23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
497 lines
14 KiB
Go
497 lines
14 KiB
Go
package probes
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// FirmwareSnapshot is the on-wire shape the agent POSTs alongside the
|
|
// Firmware stage result. Mirrors internal/store.FirmwareSnapshot without
|
|
// the import — the /result handler converts to the store type and
|
|
// persists. One run produces many snapshots (one per BIOS / BMC / NIC
|
|
// port / HBA / microcode / NVMe); identifier distinguishes siblings
|
|
// (e.g. "eth0" / "eth1"), version is the canonical string to diff.
|
|
type FirmwareSnapshot struct {
|
|
Component string `json:"component"` // bios|bmc|nic|hba|microcode|nvme_fw
|
|
Identifier string `json:"identifier"`
|
|
Version string `json:"version"`
|
|
Vendor string `json:"vendor,omitempty"`
|
|
Raw map[string]string `json:"raw,omitempty"`
|
|
}
|
|
|
|
// Firmware runs every sub-probe in sequence. Each one is bounded with
|
|
// a short timeout so a hung dmidecode / ipmitool / nvme tool can't
|
|
// freeze the stage — the probe is best-effort, missing tools produce
|
|
// empty output rather than an error. Returns the aggregated slice
|
|
// along with a list of probe-level warnings (surfaced in the stage
|
|
// summary so operators see which subsystem couldn't be read).
|
|
func Firmware(ctx context.Context) ([]FirmwareSnapshot, []string) {
|
|
var out []FirmwareSnapshot
|
|
var warnings []string
|
|
|
|
if snap, warn := probeBIOS(ctx); snap != nil {
|
|
out = append(out, *snap)
|
|
} else if warn != "" {
|
|
warnings = append(warnings, warn)
|
|
}
|
|
if snap, warn := probeBMC(ctx); snap != nil {
|
|
out = append(out, *snap)
|
|
} else if warn != "" {
|
|
warnings = append(warnings, warn)
|
|
}
|
|
out = append(out, probeNICFirmware(ctx)...)
|
|
out = append(out, probeNVMeFirmware(ctx)...)
|
|
out = append(out, probeHBAFirmware(ctx)...)
|
|
if snap := probeMicrocode(); snap != nil {
|
|
out = append(out, *snap)
|
|
}
|
|
|
|
return out, warnings
|
|
}
|
|
|
|
// runCmd executes a short-lived command with a per-call timeout. The
|
|
// timeout is intentionally aggressive (5 s) because firmware probes
|
|
// read device registers and occasionally block forever on a wedged
|
|
// controller — the stage should report "no HBA firmware readable"
|
|
// rather than hang the pipeline.
|
|
func runCmd(ctx context.Context, name string, args ...string) (string, error) {
|
|
cctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
defer cancel()
|
|
cmd := exec.CommandContext(cctx, name, args...)
|
|
out, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
return string(out), err
|
|
}
|
|
return string(out), nil
|
|
}
|
|
|
|
// ----- BIOS --------------------------------------------------------------
|
|
|
|
// probeBIOS invokes dmidecode -t bios and parses the vendor + version
|
|
// lines. dmidecode must run as root; we let it fail gracefully when the
|
|
// agent is mis-deployed without privileges.
|
|
func probeBIOS(ctx context.Context) (*FirmwareSnapshot, string) {
|
|
if _, err := exec.LookPath("dmidecode"); err != nil {
|
|
return nil, "bios: dmidecode not installed"
|
|
}
|
|
out, err := runCmd(ctx, "dmidecode", "-t", "bios")
|
|
if err != nil {
|
|
return nil, fmt.Sprintf("bios: dmidecode failed: %v", trimErr(err, out))
|
|
}
|
|
snap := parseDmidecodeBIOS(strings.NewReader(out))
|
|
if snap == nil {
|
|
return nil, "bios: dmidecode produced no usable output"
|
|
}
|
|
return snap, ""
|
|
}
|
|
|
|
// parseDmidecodeBIOS consumes `dmidecode -t bios` output and pulls
|
|
// Vendor / Version / Release Date. Kept as an io.Reader for unit tests.
|
|
func parseDmidecodeBIOS(r io.Reader) *FirmwareSnapshot {
|
|
kv := parseDmidecodeSection(r, "BIOS Information")
|
|
if kv == nil {
|
|
return nil
|
|
}
|
|
snap := &FirmwareSnapshot{
|
|
Component: "bios",
|
|
Identifier: "system",
|
|
Version: firstNonEmpty(kv["Version"], kv["Firmware Revision"]),
|
|
Vendor: kv["Vendor"],
|
|
Raw: kv,
|
|
}
|
|
if snap.Version == "" {
|
|
return nil
|
|
}
|
|
return snap
|
|
}
|
|
|
|
// parseDmidecodeSection returns the key/value map of the first dmidecode
|
|
// handle whose title matches. dmidecode blocks look like:
|
|
// Handle 0x0000, ...
|
|
// BIOS Information
|
|
// Vendor: American Megatrends
|
|
// Version: 3.0
|
|
// ...
|
|
// With a blank line between blocks. Values like "Characteristics:"
|
|
// followed by a bulleted sub-list are collapsed into "…" so we don't
|
|
// accidentally swallow the next handle.
|
|
func parseDmidecodeSection(r io.Reader, title string) map[string]string {
|
|
sc := bufio.NewScanner(r)
|
|
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
|
|
var kv map[string]string
|
|
var inside, seenTitle bool
|
|
for sc.Scan() {
|
|
line := sc.Text()
|
|
trim := strings.TrimSpace(line)
|
|
if strings.HasPrefix(line, "Handle ") {
|
|
if seenTitle && kv != nil {
|
|
return kv
|
|
}
|
|
inside = false
|
|
kv = nil
|
|
continue
|
|
}
|
|
if !inside {
|
|
if trim == title {
|
|
inside = true
|
|
seenTitle = true
|
|
kv = map[string]string{}
|
|
}
|
|
continue
|
|
}
|
|
if trim == "" {
|
|
continue
|
|
}
|
|
if k, v, ok := strings.Cut(trim, ":"); ok {
|
|
v = strings.TrimSpace(v)
|
|
if v == "" {
|
|
continue
|
|
}
|
|
kv[strings.TrimSpace(k)] = v
|
|
}
|
|
}
|
|
if seenTitle {
|
|
return kv
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ----- BMC / IPMI --------------------------------------------------------
|
|
|
|
// probeBMC walks `ipmitool mc info`. Home-lab hosts often lack a BMC —
|
|
// missing binary or a non-zero exit returns a warning without failing
|
|
// the stage. We capture Firmware Revision + Manufacturer as the version.
|
|
func probeBMC(ctx context.Context) (*FirmwareSnapshot, string) {
|
|
if _, err := exec.LookPath("ipmitool"); err != nil {
|
|
return nil, "bmc: ipmitool not installed"
|
|
}
|
|
out, err := runCmd(ctx, "ipmitool", "mc", "info")
|
|
if err != nil {
|
|
return nil, fmt.Sprintf("bmc: ipmitool mc info failed: %v", trimErr(err, out))
|
|
}
|
|
snap := parseIpmitoolMCInfo(strings.NewReader(out))
|
|
if snap == nil {
|
|
return nil, "bmc: ipmitool output not parseable"
|
|
}
|
|
return snap, ""
|
|
}
|
|
|
|
// parseIpmitoolMCInfo pulls "Firmware Revision" + "Manufacturer Name"
|
|
// from the textual output. Format is indented key : value lines.
|
|
func parseIpmitoolMCInfo(r io.Reader) *FirmwareSnapshot {
|
|
sc := bufio.NewScanner(r)
|
|
kv := map[string]string{}
|
|
for sc.Scan() {
|
|
line := strings.TrimSpace(sc.Text())
|
|
if k, v, ok := strings.Cut(line, ":"); ok {
|
|
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
|
}
|
|
}
|
|
version := firstNonEmpty(kv["Firmware Revision"], kv["Aux Firmware Rev Info"])
|
|
if version == "" {
|
|
return nil
|
|
}
|
|
return &FirmwareSnapshot{
|
|
Component: "bmc",
|
|
Identifier: "bmc0",
|
|
Version: version,
|
|
Vendor: kv["Manufacturer Name"],
|
|
Raw: kv,
|
|
}
|
|
}
|
|
|
|
// ----- NIC firmware ------------------------------------------------------
|
|
|
|
// probeNICFirmware enumerates /sys/class/net/*/device and calls
|
|
// `ethtool -i <iface>` on each real NIC (skip lo, bridges, virtuals).
|
|
// One snapshot per interface so a mismatched port lights up in the diff
|
|
// without silencing sibling ports.
|
|
func probeNICFirmware(ctx context.Context) []FirmwareSnapshot {
|
|
if _, err := exec.LookPath("ethtool"); err != nil {
|
|
return nil
|
|
}
|
|
ifaces, err := os.ReadDir("/sys/class/net")
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var out []FirmwareSnapshot
|
|
for _, entry := range ifaces {
|
|
name := entry.Name()
|
|
if !isRealNIC(name) {
|
|
continue
|
|
}
|
|
raw, err := runCmd(ctx, "ethtool", "-i", name)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
snap := parseEthtoolI(strings.NewReader(raw), name)
|
|
if snap != nil {
|
|
out = append(out, *snap)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// parseEthtoolI extracts driver/firmware-version from `ethtool -i`
|
|
// output. Lines are "key: value" with a consistent prefix order.
|
|
func parseEthtoolI(r io.Reader, iface string) *FirmwareSnapshot {
|
|
sc := bufio.NewScanner(r)
|
|
kv := map[string]string{}
|
|
for sc.Scan() {
|
|
line := sc.Text()
|
|
if k, v, ok := strings.Cut(line, ":"); ok {
|
|
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
|
}
|
|
}
|
|
if kv["firmware-version"] == "" && kv["driver"] == "" {
|
|
return nil
|
|
}
|
|
return &FirmwareSnapshot{
|
|
Component: "nic",
|
|
Identifier: iface,
|
|
Version: kv["firmware-version"],
|
|
Vendor: kv["driver"],
|
|
Raw: kv,
|
|
}
|
|
}
|
|
|
|
// isRealNIC filters out loopback, bridges, veth, and the handful of
|
|
// virtual kernel devices ethtool will refuse on.
|
|
func isRealNIC(name string) bool {
|
|
if name == "" || name == "lo" {
|
|
return false
|
|
}
|
|
for _, prefix := range []string{"docker", "br-", "veth", "virbr", "tun", "tap", "bond"} {
|
|
if strings.HasPrefix(name, prefix) {
|
|
return false
|
|
}
|
|
}
|
|
// Only accept interfaces that have a `device` link — real PCI NICs
|
|
// do; pure virtuals (dummy0, wg*) don't.
|
|
if _, err := os.Stat(filepath.Join("/sys/class/net", name, "device")); err != nil {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// ----- NVMe --------------------------------------------------------------
|
|
|
|
// probeNVMeFirmware reads /sys/class/nvme/nvmeN/firmware_rev for every
|
|
// controller. Falls back to `nvme id-ctrl` if the sysfs file is missing
|
|
// (older kernels). Identifier is the controller path so a run with two
|
|
// drives produces two snapshots.
|
|
func probeNVMeFirmware(ctx context.Context) []FirmwareSnapshot {
|
|
entries, err := os.ReadDir("/sys/class/nvme")
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var out []FirmwareSnapshot
|
|
for _, e := range entries {
|
|
ctrl := e.Name()
|
|
rev := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "firmware_rev")))
|
|
model := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "model")))
|
|
if rev == "" {
|
|
// Fallback: nvme id-ctrl -H /dev/<ctrl>. Available on hosts
|
|
// where sysfs doesn't export firmware_rev.
|
|
if _, err := exec.LookPath("nvme"); err == nil {
|
|
raw, _ := runCmd(ctx, "nvme", "id-ctrl", "/dev/"+ctrl)
|
|
rev = parseNVMeIDCtrl(strings.NewReader(raw), "fr")
|
|
if model == "" {
|
|
model = parseNVMeIDCtrl(strings.NewReader(raw), "mn")
|
|
}
|
|
}
|
|
}
|
|
if rev == "" {
|
|
continue
|
|
}
|
|
out = append(out, FirmwareSnapshot{
|
|
Component: "nvme_fw",
|
|
Identifier: ctrl,
|
|
Version: rev,
|
|
Vendor: model,
|
|
Raw: map[string]string{"model": model, "firmware_rev": rev},
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
// parseNVMeIDCtrl pulls a single field out of `nvme id-ctrl` output.
|
|
// Format: "fr : FW1234" / "mn : Samsung SSD 980 PRO".
|
|
// Leading spaces vary, values may contain spaces.
|
|
func parseNVMeIDCtrl(r io.Reader, key string) string {
|
|
sc := bufio.NewScanner(r)
|
|
prefix := key + " "
|
|
for sc.Scan() {
|
|
line := strings.TrimSpace(sc.Text())
|
|
if !strings.HasPrefix(line, prefix) {
|
|
continue
|
|
}
|
|
_, v, ok := strings.Cut(line, ":")
|
|
if !ok {
|
|
continue
|
|
}
|
|
return strings.TrimSpace(v)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// ----- HBA ---------------------------------------------------------------
|
|
|
|
var lspciClassHBA = regexp.MustCompile(`(?i)(serial attached scsi|sas controller|raid bus controller)`)
|
|
|
|
// probeHBAFirmware looks for SAS/RAID HBAs via `lspci -Dvvnn`. The
|
|
// firmware string is typically exposed as "Product Name" +
|
|
// "Capabilities" but in practice the LSI/Broadcom driver writes a
|
|
// "revision" on the device line. We capture what's printed and rely on
|
|
// SpecValidate to diff — this keeps us off tool-specific CLIs (storcli,
|
|
// mpt-status) that aren't always installed.
|
|
func probeHBAFirmware(ctx context.Context) []FirmwareSnapshot {
|
|
if _, err := exec.LookPath("lspci"); err != nil {
|
|
return nil
|
|
}
|
|
out, err := runCmd(ctx, "lspci", "-Dvvnn")
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return parseLspciHBA(strings.NewReader(out))
|
|
}
|
|
|
|
// parseLspciHBA walks `lspci -Dvvnn` stanzas and picks SAS/RAID
|
|
// controllers. One snapshot per device; identifier is the PCI address.
|
|
// Version is the device line's revision (rev NN) or the Kernel modules
|
|
// string when no rev is printed.
|
|
func parseLspciHBA(r io.Reader) []FirmwareSnapshot {
|
|
sc := bufio.NewScanner(r)
|
|
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
|
|
var out []FirmwareSnapshot
|
|
var cur *FirmwareSnapshot
|
|
revRe := regexp.MustCompile(`\(rev\s+([0-9a-fA-F]+)\)`)
|
|
flush := func() {
|
|
if cur != nil && cur.Version != "" {
|
|
out = append(out, *cur)
|
|
}
|
|
cur = nil
|
|
}
|
|
for sc.Scan() {
|
|
line := sc.Text()
|
|
if !strings.HasPrefix(line, "\t") && strings.Contains(line, " ") {
|
|
// New device line.
|
|
flush()
|
|
if lspciClassHBA.MatchString(line) {
|
|
addr, rest, _ := strings.Cut(line, " ")
|
|
cur = &FirmwareSnapshot{
|
|
Component: "hba",
|
|
Identifier: addr,
|
|
Vendor: strings.TrimSpace(rest),
|
|
Raw: map[string]string{"device_line": line},
|
|
}
|
|
if m := revRe.FindStringSubmatch(line); len(m) == 2 {
|
|
cur.Version = "rev " + m[1]
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
if cur == nil {
|
|
continue
|
|
}
|
|
trim := strings.TrimSpace(line)
|
|
if strings.HasPrefix(trim, "Kernel modules:") {
|
|
cur.Raw["kernel_modules"] = strings.TrimPrefix(trim, "Kernel modules:")
|
|
}
|
|
if strings.HasPrefix(trim, "Kernel driver in use:") {
|
|
cur.Raw["kernel_driver"] = strings.TrimPrefix(trim, "Kernel driver in use:")
|
|
}
|
|
}
|
|
flush()
|
|
return out
|
|
}
|
|
|
|
// ----- Microcode ---------------------------------------------------------
|
|
|
|
// probeMicrocode reads /proc/cpuinfo for the "microcode" line. All
|
|
// cores report the same value post-boot, so one snapshot is enough.
|
|
func probeMicrocode() *FirmwareSnapshot {
|
|
f, err := os.Open("/proc/cpuinfo")
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
defer func() { _ = f.Close() }()
|
|
snap := parseMicrocode(f)
|
|
return snap
|
|
}
|
|
|
|
func parseMicrocode(r io.Reader) *FirmwareSnapshot {
|
|
sc := bufio.NewScanner(r)
|
|
version := ""
|
|
vendor := ""
|
|
for sc.Scan() {
|
|
line := sc.Text()
|
|
k, v, ok := strings.Cut(line, ":")
|
|
if !ok {
|
|
continue
|
|
}
|
|
key := strings.TrimSpace(k)
|
|
val := strings.TrimSpace(v)
|
|
switch key {
|
|
case "microcode":
|
|
if version == "" {
|
|
version = val
|
|
}
|
|
case "vendor_id":
|
|
if vendor == "" {
|
|
vendor = val
|
|
}
|
|
}
|
|
if version != "" && vendor != "" {
|
|
break
|
|
}
|
|
}
|
|
if version == "" {
|
|
return nil
|
|
}
|
|
return &FirmwareSnapshot{
|
|
Component: "microcode",
|
|
Identifier: "cpu",
|
|
Version: version,
|
|
Vendor: vendor,
|
|
}
|
|
}
|
|
|
|
// ----- helpers -----------------------------------------------------------
|
|
|
|
func firstNonEmpty(ss ...string) string {
|
|
for _, s := range ss {
|
|
if strings.TrimSpace(s) != "" {
|
|
return s
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func readFile(p string) string {
|
|
b, err := os.ReadFile(p)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return string(b)
|
|
}
|
|
|
|
// trimErr joins the underlying error with the first line of combined
|
|
// output so the warning message carries enough diagnostic context
|
|
// without dumping a screenful of dmidecode/ipmitool noise.
|
|
func trimErr(err error, out string) string {
|
|
firstLine := strings.SplitN(strings.TrimSpace(out), "\n", 2)[0]
|
|
if firstLine == "" {
|
|
return err.Error()
|
|
}
|
|
return fmt.Sprintf("%v (%s)", err, firstLine)
|
|
}
|