Files
josh 8acef92a60
CI / Lint + build + test (push) Successful in 1m35s
Release / release (push) Successful in 9m34s
feat(inventory): deep hardware capture + per-probe substeps + verbose logs
Extend Inventory stage from a one-liner summary to a per-probe substep
emitter with ~20-30 narrative log lines per run.

- spec: per-DIMM memory (slot/size/speed/manufacturer/part_number),
  richer CPU (vendor/stepping/physical_cores/flags), disk
  model/transport/rotational, NIC driver/pci_addr, GPU vram/pci/driver,
  new System/Baseboard/PSU/OS top-level sections. All fields omitempty
  so existing expected-spec YAML and artifacts stay compatible.
- spec.Diff: new diffDIMMs/diffSystem/diffBaseboard/diffPSU/diffOS
  helpers; extended diffDisks/diffNICs/diffGPUs for new fields. GPU
  diff gains PCIAddr-pinned matching alongside count-by-model.
- agent/probes/inventory: CPU (/proc/cpuinfo extended), Memory
  (dmidecode -t 17 multi-block), Disks (+model/transport/rotational),
  NICs (+driver/pci from sysfs), GPUs (VRAM from lspci -vv),
  new System/Baseboard (dmidecode -t system/baseboard), PSU
  (dmidecode -t 39), OS (/proc/sys/kernel/osrelease + /etc/os-release).
  All probes accept a Logger and emit per-finding info/warn lines.
- agent/probes/firmware: parseDmidecodeAllSections for multi-block
  fixtures (memory / PSU).
- agent/runner: Inventory case becomes 9 substep rows (CPU / Memory /
  Disks / NICs / GPUs / System / Baseboard / PSU / OS) with per-probe
  start/complete timestamps.
- report: new Inventory HTML section between Stages and Firmware;
  resolveReporting loads the inventory.json artifact.
- agent/tests/fakes/dmidecode: dispatches on -t flag to serve bios /
  memory / system / baseboard / 39 fixtures for unit tests.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-19 22:21:17 -04:00

552 lines
16 KiB
Go

package probes
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"
)
// FirmwareSnapshot is the on-wire shape the agent POSTs alongside the
// Firmware stage result. Mirrors internal/store.FirmwareSnapshot without
// the import — the /result handler converts to the store type and
// persists. One run produces many snapshots (one per BIOS / BMC / NIC
// port / HBA / microcode / NVMe); identifier distinguishes siblings
// (e.g. "eth0" / "eth1"), version is the canonical string to diff.
type FirmwareSnapshot struct {
Component string `json:"component"` // bios|bmc|nic|hba|microcode|nvme_fw
Identifier string `json:"identifier"`
Version string `json:"version"`
Vendor string `json:"vendor,omitempty"`
Raw map[string]string `json:"raw,omitempty"`
}
// Firmware runs every sub-probe in sequence. Each one is bounded with
// a short timeout so a hung dmidecode / ipmitool / nvme tool can't
// freeze the stage — the probe is best-effort, missing tools produce
// empty output rather than an error. Returns the aggregated slice
// along with a list of probe-level warnings (surfaced in the stage
// summary so operators see which subsystem couldn't be read).
func Firmware(ctx context.Context) ([]FirmwareSnapshot, []string) {
var out []FirmwareSnapshot
var warnings []string
if snap, warn := probeBIOS(ctx); snap != nil {
out = append(out, *snap)
} else if warn != "" {
warnings = append(warnings, warn)
}
if snap, warn := probeBMC(ctx); snap != nil {
out = append(out, *snap)
} else if warn != "" {
warnings = append(warnings, warn)
}
nicSnaps, nicWarn := probeNICFirmware(ctx)
out = append(out, nicSnaps...)
if nicWarn != "" {
warnings = append(warnings, nicWarn)
}
out = append(out, probeNVMeFirmware(ctx)...)
hbaSnaps, hbaWarn := probeHBAFirmware(ctx)
out = append(out, hbaSnaps...)
if hbaWarn != "" {
warnings = append(warnings, hbaWarn)
}
if snap := probeMicrocode(); snap != nil {
out = append(out, *snap)
}
return out, warnings
}
// runCmd executes a short-lived command with a per-call timeout. The
// timeout is intentionally aggressive (5 s) because firmware probes
// read device registers and occasionally block forever on a wedged
// controller — the stage should report "no HBA firmware readable"
// rather than hang the pipeline.
func runCmd(ctx context.Context, name string, args ...string) (string, error) {
cctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
cmd := exec.CommandContext(cctx, name, args...)
out, err := cmd.CombinedOutput()
if err != nil {
return string(out), err
}
return string(out), nil
}
// ----- BIOS --------------------------------------------------------------
// probeBIOS invokes dmidecode -t bios and parses the vendor + version
// lines. dmidecode must run as root; we let it fail gracefully when the
// agent is mis-deployed without privileges.
func probeBIOS(ctx context.Context) (*FirmwareSnapshot, string) {
if _, err := exec.LookPath("dmidecode"); err != nil {
return nil, "bios: dmidecode not installed"
}
out, err := runCmd(ctx, "dmidecode", "-t", "bios")
if err != nil {
return nil, fmt.Sprintf("bios: dmidecode failed: %v", trimErr(err, out))
}
snap := parseDmidecodeBIOS(strings.NewReader(out))
if snap == nil {
return nil, "bios: dmidecode produced no usable output"
}
return snap, ""
}
// parseDmidecodeBIOS consumes `dmidecode -t bios` output and pulls
// Vendor / Version / Release Date. Kept as an io.Reader for unit tests.
func parseDmidecodeBIOS(r io.Reader) *FirmwareSnapshot {
kv := parseDmidecodeSection(r, "BIOS Information")
if kv == nil {
return nil
}
snap := &FirmwareSnapshot{
Component: "bios",
Identifier: "system",
Version: firstNonEmpty(kv["Version"], kv["Firmware Revision"]),
Vendor: kv["Vendor"],
Raw: kv,
}
if snap.Version == "" {
return nil
}
return snap
}
// parseDmidecodeSection returns the key/value map of the first dmidecode
// handle whose title matches. dmidecode blocks look like:
// Handle 0x0000, ...
// BIOS Information
// Vendor: American Megatrends
// Version: 3.0
// ...
// With a blank line between blocks. Values like "Characteristics:"
// followed by a bulleted sub-list are collapsed into "…" so we don't
// accidentally swallow the next handle.
func parseDmidecodeSection(r io.Reader, title string) map[string]string {
sc := bufio.NewScanner(r)
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
var kv map[string]string
var inside, seenTitle bool
for sc.Scan() {
line := sc.Text()
trim := strings.TrimSpace(line)
if strings.HasPrefix(line, "Handle ") {
if seenTitle && kv != nil {
return kv
}
inside = false
kv = nil
continue
}
if !inside {
if trim == title {
inside = true
seenTitle = true
kv = map[string]string{}
}
continue
}
if trim == "" {
continue
}
if k, v, ok := strings.Cut(trim, ":"); ok {
v = strings.TrimSpace(v)
if v == "" {
continue
}
kv[strings.TrimSpace(k)] = v
}
}
if seenTitle {
return kv
}
return nil
}
// parseDmidecodeAllSections is the plural variant of
// parseDmidecodeSection: returns every block whose title matches, not
// just the first. Memory (-t 17) and PSU (-t 39) emit one block per
// slot, so the inventory probes need the full list. Same scanning
// rules; accumulates kv maps on Handle/blank-line boundaries.
func parseDmidecodeAllSections(r io.Reader, title string) []map[string]string {
sc := bufio.NewScanner(r)
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
var out []map[string]string
var kv map[string]string
var inside bool
flush := func() {
if kv != nil {
out = append(out, kv)
}
kv = nil
inside = false
}
for sc.Scan() {
line := sc.Text()
trim := strings.TrimSpace(line)
if strings.HasPrefix(line, "Handle ") {
flush()
continue
}
if !inside {
if trim == title {
inside = true
kv = map[string]string{}
}
continue
}
if trim == "" {
continue
}
if k, v, ok := strings.Cut(trim, ":"); ok {
v = strings.TrimSpace(v)
if v == "" {
continue
}
kv[strings.TrimSpace(k)] = v
}
}
flush()
return out
}
// ----- BMC / IPMI --------------------------------------------------------
// probeBMC walks `ipmitool mc info`. Home-lab hosts often lack a BMC —
// missing binary or a non-zero exit returns a warning without failing
// the stage. We capture Firmware Revision + Manufacturer as the version.
func probeBMC(ctx context.Context) (*FirmwareSnapshot, string) {
if _, err := exec.LookPath("ipmitool"); err != nil {
return nil, "bmc: ipmitool not installed"
}
out, err := runCmd(ctx, "ipmitool", "mc", "info")
if err != nil {
return nil, fmt.Sprintf("bmc: ipmitool mc info failed: %v", trimErr(err, out))
}
snap := parseIpmitoolMCInfo(strings.NewReader(out))
if snap == nil {
return nil, "bmc: ipmitool output not parseable"
}
return snap, ""
}
// parseIpmitoolMCInfo pulls "Firmware Revision" + "Manufacturer Name"
// from the textual output. Format is indented key : value lines.
func parseIpmitoolMCInfo(r io.Reader) *FirmwareSnapshot {
sc := bufio.NewScanner(r)
kv := map[string]string{}
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if k, v, ok := strings.Cut(line, ":"); ok {
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
}
}
version := firstNonEmpty(kv["Firmware Revision"], kv["Aux Firmware Rev Info"])
if version == "" {
return nil
}
return &FirmwareSnapshot{
Component: "bmc",
Identifier: "bmc0",
Version: version,
Vendor: kv["Manufacturer Name"],
Raw: kv,
}
}
// ----- NIC firmware ------------------------------------------------------
// probeNICFirmware enumerates /sys/class/net/*/device and calls
// `ethtool -i <iface>` on each real NIC (skip lo, bridges, virtuals).
// One snapshot per interface so a mismatched port lights up in the diff
// without silencing sibling ports.
func probeNICFirmware(ctx context.Context) ([]FirmwareSnapshot, string) {
if _, err := exec.LookPath("ethtool"); err != nil {
return nil, "nic: ethtool not installed"
}
ifaces, err := os.ReadDir("/sys/class/net")
if err != nil {
return nil, ""
}
var out []FirmwareSnapshot
for _, entry := range ifaces {
name := entry.Name()
if !isRealNIC(name) {
continue
}
raw, err := runCmd(ctx, "ethtool", "-i", name)
if err != nil {
continue
}
snap := parseEthtoolI(strings.NewReader(raw), name)
if snap != nil {
out = append(out, *snap)
}
}
return out, ""
}
// parseEthtoolI extracts driver/firmware-version from `ethtool -i`
// output. Lines are "key: value" with a consistent prefix order.
func parseEthtoolI(r io.Reader, iface string) *FirmwareSnapshot {
sc := bufio.NewScanner(r)
kv := map[string]string{}
for sc.Scan() {
line := sc.Text()
if k, v, ok := strings.Cut(line, ":"); ok {
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
}
}
if kv["firmware-version"] == "" && kv["driver"] == "" {
return nil
}
return &FirmwareSnapshot{
Component: "nic",
Identifier: iface,
Version: kv["firmware-version"],
Vendor: kv["driver"],
Raw: kv,
}
}
// isRealNIC filters out loopback, bridges, veth, and the handful of
// virtual kernel devices ethtool will refuse on.
func isRealNIC(name string) bool {
if name == "" || name == "lo" {
return false
}
for _, prefix := range []string{"docker", "br-", "veth", "virbr", "tun", "tap", "bond"} {
if strings.HasPrefix(name, prefix) {
return false
}
}
// Only accept interfaces that have a `device` link — real PCI NICs
// do; pure virtuals (dummy0, wg*) don't.
if _, err := os.Stat(filepath.Join("/sys/class/net", name, "device")); err != nil {
return false
}
return true
}
// ----- NVMe --------------------------------------------------------------
// probeNVMeFirmware reads /sys/class/nvme/nvmeN/firmware_rev for every
// controller. Falls back to `nvme id-ctrl` if the sysfs file is missing
// (older kernels). Identifier is the controller path so a run with two
// drives produces two snapshots.
func probeNVMeFirmware(ctx context.Context) []FirmwareSnapshot {
entries, err := os.ReadDir("/sys/class/nvme")
if err != nil {
return nil
}
var out []FirmwareSnapshot
for _, e := range entries {
ctrl := e.Name()
rev := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "firmware_rev")))
model := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "model")))
if rev == "" {
// Fallback: nvme id-ctrl -H /dev/<ctrl>. Available on hosts
// where sysfs doesn't export firmware_rev.
if _, err := exec.LookPath("nvme"); err == nil {
raw, _ := runCmd(ctx, "nvme", "id-ctrl", "/dev/"+ctrl)
rev = parseNVMeIDCtrl(strings.NewReader(raw), "fr")
if model == "" {
model = parseNVMeIDCtrl(strings.NewReader(raw), "mn")
}
}
}
if rev == "" {
continue
}
out = append(out, FirmwareSnapshot{
Component: "nvme_fw",
Identifier: ctrl,
Version: rev,
Vendor: model,
Raw: map[string]string{"model": model, "firmware_rev": rev},
})
}
return out
}
// parseNVMeIDCtrl pulls a single field out of `nvme id-ctrl` output.
// Format: "fr : FW1234" / "mn : Samsung SSD 980 PRO".
// Leading spaces vary, values may contain spaces.
func parseNVMeIDCtrl(r io.Reader, key string) string {
sc := bufio.NewScanner(r)
prefix := key + " "
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if !strings.HasPrefix(line, prefix) {
continue
}
_, v, ok := strings.Cut(line, ":")
if !ok {
continue
}
return strings.TrimSpace(v)
}
return ""
}
// ----- HBA ---------------------------------------------------------------
var lspciClassHBA = regexp.MustCompile(`(?i)(serial attached scsi|sas controller|raid bus controller)`)
// probeHBAFirmware looks for SAS/RAID HBAs via `lspci -Dvvnn`. The
// firmware string is typically exposed as "Product Name" +
// "Capabilities" but in practice the LSI/Broadcom driver writes a
// "revision" on the device line. We capture what's printed and rely on
// SpecValidate to diff — this keeps us off tool-specific CLIs (storcli,
// mpt-status) that aren't always installed.
func probeHBAFirmware(ctx context.Context) ([]FirmwareSnapshot, string) {
if _, err := exec.LookPath("lspci"); err != nil {
return nil, "hba: lspci not installed"
}
out, err := runCmd(ctx, "lspci", "-Dvvnn")
if err != nil {
return nil, fmt.Sprintf("hba: lspci failed: %v", trimErr(err, out))
}
return parseLspciHBA(strings.NewReader(out)), ""
}
// parseLspciHBA walks `lspci -Dvvnn` stanzas and picks SAS/RAID
// controllers. One snapshot per device; identifier is the PCI address.
// Version is the device line's revision (rev NN) or the Kernel modules
// string when no rev is printed.
func parseLspciHBA(r io.Reader) []FirmwareSnapshot {
sc := bufio.NewScanner(r)
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
var out []FirmwareSnapshot
var cur *FirmwareSnapshot
revRe := regexp.MustCompile(`\(rev\s+([0-9a-fA-F]+)\)`)
flush := func() {
if cur != nil && cur.Version != "" {
out = append(out, *cur)
}
cur = nil
}
for sc.Scan() {
line := sc.Text()
if !strings.HasPrefix(line, "\t") && strings.Contains(line, " ") {
// New device line.
flush()
if lspciClassHBA.MatchString(line) {
addr, rest, _ := strings.Cut(line, " ")
cur = &FirmwareSnapshot{
Component: "hba",
Identifier: addr,
Vendor: strings.TrimSpace(rest),
Raw: map[string]string{"device_line": line},
}
if m := revRe.FindStringSubmatch(line); len(m) == 2 {
cur.Version = "rev " + m[1]
}
}
continue
}
if cur == nil {
continue
}
trim := strings.TrimSpace(line)
if strings.HasPrefix(trim, "Kernel modules:") {
cur.Raw["kernel_modules"] = strings.TrimPrefix(trim, "Kernel modules:")
}
if strings.HasPrefix(trim, "Kernel driver in use:") {
cur.Raw["kernel_driver"] = strings.TrimPrefix(trim, "Kernel driver in use:")
}
}
flush()
return out
}
// ----- Microcode ---------------------------------------------------------
// probeMicrocode reads /proc/cpuinfo for the "microcode" line. All
// cores report the same value post-boot, so one snapshot is enough.
func probeMicrocode() *FirmwareSnapshot {
f, err := os.Open("/proc/cpuinfo")
if err != nil {
return nil
}
defer func() { _ = f.Close() }()
snap := parseMicrocode(f)
return snap
}
func parseMicrocode(r io.Reader) *FirmwareSnapshot {
sc := bufio.NewScanner(r)
version := ""
vendor := ""
for sc.Scan() {
line := sc.Text()
k, v, ok := strings.Cut(line, ":")
if !ok {
continue
}
key := strings.TrimSpace(k)
val := strings.TrimSpace(v)
switch key {
case "microcode":
if version == "" {
version = val
}
case "vendor_id":
if vendor == "" {
vendor = val
}
}
if version != "" && vendor != "" {
break
}
}
if version == "" {
return nil
}
return &FirmwareSnapshot{
Component: "microcode",
Identifier: "cpu",
Version: version,
Vendor: vendor,
}
}
// ----- helpers -----------------------------------------------------------
func firstNonEmpty(ss ...string) string {
for _, s := range ss {
if strings.TrimSpace(s) != "" {
return s
}
}
return ""
}
func readFile(p string) string {
b, err := os.ReadFile(p)
if err != nil {
return ""
}
return string(b)
}
// trimErr joins the underlying error with the first line of combined
// output so the warning message carries enough diagnostic context
// without dumping a screenful of dmidecode/ipmitool noise.
func trimErr(err error, out string) string {
firstLine := strings.SplitN(strings.TrimSpace(out), "\n", 2)[0]
if firstLine == "" {
return err.Error()
}
return fmt.Sprintf("%v (%s)", err, firstLine)
}