deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,70 @@
|
||||
package probes
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// EDACSample is one counter reading from /sys/devices/system/edac/mc/.
|
||||
// Kind is "edac_ce" (correctable ECC errors) or "edac_ue"
|
||||
// (uncorrectable — always a critical signal). Key identifies the memory
|
||||
// controller (e.g. "mc0"). Value is the cumulative count since boot;
|
||||
// the threshold evaluator flags it the moment it exceeds 0.
|
||||
type EDACSample struct {
|
||||
Kind string
|
||||
Key string
|
||||
Value float64
|
||||
Unit string
|
||||
}
|
||||
|
||||
// EDAC returns one EDACSample per (memory-controller × {ce,ue}) pair
|
||||
// that /sys exposes. Returns an empty slice when EDAC isn't available
|
||||
// (virtualized host, missing kernel driver, mdadm-style boards without
|
||||
// a controller node) — callers treat an empty return as "no data",
|
||||
// not "passed". Errors are swallowed for the same reason: a hot-
|
||||
// swapped DIMM that makes /sys blink briefly shouldn't fail the stage
|
||||
// before the real counter can be read.
|
||||
//
|
||||
// This is intentionally small — the sidecar polls periodically, so one
|
||||
// bad read is recovered on the next tick. The counters are monotonic,
|
||||
// so emitting the current raw value is correct.
|
||||
func EDAC() []EDACSample {
|
||||
root := "/sys/devices/system/edac/mc"
|
||||
entries, err := os.ReadDir(root)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var out []EDACSample
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if !strings.HasPrefix(name, "mc") {
|
||||
continue
|
||||
}
|
||||
base := filepath.Join(root, name)
|
||||
if ce, ok := readCount(filepath.Join(base, "ce_count")); ok {
|
||||
out = append(out, EDACSample{Kind: "edac_ce", Key: name, Value: ce, Unit: "count"})
|
||||
}
|
||||
if ue, ok := readCount(filepath.Join(base, "ue_count")); ok {
|
||||
out = append(out, EDACSample{Kind: "edac_ue", Key: name, Value: ue, Unit: "count"})
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// readCount reads a single decimal integer from a sysfs file and
|
||||
// returns it as a float. Returns (0, false) on any failure so callers
|
||||
// can skip the sample without a diagnostic.
|
||||
func readCount(path string) (float64, bool) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
s := strings.TrimSpace(string(b))
|
||||
n, err := strconv.ParseInt(s, 10, 64)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return float64(n), true
|
||||
}
|
||||
@@ -0,0 +1,496 @@
|
||||
package probes
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FirmwareSnapshot is the on-wire shape the agent POSTs alongside the
|
||||
// Firmware stage result. Mirrors internal/store.FirmwareSnapshot without
|
||||
// the import — the /result handler converts to the store type and
|
||||
// persists. One run produces many snapshots (one per BIOS / BMC / NIC
|
||||
// port / HBA / microcode / NVMe); identifier distinguishes siblings
|
||||
// (e.g. "eth0" / "eth1"), version is the canonical string to diff.
|
||||
type FirmwareSnapshot struct {
|
||||
Component string `json:"component"` // bios|bmc|nic|hba|microcode|nvme_fw
|
||||
Identifier string `json:"identifier"`
|
||||
Version string `json:"version"`
|
||||
Vendor string `json:"vendor,omitempty"`
|
||||
Raw map[string]string `json:"raw,omitempty"`
|
||||
}
|
||||
|
||||
// Firmware runs every sub-probe in sequence. Each one is bounded with
|
||||
// a short timeout so a hung dmidecode / ipmitool / nvme tool can't
|
||||
// freeze the stage — the probe is best-effort, missing tools produce
|
||||
// empty output rather than an error. Returns the aggregated slice
|
||||
// along with a list of probe-level warnings (surfaced in the stage
|
||||
// summary so operators see which subsystem couldn't be read).
|
||||
func Firmware(ctx context.Context) ([]FirmwareSnapshot, []string) {
|
||||
var out []FirmwareSnapshot
|
||||
var warnings []string
|
||||
|
||||
if snap, warn := probeBIOS(ctx); snap != nil {
|
||||
out = append(out, *snap)
|
||||
} else if warn != "" {
|
||||
warnings = append(warnings, warn)
|
||||
}
|
||||
if snap, warn := probeBMC(ctx); snap != nil {
|
||||
out = append(out, *snap)
|
||||
} else if warn != "" {
|
||||
warnings = append(warnings, warn)
|
||||
}
|
||||
out = append(out, probeNICFirmware(ctx)...)
|
||||
out = append(out, probeNVMeFirmware(ctx)...)
|
||||
out = append(out, probeHBAFirmware(ctx)...)
|
||||
if snap := probeMicrocode(); snap != nil {
|
||||
out = append(out, *snap)
|
||||
}
|
||||
|
||||
return out, warnings
|
||||
}
|
||||
|
||||
// runCmd executes a short-lived command with a per-call timeout. The
|
||||
// timeout is intentionally aggressive (5 s) because firmware probes
|
||||
// read device registers and occasionally block forever on a wedged
|
||||
// controller — the stage should report "no HBA firmware readable"
|
||||
// rather than hang the pipeline.
|
||||
func runCmd(ctx context.Context, name string, args ...string) (string, error) {
|
||||
cctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(cctx, name, args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return string(out), err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// ----- BIOS --------------------------------------------------------------
|
||||
|
||||
// probeBIOS invokes dmidecode -t bios and parses the vendor + version
|
||||
// lines. dmidecode must run as root; we let it fail gracefully when the
|
||||
// agent is mis-deployed without privileges.
|
||||
func probeBIOS(ctx context.Context) (*FirmwareSnapshot, string) {
|
||||
if _, err := exec.LookPath("dmidecode"); err != nil {
|
||||
return nil, "bios: dmidecode not installed"
|
||||
}
|
||||
out, err := runCmd(ctx, "dmidecode", "-t", "bios")
|
||||
if err != nil {
|
||||
return nil, fmt.Sprintf("bios: dmidecode failed: %v", trimErr(err, out))
|
||||
}
|
||||
snap := parseDmidecodeBIOS(strings.NewReader(out))
|
||||
if snap == nil {
|
||||
return nil, "bios: dmidecode produced no usable output"
|
||||
}
|
||||
return snap, ""
|
||||
}
|
||||
|
||||
// parseDmidecodeBIOS consumes `dmidecode -t bios` output and pulls
|
||||
// Vendor / Version / Release Date. Kept as an io.Reader for unit tests.
|
||||
func parseDmidecodeBIOS(r io.Reader) *FirmwareSnapshot {
|
||||
kv := parseDmidecodeSection(r, "BIOS Information")
|
||||
if kv == nil {
|
||||
return nil
|
||||
}
|
||||
snap := &FirmwareSnapshot{
|
||||
Component: "bios",
|
||||
Identifier: "system",
|
||||
Version: firstNonEmpty(kv["Version"], kv["Firmware Revision"]),
|
||||
Vendor: kv["Vendor"],
|
||||
Raw: kv,
|
||||
}
|
||||
if snap.Version == "" {
|
||||
return nil
|
||||
}
|
||||
return snap
|
||||
}
|
||||
|
||||
// parseDmidecodeSection returns the key/value map of the first dmidecode
|
||||
// handle whose title matches. dmidecode blocks look like:
|
||||
// Handle 0x0000, ...
|
||||
// BIOS Information
|
||||
// Vendor: American Megatrends
|
||||
// Version: 3.0
|
||||
// ...
|
||||
// With a blank line between blocks. Values like "Characteristics:"
|
||||
// followed by a bulleted sub-list are collapsed into "…" so we don't
|
||||
// accidentally swallow the next handle.
|
||||
func parseDmidecodeSection(r io.Reader, title string) map[string]string {
|
||||
sc := bufio.NewScanner(r)
|
||||
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
|
||||
var kv map[string]string
|
||||
var inside, seenTitle bool
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
trim := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(line, "Handle ") {
|
||||
if seenTitle && kv != nil {
|
||||
return kv
|
||||
}
|
||||
inside = false
|
||||
kv = nil
|
||||
continue
|
||||
}
|
||||
if !inside {
|
||||
if trim == title {
|
||||
inside = true
|
||||
seenTitle = true
|
||||
kv = map[string]string{}
|
||||
}
|
||||
continue
|
||||
}
|
||||
if trim == "" {
|
||||
continue
|
||||
}
|
||||
if k, v, ok := strings.Cut(trim, ":"); ok {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" {
|
||||
continue
|
||||
}
|
||||
kv[strings.TrimSpace(k)] = v
|
||||
}
|
||||
}
|
||||
if seenTitle {
|
||||
return kv
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ----- BMC / IPMI --------------------------------------------------------
|
||||
|
||||
// probeBMC walks `ipmitool mc info`. Home-lab hosts often lack a BMC —
|
||||
// missing binary or a non-zero exit returns a warning without failing
|
||||
// the stage. We capture Firmware Revision + Manufacturer as the version.
|
||||
func probeBMC(ctx context.Context) (*FirmwareSnapshot, string) {
|
||||
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||
return nil, "bmc: ipmitool not installed"
|
||||
}
|
||||
out, err := runCmd(ctx, "ipmitool", "mc", "info")
|
||||
if err != nil {
|
||||
return nil, fmt.Sprintf("bmc: ipmitool mc info failed: %v", trimErr(err, out))
|
||||
}
|
||||
snap := parseIpmitoolMCInfo(strings.NewReader(out))
|
||||
if snap == nil {
|
||||
return nil, "bmc: ipmitool output not parseable"
|
||||
}
|
||||
return snap, ""
|
||||
}
|
||||
|
||||
// parseIpmitoolMCInfo pulls "Firmware Revision" + "Manufacturer Name"
|
||||
// from the textual output. Format is indented key : value lines.
|
||||
func parseIpmitoolMCInfo(r io.Reader) *FirmwareSnapshot {
|
||||
sc := bufio.NewScanner(r)
|
||||
kv := map[string]string{}
|
||||
for sc.Scan() {
|
||||
line := strings.TrimSpace(sc.Text())
|
||||
if k, v, ok := strings.Cut(line, ":"); ok {
|
||||
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
version := firstNonEmpty(kv["Firmware Revision"], kv["Aux Firmware Rev Info"])
|
||||
if version == "" {
|
||||
return nil
|
||||
}
|
||||
return &FirmwareSnapshot{
|
||||
Component: "bmc",
|
||||
Identifier: "bmc0",
|
||||
Version: version,
|
||||
Vendor: kv["Manufacturer Name"],
|
||||
Raw: kv,
|
||||
}
|
||||
}
|
||||
|
||||
// ----- NIC firmware ------------------------------------------------------
|
||||
|
||||
// probeNICFirmware enumerates /sys/class/net/*/device and calls
|
||||
// `ethtool -i <iface>` on each real NIC (skip lo, bridges, virtuals).
|
||||
// One snapshot per interface so a mismatched port lights up in the diff
|
||||
// without silencing sibling ports.
|
||||
func probeNICFirmware(ctx context.Context) []FirmwareSnapshot {
|
||||
if _, err := exec.LookPath("ethtool"); err != nil {
|
||||
return nil
|
||||
}
|
||||
ifaces, err := os.ReadDir("/sys/class/net")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var out []FirmwareSnapshot
|
||||
for _, entry := range ifaces {
|
||||
name := entry.Name()
|
||||
if !isRealNIC(name) {
|
||||
continue
|
||||
}
|
||||
raw, err := runCmd(ctx, "ethtool", "-i", name)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
snap := parseEthtoolI(strings.NewReader(raw), name)
|
||||
if snap != nil {
|
||||
out = append(out, *snap)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// parseEthtoolI extracts driver/firmware-version from `ethtool -i`
|
||||
// output. Lines are "key: value" with a consistent prefix order.
|
||||
func parseEthtoolI(r io.Reader, iface string) *FirmwareSnapshot {
|
||||
sc := bufio.NewScanner(r)
|
||||
kv := map[string]string{}
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
if k, v, ok := strings.Cut(line, ":"); ok {
|
||||
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
if kv["firmware-version"] == "" && kv["driver"] == "" {
|
||||
return nil
|
||||
}
|
||||
return &FirmwareSnapshot{
|
||||
Component: "nic",
|
||||
Identifier: iface,
|
||||
Version: kv["firmware-version"],
|
||||
Vendor: kv["driver"],
|
||||
Raw: kv,
|
||||
}
|
||||
}
|
||||
|
||||
// isRealNIC filters out loopback, bridges, veth, and the handful of
|
||||
// virtual kernel devices ethtool will refuse on.
|
||||
func isRealNIC(name string) bool {
|
||||
if name == "" || name == "lo" {
|
||||
return false
|
||||
}
|
||||
for _, prefix := range []string{"docker", "br-", "veth", "virbr", "tun", "tap", "bond"} {
|
||||
if strings.HasPrefix(name, prefix) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
// Only accept interfaces that have a `device` link — real PCI NICs
|
||||
// do; pure virtuals (dummy0, wg*) don't.
|
||||
if _, err := os.Stat(filepath.Join("/sys/class/net", name, "device")); err != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// ----- NVMe --------------------------------------------------------------
|
||||
|
||||
// probeNVMeFirmware reads /sys/class/nvme/nvmeN/firmware_rev for every
|
||||
// controller. Falls back to `nvme id-ctrl` if the sysfs file is missing
|
||||
// (older kernels). Identifier is the controller path so a run with two
|
||||
// drives produces two snapshots.
|
||||
func probeNVMeFirmware(ctx context.Context) []FirmwareSnapshot {
|
||||
entries, err := os.ReadDir("/sys/class/nvme")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var out []FirmwareSnapshot
|
||||
for _, e := range entries {
|
||||
ctrl := e.Name()
|
||||
rev := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "firmware_rev")))
|
||||
model := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "model")))
|
||||
if rev == "" {
|
||||
// Fallback: nvme id-ctrl -H /dev/<ctrl>. Available on hosts
|
||||
// where sysfs doesn't export firmware_rev.
|
||||
if _, err := exec.LookPath("nvme"); err == nil {
|
||||
raw, _ := runCmd(ctx, "nvme", "id-ctrl", "/dev/"+ctrl)
|
||||
rev = parseNVMeIDCtrl(strings.NewReader(raw), "fr")
|
||||
if model == "" {
|
||||
model = parseNVMeIDCtrl(strings.NewReader(raw), "mn")
|
||||
}
|
||||
}
|
||||
}
|
||||
if rev == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, FirmwareSnapshot{
|
||||
Component: "nvme_fw",
|
||||
Identifier: ctrl,
|
||||
Version: rev,
|
||||
Vendor: model,
|
||||
Raw: map[string]string{"model": model, "firmware_rev": rev},
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// parseNVMeIDCtrl pulls a single field out of `nvme id-ctrl` output.
|
||||
// Format: "fr : FW1234" / "mn : Samsung SSD 980 PRO".
|
||||
// Leading spaces vary, values may contain spaces.
|
||||
func parseNVMeIDCtrl(r io.Reader, key string) string {
|
||||
sc := bufio.NewScanner(r)
|
||||
prefix := key + " "
|
||||
for sc.Scan() {
|
||||
line := strings.TrimSpace(sc.Text())
|
||||
if !strings.HasPrefix(line, prefix) {
|
||||
continue
|
||||
}
|
||||
_, v, ok := strings.Cut(line, ":")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
return strings.TrimSpace(v)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// ----- HBA ---------------------------------------------------------------
|
||||
|
||||
var lspciClassHBA = regexp.MustCompile(`(?i)(serial attached scsi|sas controller|raid bus controller)`)
|
||||
|
||||
// probeHBAFirmware looks for SAS/RAID HBAs via `lspci -Dvvnn`. The
|
||||
// firmware string is typically exposed as "Product Name" +
|
||||
// "Capabilities" but in practice the LSI/Broadcom driver writes a
|
||||
// "revision" on the device line. We capture what's printed and rely on
|
||||
// SpecValidate to diff — this keeps us off tool-specific CLIs (storcli,
|
||||
// mpt-status) that aren't always installed.
|
||||
func probeHBAFirmware(ctx context.Context) []FirmwareSnapshot {
|
||||
if _, err := exec.LookPath("lspci"); err != nil {
|
||||
return nil
|
||||
}
|
||||
out, err := runCmd(ctx, "lspci", "-Dvvnn")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return parseLspciHBA(strings.NewReader(out))
|
||||
}
|
||||
|
||||
// parseLspciHBA walks `lspci -Dvvnn` stanzas and picks SAS/RAID
|
||||
// controllers. One snapshot per device; identifier is the PCI address.
|
||||
// Version is the device line's revision (rev NN) or the Kernel modules
|
||||
// string when no rev is printed.
|
||||
func parseLspciHBA(r io.Reader) []FirmwareSnapshot {
|
||||
sc := bufio.NewScanner(r)
|
||||
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
|
||||
var out []FirmwareSnapshot
|
||||
var cur *FirmwareSnapshot
|
||||
revRe := regexp.MustCompile(`\(rev\s+([0-9a-fA-F]+)\)`)
|
||||
flush := func() {
|
||||
if cur != nil && cur.Version != "" {
|
||||
out = append(out, *cur)
|
||||
}
|
||||
cur = nil
|
||||
}
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
if !strings.HasPrefix(line, "\t") && strings.Contains(line, " ") {
|
||||
// New device line.
|
||||
flush()
|
||||
if lspciClassHBA.MatchString(line) {
|
||||
addr, rest, _ := strings.Cut(line, " ")
|
||||
cur = &FirmwareSnapshot{
|
||||
Component: "hba",
|
||||
Identifier: addr,
|
||||
Vendor: strings.TrimSpace(rest),
|
||||
Raw: map[string]string{"device_line": line},
|
||||
}
|
||||
if m := revRe.FindStringSubmatch(line); len(m) == 2 {
|
||||
cur.Version = "rev " + m[1]
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
if cur == nil {
|
||||
continue
|
||||
}
|
||||
trim := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(trim, "Kernel modules:") {
|
||||
cur.Raw["kernel_modules"] = strings.TrimPrefix(trim, "Kernel modules:")
|
||||
}
|
||||
if strings.HasPrefix(trim, "Kernel driver in use:") {
|
||||
cur.Raw["kernel_driver"] = strings.TrimPrefix(trim, "Kernel driver in use:")
|
||||
}
|
||||
}
|
||||
flush()
|
||||
return out
|
||||
}
|
||||
|
||||
// ----- Microcode ---------------------------------------------------------
|
||||
|
||||
// probeMicrocode reads /proc/cpuinfo for the "microcode" line. All
|
||||
// cores report the same value post-boot, so one snapshot is enough.
|
||||
func probeMicrocode() *FirmwareSnapshot {
|
||||
f, err := os.Open("/proc/cpuinfo")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
defer func() { _ = f.Close() }()
|
||||
snap := parseMicrocode(f)
|
||||
return snap
|
||||
}
|
||||
|
||||
func parseMicrocode(r io.Reader) *FirmwareSnapshot {
|
||||
sc := bufio.NewScanner(r)
|
||||
version := ""
|
||||
vendor := ""
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
k, v, ok := strings.Cut(line, ":")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSpace(k)
|
||||
val := strings.TrimSpace(v)
|
||||
switch key {
|
||||
case "microcode":
|
||||
if version == "" {
|
||||
version = val
|
||||
}
|
||||
case "vendor_id":
|
||||
if vendor == "" {
|
||||
vendor = val
|
||||
}
|
||||
}
|
||||
if version != "" && vendor != "" {
|
||||
break
|
||||
}
|
||||
}
|
||||
if version == "" {
|
||||
return nil
|
||||
}
|
||||
return &FirmwareSnapshot{
|
||||
Component: "microcode",
|
||||
Identifier: "cpu",
|
||||
Version: version,
|
||||
Vendor: vendor,
|
||||
}
|
||||
}
|
||||
|
||||
// ----- helpers -----------------------------------------------------------
|
||||
|
||||
func firstNonEmpty(ss ...string) string {
|
||||
for _, s := range ss {
|
||||
if strings.TrimSpace(s) != "" {
|
||||
return s
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func readFile(p string) string {
|
||||
b, err := os.ReadFile(p)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
// trimErr joins the underlying error with the first line of combined
|
||||
// output so the warning message carries enough diagnostic context
|
||||
// without dumping a screenful of dmidecode/ipmitool noise.
|
||||
func trimErr(err error, out string) string {
|
||||
firstLine := strings.SplitN(strings.TrimSpace(out), "\n", 2)[0]
|
||||
if firstLine == "" {
|
||||
return err.Error()
|
||||
}
|
||||
return fmt.Sprintf("%v (%s)", err, firstLine)
|
||||
}
|
||||
@@ -0,0 +1,232 @@
|
||||
package probes
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Golden dmidecode -t bios output (trimmed, representative). A real
|
||||
// host will have more lines; parse must tolerate the unknown fields.
|
||||
const dmidecodeBIOS = `# dmidecode 3.3
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.2.0 present.
|
||||
|
||||
Handle 0x0000, DMI type 0, 26 bytes
|
||||
BIOS Information
|
||||
Vendor: American Megatrends Inc.
|
||||
Version: 3.2
|
||||
Release Date: 07/15/2021
|
||||
Address: 0xF0000
|
||||
Runtime Size: 64 kB
|
||||
ROM Size: 32 MB
|
||||
Characteristics:
|
||||
PCI is supported
|
||||
BIOS is upgradeable
|
||||
|
||||
Handle 0x0001, DMI type 1, 27 bytes
|
||||
System Information
|
||||
Manufacturer: Supermicro
|
||||
Product Name: X11SSL-F
|
||||
`
|
||||
|
||||
func TestParseDmidecodeBIOS(t *testing.T) {
|
||||
snap := parseDmidecodeBIOS(strings.NewReader(dmidecodeBIOS))
|
||||
if snap == nil {
|
||||
t.Fatal("parseDmidecodeBIOS returned nil")
|
||||
}
|
||||
if snap.Component != "bios" {
|
||||
t.Errorf("component = %q, want bios", snap.Component)
|
||||
}
|
||||
if snap.Version != "3.2" {
|
||||
t.Errorf("version = %q, want 3.2", snap.Version)
|
||||
}
|
||||
if snap.Vendor != "American Megatrends Inc." {
|
||||
t.Errorf("vendor = %q, want American Megatrends Inc.", snap.Vendor)
|
||||
}
|
||||
if snap.Raw["Release Date"] != "07/15/2021" {
|
||||
t.Errorf("release date = %q, want 07/15/2021", snap.Raw["Release Date"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDmidecodeBIOSMissingBlock(t *testing.T) {
|
||||
// No BIOS Information block → nil result, not a crash.
|
||||
input := "Handle 0x0001, DMI type 1, 27 bytes\nSystem Information\n\tManufacturer: Acme\n"
|
||||
if snap := parseDmidecodeBIOS(strings.NewReader(input)); snap != nil {
|
||||
t.Fatalf("expected nil when BIOS block absent, got %+v", snap)
|
||||
}
|
||||
}
|
||||
|
||||
const ipmitoolMCInfo = `Device ID : 32
|
||||
Device Revision : 1
|
||||
Firmware Revision : 1.74
|
||||
IPMI Version : 2.0
|
||||
Manufacturer ID : 10876
|
||||
Manufacturer Name : Supermicro
|
||||
Product ID : 2051 (0x0803)
|
||||
Product Name : Unknown (0x803)
|
||||
`
|
||||
|
||||
func TestParseIpmitoolMCInfo(t *testing.T) {
|
||||
snap := parseIpmitoolMCInfo(strings.NewReader(ipmitoolMCInfo))
|
||||
if snap == nil {
|
||||
t.Fatal("parseIpmitoolMCInfo returned nil")
|
||||
}
|
||||
if snap.Component != "bmc" {
|
||||
t.Errorf("component = %q, want bmc", snap.Component)
|
||||
}
|
||||
if snap.Version != "1.74" {
|
||||
t.Errorf("version = %q, want 1.74", snap.Version)
|
||||
}
|
||||
if snap.Vendor != "Supermicro" {
|
||||
t.Errorf("vendor = %q, want Supermicro", snap.Vendor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseIpmitoolMCInfoEmpty(t *testing.T) {
|
||||
if snap := parseIpmitoolMCInfo(strings.NewReader("")); snap != nil {
|
||||
t.Fatalf("expected nil on empty input, got %+v", snap)
|
||||
}
|
||||
}
|
||||
|
||||
const ethtoolEth0 = `driver: mlx5_core
|
||||
version: 5.15.0
|
||||
firmware-version: 16.32.1010 (MT_0000000008)
|
||||
expansion-rom-version:
|
||||
bus-info: 0000:5e:00.0
|
||||
supports-statistics: yes
|
||||
`
|
||||
|
||||
func TestParseEthtoolI(t *testing.T) {
|
||||
snap := parseEthtoolI(strings.NewReader(ethtoolEth0), "eth0")
|
||||
if snap == nil {
|
||||
t.Fatal("parseEthtoolI returned nil")
|
||||
}
|
||||
if snap.Component != "nic" || snap.Identifier != "eth0" {
|
||||
t.Errorf("component/id = %q/%q, want nic/eth0", snap.Component, snap.Identifier)
|
||||
}
|
||||
if snap.Version != "16.32.1010 (MT_0000000008)" {
|
||||
t.Errorf("version = %q, want 16.32.1010 (MT_0000000008)", snap.Version)
|
||||
}
|
||||
if snap.Vendor != "mlx5_core" {
|
||||
t.Errorf("vendor = %q, want mlx5_core", snap.Vendor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseEthtoolIEmpty(t *testing.T) {
|
||||
if snap := parseEthtoolI(strings.NewReader("not a valid output"), "eth0"); snap != nil {
|
||||
t.Fatalf("expected nil on garbage input, got %+v", snap)
|
||||
}
|
||||
}
|
||||
|
||||
const nvmeIDCtrl = `NVME Identify Controller:
|
||||
vid : 0x144d
|
||||
ssvid : 0x144d
|
||||
sn : S5GYNX0R500123X
|
||||
mn : Samsung SSD 980 PRO 1TB
|
||||
fr : 5B2QGXA7
|
||||
rab : 2
|
||||
`
|
||||
|
||||
func TestParseNVMeIDCtrl(t *testing.T) {
|
||||
if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "fr"); got != "5B2QGXA7" {
|
||||
t.Errorf("fr = %q, want 5B2QGXA7", got)
|
||||
}
|
||||
if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "mn"); got != "Samsung SSD 980 PRO 1TB" {
|
||||
t.Errorf("mn = %q, want Samsung SSD 980 PRO 1TB", got)
|
||||
}
|
||||
if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "missing"); got != "" {
|
||||
t.Errorf("missing key should be empty, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
const lspciHBA = `0000:01:00.0 Ethernet controller [0200]: Intel Corporation I350 [8086:1521] (rev 01)
|
||||
Subsystem: Intel Corporation I350 [8086:0001]
|
||||
Kernel driver in use: igb
|
||||
Kernel modules: igb
|
||||
|
||||
0000:03:00.0 Serial Attached SCSI controller [0107]: Broadcom / LSI SAS3008 PCI-Express Fusion-MPT SAS-3 [1000:0097] (rev 02)
|
||||
Subsystem: Broadcom / LSI SAS9300-8i [1000:30e0]
|
||||
Kernel driver in use: mpt3sas
|
||||
Kernel modules: mpt3sas
|
||||
|
||||
0000:04:00.0 RAID bus controller [0104]: LSI MegaRAID SAS-3 3108 [1000:005d] (rev 02)
|
||||
Subsystem: LSI MegaRAID SAS 9361-8i [1000:9361]
|
||||
Kernel driver in use: megaraid_sas
|
||||
Kernel modules: megaraid_sas
|
||||
`
|
||||
|
||||
func TestParseLspciHBA(t *testing.T) {
|
||||
got := parseLspciHBA(strings.NewReader(lspciHBA))
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("got %d HBA snapshots, want 2 (SAS + RAID; Ethernet must be skipped)", len(got))
|
||||
}
|
||||
for _, s := range got {
|
||||
if s.Component != "hba" {
|
||||
t.Errorf("component = %q, want hba", s.Component)
|
||||
}
|
||||
if s.Version != "rev 02" {
|
||||
t.Errorf("version = %q, want 'rev 02'", s.Version)
|
||||
}
|
||||
}
|
||||
if got[0].Identifier != "0000:03:00.0" {
|
||||
t.Errorf("first identifier = %q, want 0000:03:00.0", got[0].Identifier)
|
||||
}
|
||||
if got[1].Identifier != "0000:04:00.0" {
|
||||
t.Errorf("second identifier = %q, want 0000:04:00.0", got[1].Identifier)
|
||||
}
|
||||
}
|
||||
|
||||
const cpuinfo = `processor : 0
|
||||
vendor_id : GenuineIntel
|
||||
cpu family : 6
|
||||
model : 85
|
||||
model name : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
|
||||
stepping : 7
|
||||
microcode : 0x5003006
|
||||
cpu MHz : 2100.000
|
||||
`
|
||||
|
||||
func TestParseMicrocode(t *testing.T) {
|
||||
snap := parseMicrocode(strings.NewReader(cpuinfo))
|
||||
if snap == nil {
|
||||
t.Fatal("parseMicrocode returned nil")
|
||||
}
|
||||
if snap.Version != "0x5003006" {
|
||||
t.Errorf("version = %q, want 0x5003006", snap.Version)
|
||||
}
|
||||
if snap.Vendor != "GenuineIntel" {
|
||||
t.Errorf("vendor = %q, want GenuineIntel", snap.Vendor)
|
||||
}
|
||||
if snap.Identifier != "cpu" {
|
||||
t.Errorf("identifier = %q, want cpu", snap.Identifier)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMicrocodeMissing(t *testing.T) {
|
||||
// A /proc/cpuinfo without a microcode line returns nil.
|
||||
input := "processor\t: 0\nvendor_id\t: GenuineIntel\n"
|
||||
if snap := parseMicrocode(strings.NewReader(input)); snap != nil {
|
||||
t.Fatalf("expected nil when microcode line absent, got %+v", snap)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsRealNIC(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
want bool // want=true means a real-looking name (the /sys/class/net/<name>/device check is skipped here)
|
||||
}{
|
||||
{"lo", false},
|
||||
{"", false},
|
||||
{"docker0", false},
|
||||
{"br-abc", false},
|
||||
{"veth1234", false},
|
||||
{"virbr0", false},
|
||||
{"bond0", false},
|
||||
{"tun0", false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
if got := isRealNIC(tc.name); got != tc.want {
|
||||
t.Errorf("isRealNIC(%q) = %v, want %v", tc.name, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
package probes
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// NetDevSnapshot is the per-interface counter row from /proc/net/dev at
|
||||
// a single instant. Used by the Network stage to compute deltas across
|
||||
// an iperf window — a rising rx_errors or tx_dropped during a loaded
|
||||
// link is a real NIC problem, not general noise.
|
||||
type NetDevSnapshot struct {
|
||||
Iface string
|
||||
RxBytes uint64
|
||||
RxErrs uint64
|
||||
RxDrop uint64
|
||||
TxBytes uint64
|
||||
TxErrs uint64
|
||||
TxDrop uint64
|
||||
}
|
||||
|
||||
// NetDev reads /proc/net/dev and returns one snapshot per non-loopback
|
||||
// interface. Returns nil on read/parse failure (best-effort: a missing
|
||||
// /proc is survivable; the caller skips delta reporting that tick).
|
||||
func NetDev() []NetDevSnapshot {
|
||||
f, err := os.Open("/proc/net/dev")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
defer func() { _ = f.Close() }()
|
||||
return parseNetDev(f)
|
||||
}
|
||||
|
||||
// parseNetDev is split from NetDev so tests can feed a fixture without
|
||||
// touching the real /proc. The /proc/net/dev format is two header lines
|
||||
// followed by rows of "iface: rx_bytes rx_packets rx_errs rx_drop ... tx_bytes tx_packets tx_errs tx_drop ..."
|
||||
// — 16 whitespace-separated counters, of which we pull a curated six.
|
||||
func parseNetDev(r io.Reader) []NetDevSnapshot {
|
||||
var out []NetDevSnapshot
|
||||
sc := bufio.NewScanner(r)
|
||||
// Skip the two header lines (iface || bytes ... || bytes ...).
|
||||
for i := 0; i < 2 && sc.Scan(); i++ {
|
||||
}
|
||||
for sc.Scan() {
|
||||
line := strings.TrimSpace(sc.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
colon := strings.IndexByte(line, ':')
|
||||
if colon < 0 {
|
||||
continue
|
||||
}
|
||||
iface := strings.TrimSpace(line[:colon])
|
||||
if iface == "" || iface == "lo" {
|
||||
continue
|
||||
}
|
||||
fields := strings.Fields(line[colon+1:])
|
||||
if len(fields) < 16 {
|
||||
continue
|
||||
}
|
||||
// /proc/net/dev columns:
|
||||
// 0 rx_bytes 1 rx_packets 2 rx_errs 3 rx_drop 4 fifo 5 frame 6 compressed 7 multicast
|
||||
// 8 tx_bytes 9 tx_packets 10 tx_errs 11 tx_drop 12 fifo 13 colls 14 carrier 15 compressed
|
||||
snap := NetDevSnapshot{Iface: iface}
|
||||
snap.RxBytes = parseU64(fields[0])
|
||||
snap.RxErrs = parseU64(fields[2])
|
||||
snap.RxDrop = parseU64(fields[3])
|
||||
snap.TxBytes = parseU64(fields[8])
|
||||
snap.TxErrs = parseU64(fields[10])
|
||||
snap.TxDrop = parseU64(fields[11])
|
||||
out = append(out, snap)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func parseU64(s string) uint64 {
|
||||
n, err := strconv.ParseUint(s, 10, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return n
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
package probes
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestParseNetDev_RealSample exercises parseNetDev against a synthetic
|
||||
// /proc/net/dev fixture with the full 16-column layout. Confirms the
|
||||
// loopback interface is dropped, headers are skipped, and each of the
|
||||
// six curated counters lands in the right field.
|
||||
func TestParseNetDev_RealSample(t *testing.T) {
|
||||
// Columns after "iface:":
|
||||
// 0 rx_bytes 1 rx_packets 2 rx_errs 3 rx_drop
|
||||
// 4 fifo 5 frame 6 compressed 7 multicast
|
||||
// 8 tx_bytes 9 tx_packets 10 tx_errs 11 tx_drop
|
||||
// 12 fifo 13 colls 14 carrier 15 compressed
|
||||
fixture := `Inter-| Receive | Transmit
|
||||
face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed
|
||||
lo: 1000000 10000 0 0 0 0 0 0 1000000 10000 0 0 0 0 0 0
|
||||
eth0: 50000000 100000 7 12 0 0 0 0 40000000 90000 3 5 0 0 0 0
|
||||
eth1: 12345 200 0 0 0 0 0 0 54321 180 0 0 0 0 0 0
|
||||
`
|
||||
snaps := parseNetDev(strings.NewReader(fixture))
|
||||
if len(snaps) != 2 {
|
||||
t.Fatalf("got %d snapshots, want 2 (lo should be dropped)", len(snaps))
|
||||
}
|
||||
byIface := map[string]NetDevSnapshot{}
|
||||
for _, s := range snaps {
|
||||
byIface[s.Iface] = s
|
||||
}
|
||||
eth0, ok := byIface["eth0"]
|
||||
if !ok {
|
||||
t.Fatalf("eth0 missing from parsed snapshots")
|
||||
}
|
||||
if eth0.RxBytes != 50000000 {
|
||||
t.Errorf("eth0 RxBytes=%d, want 50000000", eth0.RxBytes)
|
||||
}
|
||||
if eth0.RxErrs != 7 {
|
||||
t.Errorf("eth0 RxErrs=%d, want 7", eth0.RxErrs)
|
||||
}
|
||||
if eth0.RxDrop != 12 {
|
||||
t.Errorf("eth0 RxDrop=%d, want 12", eth0.RxDrop)
|
||||
}
|
||||
if eth0.TxBytes != 40000000 {
|
||||
t.Errorf("eth0 TxBytes=%d, want 40000000", eth0.TxBytes)
|
||||
}
|
||||
if eth0.TxErrs != 3 {
|
||||
t.Errorf("eth0 TxErrs=%d, want 3", eth0.TxErrs)
|
||||
}
|
||||
if eth0.TxDrop != 5 {
|
||||
t.Errorf("eth0 TxDrop=%d, want 5", eth0.TxDrop)
|
||||
}
|
||||
if _, ok := byIface["lo"]; ok {
|
||||
t.Errorf("lo should have been filtered out")
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseNetDev_Empty: an empty reader returns no snapshots, not a
|
||||
// crash. Callers treat nil as "no data" and skip the delta step.
|
||||
func TestParseNetDev_Empty(t *testing.T) {
|
||||
snaps := parseNetDev(strings.NewReader(""))
|
||||
if len(snaps) != 0 {
|
||||
t.Errorf("got %d snapshots from empty reader, want 0", len(snaps))
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseNetDev_MalformedRow skips rows that don't have the expected
|
||||
// 16 columns rather than panicking. A truncated line shouldn't hide the
|
||||
// good rows that follow.
|
||||
func TestParseNetDev_MalformedRow(t *testing.T) {
|
||||
fixture := `header line 1
|
||||
header line 2
|
||||
bad0: 123 456
|
||||
eth0: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
|
||||
`
|
||||
snaps := parseNetDev(strings.NewReader(fixture))
|
||||
if len(snaps) != 1 {
|
||||
t.Fatalf("got %d snapshots, want 1 (bad0 should be dropped)", len(snaps))
|
||||
}
|
||||
if snaps[0].Iface != "eth0" {
|
||||
t.Errorf("got iface=%q, want eth0", snaps[0].Iface)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user