deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+70
View File
@@ -0,0 +1,70 @@
package probes
import (
"os"
"path/filepath"
"strconv"
"strings"
)
// EDACSample is one counter reading from /sys/devices/system/edac/mc/.
// Kind is "edac_ce" (correctable ECC errors) or "edac_ue"
// (uncorrectable — always a critical signal). Key identifies the memory
// controller (e.g. "mc0"). Value is the cumulative count since boot;
// the threshold evaluator flags it the moment it exceeds 0.
type EDACSample struct {
Kind string
Key string
Value float64
Unit string
}
// EDAC returns one EDACSample per (memory-controller × {ce,ue}) pair
// that /sys exposes. Returns an empty slice when EDAC isn't available
// (virtualized host, missing kernel driver, mdadm-style boards without
// a controller node) — callers treat an empty return as "no data",
// not "passed". Errors are swallowed for the same reason: a hot-
// swapped DIMM that makes /sys blink briefly shouldn't fail the stage
// before the real counter can be read.
//
// This is intentionally small — the sidecar polls periodically, so one
// bad read is recovered on the next tick. The counters are monotonic,
// so emitting the current raw value is correct.
func EDAC() []EDACSample {
root := "/sys/devices/system/edac/mc"
entries, err := os.ReadDir(root)
if err != nil {
return nil
}
var out []EDACSample
for _, e := range entries {
name := e.Name()
if !strings.HasPrefix(name, "mc") {
continue
}
base := filepath.Join(root, name)
if ce, ok := readCount(filepath.Join(base, "ce_count")); ok {
out = append(out, EDACSample{Kind: "edac_ce", Key: name, Value: ce, Unit: "count"})
}
if ue, ok := readCount(filepath.Join(base, "ue_count")); ok {
out = append(out, EDACSample{Kind: "edac_ue", Key: name, Value: ue, Unit: "count"})
}
}
return out
}
// readCount reads a single decimal integer from a sysfs file and
// returns it as a float. Returns (0, false) on any failure so callers
// can skip the sample without a diagnostic.
func readCount(path string) (float64, bool) {
b, err := os.ReadFile(path)
if err != nil {
return 0, false
}
s := strings.TrimSpace(string(b))
n, err := strconv.ParseInt(s, 10, 64)
if err != nil {
return 0, false
}
return float64(n), true
}
+496
View File
@@ -0,0 +1,496 @@
package probes
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"
)
// FirmwareSnapshot is the on-wire shape the agent POSTs alongside the
// Firmware stage result. Mirrors internal/store.FirmwareSnapshot without
// the import — the /result handler converts to the store type and
// persists. One run produces many snapshots (one per BIOS / BMC / NIC
// port / HBA / microcode / NVMe); identifier distinguishes siblings
// (e.g. "eth0" / "eth1"), version is the canonical string to diff.
type FirmwareSnapshot struct {
Component string `json:"component"` // bios|bmc|nic|hba|microcode|nvme_fw
Identifier string `json:"identifier"`
Version string `json:"version"`
Vendor string `json:"vendor,omitempty"`
Raw map[string]string `json:"raw,omitempty"`
}
// Firmware runs every sub-probe in sequence. Each one is bounded with
// a short timeout so a hung dmidecode / ipmitool / nvme tool can't
// freeze the stage — the probe is best-effort, missing tools produce
// empty output rather than an error. Returns the aggregated slice
// along with a list of probe-level warnings (surfaced in the stage
// summary so operators see which subsystem couldn't be read).
func Firmware(ctx context.Context) ([]FirmwareSnapshot, []string) {
var out []FirmwareSnapshot
var warnings []string
if snap, warn := probeBIOS(ctx); snap != nil {
out = append(out, *snap)
} else if warn != "" {
warnings = append(warnings, warn)
}
if snap, warn := probeBMC(ctx); snap != nil {
out = append(out, *snap)
} else if warn != "" {
warnings = append(warnings, warn)
}
out = append(out, probeNICFirmware(ctx)...)
out = append(out, probeNVMeFirmware(ctx)...)
out = append(out, probeHBAFirmware(ctx)...)
if snap := probeMicrocode(); snap != nil {
out = append(out, *snap)
}
return out, warnings
}
// runCmd executes a short-lived command with a per-call timeout. The
// timeout is intentionally aggressive (5 s) because firmware probes
// read device registers and occasionally block forever on a wedged
// controller — the stage should report "no HBA firmware readable"
// rather than hang the pipeline.
func runCmd(ctx context.Context, name string, args ...string) (string, error) {
cctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
cmd := exec.CommandContext(cctx, name, args...)
out, err := cmd.CombinedOutput()
if err != nil {
return string(out), err
}
return string(out), nil
}
// ----- BIOS --------------------------------------------------------------
// probeBIOS invokes dmidecode -t bios and parses the vendor + version
// lines. dmidecode must run as root; we let it fail gracefully when the
// agent is mis-deployed without privileges.
func probeBIOS(ctx context.Context) (*FirmwareSnapshot, string) {
if _, err := exec.LookPath("dmidecode"); err != nil {
return nil, "bios: dmidecode not installed"
}
out, err := runCmd(ctx, "dmidecode", "-t", "bios")
if err != nil {
return nil, fmt.Sprintf("bios: dmidecode failed: %v", trimErr(err, out))
}
snap := parseDmidecodeBIOS(strings.NewReader(out))
if snap == nil {
return nil, "bios: dmidecode produced no usable output"
}
return snap, ""
}
// parseDmidecodeBIOS consumes `dmidecode -t bios` output and pulls
// Vendor / Version / Release Date. Kept as an io.Reader for unit tests.
func parseDmidecodeBIOS(r io.Reader) *FirmwareSnapshot {
kv := parseDmidecodeSection(r, "BIOS Information")
if kv == nil {
return nil
}
snap := &FirmwareSnapshot{
Component: "bios",
Identifier: "system",
Version: firstNonEmpty(kv["Version"], kv["Firmware Revision"]),
Vendor: kv["Vendor"],
Raw: kv,
}
if snap.Version == "" {
return nil
}
return snap
}
// parseDmidecodeSection returns the key/value map of the first dmidecode
// handle whose title matches. dmidecode blocks look like:
// Handle 0x0000, ...
// BIOS Information
// Vendor: American Megatrends
// Version: 3.0
// ...
// With a blank line between blocks. Values like "Characteristics:"
// followed by a bulleted sub-list are collapsed into "…" so we don't
// accidentally swallow the next handle.
func parseDmidecodeSection(r io.Reader, title string) map[string]string {
sc := bufio.NewScanner(r)
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
var kv map[string]string
var inside, seenTitle bool
for sc.Scan() {
line := sc.Text()
trim := strings.TrimSpace(line)
if strings.HasPrefix(line, "Handle ") {
if seenTitle && kv != nil {
return kv
}
inside = false
kv = nil
continue
}
if !inside {
if trim == title {
inside = true
seenTitle = true
kv = map[string]string{}
}
continue
}
if trim == "" {
continue
}
if k, v, ok := strings.Cut(trim, ":"); ok {
v = strings.TrimSpace(v)
if v == "" {
continue
}
kv[strings.TrimSpace(k)] = v
}
}
if seenTitle {
return kv
}
return nil
}
// ----- BMC / IPMI --------------------------------------------------------
// probeBMC walks `ipmitool mc info`. Home-lab hosts often lack a BMC —
// missing binary or a non-zero exit returns a warning without failing
// the stage. We capture Firmware Revision + Manufacturer as the version.
func probeBMC(ctx context.Context) (*FirmwareSnapshot, string) {
if _, err := exec.LookPath("ipmitool"); err != nil {
return nil, "bmc: ipmitool not installed"
}
out, err := runCmd(ctx, "ipmitool", "mc", "info")
if err != nil {
return nil, fmt.Sprintf("bmc: ipmitool mc info failed: %v", trimErr(err, out))
}
snap := parseIpmitoolMCInfo(strings.NewReader(out))
if snap == nil {
return nil, "bmc: ipmitool output not parseable"
}
return snap, ""
}
// parseIpmitoolMCInfo pulls "Firmware Revision" + "Manufacturer Name"
// from the textual output. Format is indented key : value lines.
func parseIpmitoolMCInfo(r io.Reader) *FirmwareSnapshot {
sc := bufio.NewScanner(r)
kv := map[string]string{}
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if k, v, ok := strings.Cut(line, ":"); ok {
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
}
}
version := firstNonEmpty(kv["Firmware Revision"], kv["Aux Firmware Rev Info"])
if version == "" {
return nil
}
return &FirmwareSnapshot{
Component: "bmc",
Identifier: "bmc0",
Version: version,
Vendor: kv["Manufacturer Name"],
Raw: kv,
}
}
// ----- NIC firmware ------------------------------------------------------
// probeNICFirmware enumerates /sys/class/net/*/device and calls
// `ethtool -i <iface>` on each real NIC (skip lo, bridges, virtuals).
// One snapshot per interface so a mismatched port lights up in the diff
// without silencing sibling ports.
func probeNICFirmware(ctx context.Context) []FirmwareSnapshot {
if _, err := exec.LookPath("ethtool"); err != nil {
return nil
}
ifaces, err := os.ReadDir("/sys/class/net")
if err != nil {
return nil
}
var out []FirmwareSnapshot
for _, entry := range ifaces {
name := entry.Name()
if !isRealNIC(name) {
continue
}
raw, err := runCmd(ctx, "ethtool", "-i", name)
if err != nil {
continue
}
snap := parseEthtoolI(strings.NewReader(raw), name)
if snap != nil {
out = append(out, *snap)
}
}
return out
}
// parseEthtoolI extracts driver/firmware-version from `ethtool -i`
// output. Lines are "key: value" with a consistent prefix order.
func parseEthtoolI(r io.Reader, iface string) *FirmwareSnapshot {
sc := bufio.NewScanner(r)
kv := map[string]string{}
for sc.Scan() {
line := sc.Text()
if k, v, ok := strings.Cut(line, ":"); ok {
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
}
}
if kv["firmware-version"] == "" && kv["driver"] == "" {
return nil
}
return &FirmwareSnapshot{
Component: "nic",
Identifier: iface,
Version: kv["firmware-version"],
Vendor: kv["driver"],
Raw: kv,
}
}
// isRealNIC filters out loopback, bridges, veth, and the handful of
// virtual kernel devices ethtool will refuse on.
func isRealNIC(name string) bool {
if name == "" || name == "lo" {
return false
}
for _, prefix := range []string{"docker", "br-", "veth", "virbr", "tun", "tap", "bond"} {
if strings.HasPrefix(name, prefix) {
return false
}
}
// Only accept interfaces that have a `device` link — real PCI NICs
// do; pure virtuals (dummy0, wg*) don't.
if _, err := os.Stat(filepath.Join("/sys/class/net", name, "device")); err != nil {
return false
}
return true
}
// ----- NVMe --------------------------------------------------------------
// probeNVMeFirmware reads /sys/class/nvme/nvmeN/firmware_rev for every
// controller. Falls back to `nvme id-ctrl` if the sysfs file is missing
// (older kernels). Identifier is the controller path so a run with two
// drives produces two snapshots.
func probeNVMeFirmware(ctx context.Context) []FirmwareSnapshot {
entries, err := os.ReadDir("/sys/class/nvme")
if err != nil {
return nil
}
var out []FirmwareSnapshot
for _, e := range entries {
ctrl := e.Name()
rev := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "firmware_rev")))
model := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "model")))
if rev == "" {
// Fallback: nvme id-ctrl -H /dev/<ctrl>. Available on hosts
// where sysfs doesn't export firmware_rev.
if _, err := exec.LookPath("nvme"); err == nil {
raw, _ := runCmd(ctx, "nvme", "id-ctrl", "/dev/"+ctrl)
rev = parseNVMeIDCtrl(strings.NewReader(raw), "fr")
if model == "" {
model = parseNVMeIDCtrl(strings.NewReader(raw), "mn")
}
}
}
if rev == "" {
continue
}
out = append(out, FirmwareSnapshot{
Component: "nvme_fw",
Identifier: ctrl,
Version: rev,
Vendor: model,
Raw: map[string]string{"model": model, "firmware_rev": rev},
})
}
return out
}
// parseNVMeIDCtrl pulls a single field out of `nvme id-ctrl` output.
// Format: "fr : FW1234" / "mn : Samsung SSD 980 PRO".
// Leading spaces vary, values may contain spaces.
func parseNVMeIDCtrl(r io.Reader, key string) string {
sc := bufio.NewScanner(r)
prefix := key + " "
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if !strings.HasPrefix(line, prefix) {
continue
}
_, v, ok := strings.Cut(line, ":")
if !ok {
continue
}
return strings.TrimSpace(v)
}
return ""
}
// ----- HBA ---------------------------------------------------------------
var lspciClassHBA = regexp.MustCompile(`(?i)(serial attached scsi|sas controller|raid bus controller)`)
// probeHBAFirmware looks for SAS/RAID HBAs via `lspci -Dvvnn`. The
// firmware string is typically exposed as "Product Name" +
// "Capabilities" but in practice the LSI/Broadcom driver writes a
// "revision" on the device line. We capture what's printed and rely on
// SpecValidate to diff — this keeps us off tool-specific CLIs (storcli,
// mpt-status) that aren't always installed.
func probeHBAFirmware(ctx context.Context) []FirmwareSnapshot {
if _, err := exec.LookPath("lspci"); err != nil {
return nil
}
out, err := runCmd(ctx, "lspci", "-Dvvnn")
if err != nil {
return nil
}
return parseLspciHBA(strings.NewReader(out))
}
// parseLspciHBA walks `lspci -Dvvnn` stanzas and picks SAS/RAID
// controllers. One snapshot per device; identifier is the PCI address.
// Version is the device line's revision (rev NN) or the Kernel modules
// string when no rev is printed.
func parseLspciHBA(r io.Reader) []FirmwareSnapshot {
sc := bufio.NewScanner(r)
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
var out []FirmwareSnapshot
var cur *FirmwareSnapshot
revRe := regexp.MustCompile(`\(rev\s+([0-9a-fA-F]+)\)`)
flush := func() {
if cur != nil && cur.Version != "" {
out = append(out, *cur)
}
cur = nil
}
for sc.Scan() {
line := sc.Text()
if !strings.HasPrefix(line, "\t") && strings.Contains(line, " ") {
// New device line.
flush()
if lspciClassHBA.MatchString(line) {
addr, rest, _ := strings.Cut(line, " ")
cur = &FirmwareSnapshot{
Component: "hba",
Identifier: addr,
Vendor: strings.TrimSpace(rest),
Raw: map[string]string{"device_line": line},
}
if m := revRe.FindStringSubmatch(line); len(m) == 2 {
cur.Version = "rev " + m[1]
}
}
continue
}
if cur == nil {
continue
}
trim := strings.TrimSpace(line)
if strings.HasPrefix(trim, "Kernel modules:") {
cur.Raw["kernel_modules"] = strings.TrimPrefix(trim, "Kernel modules:")
}
if strings.HasPrefix(trim, "Kernel driver in use:") {
cur.Raw["kernel_driver"] = strings.TrimPrefix(trim, "Kernel driver in use:")
}
}
flush()
return out
}
// ----- Microcode ---------------------------------------------------------
// probeMicrocode reads /proc/cpuinfo for the "microcode" line. All
// cores report the same value post-boot, so one snapshot is enough.
func probeMicrocode() *FirmwareSnapshot {
f, err := os.Open("/proc/cpuinfo")
if err != nil {
return nil
}
defer func() { _ = f.Close() }()
snap := parseMicrocode(f)
return snap
}
func parseMicrocode(r io.Reader) *FirmwareSnapshot {
sc := bufio.NewScanner(r)
version := ""
vendor := ""
for sc.Scan() {
line := sc.Text()
k, v, ok := strings.Cut(line, ":")
if !ok {
continue
}
key := strings.TrimSpace(k)
val := strings.TrimSpace(v)
switch key {
case "microcode":
if version == "" {
version = val
}
case "vendor_id":
if vendor == "" {
vendor = val
}
}
if version != "" && vendor != "" {
break
}
}
if version == "" {
return nil
}
return &FirmwareSnapshot{
Component: "microcode",
Identifier: "cpu",
Version: version,
Vendor: vendor,
}
}
// ----- helpers -----------------------------------------------------------
func firstNonEmpty(ss ...string) string {
for _, s := range ss {
if strings.TrimSpace(s) != "" {
return s
}
}
return ""
}
func readFile(p string) string {
b, err := os.ReadFile(p)
if err != nil {
return ""
}
return string(b)
}
// trimErr joins the underlying error with the first line of combined
// output so the warning message carries enough diagnostic context
// without dumping a screenful of dmidecode/ipmitool noise.
func trimErr(err error, out string) string {
firstLine := strings.SplitN(strings.TrimSpace(out), "\n", 2)[0]
if firstLine == "" {
return err.Error()
}
return fmt.Sprintf("%v (%s)", err, firstLine)
}
+232
View File
@@ -0,0 +1,232 @@
package probes
import (
"strings"
"testing"
)
// Golden dmidecode -t bios output (trimmed, representative). A real
// host will have more lines; parse must tolerate the unknown fields.
const dmidecodeBIOS = `# dmidecode 3.3
Getting SMBIOS data from sysfs.
SMBIOS 3.2.0 present.
Handle 0x0000, DMI type 0, 26 bytes
BIOS Information
Vendor: American Megatrends Inc.
Version: 3.2
Release Date: 07/15/2021
Address: 0xF0000
Runtime Size: 64 kB
ROM Size: 32 MB
Characteristics:
PCI is supported
BIOS is upgradeable
Handle 0x0001, DMI type 1, 27 bytes
System Information
Manufacturer: Supermicro
Product Name: X11SSL-F
`
func TestParseDmidecodeBIOS(t *testing.T) {
snap := parseDmidecodeBIOS(strings.NewReader(dmidecodeBIOS))
if snap == nil {
t.Fatal("parseDmidecodeBIOS returned nil")
}
if snap.Component != "bios" {
t.Errorf("component = %q, want bios", snap.Component)
}
if snap.Version != "3.2" {
t.Errorf("version = %q, want 3.2", snap.Version)
}
if snap.Vendor != "American Megatrends Inc." {
t.Errorf("vendor = %q, want American Megatrends Inc.", snap.Vendor)
}
if snap.Raw["Release Date"] != "07/15/2021" {
t.Errorf("release date = %q, want 07/15/2021", snap.Raw["Release Date"])
}
}
func TestParseDmidecodeBIOSMissingBlock(t *testing.T) {
// No BIOS Information block → nil result, not a crash.
input := "Handle 0x0001, DMI type 1, 27 bytes\nSystem Information\n\tManufacturer: Acme\n"
if snap := parseDmidecodeBIOS(strings.NewReader(input)); snap != nil {
t.Fatalf("expected nil when BIOS block absent, got %+v", snap)
}
}
const ipmitoolMCInfo = `Device ID : 32
Device Revision : 1
Firmware Revision : 1.74
IPMI Version : 2.0
Manufacturer ID : 10876
Manufacturer Name : Supermicro
Product ID : 2051 (0x0803)
Product Name : Unknown (0x803)
`
func TestParseIpmitoolMCInfo(t *testing.T) {
snap := parseIpmitoolMCInfo(strings.NewReader(ipmitoolMCInfo))
if snap == nil {
t.Fatal("parseIpmitoolMCInfo returned nil")
}
if snap.Component != "bmc" {
t.Errorf("component = %q, want bmc", snap.Component)
}
if snap.Version != "1.74" {
t.Errorf("version = %q, want 1.74", snap.Version)
}
if snap.Vendor != "Supermicro" {
t.Errorf("vendor = %q, want Supermicro", snap.Vendor)
}
}
func TestParseIpmitoolMCInfoEmpty(t *testing.T) {
if snap := parseIpmitoolMCInfo(strings.NewReader("")); snap != nil {
t.Fatalf("expected nil on empty input, got %+v", snap)
}
}
const ethtoolEth0 = `driver: mlx5_core
version: 5.15.0
firmware-version: 16.32.1010 (MT_0000000008)
expansion-rom-version:
bus-info: 0000:5e:00.0
supports-statistics: yes
`
func TestParseEthtoolI(t *testing.T) {
snap := parseEthtoolI(strings.NewReader(ethtoolEth0), "eth0")
if snap == nil {
t.Fatal("parseEthtoolI returned nil")
}
if snap.Component != "nic" || snap.Identifier != "eth0" {
t.Errorf("component/id = %q/%q, want nic/eth0", snap.Component, snap.Identifier)
}
if snap.Version != "16.32.1010 (MT_0000000008)" {
t.Errorf("version = %q, want 16.32.1010 (MT_0000000008)", snap.Version)
}
if snap.Vendor != "mlx5_core" {
t.Errorf("vendor = %q, want mlx5_core", snap.Vendor)
}
}
func TestParseEthtoolIEmpty(t *testing.T) {
if snap := parseEthtoolI(strings.NewReader("not a valid output"), "eth0"); snap != nil {
t.Fatalf("expected nil on garbage input, got %+v", snap)
}
}
const nvmeIDCtrl = `NVME Identify Controller:
vid : 0x144d
ssvid : 0x144d
sn : S5GYNX0R500123X
mn : Samsung SSD 980 PRO 1TB
fr : 5B2QGXA7
rab : 2
`
func TestParseNVMeIDCtrl(t *testing.T) {
if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "fr"); got != "5B2QGXA7" {
t.Errorf("fr = %q, want 5B2QGXA7", got)
}
if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "mn"); got != "Samsung SSD 980 PRO 1TB" {
t.Errorf("mn = %q, want Samsung SSD 980 PRO 1TB", got)
}
if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "missing"); got != "" {
t.Errorf("missing key should be empty, got %q", got)
}
}
const lspciHBA = `0000:01:00.0 Ethernet controller [0200]: Intel Corporation I350 [8086:1521] (rev 01)
Subsystem: Intel Corporation I350 [8086:0001]
Kernel driver in use: igb
Kernel modules: igb
0000:03:00.0 Serial Attached SCSI controller [0107]: Broadcom / LSI SAS3008 PCI-Express Fusion-MPT SAS-3 [1000:0097] (rev 02)
Subsystem: Broadcom / LSI SAS9300-8i [1000:30e0]
Kernel driver in use: mpt3sas
Kernel modules: mpt3sas
0000:04:00.0 RAID bus controller [0104]: LSI MegaRAID SAS-3 3108 [1000:005d] (rev 02)
Subsystem: LSI MegaRAID SAS 9361-8i [1000:9361]
Kernel driver in use: megaraid_sas
Kernel modules: megaraid_sas
`
func TestParseLspciHBA(t *testing.T) {
got := parseLspciHBA(strings.NewReader(lspciHBA))
if len(got) != 2 {
t.Fatalf("got %d HBA snapshots, want 2 (SAS + RAID; Ethernet must be skipped)", len(got))
}
for _, s := range got {
if s.Component != "hba" {
t.Errorf("component = %q, want hba", s.Component)
}
if s.Version != "rev 02" {
t.Errorf("version = %q, want 'rev 02'", s.Version)
}
}
if got[0].Identifier != "0000:03:00.0" {
t.Errorf("first identifier = %q, want 0000:03:00.0", got[0].Identifier)
}
if got[1].Identifier != "0000:04:00.0" {
t.Errorf("second identifier = %q, want 0000:04:00.0", got[1].Identifier)
}
}
const cpuinfo = `processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 85
model name : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
stepping : 7
microcode : 0x5003006
cpu MHz : 2100.000
`
func TestParseMicrocode(t *testing.T) {
snap := parseMicrocode(strings.NewReader(cpuinfo))
if snap == nil {
t.Fatal("parseMicrocode returned nil")
}
if snap.Version != "0x5003006" {
t.Errorf("version = %q, want 0x5003006", snap.Version)
}
if snap.Vendor != "GenuineIntel" {
t.Errorf("vendor = %q, want GenuineIntel", snap.Vendor)
}
if snap.Identifier != "cpu" {
t.Errorf("identifier = %q, want cpu", snap.Identifier)
}
}
func TestParseMicrocodeMissing(t *testing.T) {
// A /proc/cpuinfo without a microcode line returns nil.
input := "processor\t: 0\nvendor_id\t: GenuineIntel\n"
if snap := parseMicrocode(strings.NewReader(input)); snap != nil {
t.Fatalf("expected nil when microcode line absent, got %+v", snap)
}
}
func TestIsRealNIC(t *testing.T) {
cases := []struct {
name string
want bool // want=true means a real-looking name (the /sys/class/net/<name>/device check is skipped here)
}{
{"lo", false},
{"", false},
{"docker0", false},
{"br-abc", false},
{"veth1234", false},
{"virbr0", false},
{"bond0", false},
{"tun0", false},
}
for _, tc := range cases {
if got := isRealNIC(tc.name); got != tc.want {
t.Errorf("isRealNIC(%q) = %v, want %v", tc.name, got, tc.want)
}
}
}
+85
View File
@@ -0,0 +1,85 @@
package probes
import (
"bufio"
"io"
"os"
"strconv"
"strings"
)
// NetDevSnapshot is the per-interface counter row from /proc/net/dev at
// a single instant. Used by the Network stage to compute deltas across
// an iperf window — a rising rx_errors or tx_dropped during a loaded
// link is a real NIC problem, not general noise.
type NetDevSnapshot struct {
Iface string
RxBytes uint64
RxErrs uint64
RxDrop uint64
TxBytes uint64
TxErrs uint64
TxDrop uint64
}
// NetDev reads /proc/net/dev and returns one snapshot per non-loopback
// interface. Returns nil on read/parse failure (best-effort: a missing
// /proc is survivable; the caller skips delta reporting that tick).
func NetDev() []NetDevSnapshot {
f, err := os.Open("/proc/net/dev")
if err != nil {
return nil
}
defer func() { _ = f.Close() }()
return parseNetDev(f)
}
// parseNetDev is split from NetDev so tests can feed a fixture without
// touching the real /proc. The /proc/net/dev format is two header lines
// followed by rows of "iface: rx_bytes rx_packets rx_errs rx_drop ... tx_bytes tx_packets tx_errs tx_drop ..."
// — 16 whitespace-separated counters, of which we pull a curated six.
func parseNetDev(r io.Reader) []NetDevSnapshot {
var out []NetDevSnapshot
sc := bufio.NewScanner(r)
// Skip the two header lines (iface || bytes ... || bytes ...).
for i := 0; i < 2 && sc.Scan(); i++ {
}
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if line == "" {
continue
}
colon := strings.IndexByte(line, ':')
if colon < 0 {
continue
}
iface := strings.TrimSpace(line[:colon])
if iface == "" || iface == "lo" {
continue
}
fields := strings.Fields(line[colon+1:])
if len(fields) < 16 {
continue
}
// /proc/net/dev columns:
// 0 rx_bytes 1 rx_packets 2 rx_errs 3 rx_drop 4 fifo 5 frame 6 compressed 7 multicast
// 8 tx_bytes 9 tx_packets 10 tx_errs 11 tx_drop 12 fifo 13 colls 14 carrier 15 compressed
snap := NetDevSnapshot{Iface: iface}
snap.RxBytes = parseU64(fields[0])
snap.RxErrs = parseU64(fields[2])
snap.RxDrop = parseU64(fields[3])
snap.TxBytes = parseU64(fields[8])
snap.TxErrs = parseU64(fields[10])
snap.TxDrop = parseU64(fields[11])
out = append(out, snap)
}
return out
}
func parseU64(s string) uint64 {
n, err := strconv.ParseUint(s, 10, 64)
if err != nil {
return 0
}
return n
}
+84
View File
@@ -0,0 +1,84 @@
package probes
import (
"strings"
"testing"
)
// TestParseNetDev_RealSample exercises parseNetDev against a synthetic
// /proc/net/dev fixture with the full 16-column layout. Confirms the
// loopback interface is dropped, headers are skipped, and each of the
// six curated counters lands in the right field.
func TestParseNetDev_RealSample(t *testing.T) {
// Columns after "iface:":
// 0 rx_bytes 1 rx_packets 2 rx_errs 3 rx_drop
// 4 fifo 5 frame 6 compressed 7 multicast
// 8 tx_bytes 9 tx_packets 10 tx_errs 11 tx_drop
// 12 fifo 13 colls 14 carrier 15 compressed
fixture := `Inter-| Receive | Transmit
face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed
lo: 1000000 10000 0 0 0 0 0 0 1000000 10000 0 0 0 0 0 0
eth0: 50000000 100000 7 12 0 0 0 0 40000000 90000 3 5 0 0 0 0
eth1: 12345 200 0 0 0 0 0 0 54321 180 0 0 0 0 0 0
`
snaps := parseNetDev(strings.NewReader(fixture))
if len(snaps) != 2 {
t.Fatalf("got %d snapshots, want 2 (lo should be dropped)", len(snaps))
}
byIface := map[string]NetDevSnapshot{}
for _, s := range snaps {
byIface[s.Iface] = s
}
eth0, ok := byIface["eth0"]
if !ok {
t.Fatalf("eth0 missing from parsed snapshots")
}
if eth0.RxBytes != 50000000 {
t.Errorf("eth0 RxBytes=%d, want 50000000", eth0.RxBytes)
}
if eth0.RxErrs != 7 {
t.Errorf("eth0 RxErrs=%d, want 7", eth0.RxErrs)
}
if eth0.RxDrop != 12 {
t.Errorf("eth0 RxDrop=%d, want 12", eth0.RxDrop)
}
if eth0.TxBytes != 40000000 {
t.Errorf("eth0 TxBytes=%d, want 40000000", eth0.TxBytes)
}
if eth0.TxErrs != 3 {
t.Errorf("eth0 TxErrs=%d, want 3", eth0.TxErrs)
}
if eth0.TxDrop != 5 {
t.Errorf("eth0 TxDrop=%d, want 5", eth0.TxDrop)
}
if _, ok := byIface["lo"]; ok {
t.Errorf("lo should have been filtered out")
}
}
// TestParseNetDev_Empty: an empty reader returns no snapshots, not a
// crash. Callers treat nil as "no data" and skip the delta step.
func TestParseNetDev_Empty(t *testing.T) {
snaps := parseNetDev(strings.NewReader(""))
if len(snaps) != 0 {
t.Errorf("got %d snapshots from empty reader, want 0", len(snaps))
}
}
// TestParseNetDev_MalformedRow skips rows that don't have the expected
// 16 columns rather than panicking. A truncated line shouldn't hide the
// good rows that follow.
func TestParseNetDev_MalformedRow(t *testing.T) {
fixture := `header line 1
header line 2
bad0: 123 456
eth0: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
`
snaps := parseNetDev(strings.NewReader(fixture))
if len(snaps) != 1 {
t.Fatalf("got %d snapshots, want 1 (bad0 should be dropped)", len(snaps))
}
if snaps[0].Iface != "eth0" {
t.Errorf("got iface=%q, want eth0", snaps[0].Iface)
}
}