deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -0,0 +1,112 @@
+package tests
+
+import (
+	"testing"
+	"time"
+)
+
+// TestIsPSULabel keeps the allowlist narrow enough that CPU VRM rails
+// don't get misclassified as PSU-out-of-range failures but wide enough
+// that common SuperMicro/Intel hwmon labels land in the Yes bucket.
+func TestIsPSULabel(t *testing.T) {
+	cases := []struct {
+		label string
+		want  bool
+	}{
+		{"+12V", true},
+		{"12V", true},
+		{"+5V", true},
+		{"5V", true},
+		{"+3.3V", true},
+		{"3V3", true},
+		{"VCCIN", true},
+		{"vccin", true},
+		{"Vcore", false},
+		{"CPU VCORE", false},
+		{"AVCC", false},
+		{"", false},
+	}
+	for _, tc := range cases {
+		if got := isPSULabel(tc.label); got != tc.want {
+			t.Errorf("isPSULabel(%q) = %v, want %v", tc.label, got, tc.want)
+		}
+	}
+}
+
+// TestNominalFor maps rail labels back to expected nominal voltages.
+// Unknown labels must return 0 so voltageInRange short-circuits — an
+// accidental nominal would invent out-of-range failures.
+func TestNominalFor(t *testing.T) {
+	cases := []struct {
+		label string
+		want  float64
+	}{
+		{"+12V", 12.0},
+		{"12V", 12.0},
+		{"+5V", 5.0},
+		{"+3.3V", 3.3},
+		{"3V3", 3.3},
+		{"VCCIN", 0},
+		{"unknown", 0},
+	}
+	for _, tc := range cases {
+		if got := nominalFor(tc.label); got != tc.want {
+			t.Errorf("nominalFor(%q) = %v, want %v", tc.label, got, tc.want)
+		}
+	}
+}
+
+// TestVoltageInRange verifies the ±10% band: 12V passes in [10.8,
+// 13.2], fails anywhere outside. Unknown labels always pass (since
+// nominalFor returned 0 above).
+func TestVoltageInRange(t *testing.T) {
+	cases := []struct {
+		rail psuRail
+		ok   bool
+	}{
+		{psuRail{Label: "+12V", Volts: 12.0}, true},
+		{psuRail{Label: "+12V", Volts: 10.8}, true},  // exactly at the band
+		{psuRail{Label: "+12V", Volts: 13.2}, true},  // exactly at the band
+		{psuRail{Label: "+12V", Volts: 10.7}, false}, // just below
+		{psuRail{Label: "+12V", Volts: 13.3}, false}, // just above
+		{psuRail{Label: "+12V", Volts: 10.5}, false}, // real sag
+		{psuRail{Label: "+5V", Volts: 4.6}, true},    // 8% low on 5V still in band
+		{psuRail{Label: "+5V", Volts: 4.4}, false},   // 12% low on 5V — out of band
+		{psuRail{Label: "+5V", Volts: 5.0}, true},
+		{psuRail{Label: "VCCIN", Volts: 1.8}, true}, // unknown nominal → pass
+	}
+	for _, tc := range cases {
+		got, _ := voltageInRange(tc.rail)
+		if got != tc.ok {
+			t.Errorf("voltageInRange(%+v) = %v, want %v", tc.rail, got, tc.ok)
+		}
+	}
+}
+
+// TestResolvePSUWindow maps stage timeouts to the sampling window.
+// Quick's 1m stage_timeout → 55s window; deep's 10m → capped at 10m;
+// missing/zero → 30s (test / legacy orchestrator path); sub-35s → at
+// least 30s so aggregates are non-trivial.
+func TestResolvePSUWindow(t *testing.T) {
+	cases := []struct {
+		name string
+		in   time.Duration
+		want time.Duration
+	}{
+		{"zero → snapshot fallback", 0, 30 * time.Second},
+		{"negative → snapshot fallback", -1 * time.Second, 30 * time.Second},
+		{"tiny timeout clamps up to 30s floor", 10 * time.Second, 30 * time.Second},
+		{"35s - 5s = 30s", 35 * time.Second, 30 * time.Second},
+		{"1m quick → 55s", time.Minute, 55 * time.Second},
+		{"10m deep → 9m55s", 10 * time.Minute, 9*time.Minute + 55*time.Second},
+		{"15m soak → capped at 10m", 15 * time.Minute, 10 * time.Minute},
+		{"1h → capped at 10m", time.Hour, 10 * time.Minute},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := resolvePSUWindow(tc.in); got != tc.want {
+				t.Errorf("resolvePSUWindow(%s) = %s, want %s", tc.in, got, tc.want)
+			}
+		})
+	}
+}