deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,112 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestIsPSULabel keeps the allowlist narrow enough that CPU VRM rails
|
||||
// don't get misclassified as PSU-out-of-range failures but wide enough
|
||||
// that common SuperMicro/Intel hwmon labels land in the Yes bucket.
|
||||
func TestIsPSULabel(t *testing.T) {
|
||||
cases := []struct {
|
||||
label string
|
||||
want bool
|
||||
}{
|
||||
{"+12V", true},
|
||||
{"12V", true},
|
||||
{"+5V", true},
|
||||
{"5V", true},
|
||||
{"+3.3V", true},
|
||||
{"3V3", true},
|
||||
{"VCCIN", true},
|
||||
{"vccin", true},
|
||||
{"Vcore", false},
|
||||
{"CPU VCORE", false},
|
||||
{"AVCC", false},
|
||||
{"", false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
if got := isPSULabel(tc.label); got != tc.want {
|
||||
t.Errorf("isPSULabel(%q) = %v, want %v", tc.label, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestNominalFor maps rail labels back to expected nominal voltages.
|
||||
// Unknown labels must return 0 so voltageInRange short-circuits — an
|
||||
// accidental nominal would invent out-of-range failures.
|
||||
func TestNominalFor(t *testing.T) {
|
||||
cases := []struct {
|
||||
label string
|
||||
want float64
|
||||
}{
|
||||
{"+12V", 12.0},
|
||||
{"12V", 12.0},
|
||||
{"+5V", 5.0},
|
||||
{"+3.3V", 3.3},
|
||||
{"3V3", 3.3},
|
||||
{"VCCIN", 0},
|
||||
{"unknown", 0},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
if got := nominalFor(tc.label); got != tc.want {
|
||||
t.Errorf("nominalFor(%q) = %v, want %v", tc.label, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestVoltageInRange verifies the ±10% band: 12V passes in [10.8,
|
||||
// 13.2], fails anywhere outside. Unknown labels always pass (since
|
||||
// nominalFor returned 0 above).
|
||||
func TestVoltageInRange(t *testing.T) {
|
||||
cases := []struct {
|
||||
rail psuRail
|
||||
ok bool
|
||||
}{
|
||||
{psuRail{Label: "+12V", Volts: 12.0}, true},
|
||||
{psuRail{Label: "+12V", Volts: 10.8}, true}, // exactly at the band
|
||||
{psuRail{Label: "+12V", Volts: 13.2}, true}, // exactly at the band
|
||||
{psuRail{Label: "+12V", Volts: 10.7}, false}, // just below
|
||||
{psuRail{Label: "+12V", Volts: 13.3}, false}, // just above
|
||||
{psuRail{Label: "+12V", Volts: 10.5}, false}, // real sag
|
||||
{psuRail{Label: "+5V", Volts: 4.6}, true}, // 8% low on 5V still in band
|
||||
{psuRail{Label: "+5V", Volts: 4.4}, false}, // 12% low on 5V — out of band
|
||||
{psuRail{Label: "+5V", Volts: 5.0}, true},
|
||||
{psuRail{Label: "VCCIN", Volts: 1.8}, true}, // unknown nominal → pass
|
||||
}
|
||||
for _, tc := range cases {
|
||||
got, _ := voltageInRange(tc.rail)
|
||||
if got != tc.ok {
|
||||
t.Errorf("voltageInRange(%+v) = %v, want %v", tc.rail, got, tc.ok)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePSUWindow maps stage timeouts to the sampling window.
|
||||
// Quick's 1m stage_timeout → 55s window; deep's 10m → capped at 10m;
|
||||
// missing/zero → 30s (test / legacy orchestrator path); sub-35s → at
|
||||
// least 30s so aggregates are non-trivial.
|
||||
func TestResolvePSUWindow(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in time.Duration
|
||||
want time.Duration
|
||||
}{
|
||||
{"zero → snapshot fallback", 0, 30 * time.Second},
|
||||
{"negative → snapshot fallback", -1 * time.Second, 30 * time.Second},
|
||||
{"tiny timeout clamps up to 30s floor", 10 * time.Second, 30 * time.Second},
|
||||
{"35s - 5s = 30s", 35 * time.Second, 30 * time.Second},
|
||||
{"1m quick → 55s", time.Minute, 55 * time.Second},
|
||||
{"10m deep → 9m55s", 10 * time.Minute, 9*time.Minute + 55*time.Second},
|
||||
{"15m soak → capped at 10m", 15 * time.Minute, 10 * time.Minute},
|
||||
{"1h → capped at 10m", time.Hour, 10 * time.Minute},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := resolvePSUWindow(tc.in); got != tc.want {
|
||||
t.Errorf("resolvePSUWindow(%s) = %s, want %s", tc.in, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user