Files
Vetting/agent/tests/psu_test.go
T
josh 23c689aa5b
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled
deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00

113 lines
3.4 KiB
Go

package tests
import (
"testing"
"time"
)
// TestIsPSULabel keeps the allowlist narrow enough that CPU VRM rails
// don't get misclassified as PSU-out-of-range failures but wide enough
// that common SuperMicro/Intel hwmon labels land in the Yes bucket.
func TestIsPSULabel(t *testing.T) {
cases := []struct {
label string
want bool
}{
{"+12V", true},
{"12V", true},
{"+5V", true},
{"5V", true},
{"+3.3V", true},
{"3V3", true},
{"VCCIN", true},
{"vccin", true},
{"Vcore", false},
{"CPU VCORE", false},
{"AVCC", false},
{"", false},
}
for _, tc := range cases {
if got := isPSULabel(tc.label); got != tc.want {
t.Errorf("isPSULabel(%q) = %v, want %v", tc.label, got, tc.want)
}
}
}
// TestNominalFor maps rail labels back to expected nominal voltages.
// Unknown labels must return 0 so voltageInRange short-circuits — an
// accidental nominal would invent out-of-range failures.
func TestNominalFor(t *testing.T) {
cases := []struct {
label string
want float64
}{
{"+12V", 12.0},
{"12V", 12.0},
{"+5V", 5.0},
{"+3.3V", 3.3},
{"3V3", 3.3},
{"VCCIN", 0},
{"unknown", 0},
}
for _, tc := range cases {
if got := nominalFor(tc.label); got != tc.want {
t.Errorf("nominalFor(%q) = %v, want %v", tc.label, got, tc.want)
}
}
}
// TestVoltageInRange verifies the ±10% band: 12V passes in [10.8,
// 13.2], fails anywhere outside. Unknown labels always pass (since
// nominalFor returned 0 above).
func TestVoltageInRange(t *testing.T) {
cases := []struct {
rail psuRail
ok bool
}{
{psuRail{Label: "+12V", Volts: 12.0}, true},
{psuRail{Label: "+12V", Volts: 10.8}, true}, // exactly at the band
{psuRail{Label: "+12V", Volts: 13.2}, true}, // exactly at the band
{psuRail{Label: "+12V", Volts: 10.7}, false}, // just below
{psuRail{Label: "+12V", Volts: 13.3}, false}, // just above
{psuRail{Label: "+12V", Volts: 10.5}, false}, // real sag
{psuRail{Label: "+5V", Volts: 4.6}, true}, // 8% low on 5V still in band
{psuRail{Label: "+5V", Volts: 4.4}, false}, // 12% low on 5V — out of band
{psuRail{Label: "+5V", Volts: 5.0}, true},
{psuRail{Label: "VCCIN", Volts: 1.8}, true}, // unknown nominal → pass
}
for _, tc := range cases {
got, _ := voltageInRange(tc.rail)
if got != tc.ok {
t.Errorf("voltageInRange(%+v) = %v, want %v", tc.rail, got, tc.ok)
}
}
}
// TestResolvePSUWindow maps stage timeouts to the sampling window.
// Quick's 1m stage_timeout → 55s window; deep's 10m → capped at 10m;
// missing/zero → 30s (test / legacy orchestrator path); sub-35s → at
// least 30s so aggregates are non-trivial.
func TestResolvePSUWindow(t *testing.T) {
cases := []struct {
name string
in time.Duration
want time.Duration
}{
{"zero → snapshot fallback", 0, 30 * time.Second},
{"negative → snapshot fallback", -1 * time.Second, 30 * time.Second},
{"tiny timeout clamps up to 30s floor", 10 * time.Second, 30 * time.Second},
{"35s - 5s = 30s", 35 * time.Second, 30 * time.Second},
{"1m quick → 55s", time.Minute, 55 * time.Second},
{"10m deep → 9m55s", 10 * time.Minute, 9*time.Minute + 55*time.Second},
{"15m soak → capped at 10m", 15 * time.Minute, 10 * time.Minute},
{"1h → capped at 10m", time.Hour, 10 * time.Minute},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := resolvePSUWindow(tc.in); got != tc.want {
t.Errorf("resolvePSUWindow(%s) = %s, want %s", tc.in, got, tc.want)
}
})
}
}