deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+96 -8
View File
@@ -75,6 +75,12 @@ func newCaptureRegistry(c *captureNotifier) *notify.Registry {
// (agent, runID, plainTokenForBearer). Caller is responsible for
// transitioning the run out of Queued.
func fullAgent(t *testing.T) (*api.Agent, int64, string) {
return fullAgentWithSpec(t, "")
}
// fullAgentWithSpec is the same as fullAgent but seeds the host with
// an ExpectedSpecYAML so SpecValidate can pick up diffs in the test.
func fullAgentWithSpec(t *testing.T, expectedSpecYAML string) (*api.Agent, int64, string) {
t.Helper()
tmp := t.TempDir()
conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
@@ -89,6 +95,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
artifactStore := &store.Artifacts{DB: conn}
specDiffStore := &store.SpecDiffs{DB: conn}
measurementStore := &store.Measurements{DB: conn}
firmwareStore := &store.Firmware{DB: conn}
hub := events.NewHub()
logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
@@ -109,7 +116,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
MAC: "aa:bb:cc:dd:ee:10",
WoLBroadcastIP: "10.0.0.255",
WoLPort: 9,
ExpectedSpecYAML: "", // empty spec → no diffs
ExpectedSpecYAML: expectedSpecYAML,
})
if err != nil {
t.Fatalf("create host: %v", err)
@@ -132,6 +139,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
Artifacts: artifactStore,
SpecDiffs: specDiffStore,
Measurements: measurementStore,
Firmware: firmwareStore,
Runner: runner,
EventHub: hub,
Logs: logHub,
@@ -195,20 +203,24 @@ func TestFullPipelineToCompleted(t *testing.T) {
Memory: spec.MemorySpec{TotalGiB: 16},
}
next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
// After Inventory → SpecValidate resolves inline → SMART
if next != "SMART" {
t.Fatalf("after Inventory, next_state = %q, want SMART", next)
// After Inventory → Firmware
if next != "Firmware" {
t.Fatalf("after Inventory, next_state = %q, want Firmware", next)
}
// The remaining stages advance one-for-one in order.
// The remaining stages advance one-for-one in order. After Firmware
// the inline SpecValidate resolver advances through SpecValidate to
// SMART without a dedicated /result POST for SpecValidate.
walkPlan := []struct {
stage string
expected string
}{
{"Firmware", "SMART"},
{"SMART", "CPUStress"},
{"CPUStress", "Storage"},
{"Storage", "Network"},
{"Network", "GPU"},
{"Network", "Burn"},
{"Burn", "GPU"},
{"GPU", "PSU"},
{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
}
@@ -287,8 +299,11 @@ func TestFaultInjectionSMART(t *testing.T) {
}
inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" {
t.Fatalf("after Inventory, next = %q want SMART", next)
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
t.Fatalf("after Inventory, next = %q want Firmware", next)
}
if next := walkStage(t, a, runID, token, "Firmware", true, nil); next != "SMART" {
t.Fatalf("after Firmware, next = %q want SMART (inline SpecValidate)", next)
}
// Fake SMART failure → expect FailedHolding.
@@ -316,3 +331,76 @@ func TestFaultInjectionSMART(t *testing.T) {
t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
}
}
// TestFirmwarePersistAndSpecMismatch exercises the Phase 4 firmware
// integration: the agent POSTs Firmware snapshots; server persists; the
// following SpecValidate diff picks up a firmware mismatch and parks
// the run in FailedHolding with FailedStage=SpecValidate.
func TestFirmwarePersistAndSpecMismatch(t *testing.T) {
// Host demands BIOS 3.3; agent will POST 3.2 → one critical firmware diff.
yaml := "firmware:\n - component: bios\n version: \"3.3\"\n"
a, runID, token := fullAgentWithSpec(t, yaml)
a.Notify = newCaptureRegistry(&captureNotifier{name: "capture"})
if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
t.Fatalf("set state: %v", err)
}
inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
t.Fatalf("after Inventory, next = %q want Firmware", next)
}
// Firmware stage: agent reports actual BIOS 3.2 → one row persisted.
fw := []map[string]any{
{"component": "bios", "identifier": "system", "version": "3.2", "vendor": "AMI"},
}
next := walkStage(t, a, runID, token, "Firmware", true, map[string]any{"firmware": fw})
// Inline SpecValidate should detect the firmware mismatch and send
// the run to FailedHolding without the agent posting SpecValidate.
if next != "FailedHolding" {
t.Fatalf("after Firmware mismatch, next = %q want FailedHolding", next)
}
run, err := a.Runs.Get(context.Background(), runID)
if err != nil {
t.Fatalf("get run: %v", err)
}
if run.State != model.StateFailedHolding {
t.Fatalf("run.State = %q, want FailedHolding", run.State)
}
if run.FailedStage != "SpecValidate" {
t.Fatalf("run.FailedStage = %q, want SpecValidate", run.FailedStage)
}
// Persistence: row landed in firmware_snapshots.
snaps, err := a.Firmware.ListForRun(context.Background(), runID)
if err != nil {
t.Fatalf("ListForRun firmware: %v", err)
}
if len(snaps) != 1 {
t.Fatalf("firmware rows = %d, want 1: %+v", len(snaps), snaps)
}
if snaps[0].Component != "bios" || snaps[0].Version != "3.2" {
t.Errorf("persisted snapshot = %+v", snaps[0])
}
// Diff row: SpecDiffs has a firmware-specific entry (rather than
// only CPU/memory/disk rows) and is critical.
diffs, err := a.SpecDiffs.ListForRun(context.Background(), runID)
if err != nil {
t.Fatalf("ListForRun specdiffs: %v", err)
}
found := false
for _, d := range diffs {
if strings.HasPrefix(d.Field, "firmware[") {
found = true
if d.Severity != "critical" {
t.Errorf("firmware diff severity = %q, want critical", d.Severity)
}
}
}
if !found {
t.Fatalf("no firmware[...] entry in spec diffs: %+v", diffs)
}
}