deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -8,26 +8,28 @@ import (
 )

 // node indexes for the default pipeline layout: pre-stages (3) + stage
-// rows (9) + terminal Completed (1) = 13 nodes.
+// rows (11) + terminal Completed (1) = 15 nodes.
 const (
 	idxQueued        = 0
 	idxWaitingReboot = 1
 	idxBooting       = 2
 	idxInventory     = 3
-	idxSpecValidate  = 4
-	idxSMART         = 5
-	idxCPUStress     = 6
-	idxStorage       = 7
-	idxNetwork       = 8
-	idxGPU           = 9
-	idxPSU           = 10
-	idxReporting     = 11
-	idxCompleted     = 12
+	idxFirmware      = 4
+	idxSpecValidate  = 5
+	idxSMART         = 6
+	idxCPUStress     = 7
+	idxStorage       = 8
+	idxNetwork       = 9
+	idxBurn          = 10
+	idxGPU           = 11
+	idxPSU           = 12
+	idxReporting     = 13
+	idxCompleted     = 14
 )

 // seedStages returns a fresh all-pending stage slice in the canonical order.
 func seedStages() []model.Stage {
-	names := []string{"Inventory", "SpecValidate", "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU", "Reporting"}
+	names := []string{"Inventory", "Firmware", "SpecValidate", "SMART", "CPUStress", "Storage", "Network", "Burn", "GPU", "PSU", "Reporting"}
 	out := make([]model.Stage, len(names))
 	for i, n := range names {
 		out[i] = model.Stage{Name: n, Ordinal: i, State: model.StagePending}
@@ -37,10 +39,10 @@ func seedStages() []model.Stage {

 func TestBuildPipeline_NoRun(t *testing.T) {
 	nodes := BuildPipeline(nil, nil)
-	// Ghost pipeline: 3 pre-stages + 9 stage ghosts + 1 terminal = 13
+	// Ghost pipeline: 3 pre-stages + 10 stage ghosts + 1 terminal = 14
 	// nodes, all pending.
-	if len(nodes) != 13 {
-		t.Fatalf("len = %d, want 13", len(nodes))
+	if len(nodes) != 15 {
+		t.Fatalf("len = %d, want 15", len(nodes))
 	}
 	for i, n := range nodes {
 		if n.State != "pending" {
@@ -56,8 +58,8 @@ func TestBuildPipeline_NoRun(t *testing.T) {
 func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
 	run := &model.Run{State: model.StateWaitingReboot}
 	nodes := BuildPipeline(run, nil)
-	if len(nodes) != 13 {
-		t.Fatalf("len = %d, want 13", len(nodes))
+	if len(nodes) != 15 {
+		t.Fatalf("len = %d, want 15", len(nodes))
 	}
 	if nodes[idxQueued].State != "passed" {
 		t.Errorf("Queued = %q, want passed", nodes[idxQueued].State)
@@ -65,7 +67,7 @@ func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
 	if nodes[idxWaitingReboot].State != "running" {
 		t.Errorf("WaitingReboot = %q, want running", nodes[idxWaitingReboot].State)
 	}
-	// All 9 stage ghosts must be pending — nothing has started yet.
+	// All 11 stage ghosts must be pending — nothing has started yet.
 	for i := idxInventory; i <= idxReporting; i++ {
 		if nodes[i].State != "pending" {
 			t.Errorf("%s (ghost) = %q, want pending", nodes[i].Name, nodes[i].State)
@@ -81,19 +83,20 @@ func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
 // pending ghosts rather than silently disappearing.
 func TestBuildPipeline_GhostStagesDuringStage(t *testing.T) {
 	run := &model.Run{State: model.StateSMART}
-	// Only Inventory + SpecValidate seeded; SMART onwards are ghosts.
+	// Only Inventory + Firmware + SpecValidate seeded; SMART onwards are ghosts.
 	stages := []model.Stage{
 		{Name: "Inventory", Ordinal: 0, State: model.StagePassed},
-		{Name: "SpecValidate", Ordinal: 1, State: model.StagePassed},
+		{Name: "Firmware", Ordinal: 1, State: model.StagePassed},
+		{Name: "SpecValidate", Ordinal: 2, State: model.StagePassed},
 	}
 	nodes := BuildPipeline(run, stages)
-	if len(nodes) != 13 {
-		t.Fatalf("len = %d, want 13", len(nodes))
+	if len(nodes) != 15 {
+		t.Fatalf("len = %d, want 15", len(nodes))
 	}
 	if nodes[idxSMART].State != "running" {
 		t.Errorf("SMART (ghost) = %q, want running", nodes[idxSMART].State)
 	}
-	for _, i := range []int{idxCPUStress, idxStorage, idxNetwork, idxGPU, idxPSU, idxReporting} {
+	for _, i := range []int{idxCPUStress, idxStorage, idxNetwork, idxBurn, idxGPU, idxPSU, idxReporting} {
 		if nodes[i].State != "pending" {
 			t.Errorf("%s (ghost) = %q, want pending", nodes[i].Name, nodes[i].State)
 		}
@@ -103,12 +106,13 @@ func TestBuildPipeline_GhostStagesDuringStage(t *testing.T) {
 func TestBuildPipeline_Running(t *testing.T) {
 	run := &model.Run{State: model.StateSMART}
 	stages := seedStages()
-	stages[0].State = model.StagePassed
-	stages[1].State = model.StagePassed
-	stages[2].State = model.StageRunning
+	stages[0].State = model.StagePassed // Inventory
+	stages[1].State = model.StagePassed // Firmware
+	stages[2].State = model.StagePassed // SpecValidate
+	stages[3].State = model.StageRunning // SMART
 	nodes := BuildPipeline(run, stages)
-	if len(nodes) != 13 {
-		t.Fatalf("len = %d, want 13", len(nodes))
+	if len(nodes) != 15 {
+		t.Fatalf("len = %d, want 15", len(nodes))
 	}
 	// Pre-stages are all past for a run that has reached SMART.
 	for i := idxQueued; i <= idxBooting; i++ {
@@ -136,10 +140,10 @@ func TestBuildPipeline_Running(t *testing.T) {
 func TestBuildPipeline_Failed(t *testing.T) {
 	run := &model.Run{State: model.StateFailedHolding, FailedStage: "Storage"}
 	stages := seedStages()
-	for i := 0; i <= 3; i++ {
+	for i := 0; i <= 4; i++ {
 		stages[i].State = model.StagePassed
 	}
-	stages[4].State = model.StageFailed // Storage
+	stages[5].State = model.StageFailed // Storage
 	nodes := BuildPipeline(run, stages)
 	// Pre-stages are past a run that reached Storage.
 	for i := idxQueued; i <= idxBooting; i++ {
@@ -150,7 +154,7 @@ func TestBuildPipeline_Failed(t *testing.T) {
 	if nodes[idxStorage].State != "failed" {
 		t.Errorf("Storage = %q, want failed", nodes[idxStorage].State)
 	}
-	for _, i := range []int{idxNetwork, idxGPU, idxPSU, idxReporting} {
+	for _, i := range []int{idxNetwork, idxBurn, idxGPU, idxPSU, idxReporting} {
 		if nodes[i].State != "skipped" {
 			t.Errorf("%s = %q, want skipped", nodes[i].Name, nodes[i].State)
 		}