deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -75,6 +75,12 @@ func newCaptureRegistry(c *captureNotifier) *notify.Registry {
 // (agent, runID, plainTokenForBearer). Caller is responsible for
 // transitioning the run out of Queued.
 func fullAgent(t *testing.T) (*api.Agent, int64, string) {
+	return fullAgentWithSpec(t, "")
+}
+
+// fullAgentWithSpec is the same as fullAgent but seeds the host with
+// an ExpectedSpecYAML so SpecValidate can pick up diffs in the test.
+func fullAgentWithSpec(t *testing.T, expectedSpecYAML string) (*api.Agent, int64, string) {
 	t.Helper()
 	tmp := t.TempDir()
 	conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
@@ -89,6 +95,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
 	artifactStore := &store.Artifacts{DB: conn}
 	specDiffStore := &store.SpecDiffs{DB: conn}
 	measurementStore := &store.Measurements{DB: conn}
+	firmwareStore := &store.Firmware{DB: conn}

 	hub := events.NewHub()
 	logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
@@ -109,7 +116,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
 		MAC:              "aa:bb:cc:dd:ee:10",
 		WoLBroadcastIP:   "10.0.0.255",
 		WoLPort:          9,
-		ExpectedSpecYAML: "", // empty spec → no diffs
+		ExpectedSpecYAML: expectedSpecYAML,
 	})
 	if err != nil {
 		t.Fatalf("create host: %v", err)
@@ -132,6 +139,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
 		Artifacts:    artifactStore,
 		SpecDiffs:    specDiffStore,
 		Measurements: measurementStore,
+		Firmware:     firmwareStore,
 		Runner:       runner,
 		EventHub:     hub,
 		Logs:         logHub,
@@ -195,20 +203,24 @@ func TestFullPipelineToCompleted(t *testing.T) {
 		Memory: spec.MemorySpec{TotalGiB: 16},
 	}
 	next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
-	// After Inventory → SpecValidate resolves inline → SMART
-	if next != "SMART" {
-		t.Fatalf("after Inventory, next_state = %q, want SMART", next)
+	// After Inventory → Firmware
+	if next != "Firmware" {
+		t.Fatalf("after Inventory, next_state = %q, want Firmware", next)
 	}

-	// The remaining stages advance one-for-one in order.
+	// The remaining stages advance one-for-one in order. After Firmware
+	// the inline SpecValidate resolver advances through SpecValidate to
+	// SMART without a dedicated /result POST for SpecValidate.
 	walkPlan := []struct {
 		stage    string
 		expected string
 	}{
+		{"Firmware", "SMART"},
 		{"SMART", "CPUStress"},
 		{"CPUStress", "Storage"},
 		{"Storage", "Network"},
-		{"Network", "GPU"},
+		{"Network", "Burn"},
+		{"Burn", "GPU"},
 		{"GPU", "PSU"},
 		{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
 	}
@@ -287,8 +299,11 @@ func TestFaultInjectionSMART(t *testing.T) {
 	}

 	inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
-	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" {
-		t.Fatalf("after Inventory, next = %q want SMART", next)
+	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
+		t.Fatalf("after Inventory, next = %q want Firmware", next)
+	}
+	if next := walkStage(t, a, runID, token, "Firmware", true, nil); next != "SMART" {
+		t.Fatalf("after Firmware, next = %q want SMART (inline SpecValidate)", next)
 	}

 	// Fake SMART failure → expect FailedHolding.
@@ -316,3 +331,76 @@ func TestFaultInjectionSMART(t *testing.T) {
 		t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
 	}
 }
+
+// TestFirmwarePersistAndSpecMismatch exercises the Phase 4 firmware
+// integration: the agent POSTs Firmware snapshots; server persists; the
+// following SpecValidate diff picks up a firmware mismatch and parks
+// the run in FailedHolding with FailedStage=SpecValidate.
+func TestFirmwarePersistAndSpecMismatch(t *testing.T) {
+	// Host demands BIOS 3.3; agent will POST 3.2 → one critical firmware diff.
+	yaml := "firmware:\n  - component: bios\n    version: \"3.3\"\n"
+	a, runID, token := fullAgentWithSpec(t, yaml)
+	a.Notify = newCaptureRegistry(&captureNotifier{name: "capture"})
+
+	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
+		t.Fatalf("set state: %v", err)
+	}
+
+	inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
+	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
+		t.Fatalf("after Inventory, next = %q want Firmware", next)
+	}
+
+	// Firmware stage: agent reports actual BIOS 3.2 → one row persisted.
+	fw := []map[string]any{
+		{"component": "bios", "identifier": "system", "version": "3.2", "vendor": "AMI"},
+	}
+	next := walkStage(t, a, runID, token, "Firmware", true, map[string]any{"firmware": fw})
+	// Inline SpecValidate should detect the firmware mismatch and send
+	// the run to FailedHolding without the agent posting SpecValidate.
+	if next != "FailedHolding" {
+		t.Fatalf("after Firmware mismatch, next = %q want FailedHolding", next)
+	}
+
+	run, err := a.Runs.Get(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("get run: %v", err)
+	}
+	if run.State != model.StateFailedHolding {
+		t.Fatalf("run.State = %q, want FailedHolding", run.State)
+	}
+	if run.FailedStage != "SpecValidate" {
+		t.Fatalf("run.FailedStage = %q, want SpecValidate", run.FailedStage)
+	}
+
+	// Persistence: row landed in firmware_snapshots.
+	snaps, err := a.Firmware.ListForRun(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("ListForRun firmware: %v", err)
+	}
+	if len(snaps) != 1 {
+		t.Fatalf("firmware rows = %d, want 1: %+v", len(snaps), snaps)
+	}
+	if snaps[0].Component != "bios" || snaps[0].Version != "3.2" {
+		t.Errorf("persisted snapshot = %+v", snaps[0])
+	}
+
+	// Diff row: SpecDiffs has a firmware-specific entry (rather than
+	// only CPU/memory/disk rows) and is critical.
+	diffs, err := a.SpecDiffs.ListForRun(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("ListForRun specdiffs: %v", err)
+	}
+	found := false
+	for _, d := range diffs {
+		if strings.HasPrefix(d.Field, "firmware[") {
+			found = true
+			if d.Severity != "critical" {
+				t.Errorf("firmware diff severity = %q, want critical", d.Severity)
+			}
+		}
+	}
+	if !found {
+		t.Fatalf("no firmware[...] entry in spec diffs: %+v", diffs)
+	}
+}