deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+34 -30
View File
@@ -8,26 +8,28 @@ import (
)
// node indexes for the default pipeline layout: pre-stages (3) + stage
// rows (9) + terminal Completed (1) = 13 nodes.
// rows (11) + terminal Completed (1) = 15 nodes.
const (
idxQueued = 0
idxWaitingReboot = 1
idxBooting = 2
idxInventory = 3
idxSpecValidate = 4
idxSMART = 5
idxCPUStress = 6
idxStorage = 7
idxNetwork = 8
idxGPU = 9
idxPSU = 10
idxReporting = 11
idxCompleted = 12
idxFirmware = 4
idxSpecValidate = 5
idxSMART = 6
idxCPUStress = 7
idxStorage = 8
idxNetwork = 9
idxBurn = 10
idxGPU = 11
idxPSU = 12
idxReporting = 13
idxCompleted = 14
)
// seedStages returns a fresh all-pending stage slice in the canonical order.
func seedStages() []model.Stage {
names := []string{"Inventory", "SpecValidate", "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU", "Reporting"}
names := []string{"Inventory", "Firmware", "SpecValidate", "SMART", "CPUStress", "Storage", "Network", "Burn", "GPU", "PSU", "Reporting"}
out := make([]model.Stage, len(names))
for i, n := range names {
out[i] = model.Stage{Name: n, Ordinal: i, State: model.StagePending}
@@ -37,10 +39,10 @@ func seedStages() []model.Stage {
func TestBuildPipeline_NoRun(t *testing.T) {
nodes := BuildPipeline(nil, nil)
// Ghost pipeline: 3 pre-stages + 9 stage ghosts + 1 terminal = 13
// Ghost pipeline: 3 pre-stages + 10 stage ghosts + 1 terminal = 14
// nodes, all pending.
if len(nodes) != 13 {
t.Fatalf("len = %d, want 13", len(nodes))
if len(nodes) != 15 {
t.Fatalf("len = %d, want 15", len(nodes))
}
for i, n := range nodes {
if n.State != "pending" {
@@ -56,8 +58,8 @@ func TestBuildPipeline_NoRun(t *testing.T) {
func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
run := &model.Run{State: model.StateWaitingReboot}
nodes := BuildPipeline(run, nil)
if len(nodes) != 13 {
t.Fatalf("len = %d, want 13", len(nodes))
if len(nodes) != 15 {
t.Fatalf("len = %d, want 15", len(nodes))
}
if nodes[idxQueued].State != "passed" {
t.Errorf("Queued = %q, want passed", nodes[idxQueued].State)
@@ -65,7 +67,7 @@ func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
if nodes[idxWaitingReboot].State != "running" {
t.Errorf("WaitingReboot = %q, want running", nodes[idxWaitingReboot].State)
}
// All 9 stage ghosts must be pending — nothing has started yet.
// All 11 stage ghosts must be pending — nothing has started yet.
for i := idxInventory; i <= idxReporting; i++ {
if nodes[i].State != "pending" {
t.Errorf("%s (ghost) = %q, want pending", nodes[i].Name, nodes[i].State)
@@ -81,19 +83,20 @@ func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
// pending ghosts rather than silently disappearing.
func TestBuildPipeline_GhostStagesDuringStage(t *testing.T) {
run := &model.Run{State: model.StateSMART}
// Only Inventory + SpecValidate seeded; SMART onwards are ghosts.
// Only Inventory + Firmware + SpecValidate seeded; SMART onwards are ghosts.
stages := []model.Stage{
{Name: "Inventory", Ordinal: 0, State: model.StagePassed},
{Name: "SpecValidate", Ordinal: 1, State: model.StagePassed},
{Name: "Firmware", Ordinal: 1, State: model.StagePassed},
{Name: "SpecValidate", Ordinal: 2, State: model.StagePassed},
}
nodes := BuildPipeline(run, stages)
if len(nodes) != 13 {
t.Fatalf("len = %d, want 13", len(nodes))
if len(nodes) != 15 {
t.Fatalf("len = %d, want 15", len(nodes))
}
if nodes[idxSMART].State != "running" {
t.Errorf("SMART (ghost) = %q, want running", nodes[idxSMART].State)
}
for _, i := range []int{idxCPUStress, idxStorage, idxNetwork, idxGPU, idxPSU, idxReporting} {
for _, i := range []int{idxCPUStress, idxStorage, idxNetwork, idxBurn, idxGPU, idxPSU, idxReporting} {
if nodes[i].State != "pending" {
t.Errorf("%s (ghost) = %q, want pending", nodes[i].Name, nodes[i].State)
}
@@ -103,12 +106,13 @@ func TestBuildPipeline_GhostStagesDuringStage(t *testing.T) {
func TestBuildPipeline_Running(t *testing.T) {
run := &model.Run{State: model.StateSMART}
stages := seedStages()
stages[0].State = model.StagePassed
stages[1].State = model.StagePassed
stages[2].State = model.StageRunning
stages[0].State = model.StagePassed // Inventory
stages[1].State = model.StagePassed // Firmware
stages[2].State = model.StagePassed // SpecValidate
stages[3].State = model.StageRunning // SMART
nodes := BuildPipeline(run, stages)
if len(nodes) != 13 {
t.Fatalf("len = %d, want 13", len(nodes))
if len(nodes) != 15 {
t.Fatalf("len = %d, want 15", len(nodes))
}
// Pre-stages are all past for a run that has reached SMART.
for i := idxQueued; i <= idxBooting; i++ {
@@ -136,10 +140,10 @@ func TestBuildPipeline_Running(t *testing.T) {
func TestBuildPipeline_Failed(t *testing.T) {
run := &model.Run{State: model.StateFailedHolding, FailedStage: "Storage"}
stages := seedStages()
for i := 0; i <= 3; i++ {
for i := 0; i <= 4; i++ {
stages[i].State = model.StagePassed
}
stages[4].State = model.StageFailed // Storage
stages[5].State = model.StageFailed // Storage
nodes := BuildPipeline(run, stages)
// Pre-stages are past a run that reached Storage.
for i := idxQueued; i <= idxBooting; i++ {
@@ -150,7 +154,7 @@ func TestBuildPipeline_Failed(t *testing.T) {
if nodes[idxStorage].State != "failed" {
t.Errorf("Storage = %q, want failed", nodes[idxStorage].State)
}
for _, i := range []int{idxNetwork, idxGPU, idxPSU, idxReporting} {
for _, i := range []int{idxNetwork, idxBurn, idxGPU, idxPSU, idxReporting} {
if nodes[i].State != "skipped" {
t.Errorf("%s = %q, want skipped", nodes[i].Name, nodes[i].State)
}