deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -119,9 +119,9 @@ func (d *Dispatcher) pickNext(ctx context.Context) {
 				queued = &runs[i]
 			}
 		case model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
-			model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
+			model.StateInventoryCheck, model.StateFirmware, model.StateSpecValidate, model.StateSMART,
 			model.StateCPUStress, model.StateStorage, model.StateNetwork,
-			model.StateGPU, model.StatePSU, model.StateReporting:
+			model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting:
 			inFlight++
 		}
 	}
@@ -30,11 +30,13 @@ const (
 // "InventoryCheck". Later stages share a name with their state.
 var stageStates = map[string]model.RunState{
 	"Inventory":    model.StateInventoryCheck,
+	"Firmware":     model.StateFirmware,
 	"SpecValidate": model.StateSpecValidate,
 	"SMART":        model.StateSMART,
 	"CPUStress":    model.StateCPUStress,
 	"Storage":      model.StateStorage,
 	"Network":      model.StateNetwork,
+	"Burn":         model.StateBurn,
 	"GPU":          model.StateGPU,
 	"PSU":          model.StatePSU,
 	"Reporting":    model.StateReporting,
@@ -44,11 +46,13 @@ var stageStates = map[string]model.RunState{
 // first stage to Completed. Kept in sync with store.DefaultStageOrder.
 var stageOrder = []model.RunState{
 	model.StateInventoryCheck,
+	model.StateFirmware,
 	model.StateSpecValidate,
 	model.StateSMART,
 	model.StateCPUStress,
 	model.StateStorage,
 	model.StateNetwork,
+	model.StateBurn,
 	model.StateGPU,
 	model.StatePSU,
 	model.StateReporting,
@@ -143,9 +147,9 @@ func nextStageState(current model.RunState) (model.RunState, error) {
 func allActiveStates() []model.RunState {
 	return []model.RunState{
 		model.StateQueued, model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
-		model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
+		model.StateInventoryCheck, model.StateFirmware, model.StateSpecValidate, model.StateSMART,
 		model.StateCPUStress, model.StateStorage, model.StateNetwork,
-		model.StateGPU, model.StatePSU, model.StateReporting,
+		model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting,
 	}
 }

@@ -80,11 +80,13 @@ func TestTriggerAgentClaimedFromWaitingReboot(t *testing.T) {
 func TestTriggerStageMismatch(t *testing.T) {
 	stageStates := []model.RunState{
 		model.StateInventoryCheck,
+		model.StateFirmware,
 		model.StateSpecValidate,
 		model.StateSMART,
 		model.StateCPUStress,
 		model.StateStorage,
 		model.StateNetwork,
+		model.StateBurn,
 		model.StateGPU,
 		model.StatePSU,
 		model.StateReporting,
@@ -114,11 +116,13 @@ func TestTriggerStageMismatch(t *testing.T) {
 func TestStageNameForState(t *testing.T) {
 	pairs := map[string]model.RunState{
 		"Inventory":    model.StateInventoryCheck,
+		"Firmware":     model.StateFirmware,
 		"SpecValidate": model.StateSpecValidate,
 		"SMART":        model.StateSMART,
 		"CPUStress":    model.StateCPUStress,
 		"Storage":      model.StateStorage,
 		"Network":      model.StateNetwork,
+		"Burn":         model.StateBurn,
 		"GPU":          model.StateGPU,
 		"PSU":          model.StatePSU,
 		"Reporting":    model.StateReporting,
@@ -143,11 +147,13 @@ func TestNextStageWalk(t *testing.T) {
 	// one in the canonical order, and from Reporting onto Completed.
 	chain := []model.RunState{
 		model.StateInventoryCheck,
+		model.StateFirmware,
 		model.StateSpecValidate,
 		model.StateSMART,
 		model.StateCPUStress,
 		model.StateStorage,
 		model.StateNetwork,
+		model.StateBurn,
 		model.StateGPU,
 		model.StatePSU,
 		model.StateReporting,
@@ -0,0 +1,182 @@
+package orchestrator
+
+import (
+	"fmt"
+	"strings"
+)
+
+// ThresholdOp is one of the comparison operators a threshold supports.
+// within_pct is the only one that cares about a "nominal" value for
+// the key — used for PSU rails ("+12V within 5% of 12.0").
+type ThresholdOp string
+
+const (
+	OpLT        ThresholdOp = "lt"
+	OpLTE       ThresholdOp = "lte"
+	OpGT        ThresholdOp = "gt"
+	OpGTE       ThresholdOp = "gte"
+	OpWithinPct ThresholdOp = "within_pct"
+)
+
+// ThresholdSeverity routes a breach to either "fail the run" or "just
+// surface a warning in the report". The evaluator returns it alongside
+// the Pass flag so the caller can decide whether to transition the run.
+type ThresholdSeverity string
+
+const (
+	SeverityCritical ThresholdSeverity = "critical"
+	SeverityWarning  ThresholdSeverity = "warning"
+)
+
+// Threshold is the evaluator's view of a stored threshold row. It's a
+// flat, already-parsed value-object — the evaluator doesn't look at
+// the DB and the store doesn't look at the evaluator.
+type Threshold struct {
+	ID        int64
+	Stage     string // "*" matches any stage
+	Kind      string
+	Key       string // glob-ish: "*" / "prefix*" / "*suffix" / exact
+	Op        ThresholdOp
+	Value     float64
+	Nominal   float64 // for within_pct (nominal voltage/frequency)
+	Severity  ThresholdSeverity
+}
+
+// Sample is a single observation the evaluator tests against matching
+// thresholds. Stage may be empty when the agent doesn't know which
+// stage posted it (e.g. the thermal sidecar running across stages) —
+// empty-stage samples only match thresholds with Stage == "*".
+type Sample struct {
+	Stage string
+	Kind  string
+	Key   string
+	Value float64
+}
+
+// EvalResult is the per-sample outcome of a threshold evaluation:
+// which threshold was consulted, whether the sample passed, and the
+// severity so the caller can fast-fail on critical breaches.
+type EvalResult struct {
+	Threshold Threshold
+	Passed    bool
+	Observed  float64
+}
+
+// Breached returns true when the sample violated the threshold.
+func (r EvalResult) Breached() bool { return !r.Passed }
+
+// CriticalBreach returns true only for critical-severity breaches —
+// the "fail the run right now" case.
+func (r EvalResult) CriticalBreach() bool {
+	return r.Breached() && r.Threshold.Severity == SeverityCritical
+}
+
+// Evaluate runs a single sample through every threshold that applies
+// to it. A sample may match more than one threshold (a generic "*"
+// rule + a stage-specific override); each match produces its own
+// EvalResult in the returned slice so both get persisted.
+func Evaluate(sample Sample, thresholds []Threshold) []EvalResult {
+	out := make([]EvalResult, 0, 1)
+	for _, t := range thresholds {
+		if !thresholdMatchesSample(t, sample) {
+			continue
+		}
+		passed, err := evaluateOp(t.Op, sample.Value, t.Value, t.Nominal)
+		if err != nil {
+			// Unknown operator — skip. The caller could validate on
+			// insert; here we prefer to drop the threshold than to
+			// return an error that forces every Sensor write to 500.
+			continue
+		}
+		out = append(out, EvalResult{
+			Threshold: t,
+			Passed:    passed,
+			Observed:  sample.Value,
+		})
+	}
+	return out
+}
+
+// thresholdMatchesSample applies the stage + kind + key filter. Kind
+// is always literal — there's no "any kind" threshold and if there
+// ever is we'll add a `kind: *` escape hatch. Stage and key both
+// support glob-ish matching.
+func thresholdMatchesSample(t Threshold, s Sample) bool {
+	if t.Kind != s.Kind {
+		return false
+	}
+	if !stageMatches(t.Stage, s.Stage) {
+		return false
+	}
+	if !keyMatches(t.Key, s.Key) {
+		return false
+	}
+	return true
+}
+
+// stageMatches returns true if the threshold's stage selector applies
+// to the sample's stage. "*" matches everything; empty threshold
+// selector is treated as "*" so a threshold declared without a stage
+// key isn't accidentally inert. A sample without a stage only matches
+// the "*" selector — we don't guess.
+func stageMatches(selector, sampleStage string) bool {
+	if selector == "" || selector == "*" {
+		return true
+	}
+	return selector == sampleStage
+}
+
+// keyMatches handles "*", "prefix*", "*suffix", and exact match. We
+// avoid pulling in filepath.Match so Windows `\`-vs-`/` rules don't
+// leak into the sample namespace (key "eth0/rx_errors" is not a path).
+func keyMatches(pattern, key string) bool {
+	if pattern == "" || pattern == "*" {
+		return true
+	}
+	hasPrefix := strings.HasPrefix(pattern, "*")
+	hasSuffix := strings.HasSuffix(pattern, "*")
+	switch {
+	case hasPrefix && hasSuffix:
+		inner := strings.TrimPrefix(strings.TrimSuffix(pattern, "*"), "*")
+		return strings.Contains(key, inner)
+	case hasSuffix:
+		return strings.HasPrefix(key, strings.TrimSuffix(pattern, "*"))
+	case hasPrefix:
+		return strings.HasSuffix(key, strings.TrimPrefix(pattern, "*"))
+	default:
+		return pattern == key
+	}
+}
+
+// evaluateOp does the numeric comparison. within_pct is the oddball:
+// it tests |observed - nominal| <= (pct / 100) * nominal. Returns an
+// error for unknown operators so the caller can log + drop.
+func evaluateOp(op ThresholdOp, observed, threshold, nominal float64) (bool, error) {
+	switch op {
+	case OpLT:
+		return observed < threshold, nil
+	case OpLTE:
+		return observed <= threshold, nil
+	case OpGT:
+		return observed > threshold, nil
+	case OpGTE:
+		return observed >= threshold, nil
+	case OpWithinPct:
+		if nominal == 0 {
+			// within_pct against a 0 nominal is meaningless. Treat as
+			// pass so a misconfigured rule doesn't spuriously fail.
+			return true, nil
+		}
+		allowed := (threshold / 100.0) * nominal
+		if allowed < 0 {
+			allowed = -allowed
+		}
+		diff := observed - nominal
+		if diff < 0 {
+			diff = -diff
+		}
+		return diff <= allowed, nil
+	default:
+		return false, fmt.Errorf("unknown op %q", op)
+	}
+}
@@ -0,0 +1,152 @@
+package orchestrator
+
+import "testing"
+
+// TestEvaluate_Ops covers every operator against the boundary case
+// (equal to threshold) plus one clearly-inside and one clearly-outside
+// value. Table-driven because the logic is regular.
+func TestEvaluate_Ops(t *testing.T) {
+	cases := []struct {
+		name     string
+		op       ThresholdOp
+		value    float64
+		nominal  float64
+		observed float64
+		want     bool
+	}{
+		{"lt strict below", OpLT, 10, 0, 5, true},
+		{"lt equal fails", OpLT, 10, 0, 10, false},
+		{"lt above fails", OpLT, 10, 0, 15, false},
+
+		{"lte below", OpLTE, 10, 0, 5, true},
+		{"lte equal passes", OpLTE, 10, 0, 10, true},
+		{"lte above fails", OpLTE, 10, 0, 11, false},
+
+		{"gt below fails", OpGT, 900, 0, 800, false},
+		{"gt equal fails", OpGT, 900, 0, 900, false},
+		{"gt above passes", OpGT, 900, 0, 950, true},
+
+		{"gte equal passes", OpGTE, 900, 0, 900, true},
+		{"gte below fails", OpGTE, 900, 0, 800, false},
+
+		{"within_pct exact", OpWithinPct, 5, 12.0, 12.0, true},
+		{"within_pct inside", OpWithinPct, 5, 12.0, 11.7, true},
+		{"within_pct outside low", OpWithinPct, 5, 12.0, 11.0, false},
+		{"within_pct outside high", OpWithinPct, 5, 12.0, 12.7, false},
+		{"within_pct zero nominal passes", OpWithinPct, 5, 0, 99, true},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			rules := []Threshold{{
+				Stage: "*", Kind: "k", Key: "k", Op: tc.op,
+				Value: tc.value, Nominal: tc.nominal, Severity: SeverityCritical,
+			}}
+			res := Evaluate(Sample{Stage: "Any", Kind: "k", Key: "k", Value: tc.observed}, rules)
+			if len(res) != 1 {
+				t.Fatalf("expected 1 match, got %d", len(res))
+			}
+			if res[0].Passed != tc.want {
+				t.Fatalf("op=%s observed=%v want passed=%v got %v", tc.op, tc.observed, tc.want, res[0].Passed)
+			}
+		})
+	}
+}
+
+// TestEvaluate_StageMatching: a Network-scoped rule ignores samples
+// stamped with other stages. Global "*" catches everything.
+func TestEvaluate_StageMatching(t *testing.T) {
+	rules := []Threshold{
+		{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
+		{Stage: "Burn", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 88, Severity: SeverityCritical},
+	}
+	// Sample from CPUStress — only the global rule applies.
+	res := Evaluate(Sample{Stage: "CPUStress", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
+	if len(res) != 1 {
+		t.Fatalf("cpustress sample: expected 1 match, got %d", len(res))
+	}
+	if res[0].Threshold.Value != 92 {
+		t.Fatalf("cpustress sample matched wrong rule: %+v", res[0].Threshold)
+	}
+
+	// Sample from Burn — both rules match. The stricter one breaches.
+	res = Evaluate(Sample{Stage: "Burn", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
+	if len(res) != 2 {
+		t.Fatalf("burn sample: expected 2 matches, got %d", len(res))
+	}
+	var globalPassed, burnPassed bool
+	for _, r := range res {
+		switch r.Threshold.Value {
+		case 92:
+			globalPassed = r.Passed
+		case 88:
+			burnPassed = r.Passed
+		}
+	}
+	if !globalPassed {
+		t.Fatalf("global 92C rule should pass at 89C")
+	}
+	if burnPassed {
+		t.Fatalf("burn 88C rule should breach at 89C")
+	}
+}
+
+// TestEvaluate_KeyWildcards covers "*" / "prefix*" / "*suffix".
+func TestEvaluate_KeyWildcards(t *testing.T) {
+	cases := []struct {
+		pattern string
+		key     string
+		match   bool
+	}{
+		{"*", "anything", true},
+		{"", "anything", true},
+		{"cpu/*", "cpu/0", true},
+		{"cpu/*", "gpu/0", false},
+		{"*/rate", "eth0/rate", true},
+		{"*/rate", "eth0/count", false},
+		{"exact", "exact", true},
+		{"exact", "exactly", false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.pattern+"_vs_"+tc.key, func(t *testing.T) {
+			got := keyMatches(tc.pattern, tc.key)
+			if got != tc.match {
+				t.Fatalf("keyMatches(%q, %q) = %v, want %v", tc.pattern, tc.key, got, tc.match)
+			}
+		})
+	}
+}
+
+// TestEvaluate_SeverityDispatch: only critical breaches flip
+// CriticalBreach; warning-severity breaches stay advisory.
+func TestEvaluate_SeverityDispatch(t *testing.T) {
+	rules := []Threshold{
+		{Stage: "*", Kind: "temp", Key: "cpu", Op: OpLT, Value: 92, Severity: SeverityCritical},
+		{Stage: "*", Kind: "fio", Key: "p99", Op: OpLT, Value: 50000, Severity: SeverityWarning},
+	}
+	res := Evaluate(Sample{Stage: "CPU", Kind: "temp", Key: "cpu", Value: 95}, rules)
+	if len(res) != 1 || !res[0].CriticalBreach() {
+		t.Fatalf("critical breach not detected: %+v", res)
+	}
+	res = Evaluate(Sample{Stage: "Storage", Kind: "fio", Key: "p99", Value: 80000}, rules)
+	if len(res) != 1 {
+		t.Fatalf("expected 1 match, got %d", len(res))
+	}
+	if res[0].CriticalBreach() {
+		t.Fatalf("warning-severity breach should not be critical")
+	}
+	if !res[0].Breached() {
+		t.Fatalf("warning-severity rule should still show breach=true")
+	}
+}
+
+// TestEvaluate_NoMatchingThreshold: a sample that doesn't hit any rule
+// produces an empty result slice — callers treat that as "advisory".
+func TestEvaluate_NoMatchingThreshold(t *testing.T) {
+	rules := []Threshold{
+		{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
+	}
+	res := Evaluate(Sample{Stage: "Network", Kind: "iperf", Key: "throughput", Value: 950}, rules)
+	if len(res) != 0 {
+		t.Fatalf("unmatched sample should yield 0 results, got %d", len(res))
+	}
+}