deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -0,0 +1,152 @@
+package orchestrator
+
+import "testing"
+
+// TestEvaluate_Ops covers every operator against the boundary case
+// (equal to threshold) plus one clearly-inside and one clearly-outside
+// value. Table-driven because the logic is regular.
+func TestEvaluate_Ops(t *testing.T) {
+	cases := []struct {
+		name     string
+		op       ThresholdOp
+		value    float64
+		nominal  float64
+		observed float64
+		want     bool
+	}{
+		{"lt strict below", OpLT, 10, 0, 5, true},
+		{"lt equal fails", OpLT, 10, 0, 10, false},
+		{"lt above fails", OpLT, 10, 0, 15, false},
+
+		{"lte below", OpLTE, 10, 0, 5, true},
+		{"lte equal passes", OpLTE, 10, 0, 10, true},
+		{"lte above fails", OpLTE, 10, 0, 11, false},
+
+		{"gt below fails", OpGT, 900, 0, 800, false},
+		{"gt equal fails", OpGT, 900, 0, 900, false},
+		{"gt above passes", OpGT, 900, 0, 950, true},
+
+		{"gte equal passes", OpGTE, 900, 0, 900, true},
+		{"gte below fails", OpGTE, 900, 0, 800, false},
+
+		{"within_pct exact", OpWithinPct, 5, 12.0, 12.0, true},
+		{"within_pct inside", OpWithinPct, 5, 12.0, 11.7, true},
+		{"within_pct outside low", OpWithinPct, 5, 12.0, 11.0, false},
+		{"within_pct outside high", OpWithinPct, 5, 12.0, 12.7, false},
+		{"within_pct zero nominal passes", OpWithinPct, 5, 0, 99, true},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			rules := []Threshold{{
+				Stage: "*", Kind: "k", Key: "k", Op: tc.op,
+				Value: tc.value, Nominal: tc.nominal, Severity: SeverityCritical,
+			}}
+			res := Evaluate(Sample{Stage: "Any", Kind: "k", Key: "k", Value: tc.observed}, rules)
+			if len(res) != 1 {
+				t.Fatalf("expected 1 match, got %d", len(res))
+			}
+			if res[0].Passed != tc.want {
+				t.Fatalf("op=%s observed=%v want passed=%v got %v", tc.op, tc.observed, tc.want, res[0].Passed)
+			}
+		})
+	}
+}
+
+// TestEvaluate_StageMatching: a Network-scoped rule ignores samples
+// stamped with other stages. Global "*" catches everything.
+func TestEvaluate_StageMatching(t *testing.T) {
+	rules := []Threshold{
+		{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
+		{Stage: "Burn", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 88, Severity: SeverityCritical},
+	}
+	// Sample from CPUStress — only the global rule applies.
+	res := Evaluate(Sample{Stage: "CPUStress", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
+	if len(res) != 1 {
+		t.Fatalf("cpustress sample: expected 1 match, got %d", len(res))
+	}
+	if res[0].Threshold.Value != 92 {
+		t.Fatalf("cpustress sample matched wrong rule: %+v", res[0].Threshold)
+	}
+
+	// Sample from Burn — both rules match. The stricter one breaches.
+	res = Evaluate(Sample{Stage: "Burn", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
+	if len(res) != 2 {
+		t.Fatalf("burn sample: expected 2 matches, got %d", len(res))
+	}
+	var globalPassed, burnPassed bool
+	for _, r := range res {
+		switch r.Threshold.Value {
+		case 92:
+			globalPassed = r.Passed
+		case 88:
+			burnPassed = r.Passed
+		}
+	}
+	if !globalPassed {
+		t.Fatalf("global 92C rule should pass at 89C")
+	}
+	if burnPassed {
+		t.Fatalf("burn 88C rule should breach at 89C")
+	}
+}
+
+// TestEvaluate_KeyWildcards covers "*" / "prefix*" / "*suffix".
+func TestEvaluate_KeyWildcards(t *testing.T) {
+	cases := []struct {
+		pattern string
+		key     string
+		match   bool
+	}{
+		{"*", "anything", true},
+		{"", "anything", true},
+		{"cpu/*", "cpu/0", true},
+		{"cpu/*", "gpu/0", false},
+		{"*/rate", "eth0/rate", true},
+		{"*/rate", "eth0/count", false},
+		{"exact", "exact", true},
+		{"exact", "exactly", false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.pattern+"_vs_"+tc.key, func(t *testing.T) {
+			got := keyMatches(tc.pattern, tc.key)
+			if got != tc.match {
+				t.Fatalf("keyMatches(%q, %q) = %v, want %v", tc.pattern, tc.key, got, tc.match)
+			}
+		})
+	}
+}
+
+// TestEvaluate_SeverityDispatch: only critical breaches flip
+// CriticalBreach; warning-severity breaches stay advisory.
+func TestEvaluate_SeverityDispatch(t *testing.T) {
+	rules := []Threshold{
+		{Stage: "*", Kind: "temp", Key: "cpu", Op: OpLT, Value: 92, Severity: SeverityCritical},
+		{Stage: "*", Kind: "fio", Key: "p99", Op: OpLT, Value: 50000, Severity: SeverityWarning},
+	}
+	res := Evaluate(Sample{Stage: "CPU", Kind: "temp", Key: "cpu", Value: 95}, rules)
+	if len(res) != 1 || !res[0].CriticalBreach() {
+		t.Fatalf("critical breach not detected: %+v", res)
+	}
+	res = Evaluate(Sample{Stage: "Storage", Kind: "fio", Key: "p99", Value: 80000}, rules)
+	if len(res) != 1 {
+		t.Fatalf("expected 1 match, got %d", len(res))
+	}
+	if res[0].CriticalBreach() {
+		t.Fatalf("warning-severity breach should not be critical")
+	}
+	if !res[0].Breached() {
+		t.Fatalf("warning-severity rule should still show breach=true")
+	}
+}
+
+// TestEvaluate_NoMatchingThreshold: a sample that doesn't hit any rule
+// produces an empty result slice — callers treat that as "advisory".
+func TestEvaluate_NoMatchingThreshold(t *testing.T) {
+	rules := []Threshold{
+		{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
+	}
+	res := Evaluate(Sample{Stage: "Network", Kind: "iperf", Key: "throughput", Value: 950}, rules)
+	if len(res) != 0 {
+		t.Fatalf("unmatched sample should yield 0 results, got %d", len(res))
+	}
+}