deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+152
View File
@@ -0,0 +1,152 @@
package orchestrator
import "testing"
// TestEvaluate_Ops covers every operator against the boundary case
// (equal to threshold) plus one clearly-inside and one clearly-outside
// value. Table-driven because the logic is regular.
func TestEvaluate_Ops(t *testing.T) {
cases := []struct {
name string
op ThresholdOp
value float64
nominal float64
observed float64
want bool
}{
{"lt strict below", OpLT, 10, 0, 5, true},
{"lt equal fails", OpLT, 10, 0, 10, false},
{"lt above fails", OpLT, 10, 0, 15, false},
{"lte below", OpLTE, 10, 0, 5, true},
{"lte equal passes", OpLTE, 10, 0, 10, true},
{"lte above fails", OpLTE, 10, 0, 11, false},
{"gt below fails", OpGT, 900, 0, 800, false},
{"gt equal fails", OpGT, 900, 0, 900, false},
{"gt above passes", OpGT, 900, 0, 950, true},
{"gte equal passes", OpGTE, 900, 0, 900, true},
{"gte below fails", OpGTE, 900, 0, 800, false},
{"within_pct exact", OpWithinPct, 5, 12.0, 12.0, true},
{"within_pct inside", OpWithinPct, 5, 12.0, 11.7, true},
{"within_pct outside low", OpWithinPct, 5, 12.0, 11.0, false},
{"within_pct outside high", OpWithinPct, 5, 12.0, 12.7, false},
{"within_pct zero nominal passes", OpWithinPct, 5, 0, 99, true},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
rules := []Threshold{{
Stage: "*", Kind: "k", Key: "k", Op: tc.op,
Value: tc.value, Nominal: tc.nominal, Severity: SeverityCritical,
}}
res := Evaluate(Sample{Stage: "Any", Kind: "k", Key: "k", Value: tc.observed}, rules)
if len(res) != 1 {
t.Fatalf("expected 1 match, got %d", len(res))
}
if res[0].Passed != tc.want {
t.Fatalf("op=%s observed=%v want passed=%v got %v", tc.op, tc.observed, tc.want, res[0].Passed)
}
})
}
}
// TestEvaluate_StageMatching: a Network-scoped rule ignores samples
// stamped with other stages. Global "*" catches everything.
func TestEvaluate_StageMatching(t *testing.T) {
rules := []Threshold{
{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
{Stage: "Burn", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 88, Severity: SeverityCritical},
}
// Sample from CPUStress — only the global rule applies.
res := Evaluate(Sample{Stage: "CPUStress", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
if len(res) != 1 {
t.Fatalf("cpustress sample: expected 1 match, got %d", len(res))
}
if res[0].Threshold.Value != 92 {
t.Fatalf("cpustress sample matched wrong rule: %+v", res[0].Threshold)
}
// Sample from Burn — both rules match. The stricter one breaches.
res = Evaluate(Sample{Stage: "Burn", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
if len(res) != 2 {
t.Fatalf("burn sample: expected 2 matches, got %d", len(res))
}
var globalPassed, burnPassed bool
for _, r := range res {
switch r.Threshold.Value {
case 92:
globalPassed = r.Passed
case 88:
burnPassed = r.Passed
}
}
if !globalPassed {
t.Fatalf("global 92C rule should pass at 89C")
}
if burnPassed {
t.Fatalf("burn 88C rule should breach at 89C")
}
}
// TestEvaluate_KeyWildcards covers "*" / "prefix*" / "*suffix".
func TestEvaluate_KeyWildcards(t *testing.T) {
cases := []struct {
pattern string
key string
match bool
}{
{"*", "anything", true},
{"", "anything", true},
{"cpu/*", "cpu/0", true},
{"cpu/*", "gpu/0", false},
{"*/rate", "eth0/rate", true},
{"*/rate", "eth0/count", false},
{"exact", "exact", true},
{"exact", "exactly", false},
}
for _, tc := range cases {
t.Run(tc.pattern+"_vs_"+tc.key, func(t *testing.T) {
got := keyMatches(tc.pattern, tc.key)
if got != tc.match {
t.Fatalf("keyMatches(%q, %q) = %v, want %v", tc.pattern, tc.key, got, tc.match)
}
})
}
}
// TestEvaluate_SeverityDispatch: only critical breaches flip
// CriticalBreach; warning-severity breaches stay advisory.
func TestEvaluate_SeverityDispatch(t *testing.T) {
rules := []Threshold{
{Stage: "*", Kind: "temp", Key: "cpu", Op: OpLT, Value: 92, Severity: SeverityCritical},
{Stage: "*", Kind: "fio", Key: "p99", Op: OpLT, Value: 50000, Severity: SeverityWarning},
}
res := Evaluate(Sample{Stage: "CPU", Kind: "temp", Key: "cpu", Value: 95}, rules)
if len(res) != 1 || !res[0].CriticalBreach() {
t.Fatalf("critical breach not detected: %+v", res)
}
res = Evaluate(Sample{Stage: "Storage", Kind: "fio", Key: "p99", Value: 80000}, rules)
if len(res) != 1 {
t.Fatalf("expected 1 match, got %d", len(res))
}
if res[0].CriticalBreach() {
t.Fatalf("warning-severity breach should not be critical")
}
if !res[0].Breached() {
t.Fatalf("warning-severity rule should still show breach=true")
}
}
// TestEvaluate_NoMatchingThreshold: a sample that doesn't hit any rule
// produces an empty result slice — callers treat that as "advisory".
func TestEvaluate_NoMatchingThreshold(t *testing.T) {
rules := []Threshold{
{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
}
res := Evaluate(Sample{Stage: "Network", Kind: "iperf", Key: "throughput", Value: 950}, rules)
if len(res) != 0 {
t.Fatalf("unmatched sample should yield 0 results, got %d", len(res))
}
}