deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -119,9 +119,9 @@ func (d *Dispatcher) pickNext(ctx context.Context) {
|
||||
queued = &runs[i]
|
||||
}
|
||||
case model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
|
||||
model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
|
||||
model.StateInventoryCheck, model.StateFirmware, model.StateSpecValidate, model.StateSMART,
|
||||
model.StateCPUStress, model.StateStorage, model.StateNetwork,
|
||||
model.StateGPU, model.StatePSU, model.StateReporting:
|
||||
model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting:
|
||||
inFlight++
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,11 +30,13 @@ const (
|
||||
// "InventoryCheck". Later stages share a name with their state.
|
||||
var stageStates = map[string]model.RunState{
|
||||
"Inventory": model.StateInventoryCheck,
|
||||
"Firmware": model.StateFirmware,
|
||||
"SpecValidate": model.StateSpecValidate,
|
||||
"SMART": model.StateSMART,
|
||||
"CPUStress": model.StateCPUStress,
|
||||
"Storage": model.StateStorage,
|
||||
"Network": model.StateNetwork,
|
||||
"Burn": model.StateBurn,
|
||||
"GPU": model.StateGPU,
|
||||
"PSU": model.StatePSU,
|
||||
"Reporting": model.StateReporting,
|
||||
@@ -44,11 +46,13 @@ var stageStates = map[string]model.RunState{
|
||||
// first stage to Completed. Kept in sync with store.DefaultStageOrder.
|
||||
var stageOrder = []model.RunState{
|
||||
model.StateInventoryCheck,
|
||||
model.StateFirmware,
|
||||
model.StateSpecValidate,
|
||||
model.StateSMART,
|
||||
model.StateCPUStress,
|
||||
model.StateStorage,
|
||||
model.StateNetwork,
|
||||
model.StateBurn,
|
||||
model.StateGPU,
|
||||
model.StatePSU,
|
||||
model.StateReporting,
|
||||
@@ -143,9 +147,9 @@ func nextStageState(current model.RunState) (model.RunState, error) {
|
||||
func allActiveStates() []model.RunState {
|
||||
return []model.RunState{
|
||||
model.StateQueued, model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
|
||||
model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
|
||||
model.StateInventoryCheck, model.StateFirmware, model.StateSpecValidate, model.StateSMART,
|
||||
model.StateCPUStress, model.StateStorage, model.StateNetwork,
|
||||
model.StateGPU, model.StatePSU, model.StateReporting,
|
||||
model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -80,11 +80,13 @@ func TestTriggerAgentClaimedFromWaitingReboot(t *testing.T) {
|
||||
func TestTriggerStageMismatch(t *testing.T) {
|
||||
stageStates := []model.RunState{
|
||||
model.StateInventoryCheck,
|
||||
model.StateFirmware,
|
||||
model.StateSpecValidate,
|
||||
model.StateSMART,
|
||||
model.StateCPUStress,
|
||||
model.StateStorage,
|
||||
model.StateNetwork,
|
||||
model.StateBurn,
|
||||
model.StateGPU,
|
||||
model.StatePSU,
|
||||
model.StateReporting,
|
||||
@@ -114,11 +116,13 @@ func TestTriggerStageMismatch(t *testing.T) {
|
||||
func TestStageNameForState(t *testing.T) {
|
||||
pairs := map[string]model.RunState{
|
||||
"Inventory": model.StateInventoryCheck,
|
||||
"Firmware": model.StateFirmware,
|
||||
"SpecValidate": model.StateSpecValidate,
|
||||
"SMART": model.StateSMART,
|
||||
"CPUStress": model.StateCPUStress,
|
||||
"Storage": model.StateStorage,
|
||||
"Network": model.StateNetwork,
|
||||
"Burn": model.StateBurn,
|
||||
"GPU": model.StateGPU,
|
||||
"PSU": model.StatePSU,
|
||||
"Reporting": model.StateReporting,
|
||||
@@ -143,11 +147,13 @@ func TestNextStageWalk(t *testing.T) {
|
||||
// one in the canonical order, and from Reporting onto Completed.
|
||||
chain := []model.RunState{
|
||||
model.StateInventoryCheck,
|
||||
model.StateFirmware,
|
||||
model.StateSpecValidate,
|
||||
model.StateSMART,
|
||||
model.StateCPUStress,
|
||||
model.StateStorage,
|
||||
model.StateNetwork,
|
||||
model.StateBurn,
|
||||
model.StateGPU,
|
||||
model.StatePSU,
|
||||
model.StateReporting,
|
||||
|
||||
@@ -0,0 +1,182 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ThresholdOp is one of the comparison operators a threshold supports.
|
||||
// within_pct is the only one that cares about a "nominal" value for
|
||||
// the key — used for PSU rails ("+12V within 5% of 12.0").
|
||||
type ThresholdOp string
|
||||
|
||||
const (
|
||||
OpLT ThresholdOp = "lt"
|
||||
OpLTE ThresholdOp = "lte"
|
||||
OpGT ThresholdOp = "gt"
|
||||
OpGTE ThresholdOp = "gte"
|
||||
OpWithinPct ThresholdOp = "within_pct"
|
||||
)
|
||||
|
||||
// ThresholdSeverity routes a breach to either "fail the run" or "just
|
||||
// surface a warning in the report". The evaluator returns it alongside
|
||||
// the Pass flag so the caller can decide whether to transition the run.
|
||||
type ThresholdSeverity string
|
||||
|
||||
const (
|
||||
SeverityCritical ThresholdSeverity = "critical"
|
||||
SeverityWarning ThresholdSeverity = "warning"
|
||||
)
|
||||
|
||||
// Threshold is the evaluator's view of a stored threshold row. It's a
|
||||
// flat, already-parsed value-object — the evaluator doesn't look at
|
||||
// the DB and the store doesn't look at the evaluator.
|
||||
type Threshold struct {
|
||||
ID int64
|
||||
Stage string // "*" matches any stage
|
||||
Kind string
|
||||
Key string // glob-ish: "*" / "prefix*" / "*suffix" / exact
|
||||
Op ThresholdOp
|
||||
Value float64
|
||||
Nominal float64 // for within_pct (nominal voltage/frequency)
|
||||
Severity ThresholdSeverity
|
||||
}
|
||||
|
||||
// Sample is a single observation the evaluator tests against matching
|
||||
// thresholds. Stage may be empty when the agent doesn't know which
|
||||
// stage posted it (e.g. the thermal sidecar running across stages) —
|
||||
// empty-stage samples only match thresholds with Stage == "*".
|
||||
type Sample struct {
|
||||
Stage string
|
||||
Kind string
|
||||
Key string
|
||||
Value float64
|
||||
}
|
||||
|
||||
// EvalResult is the per-sample outcome of a threshold evaluation:
|
||||
// which threshold was consulted, whether the sample passed, and the
|
||||
// severity so the caller can fast-fail on critical breaches.
|
||||
type EvalResult struct {
|
||||
Threshold Threshold
|
||||
Passed bool
|
||||
Observed float64
|
||||
}
|
||||
|
||||
// Breached returns true when the sample violated the threshold.
|
||||
func (r EvalResult) Breached() bool { return !r.Passed }
|
||||
|
||||
// CriticalBreach returns true only for critical-severity breaches —
|
||||
// the "fail the run right now" case.
|
||||
func (r EvalResult) CriticalBreach() bool {
|
||||
return r.Breached() && r.Threshold.Severity == SeverityCritical
|
||||
}
|
||||
|
||||
// Evaluate runs a single sample through every threshold that applies
|
||||
// to it. A sample may match more than one threshold (a generic "*"
|
||||
// rule + a stage-specific override); each match produces its own
|
||||
// EvalResult in the returned slice so both get persisted.
|
||||
func Evaluate(sample Sample, thresholds []Threshold) []EvalResult {
|
||||
out := make([]EvalResult, 0, 1)
|
||||
for _, t := range thresholds {
|
||||
if !thresholdMatchesSample(t, sample) {
|
||||
continue
|
||||
}
|
||||
passed, err := evaluateOp(t.Op, sample.Value, t.Value, t.Nominal)
|
||||
if err != nil {
|
||||
// Unknown operator — skip. The caller could validate on
|
||||
// insert; here we prefer to drop the threshold than to
|
||||
// return an error that forces every Sensor write to 500.
|
||||
continue
|
||||
}
|
||||
out = append(out, EvalResult{
|
||||
Threshold: t,
|
||||
Passed: passed,
|
||||
Observed: sample.Value,
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// thresholdMatchesSample applies the stage + kind + key filter. Kind
|
||||
// is always literal — there's no "any kind" threshold and if there
|
||||
// ever is we'll add a `kind: *` escape hatch. Stage and key both
|
||||
// support glob-ish matching.
|
||||
func thresholdMatchesSample(t Threshold, s Sample) bool {
|
||||
if t.Kind != s.Kind {
|
||||
return false
|
||||
}
|
||||
if !stageMatches(t.Stage, s.Stage) {
|
||||
return false
|
||||
}
|
||||
if !keyMatches(t.Key, s.Key) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// stageMatches returns true if the threshold's stage selector applies
|
||||
// to the sample's stage. "*" matches everything; empty threshold
|
||||
// selector is treated as "*" so a threshold declared without a stage
|
||||
// key isn't accidentally inert. A sample without a stage only matches
|
||||
// the "*" selector — we don't guess.
|
||||
func stageMatches(selector, sampleStage string) bool {
|
||||
if selector == "" || selector == "*" {
|
||||
return true
|
||||
}
|
||||
return selector == sampleStage
|
||||
}
|
||||
|
||||
// keyMatches handles "*", "prefix*", "*suffix", and exact match. We
|
||||
// avoid pulling in filepath.Match so Windows `\`-vs-`/` rules don't
|
||||
// leak into the sample namespace (key "eth0/rx_errors" is not a path).
|
||||
func keyMatches(pattern, key string) bool {
|
||||
if pattern == "" || pattern == "*" {
|
||||
return true
|
||||
}
|
||||
hasPrefix := strings.HasPrefix(pattern, "*")
|
||||
hasSuffix := strings.HasSuffix(pattern, "*")
|
||||
switch {
|
||||
case hasPrefix && hasSuffix:
|
||||
inner := strings.TrimPrefix(strings.TrimSuffix(pattern, "*"), "*")
|
||||
return strings.Contains(key, inner)
|
||||
case hasSuffix:
|
||||
return strings.HasPrefix(key, strings.TrimSuffix(pattern, "*"))
|
||||
case hasPrefix:
|
||||
return strings.HasSuffix(key, strings.TrimPrefix(pattern, "*"))
|
||||
default:
|
||||
return pattern == key
|
||||
}
|
||||
}
|
||||
|
||||
// evaluateOp does the numeric comparison. within_pct is the oddball:
|
||||
// it tests |observed - nominal| <= (pct / 100) * nominal. Returns an
|
||||
// error for unknown operators so the caller can log + drop.
|
||||
func evaluateOp(op ThresholdOp, observed, threshold, nominal float64) (bool, error) {
|
||||
switch op {
|
||||
case OpLT:
|
||||
return observed < threshold, nil
|
||||
case OpLTE:
|
||||
return observed <= threshold, nil
|
||||
case OpGT:
|
||||
return observed > threshold, nil
|
||||
case OpGTE:
|
||||
return observed >= threshold, nil
|
||||
case OpWithinPct:
|
||||
if nominal == 0 {
|
||||
// within_pct against a 0 nominal is meaningless. Treat as
|
||||
// pass so a misconfigured rule doesn't spuriously fail.
|
||||
return true, nil
|
||||
}
|
||||
allowed := (threshold / 100.0) * nominal
|
||||
if allowed < 0 {
|
||||
allowed = -allowed
|
||||
}
|
||||
diff := observed - nominal
|
||||
if diff < 0 {
|
||||
diff = -diff
|
||||
}
|
||||
return diff <= allowed, nil
|
||||
default:
|
||||
return false, fmt.Errorf("unknown op %q", op)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,152 @@
|
||||
package orchestrator
|
||||
|
||||
import "testing"
|
||||
|
||||
// TestEvaluate_Ops covers every operator against the boundary case
|
||||
// (equal to threshold) plus one clearly-inside and one clearly-outside
|
||||
// value. Table-driven because the logic is regular.
|
||||
func TestEvaluate_Ops(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
op ThresholdOp
|
||||
value float64
|
||||
nominal float64
|
||||
observed float64
|
||||
want bool
|
||||
}{
|
||||
{"lt strict below", OpLT, 10, 0, 5, true},
|
||||
{"lt equal fails", OpLT, 10, 0, 10, false},
|
||||
{"lt above fails", OpLT, 10, 0, 15, false},
|
||||
|
||||
{"lte below", OpLTE, 10, 0, 5, true},
|
||||
{"lte equal passes", OpLTE, 10, 0, 10, true},
|
||||
{"lte above fails", OpLTE, 10, 0, 11, false},
|
||||
|
||||
{"gt below fails", OpGT, 900, 0, 800, false},
|
||||
{"gt equal fails", OpGT, 900, 0, 900, false},
|
||||
{"gt above passes", OpGT, 900, 0, 950, true},
|
||||
|
||||
{"gte equal passes", OpGTE, 900, 0, 900, true},
|
||||
{"gte below fails", OpGTE, 900, 0, 800, false},
|
||||
|
||||
{"within_pct exact", OpWithinPct, 5, 12.0, 12.0, true},
|
||||
{"within_pct inside", OpWithinPct, 5, 12.0, 11.7, true},
|
||||
{"within_pct outside low", OpWithinPct, 5, 12.0, 11.0, false},
|
||||
{"within_pct outside high", OpWithinPct, 5, 12.0, 12.7, false},
|
||||
{"within_pct zero nominal passes", OpWithinPct, 5, 0, 99, true},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
rules := []Threshold{{
|
||||
Stage: "*", Kind: "k", Key: "k", Op: tc.op,
|
||||
Value: tc.value, Nominal: tc.nominal, Severity: SeverityCritical,
|
||||
}}
|
||||
res := Evaluate(Sample{Stage: "Any", Kind: "k", Key: "k", Value: tc.observed}, rules)
|
||||
if len(res) != 1 {
|
||||
t.Fatalf("expected 1 match, got %d", len(res))
|
||||
}
|
||||
if res[0].Passed != tc.want {
|
||||
t.Fatalf("op=%s observed=%v want passed=%v got %v", tc.op, tc.observed, tc.want, res[0].Passed)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestEvaluate_StageMatching: a Network-scoped rule ignores samples
|
||||
// stamped with other stages. Global "*" catches everything.
|
||||
func TestEvaluate_StageMatching(t *testing.T) {
|
||||
rules := []Threshold{
|
||||
{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
|
||||
{Stage: "Burn", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 88, Severity: SeverityCritical},
|
||||
}
|
||||
// Sample from CPUStress — only the global rule applies.
|
||||
res := Evaluate(Sample{Stage: "CPUStress", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
|
||||
if len(res) != 1 {
|
||||
t.Fatalf("cpustress sample: expected 1 match, got %d", len(res))
|
||||
}
|
||||
if res[0].Threshold.Value != 92 {
|
||||
t.Fatalf("cpustress sample matched wrong rule: %+v", res[0].Threshold)
|
||||
}
|
||||
|
||||
// Sample from Burn — both rules match. The stricter one breaches.
|
||||
res = Evaluate(Sample{Stage: "Burn", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
|
||||
if len(res) != 2 {
|
||||
t.Fatalf("burn sample: expected 2 matches, got %d", len(res))
|
||||
}
|
||||
var globalPassed, burnPassed bool
|
||||
for _, r := range res {
|
||||
switch r.Threshold.Value {
|
||||
case 92:
|
||||
globalPassed = r.Passed
|
||||
case 88:
|
||||
burnPassed = r.Passed
|
||||
}
|
||||
}
|
||||
if !globalPassed {
|
||||
t.Fatalf("global 92C rule should pass at 89C")
|
||||
}
|
||||
if burnPassed {
|
||||
t.Fatalf("burn 88C rule should breach at 89C")
|
||||
}
|
||||
}
|
||||
|
||||
// TestEvaluate_KeyWildcards covers "*" / "prefix*" / "*suffix".
|
||||
func TestEvaluate_KeyWildcards(t *testing.T) {
|
||||
cases := []struct {
|
||||
pattern string
|
||||
key string
|
||||
match bool
|
||||
}{
|
||||
{"*", "anything", true},
|
||||
{"", "anything", true},
|
||||
{"cpu/*", "cpu/0", true},
|
||||
{"cpu/*", "gpu/0", false},
|
||||
{"*/rate", "eth0/rate", true},
|
||||
{"*/rate", "eth0/count", false},
|
||||
{"exact", "exact", true},
|
||||
{"exact", "exactly", false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.pattern+"_vs_"+tc.key, func(t *testing.T) {
|
||||
got := keyMatches(tc.pattern, tc.key)
|
||||
if got != tc.match {
|
||||
t.Fatalf("keyMatches(%q, %q) = %v, want %v", tc.pattern, tc.key, got, tc.match)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestEvaluate_SeverityDispatch: only critical breaches flip
|
||||
// CriticalBreach; warning-severity breaches stay advisory.
|
||||
func TestEvaluate_SeverityDispatch(t *testing.T) {
|
||||
rules := []Threshold{
|
||||
{Stage: "*", Kind: "temp", Key: "cpu", Op: OpLT, Value: 92, Severity: SeverityCritical},
|
||||
{Stage: "*", Kind: "fio", Key: "p99", Op: OpLT, Value: 50000, Severity: SeverityWarning},
|
||||
}
|
||||
res := Evaluate(Sample{Stage: "CPU", Kind: "temp", Key: "cpu", Value: 95}, rules)
|
||||
if len(res) != 1 || !res[0].CriticalBreach() {
|
||||
t.Fatalf("critical breach not detected: %+v", res)
|
||||
}
|
||||
res = Evaluate(Sample{Stage: "Storage", Kind: "fio", Key: "p99", Value: 80000}, rules)
|
||||
if len(res) != 1 {
|
||||
t.Fatalf("expected 1 match, got %d", len(res))
|
||||
}
|
||||
if res[0].CriticalBreach() {
|
||||
t.Fatalf("warning-severity breach should not be critical")
|
||||
}
|
||||
if !res[0].Breached() {
|
||||
t.Fatalf("warning-severity rule should still show breach=true")
|
||||
}
|
||||
}
|
||||
|
||||
// TestEvaluate_NoMatchingThreshold: a sample that doesn't hit any rule
|
||||
// produces an empty result slice — callers treat that as "advisory".
|
||||
func TestEvaluate_NoMatchingThreshold(t *testing.T) {
|
||||
rules := []Threshold{
|
||||
{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
|
||||
}
|
||||
res := Evaluate(Sample{Stage: "Network", Kind: "iperf", Key: "throughput", Value: 950}, rules)
|
||||
if len(res) != 0 {
|
||||
t.Fatalf("unmatched sample should yield 0 results, got %d", len(res))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user