deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -0,0 +1,182 @@
+package orchestrator
+
+import (
+	"fmt"
+	"strings"
+)
+
+// ThresholdOp is one of the comparison operators a threshold supports.
+// within_pct is the only one that cares about a "nominal" value for
+// the key — used for PSU rails ("+12V within 5% of 12.0").
+type ThresholdOp string
+
+const (
+	OpLT        ThresholdOp = "lt"
+	OpLTE       ThresholdOp = "lte"
+	OpGT        ThresholdOp = "gt"
+	OpGTE       ThresholdOp = "gte"
+	OpWithinPct ThresholdOp = "within_pct"
+)
+
+// ThresholdSeverity routes a breach to either "fail the run" or "just
+// surface a warning in the report". The evaluator returns it alongside
+// the Pass flag so the caller can decide whether to transition the run.
+type ThresholdSeverity string
+
+const (
+	SeverityCritical ThresholdSeverity = "critical"
+	SeverityWarning  ThresholdSeverity = "warning"
+)
+
+// Threshold is the evaluator's view of a stored threshold row. It's a
+// flat, already-parsed value-object — the evaluator doesn't look at
+// the DB and the store doesn't look at the evaluator.
+type Threshold struct {
+	ID        int64
+	Stage     string // "*" matches any stage
+	Kind      string
+	Key       string // glob-ish: "*" / "prefix*" / "*suffix" / exact
+	Op        ThresholdOp
+	Value     float64
+	Nominal   float64 // for within_pct (nominal voltage/frequency)
+	Severity  ThresholdSeverity
+}
+
+// Sample is a single observation the evaluator tests against matching
+// thresholds. Stage may be empty when the agent doesn't know which
+// stage posted it (e.g. the thermal sidecar running across stages) —
+// empty-stage samples only match thresholds with Stage == "*".
+type Sample struct {
+	Stage string
+	Kind  string
+	Key   string
+	Value float64
+}
+
+// EvalResult is the per-sample outcome of a threshold evaluation:
+// which threshold was consulted, whether the sample passed, and the
+// severity so the caller can fast-fail on critical breaches.
+type EvalResult struct {
+	Threshold Threshold
+	Passed    bool
+	Observed  float64
+}
+
+// Breached returns true when the sample violated the threshold.
+func (r EvalResult) Breached() bool { return !r.Passed }
+
+// CriticalBreach returns true only for critical-severity breaches —
+// the "fail the run right now" case.
+func (r EvalResult) CriticalBreach() bool {
+	return r.Breached() && r.Threshold.Severity == SeverityCritical
+}
+
+// Evaluate runs a single sample through every threshold that applies
+// to it. A sample may match more than one threshold (a generic "*"
+// rule + a stage-specific override); each match produces its own
+// EvalResult in the returned slice so both get persisted.
+func Evaluate(sample Sample, thresholds []Threshold) []EvalResult {
+	out := make([]EvalResult, 0, 1)
+	for _, t := range thresholds {
+		if !thresholdMatchesSample(t, sample) {
+			continue
+		}
+		passed, err := evaluateOp(t.Op, sample.Value, t.Value, t.Nominal)
+		if err != nil {
+			// Unknown operator — skip. The caller could validate on
+			// insert; here we prefer to drop the threshold than to
+			// return an error that forces every Sensor write to 500.
+			continue
+		}
+		out = append(out, EvalResult{
+			Threshold: t,
+			Passed:    passed,
+			Observed:  sample.Value,
+		})
+	}
+	return out
+}
+
+// thresholdMatchesSample applies the stage + kind + key filter. Kind
+// is always literal — there's no "any kind" threshold and if there
+// ever is we'll add a `kind: *` escape hatch. Stage and key both
+// support glob-ish matching.
+func thresholdMatchesSample(t Threshold, s Sample) bool {
+	if t.Kind != s.Kind {
+		return false
+	}
+	if !stageMatches(t.Stage, s.Stage) {
+		return false
+	}
+	if !keyMatches(t.Key, s.Key) {
+		return false
+	}
+	return true
+}
+
+// stageMatches returns true if the threshold's stage selector applies
+// to the sample's stage. "*" matches everything; empty threshold
+// selector is treated as "*" so a threshold declared without a stage
+// key isn't accidentally inert. A sample without a stage only matches
+// the "*" selector — we don't guess.
+func stageMatches(selector, sampleStage string) bool {
+	if selector == "" || selector == "*" {
+		return true
+	}
+	return selector == sampleStage
+}
+
+// keyMatches handles "*", "prefix*", "*suffix", and exact match. We
+// avoid pulling in filepath.Match so Windows `\`-vs-`/` rules don't
+// leak into the sample namespace (key "eth0/rx_errors" is not a path).
+func keyMatches(pattern, key string) bool {
+	if pattern == "" || pattern == "*" {
+		return true
+	}
+	hasPrefix := strings.HasPrefix(pattern, "*")
+	hasSuffix := strings.HasSuffix(pattern, "*")
+	switch {
+	case hasPrefix && hasSuffix:
+		inner := strings.TrimPrefix(strings.TrimSuffix(pattern, "*"), "*")
+		return strings.Contains(key, inner)
+	case hasSuffix:
+		return strings.HasPrefix(key, strings.TrimSuffix(pattern, "*"))
+	case hasPrefix:
+		return strings.HasSuffix(key, strings.TrimPrefix(pattern, "*"))
+	default:
+		return pattern == key
+	}
+}
+
+// evaluateOp does the numeric comparison. within_pct is the oddball:
+// it tests |observed - nominal| <= (pct / 100) * nominal. Returns an
+// error for unknown operators so the caller can log + drop.
+func evaluateOp(op ThresholdOp, observed, threshold, nominal float64) (bool, error) {
+	switch op {
+	case OpLT:
+		return observed < threshold, nil
+	case OpLTE:
+		return observed <= threshold, nil
+	case OpGT:
+		return observed > threshold, nil
+	case OpGTE:
+		return observed >= threshold, nil
+	case OpWithinPct:
+		if nominal == 0 {
+			// within_pct against a 0 nominal is meaningless. Treat as
+			// pass so a misconfigured rule doesn't spuriously fail.
+			return true, nil
+		}
+		allowed := (threshold / 100.0) * nominal
+		if allowed < 0 {
+			allowed = -allowed
+		}
+		diff := observed - nominal
+		if diff < 0 {
+			diff = -diff
+		}
+		return diff <= allowed, nil
+	default:
+		return false, fmt.Errorf("unknown op %q", op)
+	}
+}