23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
183 lines
5.6 KiB
Go
183 lines
5.6 KiB
Go
package orchestrator
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// ThresholdOp is one of the comparison operators a threshold supports.
|
|
// within_pct is the only one that cares about a "nominal" value for
|
|
// the key — used for PSU rails ("+12V within 5% of 12.0").
|
|
type ThresholdOp string
|
|
|
|
const (
|
|
OpLT ThresholdOp = "lt"
|
|
OpLTE ThresholdOp = "lte"
|
|
OpGT ThresholdOp = "gt"
|
|
OpGTE ThresholdOp = "gte"
|
|
OpWithinPct ThresholdOp = "within_pct"
|
|
)
|
|
|
|
// ThresholdSeverity routes a breach to either "fail the run" or "just
|
|
// surface a warning in the report". The evaluator returns it alongside
|
|
// the Pass flag so the caller can decide whether to transition the run.
|
|
type ThresholdSeverity string
|
|
|
|
const (
|
|
SeverityCritical ThresholdSeverity = "critical"
|
|
SeverityWarning ThresholdSeverity = "warning"
|
|
)
|
|
|
|
// Threshold is the evaluator's view of a stored threshold row. It's a
|
|
// flat, already-parsed value-object — the evaluator doesn't look at
|
|
// the DB and the store doesn't look at the evaluator.
|
|
type Threshold struct {
|
|
ID int64
|
|
Stage string // "*" matches any stage
|
|
Kind string
|
|
Key string // glob-ish: "*" / "prefix*" / "*suffix" / exact
|
|
Op ThresholdOp
|
|
Value float64
|
|
Nominal float64 // for within_pct (nominal voltage/frequency)
|
|
Severity ThresholdSeverity
|
|
}
|
|
|
|
// Sample is a single observation the evaluator tests against matching
|
|
// thresholds. Stage may be empty when the agent doesn't know which
|
|
// stage posted it (e.g. the thermal sidecar running across stages) —
|
|
// empty-stage samples only match thresholds with Stage == "*".
|
|
type Sample struct {
|
|
Stage string
|
|
Kind string
|
|
Key string
|
|
Value float64
|
|
}
|
|
|
|
// EvalResult is the per-sample outcome of a threshold evaluation:
|
|
// which threshold was consulted, whether the sample passed, and the
|
|
// severity so the caller can fast-fail on critical breaches.
|
|
type EvalResult struct {
|
|
Threshold Threshold
|
|
Passed bool
|
|
Observed float64
|
|
}
|
|
|
|
// Breached returns true when the sample violated the threshold.
|
|
func (r EvalResult) Breached() bool { return !r.Passed }
|
|
|
|
// CriticalBreach returns true only for critical-severity breaches —
|
|
// the "fail the run right now" case.
|
|
func (r EvalResult) CriticalBreach() bool {
|
|
return r.Breached() && r.Threshold.Severity == SeverityCritical
|
|
}
|
|
|
|
// Evaluate runs a single sample through every threshold that applies
|
|
// to it. A sample may match more than one threshold (a generic "*"
|
|
// rule + a stage-specific override); each match produces its own
|
|
// EvalResult in the returned slice so both get persisted.
|
|
func Evaluate(sample Sample, thresholds []Threshold) []EvalResult {
|
|
out := make([]EvalResult, 0, 1)
|
|
for _, t := range thresholds {
|
|
if !thresholdMatchesSample(t, sample) {
|
|
continue
|
|
}
|
|
passed, err := evaluateOp(t.Op, sample.Value, t.Value, t.Nominal)
|
|
if err != nil {
|
|
// Unknown operator — skip. The caller could validate on
|
|
// insert; here we prefer to drop the threshold than to
|
|
// return an error that forces every Sensor write to 500.
|
|
continue
|
|
}
|
|
out = append(out, EvalResult{
|
|
Threshold: t,
|
|
Passed: passed,
|
|
Observed: sample.Value,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
// thresholdMatchesSample applies the stage + kind + key filter. Kind
|
|
// is always literal — there's no "any kind" threshold and if there
|
|
// ever is we'll add a `kind: *` escape hatch. Stage and key both
|
|
// support glob-ish matching.
|
|
func thresholdMatchesSample(t Threshold, s Sample) bool {
|
|
if t.Kind != s.Kind {
|
|
return false
|
|
}
|
|
if !stageMatches(t.Stage, s.Stage) {
|
|
return false
|
|
}
|
|
if !keyMatches(t.Key, s.Key) {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// stageMatches returns true if the threshold's stage selector applies
|
|
// to the sample's stage. "*" matches everything; empty threshold
|
|
// selector is treated as "*" so a threshold declared without a stage
|
|
// key isn't accidentally inert. A sample without a stage only matches
|
|
// the "*" selector — we don't guess.
|
|
func stageMatches(selector, sampleStage string) bool {
|
|
if selector == "" || selector == "*" {
|
|
return true
|
|
}
|
|
return selector == sampleStage
|
|
}
|
|
|
|
// keyMatches handles "*", "prefix*", "*suffix", and exact match. We
|
|
// avoid pulling in filepath.Match so Windows `\`-vs-`/` rules don't
|
|
// leak into the sample namespace (key "eth0/rx_errors" is not a path).
|
|
func keyMatches(pattern, key string) bool {
|
|
if pattern == "" || pattern == "*" {
|
|
return true
|
|
}
|
|
hasPrefix := strings.HasPrefix(pattern, "*")
|
|
hasSuffix := strings.HasSuffix(pattern, "*")
|
|
switch {
|
|
case hasPrefix && hasSuffix:
|
|
inner := strings.TrimPrefix(strings.TrimSuffix(pattern, "*"), "*")
|
|
return strings.Contains(key, inner)
|
|
case hasSuffix:
|
|
return strings.HasPrefix(key, strings.TrimSuffix(pattern, "*"))
|
|
case hasPrefix:
|
|
return strings.HasSuffix(key, strings.TrimPrefix(pattern, "*"))
|
|
default:
|
|
return pattern == key
|
|
}
|
|
}
|
|
|
|
// evaluateOp does the numeric comparison. within_pct is the oddball:
|
|
// it tests |observed - nominal| <= (pct / 100) * nominal. Returns an
|
|
// error for unknown operators so the caller can log + drop.
|
|
func evaluateOp(op ThresholdOp, observed, threshold, nominal float64) (bool, error) {
|
|
switch op {
|
|
case OpLT:
|
|
return observed < threshold, nil
|
|
case OpLTE:
|
|
return observed <= threshold, nil
|
|
case OpGT:
|
|
return observed > threshold, nil
|
|
case OpGTE:
|
|
return observed >= threshold, nil
|
|
case OpWithinPct:
|
|
if nominal == 0 {
|
|
// within_pct against a 0 nominal is meaningless. Treat as
|
|
// pass so a misconfigured rule doesn't spuriously fail.
|
|
return true, nil
|
|
}
|
|
allowed := (threshold / 100.0) * nominal
|
|
if allowed < 0 {
|
|
allowed = -allowed
|
|
}
|
|
diff := observed - nominal
|
|
if diff < 0 {
|
|
diff = -diff
|
|
}
|
|
return diff <= allowed, nil
|
|
default:
|
|
return false, fmt.Errorf("unknown op %q", op)
|
|
}
|
|
}
|