deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,280 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Threshold is the DB view of a per-run threshold row. Mirrors the
|
||||
// orchestrator.Threshold value-object but keeps Severity/Op as strings
|
||||
// so callers higher up don't force this package to import orchestrator.
|
||||
type Threshold struct {
|
||||
ID int64
|
||||
RunID int64
|
||||
Stage string
|
||||
Kind string
|
||||
Key string
|
||||
Op string
|
||||
Threshold float64
|
||||
Nominal float64
|
||||
Unit string
|
||||
Severity string
|
||||
Source string // profile|host_override
|
||||
}
|
||||
|
||||
// ThresholdEvaluation is one recorded comparison — the evaluator calls
|
||||
// this for every sample that matched a threshold, whether it passed
|
||||
// or breached. The report page aggregates these to show the operator
|
||||
// why a run failed (or was flagged as warning-only).
|
||||
type ThresholdEvaluation struct {
|
||||
ID int64
|
||||
RunID int64
|
||||
ThresholdID int64
|
||||
Stage string
|
||||
Kind string
|
||||
Key string
|
||||
TS time.Time
|
||||
Observed float64
|
||||
Passed bool
|
||||
}
|
||||
|
||||
// Thresholds is the CRUD seam. Kept intentionally narrow: seed at run
|
||||
// creation, list for evaluation on each sensor batch, record eval
|
||||
// results, aggregate for the report.
|
||||
type Thresholds struct {
|
||||
DB *sql.DB
|
||||
}
|
||||
|
||||
// ThresholdSpec is the caller-supplied shape for seeding — a flat
|
||||
// value-object that carries the threshold rule plus its source so
|
||||
// the ProfileRegistry-driven seed and per-host overrides converge
|
||||
// on one insert path. Kept here (not in config) so the store layer
|
||||
// doesn't have to import config.
|
||||
type ThresholdSpec struct {
|
||||
Stage string
|
||||
Kind string
|
||||
Key string
|
||||
Op string
|
||||
Value float64
|
||||
Nominal float64
|
||||
Unit string
|
||||
Severity string
|
||||
Source string
|
||||
}
|
||||
|
||||
// SeedForRun converts the caller's specs into Threshold rows for the
|
||||
// given run and bulk-inserts them. Returns the inserted rows with IDs
|
||||
// populated so the evaluator can pin evaluations without a re-read.
|
||||
func (t *Thresholds) SeedForRun(ctx context.Context, runID int64, specs []ThresholdSpec) ([]Threshold, error) {
|
||||
rows := make([]Threshold, 0, len(specs))
|
||||
for _, s := range specs {
|
||||
rows = append(rows, Threshold{
|
||||
RunID: runID,
|
||||
Stage: s.Stage,
|
||||
Kind: s.Kind,
|
||||
Key: s.Key,
|
||||
Op: s.Op,
|
||||
Threshold: s.Value,
|
||||
Nominal: s.Nominal,
|
||||
Unit: s.Unit,
|
||||
Severity: s.Severity,
|
||||
Source: s.Source,
|
||||
})
|
||||
}
|
||||
return t.CreateBatch(ctx, rows)
|
||||
}
|
||||
|
||||
// Create inserts a single threshold row — used by the seed path when
|
||||
// the orchestrator materializes per-run rules from the ProfileRegistry.
|
||||
// Returns the row's ID so the evaluator can pin evaluations to it.
|
||||
func (t *Thresholds) Create(ctx context.Context, th Threshold) (int64, error) {
|
||||
res, err := t.DB.ExecContext(ctx, `
|
||||
INSERT INTO thresholds(run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source)
|
||||
VALUES(?,?,?,?,?,?,?,?,?,?)
|
||||
`, th.RunID, th.Stage, th.Kind, th.Key, th.Op, th.Threshold, th.Nominal, th.Unit, th.Severity, th.Source)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("insert threshold: %w", err)
|
||||
}
|
||||
return res.LastInsertId()
|
||||
}
|
||||
|
||||
// CreateBatch is the fast path for run seeding — one transaction per
|
||||
// run, one row per threshold. Returns the inserted rows with IDs set
|
||||
// so the caller can drop them into the in-memory evaluator without a
|
||||
// follow-up read.
|
||||
func (t *Thresholds) CreateBatch(ctx context.Context, rows []Threshold) ([]Threshold, error) {
|
||||
if len(rows) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
tx, err := t.DB.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
stmt, err := tx.PrepareContext(ctx, `
|
||||
INSERT INTO thresholds(run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source)
|
||||
VALUES(?,?,?,?,?,?,?,?,?,?)
|
||||
`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("prepare threshold insert: %w", err)
|
||||
}
|
||||
defer func() { _ = stmt.Close() }()
|
||||
out := make([]Threshold, 0, len(rows))
|
||||
for _, th := range rows {
|
||||
res, err := stmt.ExecContext(ctx, th.RunID, th.Stage, th.Kind, th.Key, th.Op,
|
||||
th.Threshold, th.Nominal, th.Unit, th.Severity, th.Source)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("insert threshold %s/%s: %w", th.Stage, th.Key, err)
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
th.ID = id
|
||||
out = append(out, th)
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// ListForRun returns every threshold seeded for a run, in stable ID
|
||||
// order. Evaluator expects this to be cheap (few tens of rows per run)
|
||||
// and pulls it on each /sensor batch.
|
||||
func (t *Thresholds) ListForRun(ctx context.Context, runID int64) ([]Threshold, error) {
|
||||
rows, err := t.DB.QueryContext(ctx, `
|
||||
SELECT id, run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source
|
||||
FROM thresholds WHERE run_id = ? ORDER BY id
|
||||
`, runID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var out []Threshold
|
||||
for rows.Next() {
|
||||
var th Threshold
|
||||
if err := rows.Scan(&th.ID, &th.RunID, &th.Stage, &th.Kind, &th.Key,
|
||||
&th.Op, &th.Threshold, &th.Nominal, &th.Unit, &th.Severity, &th.Source); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, th)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// RecordEvaluation persists a single evaluation outcome. Called per
|
||||
// matching sample so the run's report has a full audit trail ("temp
|
||||
// hit 95 at 14:22:03" rather than just "temp failed").
|
||||
func (t *Thresholds) RecordEvaluation(ctx context.Context, ev ThresholdEvaluation) error {
|
||||
passed := 0
|
||||
if ev.Passed {
|
||||
passed = 1
|
||||
}
|
||||
if ev.TS.IsZero() {
|
||||
ev.TS = time.Now().UTC()
|
||||
}
|
||||
_, err := t.DB.ExecContext(ctx, `
|
||||
INSERT INTO threshold_evaluations(run_id, threshold_id, stage_name, kind, key, ts, observed, passed)
|
||||
VALUES(?,?,?,?,?,?,?,?)
|
||||
`, ev.RunID, ev.ThresholdID, ev.Stage, ev.Kind, ev.Key, ev.TS, ev.Observed, passed)
|
||||
if err != nil {
|
||||
return fmt.Errorf("record evaluation: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RecordBatch persists a slice of evaluations in one transaction. The
|
||||
// agent-handler hot path builds these one per sample and batches them
|
||||
// under the same Sensor POST so we take one round-trip rather than N.
|
||||
func (t *Thresholds) RecordBatch(ctx context.Context, evals []ThresholdEvaluation) error {
|
||||
if len(evals) == 0 {
|
||||
return nil
|
||||
}
|
||||
tx, err := t.DB.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
stmt, err := tx.PrepareContext(ctx, `
|
||||
INSERT INTO threshold_evaluations(run_id, threshold_id, stage_name, kind, key, ts, observed, passed)
|
||||
VALUES(?,?,?,?,?,?,?,?)
|
||||
`)
|
||||
if err != nil {
|
||||
return fmt.Errorf("prepare eval insert: %w", err)
|
||||
}
|
||||
defer func() { _ = stmt.Close() }()
|
||||
for _, ev := range evals {
|
||||
passed := 0
|
||||
if ev.Passed {
|
||||
passed = 1
|
||||
}
|
||||
if ev.TS.IsZero() {
|
||||
ev.TS = time.Now().UTC()
|
||||
}
|
||||
if _, err := stmt.ExecContext(ctx, ev.RunID, ev.ThresholdID, ev.Stage, ev.Kind, ev.Key, ev.TS, ev.Observed, passed); err != nil {
|
||||
return fmt.Errorf("insert eval: %w", err)
|
||||
}
|
||||
}
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
// ListEvaluations returns the evaluation history for a run, newest
|
||||
// last. Bounded at a sane cap so a pathological run with a sample-per-
|
||||
// second sidecar doesn't blow up the report page.
|
||||
func (t *Thresholds) ListEvaluations(ctx context.Context, runID int64) ([]ThresholdEvaluation, error) {
|
||||
rows, err := t.DB.QueryContext(ctx, `
|
||||
SELECT id, run_id, threshold_id, stage_name, kind, key, ts, observed, passed
|
||||
FROM threshold_evaluations WHERE run_id = ?
|
||||
ORDER BY id LIMIT 5000
|
||||
`, runID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var out []ThresholdEvaluation
|
||||
for rows.Next() {
|
||||
var ev ThresholdEvaluation
|
||||
var passed int
|
||||
if err := rows.Scan(&ev.ID, &ev.RunID, &ev.ThresholdID, &ev.Stage, &ev.Kind,
|
||||
&ev.Key, &ev.TS, &ev.Observed, &passed); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ev.Passed = passed == 1
|
||||
out = append(out, ev)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// CriticalBreaches returns the evaluations that fire the "fail the
|
||||
// run" gate — critical-severity thresholds with passed=0. The
|
||||
// agent-handler calls this at /result close so an aggregate breach
|
||||
// (p99 latency > bound) still flips the run to FailedHolding even if
|
||||
// no single sample tripped the fast-fail path.
|
||||
func (t *Thresholds) CriticalBreaches(ctx context.Context, runID int64) ([]ThresholdEvaluation, error) {
|
||||
rows, err := t.DB.QueryContext(ctx, `
|
||||
SELECT e.id, e.run_id, e.threshold_id, e.stage_name, e.kind, e.key, e.ts, e.observed, e.passed
|
||||
FROM threshold_evaluations e
|
||||
JOIN thresholds t ON t.id = e.threshold_id
|
||||
WHERE e.run_id = ? AND e.passed = 0 AND t.severity = 'critical'
|
||||
ORDER BY e.id
|
||||
`, runID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var out []ThresholdEvaluation
|
||||
for rows.Next() {
|
||||
var ev ThresholdEvaluation
|
||||
var passed int
|
||||
if err := rows.Scan(&ev.ID, &ev.RunID, &ev.ThresholdID, &ev.Stage, &ev.Kind,
|
||||
&ev.Key, &ev.TS, &ev.Observed, &passed); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ev.Passed = passed == 1
|
||||
out = append(out, ev)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
Reference in New Issue
Block a user