deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -0,0 +1,97 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+)
+
+// FirmwareSnapshot is one row in firmware_snapshots. A run captures
+// many (one per BIOS/BMC/NIC/HBA/microcode/NVMe) so SpecValidate can
+// diff them against the host's expected spec in Phase 4.
+type FirmwareSnapshot struct {
+	ID         int64
+	RunID      int64
+	Component  string // bios|bmc|nic|hba|microcode|nvme_fw
+	Identifier string // slot/serial/device path
+	Version    string
+	Vendor     string
+	RawJSON    string
+}
+
+// Firmware is the CRUD seam. The agent's Phase-4 probe POSTs captured
+// rows; the orchestrator stores them. SpecValidate reads them back.
+type Firmware struct {
+	DB *sql.DB
+}
+
+// Create inserts a single firmware snapshot. One call per (run, component,
+// identifier) — the agent probe owns dedup/formatting.
+func (f *Firmware) Create(ctx context.Context, s FirmwareSnapshot) (int64, error) {
+	raw := s.RawJSON
+	if raw == "" {
+		raw = "{}"
+	}
+	res, err := f.DB.ExecContext(ctx, `
+		INSERT INTO firmware_snapshots(run_id, component, identifier, version, vendor, raw_json)
+		VALUES(?,?,?,?,?,?)
+	`, s.RunID, s.Component, s.Identifier, s.Version, s.Vendor, raw)
+	if err != nil {
+		return 0, fmt.Errorf("insert firmware: %w", err)
+	}
+	return res.LastInsertId()
+}
+
+// CreateBatch persists a slice of snapshots under one transaction.
+// Agent probe enumerates all components in one pass, so batching wins.
+func (f *Firmware) CreateBatch(ctx context.Context, rows []FirmwareSnapshot) error {
+	if len(rows) == 0 {
+		return nil
+	}
+	tx, err := f.DB.BeginTx(ctx, nil)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = tx.Rollback() }()
+	stmt, err := tx.PrepareContext(ctx, `
+		INSERT INTO firmware_snapshots(run_id, component, identifier, version, vendor, raw_json)
+		VALUES(?,?,?,?,?,?)
+	`)
+	if err != nil {
+		return fmt.Errorf("prepare firmware insert: %w", err)
+	}
+	defer func() { _ = stmt.Close() }()
+	for _, s := range rows {
+		raw := s.RawJSON
+		if raw == "" {
+			raw = "{}"
+		}
+		if _, err := stmt.ExecContext(ctx, s.RunID, s.Component, s.Identifier, s.Version, s.Vendor, raw); err != nil {
+			return fmt.Errorf("insert firmware %s/%s: %w", s.Component, s.Identifier, err)
+		}
+	}
+	return tx.Commit()
+}
+
+// ListForRun returns every firmware snapshot for a run in stable order.
+// Report page + SpecValidate both read this.
+func (f *Firmware) ListForRun(ctx context.Context, runID int64) ([]FirmwareSnapshot, error) {
+	rows, err := f.DB.QueryContext(ctx, `
+		SELECT id, run_id, component, identifier, version, vendor, raw_json
+		FROM firmware_snapshots WHERE run_id = ? ORDER BY id
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []FirmwareSnapshot
+	for rows.Next() {
+		var s FirmwareSnapshot
+		if err := rows.Scan(&s.ID, &s.RunID, &s.Component, &s.Identifier,
+			&s.Version, &s.Vendor, &s.RawJSON); err != nil {
+			return nil, err
+		}
+		out = append(out, s)
+	}
+	return out, rows.Err()
+}
@@ -14,16 +14,30 @@ type Runs struct {
 	DB *sql.DB
 }

+// Create inserts a new run using the default "quick" profile. Older
+// call sites (and most tests) target this form — the profile column's
+// DEFAULT 'quick' on runs takes care of the backfill.
 func (r *Runs) Create(ctx context.Context, hostID int64, tokenHash string, nonDestructive bool) (int64, error) {
+	return r.CreateWithProfile(ctx, hostID, tokenHash, nonDestructive, "quick")
+}
+
+// CreateWithProfile inserts a new run with an explicit profile
+// ("quick"|"deep"|"soak"). The UI handler is the authoritative caller;
+// empty profile falls back to "quick" so a misconfigured form doesn't
+// leave a row with a blank profile column.
+func (r *Runs) CreateWithProfile(ctx context.Context, hostID int64, tokenHash string, nonDestructive bool, profile string) (int64, error) {
+	if profile == "" {
+		profile = "quick"
+	}
 	now := time.Now().UTC()
 	nd := 0
 	if nonDestructive {
 		nd = 1
 	}
 	res, err := r.DB.ExecContext(ctx, `
-		INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at, non_destructive)
-		VALUES(?,?,?,?,?,?)
-	`, hostID, string(model.StateQueued), tokenHash, "linux", now, nd)
+		INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at, non_destructive, profile)
+		VALUES(?,?,?,?,?,?,?)
+	`, hostID, string(model.StateQueued), tokenHash, "linux", now, nd, profile)
 	if err != nil {
 		return 0, fmt.Errorf("insert run: %w", err)
 	}
@@ -107,14 +121,15 @@ func (r *Runs) Get(ctx context.Context, id int64) (*model.Run, error) {
 		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
 		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
 		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
-		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0)
+		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
+		       COALESCE(profile,'quick')
 		FROM runs WHERE id = ?
 	`, id)
 	var run model.Run
 	var completedAt sql.NullTime
 	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
 		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
-		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive)
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile)
 	if errors.Is(err, sql.ErrNoRows) {
 		return nil, ErrNotFound
 	}
@@ -133,7 +148,8 @@ func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, err
 		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
 		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
 		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
-		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0)
+		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
+		       COALESCE(profile,'quick')
 		FROM runs WHERE host_id = ?
 		ORDER BY id DESC LIMIT 1
 	`, hostID)
@@ -141,7 +157,7 @@ func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, err
 	var completedAt sql.NullTime
 	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
 		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
-		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive)
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile)
 	if errors.Is(err, sql.ErrNoRows) {
 		return nil, nil
 	}
@@ -165,7 +181,8 @@ func (r *Runs) ListForHost(ctx context.Context, hostID int64, limit int) ([]mode
 		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
 		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
 		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
-		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0)
+		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
+		       COALESCE(profile,'quick')
 		FROM runs
 		WHERE host_id = ?
 		ORDER BY id DESC
@@ -181,7 +198,7 @@ func (r *Runs) ListForHost(ctx context.Context, hostID int64, limit int) ([]mode
 		var completedAt sql.NullTime
 		if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
 			&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
-			&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive); err != nil {
+			&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile); err != nil {
 			return nil, err
 		}
 		if completedAt.Valid {
@@ -206,7 +223,8 @@ func (r *Runs) Active(ctx context.Context) ([]model.Run, error) {
 		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
 		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
 		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
-		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0)
+		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
+		       COALESCE(profile,'quick')
 		FROM runs
 		WHERE state NOT IN ('Completed','Released','Cancelled')
 		ORDER BY id
@@ -221,7 +239,7 @@ func (r *Runs) Active(ctx context.Context) ([]model.Run, error) {
 		var completedAt sql.NullTime
 		if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
 			&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
-			&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive); err != nil {
+			&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile); err != nil {
 			return nil, err
 		}
 		if completedAt.Valid {
@@ -275,7 +293,7 @@ func (r *Runs) FindActiveByMAC(ctx context.Context, mac string) (*model.Run, err
 	var completedAt sql.NullTime
 	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
 		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
-		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive)
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile)
 	if errors.Is(err, sql.ErrNoRows) {
 		return nil, nil
 	}
@@ -17,11 +17,13 @@ type Stages struct {
 // reaches Inventory; later phases add more executors but the list is fixed.
 var DefaultStageOrder = []string{
 	"Inventory",
+	"Firmware",
 	"SpecValidate",
 	"SMART",
 	"CPUStress",
 	"Storage",
 	"Network",
+	"Burn",
 	"GPU",
 	"PSU",
 	"Reporting",
@@ -0,0 +1,280 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"time"
+)
+
+// Threshold is the DB view of a per-run threshold row. Mirrors the
+// orchestrator.Threshold value-object but keeps Severity/Op as strings
+// so callers higher up don't force this package to import orchestrator.
+type Threshold struct {
+	ID        int64
+	RunID     int64
+	Stage     string
+	Kind      string
+	Key       string
+	Op        string
+	Threshold float64
+	Nominal   float64
+	Unit      string
+	Severity  string
+	Source    string // profile|host_override
+}
+
+// ThresholdEvaluation is one recorded comparison — the evaluator calls
+// this for every sample that matched a threshold, whether it passed
+// or breached. The report page aggregates these to show the operator
+// why a run failed (or was flagged as warning-only).
+type ThresholdEvaluation struct {
+	ID          int64
+	RunID       int64
+	ThresholdID int64
+	Stage       string
+	Kind        string
+	Key         string
+	TS          time.Time
+	Observed    float64
+	Passed      bool
+}
+
+// Thresholds is the CRUD seam. Kept intentionally narrow: seed at run
+// creation, list for evaluation on each sensor batch, record eval
+// results, aggregate for the report.
+type Thresholds struct {
+	DB *sql.DB
+}
+
+// ThresholdSpec is the caller-supplied shape for seeding — a flat
+// value-object that carries the threshold rule plus its source so
+// the ProfileRegistry-driven seed and per-host overrides converge
+// on one insert path. Kept here (not in config) so the store layer
+// doesn't have to import config.
+type ThresholdSpec struct {
+	Stage    string
+	Kind     string
+	Key      string
+	Op       string
+	Value    float64
+	Nominal  float64
+	Unit     string
+	Severity string
+	Source   string
+}
+
+// SeedForRun converts the caller's specs into Threshold rows for the
+// given run and bulk-inserts them. Returns the inserted rows with IDs
+// populated so the evaluator can pin evaluations without a re-read.
+func (t *Thresholds) SeedForRun(ctx context.Context, runID int64, specs []ThresholdSpec) ([]Threshold, error) {
+	rows := make([]Threshold, 0, len(specs))
+	for _, s := range specs {
+		rows = append(rows, Threshold{
+			RunID:     runID,
+			Stage:     s.Stage,
+			Kind:      s.Kind,
+			Key:       s.Key,
+			Op:        s.Op,
+			Threshold: s.Value,
+			Nominal:   s.Nominal,
+			Unit:      s.Unit,
+			Severity:  s.Severity,
+			Source:    s.Source,
+		})
+	}
+	return t.CreateBatch(ctx, rows)
+}
+
+// Create inserts a single threshold row — used by the seed path when
+// the orchestrator materializes per-run rules from the ProfileRegistry.
+// Returns the row's ID so the evaluator can pin evaluations to it.
+func (t *Thresholds) Create(ctx context.Context, th Threshold) (int64, error) {
+	res, err := t.DB.ExecContext(ctx, `
+		INSERT INTO thresholds(run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source)
+		VALUES(?,?,?,?,?,?,?,?,?,?)
+	`, th.RunID, th.Stage, th.Kind, th.Key, th.Op, th.Threshold, th.Nominal, th.Unit, th.Severity, th.Source)
+	if err != nil {
+		return 0, fmt.Errorf("insert threshold: %w", err)
+	}
+	return res.LastInsertId()
+}
+
+// CreateBatch is the fast path for run seeding — one transaction per
+// run, one row per threshold. Returns the inserted rows with IDs set
+// so the caller can drop them into the in-memory evaluator without a
+// follow-up read.
+func (t *Thresholds) CreateBatch(ctx context.Context, rows []Threshold) ([]Threshold, error) {
+	if len(rows) == 0 {
+		return nil, nil
+	}
+	tx, err := t.DB.BeginTx(ctx, nil)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = tx.Rollback() }()
+	stmt, err := tx.PrepareContext(ctx, `
+		INSERT INTO thresholds(run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source)
+		VALUES(?,?,?,?,?,?,?,?,?,?)
+	`)
+	if err != nil {
+		return nil, fmt.Errorf("prepare threshold insert: %w", err)
+	}
+	defer func() { _ = stmt.Close() }()
+	out := make([]Threshold, 0, len(rows))
+	for _, th := range rows {
+		res, err := stmt.ExecContext(ctx, th.RunID, th.Stage, th.Kind, th.Key, th.Op,
+			th.Threshold, th.Nominal, th.Unit, th.Severity, th.Source)
+		if err != nil {
+			return nil, fmt.Errorf("insert threshold %s/%s: %w", th.Stage, th.Key, err)
+		}
+		id, err := res.LastInsertId()
+		if err != nil {
+			return nil, err
+		}
+		th.ID = id
+		out = append(out, th)
+	}
+	if err := tx.Commit(); err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+// ListForRun returns every threshold seeded for a run, in stable ID
+// order. Evaluator expects this to be cheap (few tens of rows per run)
+// and pulls it on each /sensor batch.
+func (t *Thresholds) ListForRun(ctx context.Context, runID int64) ([]Threshold, error) {
+	rows, err := t.DB.QueryContext(ctx, `
+		SELECT id, run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source
+		FROM thresholds WHERE run_id = ? ORDER BY id
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []Threshold
+	for rows.Next() {
+		var th Threshold
+		if err := rows.Scan(&th.ID, &th.RunID, &th.Stage, &th.Kind, &th.Key,
+			&th.Op, &th.Threshold, &th.Nominal, &th.Unit, &th.Severity, &th.Source); err != nil {
+			return nil, err
+		}
+		out = append(out, th)
+	}
+	return out, rows.Err()
+}
+
+// RecordEvaluation persists a single evaluation outcome. Called per
+// matching sample so the run's report has a full audit trail ("temp
+// hit 95 at 14:22:03" rather than just "temp failed").
+func (t *Thresholds) RecordEvaluation(ctx context.Context, ev ThresholdEvaluation) error {
+	passed := 0
+	if ev.Passed {
+		passed = 1
+	}
+	if ev.TS.IsZero() {
+		ev.TS = time.Now().UTC()
+	}
+	_, err := t.DB.ExecContext(ctx, `
+		INSERT INTO threshold_evaluations(run_id, threshold_id, stage_name, kind, key, ts, observed, passed)
+		VALUES(?,?,?,?,?,?,?,?)
+	`, ev.RunID, ev.ThresholdID, ev.Stage, ev.Kind, ev.Key, ev.TS, ev.Observed, passed)
+	if err != nil {
+		return fmt.Errorf("record evaluation: %w", err)
+	}
+	return nil
+}
+
+// RecordBatch persists a slice of evaluations in one transaction. The
+// agent-handler hot path builds these one per sample and batches them
+// under the same Sensor POST so we take one round-trip rather than N.
+func (t *Thresholds) RecordBatch(ctx context.Context, evals []ThresholdEvaluation) error {
+	if len(evals) == 0 {
+		return nil
+	}
+	tx, err := t.DB.BeginTx(ctx, nil)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = tx.Rollback() }()
+	stmt, err := tx.PrepareContext(ctx, `
+		INSERT INTO threshold_evaluations(run_id, threshold_id, stage_name, kind, key, ts, observed, passed)
+		VALUES(?,?,?,?,?,?,?,?)
+	`)
+	if err != nil {
+		return fmt.Errorf("prepare eval insert: %w", err)
+	}
+	defer func() { _ = stmt.Close() }()
+	for _, ev := range evals {
+		passed := 0
+		if ev.Passed {
+			passed = 1
+		}
+		if ev.TS.IsZero() {
+			ev.TS = time.Now().UTC()
+		}
+		if _, err := stmt.ExecContext(ctx, ev.RunID, ev.ThresholdID, ev.Stage, ev.Kind, ev.Key, ev.TS, ev.Observed, passed); err != nil {
+			return fmt.Errorf("insert eval: %w", err)
+		}
+	}
+	return tx.Commit()
+}
+
+// ListEvaluations returns the evaluation history for a run, newest
+// last. Bounded at a sane cap so a pathological run with a sample-per-
+// second sidecar doesn't blow up the report page.
+func (t *Thresholds) ListEvaluations(ctx context.Context, runID int64) ([]ThresholdEvaluation, error) {
+	rows, err := t.DB.QueryContext(ctx, `
+		SELECT id, run_id, threshold_id, stage_name, kind, key, ts, observed, passed
+		FROM threshold_evaluations WHERE run_id = ?
+		ORDER BY id LIMIT 5000
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []ThresholdEvaluation
+	for rows.Next() {
+		var ev ThresholdEvaluation
+		var passed int
+		if err := rows.Scan(&ev.ID, &ev.RunID, &ev.ThresholdID, &ev.Stage, &ev.Kind,
+			&ev.Key, &ev.TS, &ev.Observed, &passed); err != nil {
+			return nil, err
+		}
+		ev.Passed = passed == 1
+		out = append(out, ev)
+	}
+	return out, rows.Err()
+}
+
+// CriticalBreaches returns the evaluations that fire the "fail the
+// run" gate — critical-severity thresholds with passed=0. The
+// agent-handler calls this at /result close so an aggregate breach
+// (p99 latency > bound) still flips the run to FailedHolding even if
+// no single sample tripped the fast-fail path.
+func (t *Thresholds) CriticalBreaches(ctx context.Context, runID int64) ([]ThresholdEvaluation, error) {
+	rows, err := t.DB.QueryContext(ctx, `
+		SELECT e.id, e.run_id, e.threshold_id, e.stage_name, e.kind, e.key, e.ts, e.observed, e.passed
+		FROM threshold_evaluations e
+		JOIN thresholds t ON t.id = e.threshold_id
+		WHERE e.run_id = ? AND e.passed = 0 AND t.severity = 'critical'
+		ORDER BY e.id
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []ThresholdEvaluation
+	for rows.Next() {
+		var ev ThresholdEvaluation
+		var passed int
+		if err := rows.Scan(&ev.ID, &ev.RunID, &ev.ThresholdID, &ev.Stage, &ev.Kind,
+			&ev.Key, &ev.TS, &ev.Observed, &passed); err != nil {
+			return nil, err
+		}
+		ev.Passed = passed == 1
+		out = append(out, ev)
+	}
+	return out, rows.Err()
+}