deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,7 @@ import (
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"vetting/internal/config"
|
||||
"vetting/internal/events"
|
||||
"vetting/internal/hold"
|
||||
"vetting/internal/logs"
|
||||
@@ -41,6 +42,9 @@ type Agent struct {
|
||||
Artifacts *store.Artifacts
|
||||
SpecDiffs *store.SpecDiffs
|
||||
Measurements *store.Measurements
|
||||
Thresholds *store.Thresholds // Phase 1: seeded per run; consulted on each /sensor batch
|
||||
Firmware *store.Firmware // Phase 4: firmware snapshots (unused before then)
|
||||
Profiles *config.ProfileRegistry // Phase 2: /claim resolves the run's profile → stage knobs
|
||||
Runner *orchestrator.Runner
|
||||
EventHub *events.Hub
|
||||
Logs *logs.Hub
|
||||
@@ -216,6 +220,21 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
|
||||
if iperfPort == 0 {
|
||||
iperfPort = 5201
|
||||
}
|
||||
|
||||
// Resolve the run's profile → agent-visible stage knobs. The agent
|
||||
// reads these to size CPUStress / Storage / Network work. An empty
|
||||
// profile (legacy runs seeded before Phase 1) falls back to "quick".
|
||||
profileName := run.Profile
|
||||
if profileName == "" {
|
||||
profileName = config.ProfileQuick
|
||||
}
|
||||
var stageCfg config.StageConfig
|
||||
if a.Profiles != nil {
|
||||
stageCfg = a.Profiles.ResolveStageConfig(profileName)
|
||||
} else {
|
||||
stageCfg = config.StageConfig{Profile: profileName}
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, map[string]any{
|
||||
"ok": true,
|
||||
"run_id": runID,
|
||||
@@ -224,6 +243,7 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
|
||||
"iperf_port": iperfPort,
|
||||
"non_destructive": run.NonDestructive,
|
||||
"current_state": string(currentState),
|
||||
"stage_config": stageCfg,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -398,10 +418,24 @@ type StageResult struct {
|
||||
Passed bool `json:"passed"`
|
||||
Summary json.RawMessage `json:"summary,omitempty"`
|
||||
Inventory *spec.Inventory `json:"inventory,omitempty"`
|
||||
Firmware []FirmwareLine `json:"firmware,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
SubSteps []SubStepResultLine `json:"sub_steps,omitempty"`
|
||||
}
|
||||
|
||||
// FirmwareLine is a single firmware snapshot POSTed alongside the
|
||||
// Firmware stage's /result body. Mirrors agent/probes.FirmwareSnapshot.
|
||||
// The server converts each line to a store.FirmwareSnapshot and persists
|
||||
// it under the run — SpecValidate reads these back to diff against the
|
||||
// host's expected_firmware.
|
||||
type FirmwareLine struct {
|
||||
Component string `json:"component"`
|
||||
Identifier string `json:"identifier"`
|
||||
Version string `json:"version"`
|
||||
Vendor string `json:"vendor,omitempty"`
|
||||
Raw map[string]string `json:"raw,omitempty"`
|
||||
}
|
||||
|
||||
// SubStepResultLine is one entry in StageResult.SubSteps. Ordinal is
|
||||
// assigned from slice index server-side; the agent doesn't set it.
|
||||
type SubStepResultLine struct {
|
||||
@@ -476,6 +510,20 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
// Aggregate threshold gate: flip Passed=false server-side when any
|
||||
// critical breach landed for this stage. The agent's verdict is
|
||||
// advisory — a stage-executor can miss a runaway sample that the
|
||||
// sidecar caught. We check this *before* writing the stage state
|
||||
// so the DB reflects the server-side decision.
|
||||
thresholdDetail := ""
|
||||
if body.Passed {
|
||||
if breached, detail := a.stageHadCriticalBreach(r.Context(), runID, body.Stage); breached {
|
||||
body.Passed = false
|
||||
thresholdDetail = detail
|
||||
a.appendLog(runID, "error", fmt.Sprintf("%s reported passed but %s — flipping to failed", body.Stage, detail))
|
||||
}
|
||||
}
|
||||
|
||||
stageState := model.StagePassed
|
||||
if !body.Passed {
|
||||
stageState = model.StageFailed
|
||||
@@ -488,6 +536,9 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if thresholdDetail != "" && body.Message == "" {
|
||||
body.Message = thresholdDetail
|
||||
}
|
||||
|
||||
// Agent-authored sub-steps: persist in slice order (ordinal = index)
|
||||
// and fan out a per-row SSE event each so the detail pane shows them
|
||||
@@ -502,6 +553,14 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// Firmware-specific: persist each snapshot into firmware_snapshots.
|
||||
// SpecValidate reads them back to diff against expected_firmware.
|
||||
if body.Stage == "Firmware" && len(body.Firmware) > 0 {
|
||||
if err := a.persistFirmware(r.Context(), runID, body.Firmware); err != nil {
|
||||
log.Printf("persist firmware run %d: %v", runID, err)
|
||||
}
|
||||
}
|
||||
|
||||
if !body.Passed {
|
||||
if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
|
||||
log.Printf("set failed stage: %v", err)
|
||||
@@ -615,6 +674,34 @@ func parseResultTime(s string) *time.Time {
|
||||
return nil
|
||||
}
|
||||
|
||||
// persistFirmware writes the reported snapshots. A nil/unset a.Firmware
|
||||
// store is a no-op so tests that don't wire it up stay green; a mid-run
|
||||
// persist error is logged but doesn't fail the stage (Firmware is
|
||||
// advisory — SpecValidate is the gate).
|
||||
func (a *Agent) persistFirmware(ctx context.Context, runID int64, lines []FirmwareLine) error {
|
||||
if a.Firmware == nil || len(lines) == 0 {
|
||||
return nil
|
||||
}
|
||||
rows := make([]store.FirmwareSnapshot, 0, len(lines))
|
||||
for _, l := range lines {
|
||||
raw := "{}"
|
||||
if len(l.Raw) > 0 {
|
||||
if b, err := json.Marshal(l.Raw); err == nil {
|
||||
raw = string(b)
|
||||
}
|
||||
}
|
||||
rows = append(rows, store.FirmwareSnapshot{
|
||||
RunID: runID,
|
||||
Component: l.Component,
|
||||
Identifier: l.Identifier,
|
||||
Version: l.Version,
|
||||
Vendor: l.Vendor,
|
||||
RawJSON: raw,
|
||||
})
|
||||
}
|
||||
return a.Firmware.CreateBatch(ctx, rows)
|
||||
}
|
||||
|
||||
func (a *Agent) persistInventory(r *http.Request, run *model.Run, inv *spec.Inventory) error {
|
||||
dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", run.ID))
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
@@ -667,6 +754,22 @@ func (a *Agent) resolveSpecValidate(r *http.Request, runID int64) {
|
||||
return
|
||||
}
|
||||
diffs := spec.Diff(expected, inv)
|
||||
if a.Firmware != nil && len(expected.Firmware) > 0 {
|
||||
snaps, err := a.Firmware.ListForRun(r.Context(), runID)
|
||||
if err != nil {
|
||||
log.Printf("specvalidate: list firmware: %v", err)
|
||||
} else {
|
||||
observed := make([]spec.FirmwareObserved, 0, len(snaps))
|
||||
for _, s := range snaps {
|
||||
observed = append(observed, spec.FirmwareObserved{
|
||||
Component: s.Component,
|
||||
Identifier: s.Identifier,
|
||||
Version: s.Version,
|
||||
})
|
||||
}
|
||||
diffs = append(diffs, spec.DiffFirmware(expected.Firmware, observed)...)
|
||||
}
|
||||
}
|
||||
if err := a.SpecDiffs.ReplaceForRun(r.Context(), runID, diffs); err != nil {
|
||||
log.Printf("specvalidate: write diffs: %v", err)
|
||||
}
|
||||
@@ -884,13 +987,17 @@ type SensorSample struct {
|
||||
}
|
||||
|
||||
// Sensor persists a batch of numeric samples. The thermal sidecar hits
|
||||
// this on a tick; stage executors (iperf, fio) also drop here.
|
||||
// this on a tick; stage executors (iperf, fio) also drop here. Each
|
||||
// sample is evaluated against the run's seeded thresholds — critical
|
||||
// breaches fail the run immediately (thermal runaway, EDAC UE, voltage
|
||||
// sag); warning breaches are recorded for the report only.
|
||||
func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
|
||||
runID, ok := runIDFromURL(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if _, ok := a.authenticate(w, r, runID); !ok {
|
||||
run, ok := a.authenticate(w, r, runID)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if a.Measurements == nil {
|
||||
@@ -903,8 +1010,12 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
rows := make([]model.Measurement, 0, len(body.Samples))
|
||||
sampleStages := make([]string, 0, len(body.Samples))
|
||||
for _, s := range body.Samples {
|
||||
ts, _ := time.Parse(time.RFC3339Nano, s.TS)
|
||||
if ts.IsZero() {
|
||||
ts = time.Now().UTC()
|
||||
}
|
||||
rows = append(rows, model.Measurement{
|
||||
RunID: runID,
|
||||
TS: ts,
|
||||
@@ -913,12 +1024,139 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
|
||||
Value: s.Value,
|
||||
Unit: s.Unit,
|
||||
})
|
||||
// Stage the sample belongs to drives threshold selector
|
||||
// matching. We use the run's current state — the agent does
|
||||
// not tag samples with a stage.
|
||||
sampleStages = append(sampleStages, orchestrator.StageNameForState(run.State))
|
||||
}
|
||||
if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil {
|
||||
http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(rows)})
|
||||
critical := a.evaluateSensorBatch(r.Context(), runID, rows, sampleStages)
|
||||
writeJSON(w, http.StatusOK, map[string]any{
|
||||
"ok": true,
|
||||
"written": len(rows),
|
||||
"breach": critical != "",
|
||||
"breach_kind": critical,
|
||||
})
|
||||
if critical != "" {
|
||||
a.failRunOnCriticalBreach(r, run, critical)
|
||||
}
|
||||
}
|
||||
|
||||
// evaluateSensorBatch runs each sample through the run's thresholds,
|
||||
// persists evaluations, and returns a short human-readable label for
|
||||
// the first critical breach it sees (empty when all samples pass or
|
||||
// only hit warning-severity rules).
|
||||
func (a *Agent) evaluateSensorBatch(ctx context.Context, runID int64, rows []model.Measurement, sampleStages []string) string {
|
||||
if a.Thresholds == nil || len(rows) == 0 {
|
||||
return ""
|
||||
}
|
||||
rules, err := a.Thresholds.ListForRun(ctx, runID)
|
||||
if err != nil {
|
||||
log.Printf("sensor: list thresholds run %d: %v", runID, err)
|
||||
return ""
|
||||
}
|
||||
if len(rules) == 0 {
|
||||
return ""
|
||||
}
|
||||
evalRules := make([]orchestrator.Threshold, 0, len(rules))
|
||||
for _, r := range rules {
|
||||
evalRules = append(evalRules, orchestrator.Threshold{
|
||||
ID: r.ID,
|
||||
Stage: r.Stage,
|
||||
Kind: r.Kind,
|
||||
Key: r.Key,
|
||||
Op: orchestrator.ThresholdOp(r.Op),
|
||||
Value: r.Threshold,
|
||||
Nominal: r.Nominal,
|
||||
Severity: orchestrator.ThresholdSeverity(r.Severity),
|
||||
})
|
||||
}
|
||||
evals := make([]store.ThresholdEvaluation, 0, len(rows))
|
||||
critical := ""
|
||||
for i, m := range rows {
|
||||
sample := orchestrator.Sample{
|
||||
Stage: sampleStages[i],
|
||||
Kind: m.Kind,
|
||||
Key: m.Key,
|
||||
Value: m.Value,
|
||||
}
|
||||
for _, res := range orchestrator.Evaluate(sample, evalRules) {
|
||||
evals = append(evals, store.ThresholdEvaluation{
|
||||
RunID: runID,
|
||||
ThresholdID: res.Threshold.ID,
|
||||
Stage: sample.Stage,
|
||||
Kind: sample.Kind,
|
||||
Key: sample.Key,
|
||||
TS: m.TS,
|
||||
Observed: res.Observed,
|
||||
Passed: res.Passed,
|
||||
})
|
||||
if critical == "" && res.CriticalBreach() {
|
||||
critical = fmt.Sprintf("%s %s=%g breached %s %g",
|
||||
res.Threshold.Kind, sample.Key, res.Observed, res.Threshold.Op, res.Threshold.Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := a.Thresholds.RecordBatch(ctx, evals); err != nil {
|
||||
log.Printf("sensor: record evals run %d: %v", runID, err)
|
||||
}
|
||||
return critical
|
||||
}
|
||||
|
||||
// stageHadCriticalBreach returns true if any critical-severity
|
||||
// threshold evaluation for this run matched samples attributed to the
|
||||
// given stage (stage selector "*" or exact). Called at /result close
|
||||
// so even an agent that reports Passed=true gets overridden when the
|
||||
// aggregate view says the stage tripped a gate.
|
||||
func (a *Agent) stageHadCriticalBreach(ctx context.Context, runID int64, stage string) (bool, string) {
|
||||
if a.Thresholds == nil {
|
||||
return false, ""
|
||||
}
|
||||
breaches, err := a.Thresholds.CriticalBreaches(ctx, runID)
|
||||
if err != nil {
|
||||
log.Printf("result: list breaches run %d: %v", runID, err)
|
||||
return false, ""
|
||||
}
|
||||
for _, b := range breaches {
|
||||
if b.Stage == stage || b.Stage == "" || b.Stage == "*" {
|
||||
return true, fmt.Sprintf("critical threshold breach: %s %s=%g", b.Kind, b.Key, b.Observed)
|
||||
}
|
||||
}
|
||||
return false, ""
|
||||
}
|
||||
|
||||
// failRunOnCriticalBreach flips the run to FailedHolding in response
|
||||
// to a live threshold breach (thermal runaway, EDAC UE, rail sag).
|
||||
// The agent's pending /result for the current stage may still arrive —
|
||||
// the silent-skip guard handles that by refusing to double-transition.
|
||||
func (a *Agent) failRunOnCriticalBreach(r *http.Request, run *model.Run, detail string) {
|
||||
stage := orchestrator.StageNameForState(run.State)
|
||||
if stage == "" {
|
||||
stage = "threshold"
|
||||
}
|
||||
if err := a.Runs.SetFailedStage(r.Context(), run.ID, stage+" (threshold)"); err != nil {
|
||||
log.Printf("sensor: set failed stage run %d: %v", run.ID, err)
|
||||
}
|
||||
if _, err := a.Runner.Transition(r.Context(), run.ID, orchestrator.TriggerStageFailed); err != nil {
|
||||
// If we're already in FailedHolding the transition errors —
|
||||
// that's fine, the first breach wins.
|
||||
log.Printf("sensor: fail-transition run %d: %v", run.ID, err)
|
||||
return
|
||||
}
|
||||
hostName := a.hostNameFor(r.Context(), run.HostID)
|
||||
a.dispatchEvent(notify.Event{
|
||||
Kind: notify.KindStageFailed,
|
||||
Severity: notify.SeverityCritical,
|
||||
RunID: run.ID,
|
||||
HostName: hostName,
|
||||
Title: fmt.Sprintf("[vetting] %s FAILED: %s (threshold)", hostName, stage),
|
||||
Body: fmt.Sprintf("Run %d on %s tripped a critical threshold during %s: %s", run.ID, hostName, stage, detail),
|
||||
URL: a.runLinkURL(run.ID),
|
||||
})
|
||||
a.appendLog(run.ID, "error", fmt.Sprintf("threshold breach during %s: %s — run parked in FailedHolding", stage, detail))
|
||||
}
|
||||
|
||||
// resolveReporting runs when the pipeline advances into StateReporting.
|
||||
@@ -956,12 +1194,20 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) {
|
||||
log.Printf("reporting: list measurements: %v", err)
|
||||
}
|
||||
}
|
||||
var firmware []store.FirmwareSnapshot
|
||||
if a.Firmware != nil {
|
||||
firmware, err = a.Firmware.ListForRun(ctx, runID)
|
||||
if err != nil {
|
||||
log.Printf("reporting: list firmware: %v", err)
|
||||
}
|
||||
}
|
||||
bundle := map[string]any{
|
||||
"run": run,
|
||||
"host": host,
|
||||
"stages": stages,
|
||||
"spec_diffs": diffs,
|
||||
"measurements": measurements,
|
||||
"firmware": firmware,
|
||||
"generated_at": time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
buf, err := json.MarshalIndent(bundle, "", " ")
|
||||
@@ -993,6 +1239,15 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) {
|
||||
// Also render the operator-facing HTML summary alongside the JSON.
|
||||
// Failures here are non-fatal — the JSON is the source of truth.
|
||||
if host != nil {
|
||||
fwRows := make([]report.FirmwareSnapshot, 0, len(firmware))
|
||||
for _, f := range firmware {
|
||||
fwRows = append(fwRows, report.FirmwareSnapshot{
|
||||
Component: f.Component,
|
||||
Identifier: f.Identifier,
|
||||
Version: f.Version,
|
||||
Vendor: f.Vendor,
|
||||
})
|
||||
}
|
||||
htmlData := report.Data{
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Run: *run,
|
||||
@@ -1000,6 +1255,7 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) {
|
||||
Stages: stages,
|
||||
SpecDiffs: diffs,
|
||||
Aggregates: report.AggregateMeasurements(measurements),
|
||||
Firmware: fwRows,
|
||||
}
|
||||
if htmlBuf, err := report.RenderHTML(htmlData); err != nil {
|
||||
log.Printf("reporting: render html: %v", err)
|
||||
|
||||
@@ -108,7 +108,7 @@ func TestRunPage_DefaultStep_Running(t *testing.T) {
|
||||
})
|
||||
runID, _ := runs.Create(ctx, id, "rr", false)
|
||||
_ = ui.Stages.Seed(ctx, runID)
|
||||
for _, name := range []string{"Inventory", "SpecValidate"} {
|
||||
for _, name := range []string{"Inventory", "Firmware", "SpecValidate"} {
|
||||
_ = ui.Stages.StartByName(ctx, runID, name)
|
||||
_ = ui.Stages.CompleteByName(ctx, runID, name, model.StagePassed, "")
|
||||
}
|
||||
@@ -135,7 +135,7 @@ func TestRunPage_DefaultStep_Failed(t *testing.T) {
|
||||
})
|
||||
runID, _ := runs.Create(ctx, id, "rf", false)
|
||||
_ = ui.Stages.Seed(ctx, runID)
|
||||
for _, name := range []string{"Inventory", "SpecValidate", "SMART"} {
|
||||
for _, name := range []string{"Inventory", "Firmware", "SpecValidate", "SMART"} {
|
||||
_ = ui.Stages.StartByName(ctx, runID, name)
|
||||
_ = ui.Stages.CompleteByName(ctx, runID, name, model.StagePassed, "")
|
||||
}
|
||||
|
||||
@@ -0,0 +1,169 @@
|
||||
package api_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
"vetting/internal/api"
|
||||
"vetting/internal/db"
|
||||
"vetting/internal/events"
|
||||
"vetting/internal/model"
|
||||
"vetting/internal/orchestrator"
|
||||
"vetting/internal/store"
|
||||
)
|
||||
|
||||
// setupAgentWithThresholds builds an Agent wired up to the thresholds
|
||||
// store + a Runner so the /sensor handler can drive the state machine.
|
||||
// Seeds one critical thermal threshold and parks the run in CPUStress
|
||||
// so the handler will stamp a stage-relevant failed_stage.
|
||||
func setupAgentWithThresholds(t *testing.T) (*api.Agent, int64, string) {
|
||||
t.Helper()
|
||||
path := filepath.Join(t.TempDir(), "vetting.db")
|
||||
conn, err := db.Open(path)
|
||||
if err != nil {
|
||||
t.Fatalf("open db: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { _ = conn.Close() })
|
||||
|
||||
hosts := &store.Hosts{DB: conn}
|
||||
runs := &store.Runs{DB: conn}
|
||||
stages := &store.Stages{DB: conn}
|
||||
meas := &store.Measurements{DB: conn}
|
||||
thresholds := &store.Thresholds{DB: conn}
|
||||
hub := events.NewHub()
|
||||
runner := &orchestrator.Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub}
|
||||
|
||||
hostID, err := hosts.Create(context.Background(), model.Host{
|
||||
Name: "thresh-host",
|
||||
MAC: "aa:bb:cc:dd:ee:aa",
|
||||
WoLBroadcastIP: "10.0.0.255",
|
||||
WoLPort: 9,
|
||||
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
plain, hash, err := orchestrator.IssueRunToken()
|
||||
if err != nil {
|
||||
t.Fatalf("issue token: %v", err)
|
||||
}
|
||||
runID, err := runs.Create(context.Background(), hostID, hash, false)
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
if err := stages.Seed(context.Background(), runID); err != nil {
|
||||
t.Fatalf("seed stages: %v", err)
|
||||
}
|
||||
// Park the run where a real thermal sidecar would be posting samples.
|
||||
if err := runs.SetState(context.Background(), runID, model.StateCPUStress); err != nil {
|
||||
t.Fatalf("set state: %v", err)
|
||||
}
|
||||
// Seed one critical thermal threshold.
|
||||
if _, err := thresholds.SeedForRun(context.Background(), runID, []store.ThresholdSpec{
|
||||
{Stage: "*", Kind: "temp", Key: "cpu/*", Op: "lt", Value: 92, Unit: "C", Severity: "critical", Source: "profile"},
|
||||
}); err != nil {
|
||||
t.Fatalf("seed thresholds: %v", err)
|
||||
}
|
||||
return &api.Agent{
|
||||
Hosts: hosts,
|
||||
Runs: runs,
|
||||
Stages: stages,
|
||||
Measurements: meas,
|
||||
Thresholds: thresholds,
|
||||
Runner: runner,
|
||||
}, runID, plain
|
||||
}
|
||||
|
||||
// TestSensor_ThermalRunawayFailsRun: a sample that breaches a critical
|
||||
// threshold lands in threshold_evaluations (passed=0) and flips the
|
||||
// run into FailedHolding with failed_stage naming the current stage.
|
||||
// This is the Phase-1 behavior gate — without the evaluator, the sample
|
||||
// would just sit in measurements and the run would happily march on.
|
||||
func TestSensor_ThermalRunawayFailsRun(t *testing.T) {
|
||||
a, runID, token := setupAgentWithThresholds(t)
|
||||
batch := api.SensorBatch{Samples: []api.SensorSample{
|
||||
{Kind: "temp", Key: "cpu/0", Value: 95.3, Unit: "C"},
|
||||
}}
|
||||
buf, _ := json.Marshal(batch)
|
||||
req := routedRequest(runID, http.MethodPost,
|
||||
"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
|
||||
req.Header.Set("Authorization", "Bearer "+token)
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
rr := httptest.NewRecorder()
|
||||
a.Sensor(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
|
||||
}
|
||||
var resp struct {
|
||||
OK bool `json:"ok"`
|
||||
Breach bool `json:"breach"`
|
||||
Kind string `json:"breach_kind"`
|
||||
}
|
||||
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if !resp.Breach {
|
||||
t.Fatalf("expected breach=true, got %+v", resp)
|
||||
}
|
||||
run, err := a.Runs.Get(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("get run: %v", err)
|
||||
}
|
||||
if run.State != model.StateFailedHolding {
|
||||
t.Fatalf("state = %s, want FailedHolding", run.State)
|
||||
}
|
||||
if run.FailedStage == "" {
|
||||
t.Fatalf("failed_stage empty; want stage-named breach")
|
||||
}
|
||||
evals, err := a.Thresholds.ListEvaluations(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("list evaluations: %v", err)
|
||||
}
|
||||
if len(evals) != 1 {
|
||||
t.Fatalf("want 1 evaluation recorded, got %d", len(evals))
|
||||
}
|
||||
if evals[0].Passed {
|
||||
t.Fatalf("evaluation recorded as passed for 95.3C sample against <92C rule")
|
||||
}
|
||||
}
|
||||
|
||||
// TestSensor_WithinThresholdPasses: a sample comfortably inside the
|
||||
// threshold writes an evaluation row with passed=1 and leaves the run
|
||||
// state untouched.
|
||||
func TestSensor_WithinThresholdPasses(t *testing.T) {
|
||||
a, runID, token := setupAgentWithThresholds(t)
|
||||
batch := api.SensorBatch{Samples: []api.SensorSample{
|
||||
{Kind: "temp", Key: "cpu/0", Value: 55.0, Unit: "C"},
|
||||
}}
|
||||
buf, _ := json.Marshal(batch)
|
||||
req := routedRequest(runID, http.MethodPost,
|
||||
"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
|
||||
req.Header.Set("Authorization", "Bearer "+token)
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
rr := httptest.NewRecorder()
|
||||
a.Sensor(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
|
||||
}
|
||||
run, err := a.Runs.Get(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("get run: %v", err)
|
||||
}
|
||||
if run.State != model.StateCPUStress {
|
||||
t.Fatalf("state = %s, want CPUStress unchanged", run.State)
|
||||
}
|
||||
evals, err := a.Thresholds.ListEvaluations(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("list evaluations: %v", err)
|
||||
}
|
||||
if len(evals) != 1 || !evals[0].Passed {
|
||||
t.Fatalf("want 1 passed evaluation, got %+v", evals)
|
||||
}
|
||||
}
|
||||
@@ -75,6 +75,12 @@ func newCaptureRegistry(c *captureNotifier) *notify.Registry {
|
||||
// (agent, runID, plainTokenForBearer). Caller is responsible for
|
||||
// transitioning the run out of Queued.
|
||||
func fullAgent(t *testing.T) (*api.Agent, int64, string) {
|
||||
return fullAgentWithSpec(t, "")
|
||||
}
|
||||
|
||||
// fullAgentWithSpec is the same as fullAgent but seeds the host with
|
||||
// an ExpectedSpecYAML so SpecValidate can pick up diffs in the test.
|
||||
func fullAgentWithSpec(t *testing.T, expectedSpecYAML string) (*api.Agent, int64, string) {
|
||||
t.Helper()
|
||||
tmp := t.TempDir()
|
||||
conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
|
||||
@@ -89,6 +95,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
|
||||
artifactStore := &store.Artifacts{DB: conn}
|
||||
specDiffStore := &store.SpecDiffs{DB: conn}
|
||||
measurementStore := &store.Measurements{DB: conn}
|
||||
firmwareStore := &store.Firmware{DB: conn}
|
||||
|
||||
hub := events.NewHub()
|
||||
logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
|
||||
@@ -109,7 +116,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
|
||||
MAC: "aa:bb:cc:dd:ee:10",
|
||||
WoLBroadcastIP: "10.0.0.255",
|
||||
WoLPort: 9,
|
||||
ExpectedSpecYAML: "", // empty spec → no diffs
|
||||
ExpectedSpecYAML: expectedSpecYAML,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
@@ -132,6 +139,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
|
||||
Artifacts: artifactStore,
|
||||
SpecDiffs: specDiffStore,
|
||||
Measurements: measurementStore,
|
||||
Firmware: firmwareStore,
|
||||
Runner: runner,
|
||||
EventHub: hub,
|
||||
Logs: logHub,
|
||||
@@ -195,20 +203,24 @@ func TestFullPipelineToCompleted(t *testing.T) {
|
||||
Memory: spec.MemorySpec{TotalGiB: 16},
|
||||
}
|
||||
next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
|
||||
// After Inventory → SpecValidate resolves inline → SMART
|
||||
if next != "SMART" {
|
||||
t.Fatalf("after Inventory, next_state = %q, want SMART", next)
|
||||
// After Inventory → Firmware
|
||||
if next != "Firmware" {
|
||||
t.Fatalf("after Inventory, next_state = %q, want Firmware", next)
|
||||
}
|
||||
|
||||
// The remaining stages advance one-for-one in order.
|
||||
// The remaining stages advance one-for-one in order. After Firmware
|
||||
// the inline SpecValidate resolver advances through SpecValidate to
|
||||
// SMART without a dedicated /result POST for SpecValidate.
|
||||
walkPlan := []struct {
|
||||
stage string
|
||||
expected string
|
||||
}{
|
||||
{"Firmware", "SMART"},
|
||||
{"SMART", "CPUStress"},
|
||||
{"CPUStress", "Storage"},
|
||||
{"Storage", "Network"},
|
||||
{"Network", "GPU"},
|
||||
{"Network", "Burn"},
|
||||
{"Burn", "GPU"},
|
||||
{"GPU", "PSU"},
|
||||
{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
|
||||
}
|
||||
@@ -287,8 +299,11 @@ func TestFaultInjectionSMART(t *testing.T) {
|
||||
}
|
||||
|
||||
inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
|
||||
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" {
|
||||
t.Fatalf("after Inventory, next = %q want SMART", next)
|
||||
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
|
||||
t.Fatalf("after Inventory, next = %q want Firmware", next)
|
||||
}
|
||||
if next := walkStage(t, a, runID, token, "Firmware", true, nil); next != "SMART" {
|
||||
t.Fatalf("after Firmware, next = %q want SMART (inline SpecValidate)", next)
|
||||
}
|
||||
|
||||
// Fake SMART failure → expect FailedHolding.
|
||||
@@ -316,3 +331,76 @@ func TestFaultInjectionSMART(t *testing.T) {
|
||||
t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
|
||||
}
|
||||
}
|
||||
|
||||
// TestFirmwarePersistAndSpecMismatch exercises the Phase 4 firmware
|
||||
// integration: the agent POSTs Firmware snapshots; server persists; the
|
||||
// following SpecValidate diff picks up a firmware mismatch and parks
|
||||
// the run in FailedHolding with FailedStage=SpecValidate.
|
||||
func TestFirmwarePersistAndSpecMismatch(t *testing.T) {
|
||||
// Host demands BIOS 3.3; agent will POST 3.2 → one critical firmware diff.
|
||||
yaml := "firmware:\n - component: bios\n version: \"3.3\"\n"
|
||||
a, runID, token := fullAgentWithSpec(t, yaml)
|
||||
a.Notify = newCaptureRegistry(&captureNotifier{name: "capture"})
|
||||
|
||||
if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
|
||||
t.Fatalf("set state: %v", err)
|
||||
}
|
||||
|
||||
inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
|
||||
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
|
||||
t.Fatalf("after Inventory, next = %q want Firmware", next)
|
||||
}
|
||||
|
||||
// Firmware stage: agent reports actual BIOS 3.2 → one row persisted.
|
||||
fw := []map[string]any{
|
||||
{"component": "bios", "identifier": "system", "version": "3.2", "vendor": "AMI"},
|
||||
}
|
||||
next := walkStage(t, a, runID, token, "Firmware", true, map[string]any{"firmware": fw})
|
||||
// Inline SpecValidate should detect the firmware mismatch and send
|
||||
// the run to FailedHolding without the agent posting SpecValidate.
|
||||
if next != "FailedHolding" {
|
||||
t.Fatalf("after Firmware mismatch, next = %q want FailedHolding", next)
|
||||
}
|
||||
|
||||
run, err := a.Runs.Get(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("get run: %v", err)
|
||||
}
|
||||
if run.State != model.StateFailedHolding {
|
||||
t.Fatalf("run.State = %q, want FailedHolding", run.State)
|
||||
}
|
||||
if run.FailedStage != "SpecValidate" {
|
||||
t.Fatalf("run.FailedStage = %q, want SpecValidate", run.FailedStage)
|
||||
}
|
||||
|
||||
// Persistence: row landed in firmware_snapshots.
|
||||
snaps, err := a.Firmware.ListForRun(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("ListForRun firmware: %v", err)
|
||||
}
|
||||
if len(snaps) != 1 {
|
||||
t.Fatalf("firmware rows = %d, want 1: %+v", len(snaps), snaps)
|
||||
}
|
||||
if snaps[0].Component != "bios" || snaps[0].Version != "3.2" {
|
||||
t.Errorf("persisted snapshot = %+v", snaps[0])
|
||||
}
|
||||
|
||||
// Diff row: SpecDiffs has a firmware-specific entry (rather than
|
||||
// only CPU/memory/disk rows) and is critical.
|
||||
diffs, err := a.SpecDiffs.ListForRun(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("ListForRun specdiffs: %v", err)
|
||||
}
|
||||
found := false
|
||||
for _, d := range diffs {
|
||||
if strings.HasPrefix(d.Field, "firmware[") {
|
||||
found = true
|
||||
if d.Severity != "critical" {
|
||||
t.Errorf("firmware diff severity = %q, want critical", d.Severity)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("no firmware[...] entry in spec diffs: %+v", diffs)
|
||||
}
|
||||
}
|
||||
|
||||
+64
-13
@@ -16,6 +16,7 @@ import (
|
||||
"github.com/go-chi/chi/v5"
|
||||
"gopkg.in/yaml.v3"
|
||||
|
||||
"vetting/internal/config"
|
||||
"vetting/internal/events"
|
||||
"vetting/internal/logs"
|
||||
"vetting/internal/model"
|
||||
@@ -26,17 +27,19 @@ import (
|
||||
)
|
||||
|
||||
type UI struct {
|
||||
Hosts *store.Hosts
|
||||
Runs *store.Runs
|
||||
Stages *store.Stages
|
||||
SubSteps *store.SubSteps
|
||||
SpecDiffs *store.SpecDiffs
|
||||
Artifacts *store.Artifacts
|
||||
EventHub *events.Hub
|
||||
Logs *logs.Hub
|
||||
Runner *orchestrator.Runner
|
||||
Tiles *TileEnricher
|
||||
PublicURL string // user-visible base URL baked into the quick-register one-liner
|
||||
Hosts *store.Hosts
|
||||
Runs *store.Runs
|
||||
Stages *store.Stages
|
||||
SubSteps *store.SubSteps
|
||||
SpecDiffs *store.SpecDiffs
|
||||
Artifacts *store.Artifacts
|
||||
Thresholds *store.Thresholds // Phase 1: seeded at StartRun from Profiles
|
||||
Profiles *config.ProfileRegistry
|
||||
EventHub *events.Hub
|
||||
Logs *logs.Hub
|
||||
Runner *orchestrator.Runner
|
||||
Tiles *TileEnricher
|
||||
PublicURL string // user-visible base URL baked into the quick-register one-liner
|
||||
// PXE, when non-nil, gets Reload()ed after host create/delete so
|
||||
// dnsmasq's dhcp-host= allowlist reflects the current registry.
|
||||
// Without this, a newly-registered host PXE-boots and gets
|
||||
@@ -316,23 +319,71 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
nonDestructive := r.PostFormValue("non_destructive") == "1"
|
||||
profile := strings.TrimSpace(r.PostFormValue("profile"))
|
||||
if profile == "" {
|
||||
profile = config.ProfileQuick
|
||||
}
|
||||
if !config.IsValidProfile(profile) {
|
||||
http.Error(w, "unknown profile: "+profile, http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
_, hash, err := orchestrator.IssueRunToken()
|
||||
if err != nil {
|
||||
http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
runID, err := u.Runs.Create(r.Context(), hostID, hash, nonDestructive)
|
||||
runID, err := u.Runs.CreateWithProfile(r.Context(), hostID, hash, nonDestructive, profile)
|
||||
if err != nil {
|
||||
http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID)
|
||||
if err := u.seedThresholds(r.Context(), runID, host, profile); err != nil {
|
||||
// A threshold-seed failure shouldn't orphan a run row — log
|
||||
// and continue. Samples will just accumulate without a gate
|
||||
// until the operator retries, same as before Phase 1.
|
||||
log.Printf("ui: seed thresholds run %d: %v", runID, err)
|
||||
}
|
||||
log.Printf("ui: created run %d for host %d profile=%s (state=Queued)", runID, hostID, profile)
|
||||
// Send the operator straight to the new run — the button they clicked
|
||||
// was "Start vetting", the thing they want next is to watch it.
|
||||
http.Redirect(w, r, fmt.Sprintf("/runs/%d", runID), http.StatusSeeOther)
|
||||
}
|
||||
|
||||
// seedThresholds materializes the per-run threshold table from the
|
||||
// ProfileRegistry. The shared vetting.thresholds block applies to
|
||||
// every profile; future per-profile overrides will layer on top here,
|
||||
// and per-host overrides (Phase 1 extra) land via ExpectedSpecYAML in
|
||||
// a later iteration. Safe to skip silently when Thresholds or the
|
||||
// registry isn't wired — tests do not always build one.
|
||||
func (u *UI) seedThresholds(ctx context.Context, runID int64, host *model.Host, profile string) error {
|
||||
if u.Thresholds == nil || u.Profiles == nil {
|
||||
return nil
|
||||
}
|
||||
_ = host // reserved for per-host override layer
|
||||
_ = profile // reserved for per-profile override layer
|
||||
defaults := u.Profiles.Vetting.Thresholds
|
||||
if len(defaults) == 0 {
|
||||
return nil
|
||||
}
|
||||
specs := make([]store.ThresholdSpec, 0, len(defaults))
|
||||
for _, d := range defaults {
|
||||
specs = append(specs, store.ThresholdSpec{
|
||||
Stage: d.Stage,
|
||||
Kind: d.Kind,
|
||||
Key: d.Key,
|
||||
Op: d.Op,
|
||||
Value: d.Value,
|
||||
Nominal: d.Nominal,
|
||||
Unit: d.Unit,
|
||||
Severity: d.Severity,
|
||||
Source: "profile",
|
||||
})
|
||||
}
|
||||
_, err := u.Thresholds.SeedForRun(ctx, runID, specs)
|
||||
return err
|
||||
}
|
||||
|
||||
func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
|
||||
_ = templates.Registration(templates.RegistrationForm{
|
||||
QuickRegisterURL: u.baseURL(r),
|
||||
|
||||
Reference in New Issue
Block a user