23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
170 lines
5.4 KiB
Go
170 lines
5.4 KiB
Go
package api_test
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"path/filepath"
|
|
"strconv"
|
|
"testing"
|
|
|
|
"vetting/internal/api"
|
|
"vetting/internal/db"
|
|
"vetting/internal/events"
|
|
"vetting/internal/model"
|
|
"vetting/internal/orchestrator"
|
|
"vetting/internal/store"
|
|
)
|
|
|
|
// setupAgentWithThresholds builds an Agent wired up to the thresholds
|
|
// store + a Runner so the /sensor handler can drive the state machine.
|
|
// Seeds one critical thermal threshold and parks the run in CPUStress
|
|
// so the handler will stamp a stage-relevant failed_stage.
|
|
func setupAgentWithThresholds(t *testing.T) (*api.Agent, int64, string) {
|
|
t.Helper()
|
|
path := filepath.Join(t.TempDir(), "vetting.db")
|
|
conn, err := db.Open(path)
|
|
if err != nil {
|
|
t.Fatalf("open db: %v", err)
|
|
}
|
|
t.Cleanup(func() { _ = conn.Close() })
|
|
|
|
hosts := &store.Hosts{DB: conn}
|
|
runs := &store.Runs{DB: conn}
|
|
stages := &store.Stages{DB: conn}
|
|
meas := &store.Measurements{DB: conn}
|
|
thresholds := &store.Thresholds{DB: conn}
|
|
hub := events.NewHub()
|
|
runner := &orchestrator.Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub}
|
|
|
|
hostID, err := hosts.Create(context.Background(), model.Host{
|
|
Name: "thresh-host",
|
|
MAC: "aa:bb:cc:dd:ee:aa",
|
|
WoLBroadcastIP: "10.0.0.255",
|
|
WoLPort: 9,
|
|
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("create host: %v", err)
|
|
}
|
|
plain, hash, err := orchestrator.IssueRunToken()
|
|
if err != nil {
|
|
t.Fatalf("issue token: %v", err)
|
|
}
|
|
runID, err := runs.Create(context.Background(), hostID, hash, false)
|
|
if err != nil {
|
|
t.Fatalf("create run: %v", err)
|
|
}
|
|
if err := stages.Seed(context.Background(), runID); err != nil {
|
|
t.Fatalf("seed stages: %v", err)
|
|
}
|
|
// Park the run where a real thermal sidecar would be posting samples.
|
|
if err := runs.SetState(context.Background(), runID, model.StateCPUStress); err != nil {
|
|
t.Fatalf("set state: %v", err)
|
|
}
|
|
// Seed one critical thermal threshold.
|
|
if _, err := thresholds.SeedForRun(context.Background(), runID, []store.ThresholdSpec{
|
|
{Stage: "*", Kind: "temp", Key: "cpu/*", Op: "lt", Value: 92, Unit: "C", Severity: "critical", Source: "profile"},
|
|
}); err != nil {
|
|
t.Fatalf("seed thresholds: %v", err)
|
|
}
|
|
return &api.Agent{
|
|
Hosts: hosts,
|
|
Runs: runs,
|
|
Stages: stages,
|
|
Measurements: meas,
|
|
Thresholds: thresholds,
|
|
Runner: runner,
|
|
}, runID, plain
|
|
}
|
|
|
|
// TestSensor_ThermalRunawayFailsRun: a sample that breaches a critical
|
|
// threshold lands in threshold_evaluations (passed=0) and flips the
|
|
// run into FailedHolding with failed_stage naming the current stage.
|
|
// This is the Phase-1 behavior gate — without the evaluator, the sample
|
|
// would just sit in measurements and the run would happily march on.
|
|
func TestSensor_ThermalRunawayFailsRun(t *testing.T) {
|
|
a, runID, token := setupAgentWithThresholds(t)
|
|
batch := api.SensorBatch{Samples: []api.SensorSample{
|
|
{Kind: "temp", Key: "cpu/0", Value: 95.3, Unit: "C"},
|
|
}}
|
|
buf, _ := json.Marshal(batch)
|
|
req := routedRequest(runID, http.MethodPost,
|
|
"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
|
|
req.Header.Set("Authorization", "Bearer "+token)
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
rr := httptest.NewRecorder()
|
|
a.Sensor(rr, req)
|
|
if rr.Code != http.StatusOK {
|
|
t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
|
|
}
|
|
var resp struct {
|
|
OK bool `json:"ok"`
|
|
Breach bool `json:"breach"`
|
|
Kind string `json:"breach_kind"`
|
|
}
|
|
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
|
t.Fatalf("decode: %v", err)
|
|
}
|
|
if !resp.Breach {
|
|
t.Fatalf("expected breach=true, got %+v", resp)
|
|
}
|
|
run, err := a.Runs.Get(context.Background(), runID)
|
|
if err != nil {
|
|
t.Fatalf("get run: %v", err)
|
|
}
|
|
if run.State != model.StateFailedHolding {
|
|
t.Fatalf("state = %s, want FailedHolding", run.State)
|
|
}
|
|
if run.FailedStage == "" {
|
|
t.Fatalf("failed_stage empty; want stage-named breach")
|
|
}
|
|
evals, err := a.Thresholds.ListEvaluations(context.Background(), runID)
|
|
if err != nil {
|
|
t.Fatalf("list evaluations: %v", err)
|
|
}
|
|
if len(evals) != 1 {
|
|
t.Fatalf("want 1 evaluation recorded, got %d", len(evals))
|
|
}
|
|
if evals[0].Passed {
|
|
t.Fatalf("evaluation recorded as passed for 95.3C sample against <92C rule")
|
|
}
|
|
}
|
|
|
|
// TestSensor_WithinThresholdPasses: a sample comfortably inside the
|
|
// threshold writes an evaluation row with passed=1 and leaves the run
|
|
// state untouched.
|
|
func TestSensor_WithinThresholdPasses(t *testing.T) {
|
|
a, runID, token := setupAgentWithThresholds(t)
|
|
batch := api.SensorBatch{Samples: []api.SensorSample{
|
|
{Kind: "temp", Key: "cpu/0", Value: 55.0, Unit: "C"},
|
|
}}
|
|
buf, _ := json.Marshal(batch)
|
|
req := routedRequest(runID, http.MethodPost,
|
|
"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
|
|
req.Header.Set("Authorization", "Bearer "+token)
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
rr := httptest.NewRecorder()
|
|
a.Sensor(rr, req)
|
|
if rr.Code != http.StatusOK {
|
|
t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
|
|
}
|
|
run, err := a.Runs.Get(context.Background(), runID)
|
|
if err != nil {
|
|
t.Fatalf("get run: %v", err)
|
|
}
|
|
if run.State != model.StateCPUStress {
|
|
t.Fatalf("state = %s, want CPUStress unchanged", run.State)
|
|
}
|
|
evals, err := a.Thresholds.ListEvaluations(context.Background(), runID)
|
|
if err != nil {
|
|
t.Fatalf("list evaluations: %v", err)
|
|
}
|
|
if len(evals) != 1 || !evals[0].Passed {
|
|
t.Fatalf("want 1 passed evaluation, got %+v", evals)
|
|
}
|
|
}
|