23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
407 lines
12 KiB
Go
407 lines
12 KiB
Go
package api_test
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/go-chi/chi/v5"
|
|
|
|
"vetting/internal/api"
|
|
"vetting/internal/db"
|
|
"vetting/internal/events"
|
|
"vetting/internal/logs"
|
|
"vetting/internal/model"
|
|
"vetting/internal/notify"
|
|
"vetting/internal/orchestrator"
|
|
"vetting/internal/spec"
|
|
"vetting/internal/store"
|
|
)
|
|
|
|
// captureNotifier is a testing-only Notifier that records every Event
|
|
// sent to it, under a mutex so concurrent Dispatch goroutines are safe.
|
|
type captureNotifier struct {
|
|
mu sync.Mutex
|
|
name string
|
|
evs []notify.Event
|
|
}
|
|
|
|
func (c *captureNotifier) Name() string { return c.name }
|
|
|
|
func (c *captureNotifier) Send(_ context.Context, ev notify.Event) error {
|
|
c.mu.Lock()
|
|
c.evs = append(c.evs, ev)
|
|
c.mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
func (c *captureNotifier) awaitKind(t *testing.T, k notify.Kind) notify.Event {
|
|
t.Helper()
|
|
deadline := time.Now().Add(2 * time.Second)
|
|
for {
|
|
c.mu.Lock()
|
|
for _, ev := range c.evs {
|
|
if ev.Kind == k {
|
|
got := ev
|
|
c.mu.Unlock()
|
|
return got
|
|
}
|
|
}
|
|
c.mu.Unlock()
|
|
if time.Now().After(deadline) {
|
|
t.Fatalf("no %q event received within timeout", k)
|
|
}
|
|
time.Sleep(5 * time.Millisecond)
|
|
}
|
|
}
|
|
|
|
func newCaptureRegistry(c *captureNotifier) *notify.Registry {
|
|
reg := notify.NewRegistry(time.Second)
|
|
reg.Register(c)
|
|
reg.AddRoute(notify.Route{Notifier: c.name}) // wildcard
|
|
return reg
|
|
}
|
|
|
|
// Builds a fully-wired Agent against a fresh sqlite DB and returns
|
|
// (agent, runID, plainTokenForBearer). Caller is responsible for
|
|
// transitioning the run out of Queued.
|
|
func fullAgent(t *testing.T) (*api.Agent, int64, string) {
|
|
return fullAgentWithSpec(t, "")
|
|
}
|
|
|
|
// fullAgentWithSpec is the same as fullAgent but seeds the host with
|
|
// an ExpectedSpecYAML so SpecValidate can pick up diffs in the test.
|
|
func fullAgentWithSpec(t *testing.T, expectedSpecYAML string) (*api.Agent, int64, string) {
|
|
t.Helper()
|
|
tmp := t.TempDir()
|
|
conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
|
|
if err != nil {
|
|
t.Fatalf("open db: %v", err)
|
|
}
|
|
t.Cleanup(func() { _ = conn.Close() })
|
|
|
|
hostStore := &store.Hosts{DB: conn}
|
|
runStore := &store.Runs{DB: conn}
|
|
stageStore := &store.Stages{DB: conn}
|
|
artifactStore := &store.Artifacts{DB: conn}
|
|
specDiffStore := &store.SpecDiffs{DB: conn}
|
|
measurementStore := &store.Measurements{DB: conn}
|
|
firmwareStore := &store.Firmware{DB: conn}
|
|
|
|
hub := events.NewHub()
|
|
logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
|
|
if err != nil {
|
|
t.Fatalf("logs hub: %v", err)
|
|
}
|
|
t.Cleanup(func() { logHub.Close() })
|
|
|
|
runner := &orchestrator.Runner{
|
|
Runs: runStore,
|
|
Hosts: hostStore,
|
|
Stages: stageStore,
|
|
EventHub: hub,
|
|
}
|
|
|
|
hostID, err := hostStore.Create(context.Background(), model.Host{
|
|
Name: "smoke-host",
|
|
MAC: "aa:bb:cc:dd:ee:10",
|
|
WoLBroadcastIP: "10.0.0.255",
|
|
WoLPort: 9,
|
|
ExpectedSpecYAML: expectedSpecYAML,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("create host: %v", err)
|
|
}
|
|
plain, hash, err := orchestrator.IssueRunToken()
|
|
if err != nil {
|
|
t.Fatalf("issue token: %v", err)
|
|
}
|
|
runID, err := runStore.Create(context.Background(), hostID, hash, false)
|
|
if err != nil {
|
|
t.Fatalf("create run: %v", err)
|
|
}
|
|
if err := stageStore.Seed(context.Background(), runID); err != nil {
|
|
t.Fatalf("seed stages: %v", err)
|
|
}
|
|
return &api.Agent{
|
|
Hosts: hostStore,
|
|
Runs: runStore,
|
|
Stages: stageStore,
|
|
Artifacts: artifactStore,
|
|
SpecDiffs: specDiffStore,
|
|
Measurements: measurementStore,
|
|
Firmware: firmwareStore,
|
|
Runner: runner,
|
|
EventHub: hub,
|
|
Logs: logHub,
|
|
ArtifactsDir: filepath.Join(tmp, "artifacts"),
|
|
PublicURL: "https://vetting.example",
|
|
}, runID, plain
|
|
}
|
|
|
|
// walkStage simulates the agent reporting a single stage's outcome.
|
|
// Returns the next_state the orchestrator decided to advance to.
|
|
func walkStage(t *testing.T, a *api.Agent, runID int64, token, stage string, passed bool, extras map[string]any) string {
|
|
t.Helper()
|
|
body := map[string]any{"stage": stage, "passed": passed}
|
|
if extras != nil {
|
|
for k, v := range extras {
|
|
body[k] = v
|
|
}
|
|
}
|
|
buf, _ := json.Marshal(body)
|
|
req := httptest.NewRequest(http.MethodPost,
|
|
"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/result",
|
|
bytes.NewReader(buf))
|
|
rctx := chi.NewRouteContext()
|
|
rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
|
|
req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
|
|
req.Header.Set("Authorization", "Bearer "+token)
|
|
req.Header.Set("Content-Type", "application/json")
|
|
rr := httptest.NewRecorder()
|
|
a.Result(rr, req)
|
|
if rr.Code != http.StatusOK {
|
|
t.Fatalf("stage %s: status %d body=%q", stage, rr.Code, rr.Body.String())
|
|
}
|
|
var resp struct {
|
|
OK bool `json:"ok"`
|
|
NextState string `json:"next_state"`
|
|
}
|
|
if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil {
|
|
t.Fatalf("stage %s: decode resp: %v", stage, err)
|
|
}
|
|
return resp.NextState
|
|
}
|
|
|
|
// TestFullPipelineToCompleted walks an agent through all stages of a
|
|
// successful run and asserts the run ends in Completed. Inventory is
|
|
// minimal; the empty expected-spec means SpecValidate produces zero
|
|
// critical diffs and the orchestrator auto-advances past it.
|
|
func TestFullPipelineToCompleted(t *testing.T) {
|
|
a, runID, token := fullAgent(t)
|
|
capture := &captureNotifier{name: "capture"}
|
|
a.Notify = newCaptureRegistry(capture)
|
|
// Claim would normally transition Booting → InventoryCheck; set it
|
|
// directly here since we're not exercising the claim path.
|
|
if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
|
|
t.Fatalf("set state: %v", err)
|
|
}
|
|
|
|
// Stage 1: Inventory — provide a concrete inventory so SpecValidate
|
|
// has something to compare against.
|
|
inv := spec.Inventory{
|
|
CPU: spec.CPUSpec{Model: "Xeon Gold", LogicalCores: 8},
|
|
Memory: spec.MemorySpec{TotalGiB: 16},
|
|
}
|
|
next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
|
|
// After Inventory → Firmware
|
|
if next != "Firmware" {
|
|
t.Fatalf("after Inventory, next_state = %q, want Firmware", next)
|
|
}
|
|
|
|
// The remaining stages advance one-for-one in order. After Firmware
|
|
// the inline SpecValidate resolver advances through SpecValidate to
|
|
// SMART without a dedicated /result POST for SpecValidate.
|
|
walkPlan := []struct {
|
|
stage string
|
|
expected string
|
|
}{
|
|
{"Firmware", "SMART"},
|
|
{"SMART", "CPUStress"},
|
|
{"CPUStress", "Storage"},
|
|
{"Storage", "Network"},
|
|
{"Network", "Burn"},
|
|
{"Burn", "GPU"},
|
|
{"GPU", "PSU"},
|
|
{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
|
|
}
|
|
for _, step := range walkPlan {
|
|
got := walkStage(t, a, runID, token, step.stage, true, nil)
|
|
if got != step.expected {
|
|
t.Fatalf("after %s, next_state = %q, want %q", step.stage, got, step.expected)
|
|
}
|
|
}
|
|
|
|
run, err := a.Runs.Get(context.Background(), runID)
|
|
if err != nil {
|
|
t.Fatalf("Get run: %v", err)
|
|
}
|
|
if run.State != model.StateCompleted {
|
|
t.Fatalf("run.State = %q, want Completed", run.State)
|
|
}
|
|
if run.ReportPath == "" {
|
|
t.Fatalf("run.ReportPath not set")
|
|
}
|
|
|
|
// Phase 5 assertions: an HTML report artifact exists on disk, and
|
|
// the capture notifier saw a RunCompleted event.
|
|
arts, err := a.Artifacts.ListForRun(context.Background(), runID)
|
|
if err != nil {
|
|
t.Fatalf("ListForRun: %v", err)
|
|
}
|
|
var htmlPath string
|
|
for _, art := range arts {
|
|
if art.Kind == "report_html" {
|
|
htmlPath = art.Path
|
|
}
|
|
}
|
|
if htmlPath == "" {
|
|
t.Fatalf("no report_html artifact recorded (kinds seen: %v)", artifactKinds(arts))
|
|
}
|
|
data, err := os.ReadFile(htmlPath)
|
|
if err != nil {
|
|
t.Fatalf("read report.html: %v", err)
|
|
}
|
|
if !strings.Contains(string(data), "<html") {
|
|
t.Fatalf("report.html missing <html tag: %s", string(data[:min(200, len(data))]))
|
|
}
|
|
ev := capture.awaitKind(t, notify.KindRunCompleted)
|
|
if ev.HostName != "smoke-host" {
|
|
t.Errorf("RunCompleted host = %q, want smoke-host", ev.HostName)
|
|
}
|
|
if ev.URL == "" || !strings.Contains(ev.URL, "/reports/") {
|
|
t.Errorf("RunCompleted URL = %q, want non-empty with /reports/", ev.URL)
|
|
}
|
|
}
|
|
|
|
func artifactKinds(arts []store.Artifact) []string {
|
|
out := make([]string, 0, len(arts))
|
|
for _, a := range arts {
|
|
out = append(out, a.Kind)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func min(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
// TestFaultInjectionSMART verifies a failing SMART stage halts the
|
|
// pipeline at FailedHolding with failed_stage recorded.
|
|
func TestFaultInjectionSMART(t *testing.T) {
|
|
a, runID, token := fullAgent(t)
|
|
capture := &captureNotifier{name: "capture"}
|
|
a.Notify = newCaptureRegistry(capture)
|
|
if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
|
|
t.Fatalf("set state: %v", err)
|
|
}
|
|
|
|
inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
|
|
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
|
|
t.Fatalf("after Inventory, next = %q want Firmware", next)
|
|
}
|
|
if next := walkStage(t, a, runID, token, "Firmware", true, nil); next != "SMART" {
|
|
t.Fatalf("after Firmware, next = %q want SMART (inline SpecValidate)", next)
|
|
}
|
|
|
|
// Fake SMART failure → expect FailedHolding.
|
|
if next := walkStage(t, a, runID, token, "SMART", false, nil); next != "FailedHolding" {
|
|
t.Fatalf("after SMART fail, next = %q want FailedHolding", next)
|
|
}
|
|
|
|
run, err := a.Runs.Get(context.Background(), runID)
|
|
if err != nil {
|
|
t.Fatalf("Get run: %v", err)
|
|
}
|
|
if run.State != model.StateFailedHolding {
|
|
t.Fatalf("run.State = %q, want FailedHolding", run.State)
|
|
}
|
|
if run.FailedStage != "SMART" {
|
|
t.Fatalf("run.FailedStage = %q, want SMART", run.FailedStage)
|
|
}
|
|
|
|
// Phase 5 assertion: the fault fires a StageFailed notification.
|
|
ev := capture.awaitKind(t, notify.KindStageFailed)
|
|
if !strings.Contains(ev.Title, "SMART") {
|
|
t.Errorf("StageFailed title = %q, want to mention SMART", ev.Title)
|
|
}
|
|
if ev.Severity != notify.SeverityCritical {
|
|
t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
|
|
}
|
|
}
|
|
|
|
// TestFirmwarePersistAndSpecMismatch exercises the Phase 4 firmware
|
|
// integration: the agent POSTs Firmware snapshots; server persists; the
|
|
// following SpecValidate diff picks up a firmware mismatch and parks
|
|
// the run in FailedHolding with FailedStage=SpecValidate.
|
|
func TestFirmwarePersistAndSpecMismatch(t *testing.T) {
|
|
// Host demands BIOS 3.3; agent will POST 3.2 → one critical firmware diff.
|
|
yaml := "firmware:\n - component: bios\n version: \"3.3\"\n"
|
|
a, runID, token := fullAgentWithSpec(t, yaml)
|
|
a.Notify = newCaptureRegistry(&captureNotifier{name: "capture"})
|
|
|
|
if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
|
|
t.Fatalf("set state: %v", err)
|
|
}
|
|
|
|
inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
|
|
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
|
|
t.Fatalf("after Inventory, next = %q want Firmware", next)
|
|
}
|
|
|
|
// Firmware stage: agent reports actual BIOS 3.2 → one row persisted.
|
|
fw := []map[string]any{
|
|
{"component": "bios", "identifier": "system", "version": "3.2", "vendor": "AMI"},
|
|
}
|
|
next := walkStage(t, a, runID, token, "Firmware", true, map[string]any{"firmware": fw})
|
|
// Inline SpecValidate should detect the firmware mismatch and send
|
|
// the run to FailedHolding without the agent posting SpecValidate.
|
|
if next != "FailedHolding" {
|
|
t.Fatalf("after Firmware mismatch, next = %q want FailedHolding", next)
|
|
}
|
|
|
|
run, err := a.Runs.Get(context.Background(), runID)
|
|
if err != nil {
|
|
t.Fatalf("get run: %v", err)
|
|
}
|
|
if run.State != model.StateFailedHolding {
|
|
t.Fatalf("run.State = %q, want FailedHolding", run.State)
|
|
}
|
|
if run.FailedStage != "SpecValidate" {
|
|
t.Fatalf("run.FailedStage = %q, want SpecValidate", run.FailedStage)
|
|
}
|
|
|
|
// Persistence: row landed in firmware_snapshots.
|
|
snaps, err := a.Firmware.ListForRun(context.Background(), runID)
|
|
if err != nil {
|
|
t.Fatalf("ListForRun firmware: %v", err)
|
|
}
|
|
if len(snaps) != 1 {
|
|
t.Fatalf("firmware rows = %d, want 1: %+v", len(snaps), snaps)
|
|
}
|
|
if snaps[0].Component != "bios" || snaps[0].Version != "3.2" {
|
|
t.Errorf("persisted snapshot = %+v", snaps[0])
|
|
}
|
|
|
|
// Diff row: SpecDiffs has a firmware-specific entry (rather than
|
|
// only CPU/memory/disk rows) and is critical.
|
|
diffs, err := a.SpecDiffs.ListForRun(context.Background(), runID)
|
|
if err != nil {
|
|
t.Fatalf("ListForRun specdiffs: %v", err)
|
|
}
|
|
found := false
|
|
for _, d := range diffs {
|
|
if strings.HasPrefix(d.Field, "firmware[") {
|
|
found = true
|
|
if d.Severity != "critical" {
|
|
t.Errorf("firmware diff severity = %q, want critical", d.Severity)
|
|
}
|
|
}
|
|
}
|
|
if !found {
|
|
t.Fatalf("no firmware[...] entry in spec diffs: %+v", diffs)
|
|
}
|
|
}
|