Initial commit: full Phases 1-6 implementation

Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
@@ -0,0 +1,318 @@
+package api_test
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"vetting/internal/api"
+	"vetting/internal/db"
+	"vetting/internal/events"
+	"vetting/internal/logs"
+	"vetting/internal/model"
+	"vetting/internal/notify"
+	"vetting/internal/orchestrator"
+	"vetting/internal/spec"
+	"vetting/internal/store"
+)
+
+// captureNotifier is a testing-only Notifier that records every Event
+// sent to it, under a mutex so concurrent Dispatch goroutines are safe.
+type captureNotifier struct {
+	mu   sync.Mutex
+	name string
+	evs  []notify.Event
+}
+
+func (c *captureNotifier) Name() string { return c.name }
+
+func (c *captureNotifier) Send(_ context.Context, ev notify.Event) error {
+	c.mu.Lock()
+	c.evs = append(c.evs, ev)
+	c.mu.Unlock()
+	return nil
+}
+
+func (c *captureNotifier) awaitKind(t *testing.T, k notify.Kind) notify.Event {
+	t.Helper()
+	deadline := time.Now().Add(2 * time.Second)
+	for {
+		c.mu.Lock()
+		for _, ev := range c.evs {
+			if ev.Kind == k {
+				got := ev
+				c.mu.Unlock()
+				return got
+			}
+		}
+		c.mu.Unlock()
+		if time.Now().After(deadline) {
+			t.Fatalf("no %q event received within timeout", k)
+		}
+		time.Sleep(5 * time.Millisecond)
+	}
+}
+
+func newCaptureRegistry(c *captureNotifier) *notify.Registry {
+	reg := notify.NewRegistry(time.Second)
+	reg.Register(c)
+	reg.AddRoute(notify.Route{Notifier: c.name}) // wildcard
+	return reg
+}
+
+// Builds a fully-wired Agent against a fresh sqlite DB and returns
+// (agent, runID, plainTokenForBearer). Caller is responsible for
+// transitioning the run out of Queued.
+func fullAgent(t *testing.T) (*api.Agent, int64, string) {
+	t.Helper()
+	tmp := t.TempDir()
+	conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
+	if err != nil {
+		t.Fatalf("open db: %v", err)
+	}
+	t.Cleanup(func() { _ = conn.Close() })
+
+	hostStore := &store.Hosts{DB: conn}
+	runStore := &store.Runs{DB: conn}
+	stageStore := &store.Stages{DB: conn}
+	artifactStore := &store.Artifacts{DB: conn}
+	specDiffStore := &store.SpecDiffs{DB: conn}
+	measurementStore := &store.Measurements{DB: conn}
+
+	hub := events.NewHub()
+	logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
+	if err != nil {
+		t.Fatalf("logs hub: %v", err)
+	}
+	t.Cleanup(func() { logHub.Close() })
+
+	runner := &orchestrator.Runner{
+		Runs:     runStore,
+		Hosts:    hostStore,
+		Stages:   stageStore,
+		EventHub: hub,
+	}
+
+	hostID, err := hostStore.Create(context.Background(), model.Host{
+		Name:             "smoke-host",
+		MAC:              "aa:bb:cc:dd:ee:10",
+		WoLBroadcastIP:   "10.0.0.255",
+		WoLPort:          9,
+		ExpectedSpecYAML: "", // empty spec → no diffs
+	})
+	if err != nil {
+		t.Fatalf("create host: %v", err)
+	}
+	plain, hash, err := orchestrator.IssueRunToken()
+	if err != nil {
+		t.Fatalf("issue token: %v", err)
+	}
+	runID, err := runStore.Create(context.Background(), hostID, hash)
+	if err != nil {
+		t.Fatalf("create run: %v", err)
+	}
+	if err := stageStore.Seed(context.Background(), runID); err != nil {
+		t.Fatalf("seed stages: %v", err)
+	}
+	return &api.Agent{
+		Hosts:        hostStore,
+		Runs:         runStore,
+		Stages:       stageStore,
+		Artifacts:    artifactStore,
+		SpecDiffs:    specDiffStore,
+		Measurements: measurementStore,
+		Runner:       runner,
+		EventHub:     hub,
+		Logs:         logHub,
+		ArtifactsDir: filepath.Join(tmp, "artifacts"),
+		PublicURL:    "https://vetting.example",
+	}, runID, plain
+}
+
+// walkStage simulates the agent reporting a single stage's outcome.
+// Returns the next_state the orchestrator decided to advance to.
+func walkStage(t *testing.T, a *api.Agent, runID int64, token, stage string, passed bool, extras map[string]any) string {
+	t.Helper()
+	body := map[string]any{"stage": stage, "passed": passed}
+	if extras != nil {
+		for k, v := range extras {
+			body[k] = v
+		}
+	}
+	buf, _ := json.Marshal(body)
+	req := httptest.NewRequest(http.MethodPost,
+		"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/result",
+		bytes.NewReader(buf))
+	rctx := chi.NewRouteContext()
+	rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
+	req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
+	req.Header.Set("Authorization", "Bearer "+token)
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+	a.Result(rr, req)
+	if rr.Code != http.StatusOK {
+		t.Fatalf("stage %s: status %d body=%q", stage, rr.Code, rr.Body.String())
+	}
+	var resp struct {
+		OK        bool   `json:"ok"`
+		NextState string `json:"next_state"`
+	}
+	if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil {
+		t.Fatalf("stage %s: decode resp: %v", stage, err)
+	}
+	return resp.NextState
+}
+
+// TestFullPipelineToCompleted walks an agent through all stages of a
+// successful run and asserts the run ends in Completed. Inventory is
+// minimal; the empty expected-spec means SpecValidate produces zero
+// critical diffs and the orchestrator auto-advances past it.
+func TestFullPipelineToCompleted(t *testing.T) {
+	a, runID, token := fullAgent(t)
+	capture := &captureNotifier{name: "capture"}
+	a.Notify = newCaptureRegistry(capture)
+	// Claim would normally transition Booting → InventoryCheck; set it
+	// directly here since we're not exercising the claim path.
+	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
+		t.Fatalf("set state: %v", err)
+	}
+
+	// Stage 1: Inventory — provide a concrete inventory so SpecValidate
+	// has something to compare against.
+	inv := spec.Inventory{
+		CPU:    spec.CPUSpec{Model: "Xeon Gold", LogicalCores: 8},
+		Memory: spec.MemorySpec{TotalGiB: 16},
+	}
+	next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
+	// After Inventory → SpecValidate resolves inline → SMART
+	if next != "SMART" {
+		t.Fatalf("after Inventory, next_state = %q, want SMART", next)
+	}
+
+	// The remaining stages advance one-for-one in order.
+	walkPlan := []struct {
+		stage    string
+		expected string
+	}{
+		{"SMART", "CPUStress"},
+		{"CPUStress", "Storage"},
+		{"Storage", "Network"},
+		{"Network", "GPU"},
+		{"GPU", "PSU"},
+		{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
+	}
+	for _, step := range walkPlan {
+		got := walkStage(t, a, runID, token, step.stage, true, nil)
+		if got != step.expected {
+			t.Fatalf("after %s, next_state = %q, want %q", step.stage, got, step.expected)
+		}
+	}
+
+	run, err := a.Runs.Get(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("Get run: %v", err)
+	}
+	if run.State != model.StateCompleted {
+		t.Fatalf("run.State = %q, want Completed", run.State)
+	}
+	if run.ReportPath == "" {
+		t.Fatalf("run.ReportPath not set")
+	}
+
+	// Phase 5 assertions: an HTML report artifact exists on disk, and
+	// the capture notifier saw a RunCompleted event.
+	arts, err := a.Artifacts.ListForRun(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	var htmlPath string
+	for _, art := range arts {
+		if art.Kind == "report_html" {
+			htmlPath = art.Path
+		}
+	}
+	if htmlPath == "" {
+		t.Fatalf("no report_html artifact recorded (kinds seen: %v)", artifactKinds(arts))
+	}
+	data, err := os.ReadFile(htmlPath)
+	if err != nil {
+		t.Fatalf("read report.html: %v", err)
+	}
+	if !strings.Contains(string(data), "<html") {
+		t.Fatalf("report.html missing <html tag: %s", string(data[:min(200, len(data))]))
+	}
+	ev := capture.awaitKind(t, notify.KindRunCompleted)
+	if ev.HostName != "smoke-host" {
+		t.Errorf("RunCompleted host = %q, want smoke-host", ev.HostName)
+	}
+	if ev.URL == "" || !strings.Contains(ev.URL, "/reports/") {
+		t.Errorf("RunCompleted URL = %q, want non-empty with /reports/", ev.URL)
+	}
+}
+
+func artifactKinds(arts []store.Artifact) []string {
+	out := make([]string, 0, len(arts))
+	for _, a := range arts {
+		out = append(out, a.Kind)
+	}
+	return out
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// TestFaultInjectionSMART verifies a failing SMART stage halts the
+// pipeline at FailedHolding with failed_stage recorded.
+func TestFaultInjectionSMART(t *testing.T) {
+	a, runID, token := fullAgent(t)
+	capture := &captureNotifier{name: "capture"}
+	a.Notify = newCaptureRegistry(capture)
+	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
+		t.Fatalf("set state: %v", err)
+	}
+
+	inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
+	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" {
+		t.Fatalf("after Inventory, next = %q want SMART", next)
+	}
+
+	// Fake SMART failure → expect FailedHolding.
+	if next := walkStage(t, a, runID, token, "SMART", false, nil); next != "FailedHolding" {
+		t.Fatalf("after SMART fail, next = %q want FailedHolding", next)
+	}
+
+	run, err := a.Runs.Get(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("Get run: %v", err)
+	}
+	if run.State != model.StateFailedHolding {
+		t.Fatalf("run.State = %q, want FailedHolding", run.State)
+	}
+	if run.FailedStage != "SMART" {
+		t.Fatalf("run.FailedStage = %q, want SMART", run.FailedStage)
+	}
+
+	// Phase 5 assertion: the fault fires a StageFailed notification.
+	ev := capture.awaitKind(t, notify.KindStageFailed)
+	if !strings.Contains(ev.Title, "SMART") {
+		t.Errorf("StageFailed title = %q, want to mention SMART", ev.Title)
+	}
+	if ev.Severity != notify.SeverityCritical {
+		t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
+	}
+}