Initial commit: full Phases 1-6 implementation

Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
@@ -0,0 +1,918 @@
+package api
+
+import (
+	"context"
+	"crypto/sha256"
+	"crypto/subtle"
+	"encoding/hex"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log"
+	"net"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"vetting/internal/events"
+	"vetting/internal/hold"
+	"vetting/internal/logs"
+	"vetting/internal/model"
+	"vetting/internal/notify"
+	"vetting/internal/orchestrator"
+	"vetting/internal/pxe"
+	"vetting/internal/report"
+	"vetting/internal/spec"
+	"vetting/internal/store"
+)
+
+// Agent collects the collaborators used by agent-facing HTTP routes:
+// the iPXE chainload endpoint and the /api/v1/runs/:id/* endpoints.
+type Agent struct {
+	Hosts           *store.Hosts
+	Runs            *store.Runs
+	Stages          *store.Stages
+	Artifacts       *store.Artifacts
+	SpecDiffs       *store.SpecDiffs
+	Measurements    *store.Measurements
+	Runner          *orchestrator.Runner
+	EventHub        *events.Hub
+	Logs            *logs.Hub
+	Notify          *notify.Registry
+	ArtifactsDir    string // ./var/artifacts
+	OrchestratorURL string // baked into iPXE cmdline
+	PublicURL       string // user-visible URL base for notification click-throughs
+	LiveKernelURL   string
+	LiveInitrdURL   string
+	TLSCertFPR      string // optional; empty = skip pinning
+	IperfPort       int    // orchestrator-supervised iperf3 port; 0 = 5201
+}
+
+// IPXEScript serves a per-MAC iPXE script. Called by iPXE itself after
+// dnsmasq hands it the chainload URL. Unknown MAC → halt script.
+// Known MAC with no active run → poweroff script. Known MAC with active
+// run → real boot script; the fetch triggers PXEObserved.
+func (a *Agent) IPXEScript(w http.ResponseWriter, r *http.Request) {
+	mac := strings.ToLower(strings.TrimSpace(chi.URLParam(r, "mac")))
+	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+	w.Header().Set("Cache-Control", "no-store")
+
+	if !macRe.MatchString(mac) {
+		log.Printf("ipxe: rejected malformed mac %q from %s", mac, r.RemoteAddr)
+		_, _ = w.Write([]byte(pxe.NotRegisteredScript(mac)))
+		return
+	}
+
+	run, err := a.Runs.FindActiveByMAC(r.Context(), mac)
+	if err != nil {
+		log.Printf("ipxe: find run by mac %s: %v", mac, err)
+		http.Error(w, "internal error", http.StatusInternalServerError)
+		return
+	}
+	if run == nil {
+		_, _ = w.Write([]byte(pxe.NoActiveRunScript(mac)))
+		return
+	}
+
+	// The token hash in the DB is the sha256 of the plaintext. The
+	// plaintext itself cannot be recovered from the hash — we issued it
+	// once when the run was created. For iPXE we re-issue a fresh token
+	// on every PXE fetch: this is safe because the hash in the DB is
+	// rewritten to match and only the most recent PXE can be claimed.
+	plain, hash, err := orchestrator.IssueRunToken()
+	if err != nil {
+		http.Error(w, "token", http.StatusInternalServerError)
+		return
+	}
+	if err := a.Runs.RotateTokenHash(r.Context(), run.ID, hash); err != nil {
+		log.Printf("ipxe: rotate token run %d: %v", run.ID, err)
+		http.Error(w, "token", http.StatusInternalServerError)
+		return
+	}
+
+	script := pxe.BuildScript(pxe.IPXEParams{
+		OrchestratorURL: a.OrchestratorURL,
+		LiveKernelURL:   a.LiveKernelURL,
+		LiveInitrdURL:   a.LiveInitrdURL,
+		TLSCertFPR:      a.TLSCertFPR,
+		RunID:           run.ID,
+		MAC:             mac,
+		Token:           plain,
+	})
+	_, _ = w.Write([]byte(script))
+
+	// iPXE has now fetched the script — treat this as PXEObserved. If we
+	// were already in Booting the transition table allows staying.
+	if _, err := a.Runner.Transition(r.Context(), run.ID, orchestrator.TriggerPXEObserved); err != nil {
+		// Non-fatal: the agent may still claim via /claim.
+		log.Printf("ipxe: PXEObserved for run %d: %v", run.ID, err)
+	}
+}
+
+// Hello is the first call an agent makes once userspace is up. It's
+// idempotent and only writes a log line; the authoritative transition
+// comes from /claim. The agent sends Hello early so operators see a
+// signal in the tile even before the token is validated.
+func (a *Agent) Hello(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	if _, ok := a.authenticate(w, r, runID); !ok {
+		return
+	}
+	log.Printf("agent hello: run=%d remote=%s", runID, r.RemoteAddr)
+	writeJSON(w, http.StatusOK, map[string]any{"ok": true, "run_id": runID})
+}
+
+// Claim is the binding call: the agent proves it holds the plaintext
+// token for this run, and in return the orchestrator transitions to
+// InventoryCheck and seeds the stage rows. All destructive actions the
+// agent takes later require a prior successful claim.
+func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	run, ok := a.authenticate(w, r, runID)
+	if !ok {
+		return
+	}
+
+	var body struct {
+		AgentIP string `json:"agent_ip"`
+	}
+	if r.Body != nil {
+		// agent_ip is informational; if missing fall back to RemoteAddr.
+		_ = json.NewDecoder(r.Body).Decode(&body)
+	}
+	agentIP := strings.TrimSpace(body.AgentIP)
+	if agentIP == "" {
+		if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil {
+			agentIP = host
+		} else {
+			agentIP = r.RemoteAddr
+		}
+	}
+
+	// First claim seeds the stage rows; subsequent claims are a no-op
+	// so agent retries after transient network failures stay safe.
+	if len(mustListStages(a.Stages, r, runID)) == 0 {
+		if err := a.Stages.Seed(r.Context(), runID); err != nil {
+			log.Printf("claim: seed stages run %d: %v", runID, err)
+			http.Error(w, "seed stages", http.StatusInternalServerError)
+			return
+		}
+	}
+
+	// Drive the transition. If we're already past Booting this returns
+	// an error — treat as "already claimed" and report OK, don't 500.
+	if run.State == model.StateWaitingWoL || run.State == model.StateBooting {
+		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerAgentClaimed); err != nil {
+			log.Printf("claim: transition run %d: %v", runID, err)
+			http.Error(w, "transition", http.StatusConflict)
+			return
+		}
+	}
+
+	log.Printf("agent claimed: run=%d agent_ip=%s", runID, agentIP)
+
+	// Stage-driven agent needs a bit of per-run config: the device
+	// allowlist (serial + expected size) for Storage, and the iperf3
+	// server port for Network. Parse the host's expected spec here so
+	// the agent doesn't need to read YAML.
+	expectedDisks := []map[string]any{}
+	if host, err := a.Hosts.Get(r.Context(), run.HostID); err == nil && host != nil {
+		if parsed, err := spec.Parse(host.ExpectedSpecYAML); err == nil && parsed != nil {
+			for _, dd := range parsed.Disks {
+				expectedDisks = append(expectedDisks, map[string]any{
+					"serial":  dd.Serial,
+					"size_gb": dd.SizeGB,
+				})
+			}
+		}
+	}
+	iperfPort := a.IperfPort
+	if iperfPort == 0 {
+		iperfPort = 5201
+	}
+	writeJSON(w, http.StatusOK, map[string]any{
+		"ok":             true,
+		"run_id":         runID,
+		"stages":         store.DefaultStageOrder,
+		"expected_disks": expectedDisks,
+		"iperf_port":     iperfPort,
+	})
+}
+
+// Heartbeat is the agent's periodic liveness ping. The response body
+// acts as a control channel: cmd=continue is the normal case; cmd=abort
+// once the run enters FailedHolding/Released; cmd=retry_stage when the
+// operator has overridden a failed stage (wipe-probe override).
+func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	run, ok := a.authenticate(w, r, runID)
+	if !ok {
+		return
+	}
+	a.Runner.TouchHeartbeat(runID)
+
+	cmd := "continue"
+	resp := map[string]any{"state": run.State}
+	switch {
+	case run.State == model.StateCompleted:
+		// Pipeline succeeded — agent should power the host down.
+		cmd = "shutdown"
+	case run.State == model.StateFailedHolding || run.State == model.StateReleased:
+		cmd = "abort"
+	case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON):
+		// Operator pressed "Override wipe & retry". Agent should
+		// re-enter Storage with the wipe-probe bypass armed.
+		cmd = "retry_stage"
+		resp["stage"] = "Storage"
+		resp["override_flags"] = json.RawMessage(run.OverrideFlagsJSON)
+	}
+	resp["cmd"] = cmd
+	writeJSON(w, http.StatusOK, resp)
+}
+
+// overrideWipeSet inspects a Run.OverrideFlagsJSON blob for the wipe flag.
+// Malformed JSON is ignored — the operator has to reapply the override if
+// it didn't round-trip correctly.
+func overrideWipeSet(blob string) bool {
+	if blob == "" {
+		return false
+	}
+	var flags struct {
+		Wipe bool `json:"wipe"`
+	}
+	_ = json.Unmarshal([]byte(blob), &flags)
+	return flags.Wipe
+}
+
+// authenticate verifies the Bearer token against the run's stored hash
+// and returns the Run for downstream handlers. Responds 401/404 on
+// failure and returns ok=false so the caller can bail early.
+func (a *Agent) authenticate(w http.ResponseWriter, r *http.Request, runID int64) (*model.Run, bool) {
+	run, err := a.Runs.Get(r.Context(), runID)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			http.Error(w, "run not found", http.StatusNotFound)
+			return nil, false
+		}
+		http.Error(w, "internal error", http.StatusInternalServerError)
+		return nil, false
+	}
+	token := bearerToken(r)
+	if token == "" {
+		http.Error(w, "missing bearer", http.StatusUnauthorized)
+		return nil, false
+	}
+	presented := orchestrator.HashRunToken(token)
+	if subtle.ConstantTimeCompare([]byte(presented), []byte(run.AgentTokenHash)) != 1 {
+		http.Error(w, "bad token", http.StatusUnauthorized)
+		return nil, false
+	}
+	return run, true
+}
+
+func bearerToken(r *http.Request) string {
+	h := r.Header.Get("Authorization")
+	if !strings.HasPrefix(h, "Bearer ") {
+		return ""
+	}
+	return strings.TrimSpace(strings.TrimPrefix(h, "Bearer "))
+}
+
+func runIDFromURL(w http.ResponseWriter, r *http.Request) (int64, bool) {
+	idStr := chi.URLParam(r, "id")
+	id, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil || id <= 0 {
+		http.Error(w, "bad run id", http.StatusBadRequest)
+		return 0, false
+	}
+	return id, true
+}
+
+func writeJSON(w http.ResponseWriter, status int, body any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	_ = json.NewEncoder(w).Encode(body)
+}
+
+// mustListStages is a small wrapper that hides the error path from
+// /claim — a DB read failure just pretends there are zero stages, and
+// the subsequent Seed will surface the real error.
+func mustListStages(s *store.Stages, r *http.Request, runID int64) []model.Stage {
+	rows, err := s.ListForRun(r.Context(), runID)
+	if err != nil {
+		return nil
+	}
+	return rows
+}
+
+// ===== Phase 3 endpoints =================================================
+
+// LogBatch is what the agent POSTs to /log: zero or more lines with
+// timestamp + level + text. Lines are written in order to the per-run
+// file and fanned out on the SSE hub.
+type LogBatch struct {
+	Lines []LogLine `json:"lines"`
+}
+
+type LogLine struct {
+	TS    string `json:"ts,omitempty"`    // RFC3339Nano; server clock used if empty
+	Level string `json:"level,omitempty"` // info|warn|error|debug
+	Text  string `json:"text"`
+}
+
+// Log accepts a batch of log lines from the agent. Empty batches are
+// legal (useful for agent-side flush ping).
+func (a *Agent) Log(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	if _, ok := a.authenticate(w, r, runID); !ok {
+		return
+	}
+	var batch LogBatch
+	if err := json.NewDecoder(r.Body).Decode(&batch); err != nil {
+		http.Error(w, "bad json", http.StatusBadRequest)
+		return
+	}
+	writer, err := a.Logs.WriterFor(runID)
+	if err != nil {
+		http.Error(w, "open log: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	for _, l := range batch.Lines {
+		ts, _ := time.Parse(time.RFC3339Nano, l.TS)
+		writer.Append(logs.Line{TS: ts, Level: l.Level, Text: l.Text})
+	}
+	writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(batch.Lines)})
+}
+
+// StageResult is the body of /result. Kind is the stage name (from
+// DefaultStageOrder); Passed drives StageCompleted vs StageFailed.
+// Inventory is optional and only set when kind == "Inventory" — the
+// orchestrator persists it as an artifact and feeds it to spec.Diff.
+type StageResult struct {
+	Stage     string          `json:"stage"`
+	Passed    bool            `json:"passed"`
+	Summary   json.RawMessage `json:"summary,omitempty"`
+	Inventory *spec.Inventory `json:"inventory,omitempty"`
+	Message   string          `json:"message,omitempty"`
+}
+
+// Result receives a stage's outcome. Flow:
+//  1. Mark the stage row passed/failed + record summary JSON.
+//  2. For Inventory: persist the inventory artifact.
+//  3. For Inventory (on pass): run spec diff server-side, persist rows,
+//     bump the run into SpecValidate and immediately resolve SpecValidate
+//     from that diff — the agent isn't involved in SpecValidate at all.
+//  4. Transition the run via StageCompleted/StageFailed.
+func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	run, ok := a.authenticate(w, r, runID)
+	if !ok {
+		return
+	}
+	var body StageResult
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		http.Error(w, "bad json", http.StatusBadRequest)
+		return
+	}
+	body.Stage = strings.TrimSpace(body.Stage)
+	if _, ok := orchestrator.StateForStage(body.Stage); !ok {
+		http.Error(w, "unknown stage: "+body.Stage, http.StatusBadRequest)
+		return
+	}
+
+	stageState := model.StagePassed
+	if !body.Passed {
+		stageState = model.StageFailed
+	}
+	summaryJSON := ""
+	if len(body.Summary) > 0 {
+		summaryJSON = string(body.Summary)
+	}
+	if err := a.Stages.CompleteByName(r.Context(), runID, body.Stage, stageState, summaryJSON); err != nil {
+		http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+
+	// Inventory-specific: persist artifact + compute spec diff.
+	if body.Stage == "Inventory" && body.Inventory != nil {
+		if err := a.persistInventory(r, run, body.Inventory); err != nil {
+			log.Printf("persist inventory run %d: %v", runID, err)
+		}
+	}
+
+	if !body.Passed {
+		if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
+			log.Printf("set failed stage: %v", err)
+		}
+		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
+			log.Printf("result: failed-transition run %d: %v", runID, err)
+			http.Error(w, "transition", http.StatusConflict)
+			return
+		}
+		hostName := a.hostNameFor(r.Context(), run.HostID)
+		detail := body.Message
+		if detail == "" {
+			detail = "stage reported failure"
+		}
+		a.dispatchEvent(notify.Event{
+			Kind:     notify.KindStageFailed,
+			Severity: notify.SeverityCritical,
+			RunID:    runID,
+			HostName: hostName,
+			Title:    fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage),
+			Body:     fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail),
+			URL:      a.runLinkURL(runID),
+		})
+		writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"})
+		return
+	}
+
+	// Passed: advance to the next stage in the pipeline.
+	next, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted)
+	if err != nil {
+		http.Error(w, "advance: "+err.Error(), http.StatusConflict)
+		return
+	}
+	log.Printf("result: run %d stage %s passed → %s", runID, body.Stage, next)
+
+	// If the just-advanced-into state is SpecValidate or Reporting, the
+	// orchestrator owns those stages entirely. The resolve function may
+	// transition further (→ next stage on pass, → FailedHolding on fail,
+	// → Completed for Reporting), so we re-read the run after each.
+	if next == model.StateSpecValidate {
+		a.resolveSpecValidate(r, runID)
+		if after, err := a.Runs.Get(r.Context(), runID); err == nil {
+			next = after.State
+		}
+	}
+	if next == model.StateReporting {
+		a.resolveReporting(r, runID)
+		if after, err := a.Runs.Get(r.Context(), runID); err == nil {
+			next = after.State
+		}
+	}
+	writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": string(next)})
+}
+
+func (a *Agent) persistInventory(r *http.Request, run *model.Run, inv *spec.Inventory) error {
+	dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", run.ID))
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		return err
+	}
+	path := filepath.Join(dir, "inventory.json")
+	buf, err := json.MarshalIndent(inv, "", "  ")
+	if err != nil {
+		return err
+	}
+	if err := os.WriteFile(path, buf, 0o644); err != nil {
+		return err
+	}
+	sum := sha256.Sum256(buf)
+	_, err = a.Artifacts.Create(r.Context(), store.Artifact{
+		RunID:     run.ID,
+		Kind:      "inventory",
+		Path:      path,
+		SHA256:    hex.EncodeToString(sum[:]),
+		SizeBytes: int64(len(buf)),
+	})
+	return err
+}
+
+// resolveSpecValidate runs the expected-vs-actual diff against the
+// just-stored inventory artifact, persists spec_diffs rows, and drives
+// the state machine — all on the server. The agent does nothing for
+// this stage.
+func (a *Agent) resolveSpecValidate(r *http.Request, runID int64) {
+	run, err := a.Runs.Get(r.Context(), runID)
+	if err != nil {
+		log.Printf("specvalidate: get run: %v", err)
+		return
+	}
+	host, err := a.Hosts.Get(r.Context(), run.HostID)
+	if err != nil {
+		log.Printf("specvalidate: get host: %v", err)
+		return
+	}
+	expected, err := spec.Parse(host.ExpectedSpecYAML)
+	if err != nil {
+		log.Printf("specvalidate: parse expected yaml: %v", err)
+		a.failStage(r, runID, "SpecValidate", "malformed expected spec: "+err.Error())
+		return
+	}
+	inv, err := a.readInventoryArtifact(r, runID)
+	if err != nil {
+		log.Printf("specvalidate: read inventory: %v", err)
+		a.failStage(r, runID, "SpecValidate", "missing inventory artifact")
+		return
+	}
+	diffs := spec.Diff(expected, inv)
+	if err := a.SpecDiffs.ReplaceForRun(r.Context(), runID, diffs); err != nil {
+		log.Printf("specvalidate: write diffs: %v", err)
+	}
+	if err := a.Stages.StartByName(r.Context(), runID, "SpecValidate"); err != nil {
+		log.Printf("specvalidate: start stage: %v", err)
+	}
+
+	critical := 0
+	for _, d := range diffs {
+		if d.Severity == "critical" && !d.Ignored {
+			critical++
+		}
+	}
+	summaryBuf, _ := json.Marshal(map[string]any{
+		"diffs":    len(diffs),
+		"critical": critical,
+	})
+	if critical > 0 {
+		_ = a.Stages.CompleteByName(r.Context(), runID, "SpecValidate", model.StageFailed, string(summaryBuf))
+		_ = a.Runs.SetFailedStage(r.Context(), runID, "SpecValidate")
+		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
+			log.Printf("specvalidate: failed-transition: %v", err)
+		}
+		a.appendLog(runID, "error", fmt.Sprintf("SpecValidate: %d critical diff(s) — holding host", critical))
+		hostName := a.hostNameFor(r.Context(), run.HostID)
+		a.dispatchEvent(notify.Event{
+			Kind:     notify.KindSpecMismatch,
+			Severity: notify.SeverityCritical,
+			RunID:    runID,
+			HostName: hostName,
+			Title:    fmt.Sprintf("[vetting] %s spec mismatch (%d critical)", hostName, critical),
+			Body:     fmt.Sprintf("SpecValidate found %d critical diff(s) on %s. Host is held for inspection.", critical, hostName),
+			URL:      a.runLinkURL(runID),
+		})
+	} else {
+		_ = a.Stages.CompleteByName(r.Context(), runID, "SpecValidate", model.StagePassed, string(summaryBuf))
+		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted); err != nil {
+			log.Printf("specvalidate: advance: %v", err)
+		}
+		a.appendLog(runID, "info", "SpecValidate: all fields match expected spec")
+	}
+}
+
+func (a *Agent) readInventoryArtifact(r *http.Request, runID int64) (*spec.Inventory, error) {
+	arts, err := a.Artifacts.ListForRun(r.Context(), runID)
+	if err != nil {
+		return nil, err
+	}
+	for i := len(arts) - 1; i >= 0; i-- {
+		if arts[i].Kind == "inventory" {
+			buf, err := os.ReadFile(arts[i].Path)
+			if err != nil {
+				return nil, err
+			}
+			var inv spec.Inventory
+			if err := json.Unmarshal(buf, &inv); err != nil {
+				return nil, err
+			}
+			return &inv, nil
+		}
+	}
+	return nil, errors.New("no inventory artifact")
+}
+
+func (a *Agent) failStage(r *http.Request, runID int64, stage, message string) {
+	_ = a.Stages.CompleteByName(r.Context(), runID, stage, model.StageFailed, fmt.Sprintf(`{"error":%q}`, message))
+	_ = a.Runs.SetFailedStage(r.Context(), runID, stage)
+	if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
+		log.Printf("failStage: transition run %d: %v", runID, err)
+	}
+	a.appendLog(runID, "error", stage+": "+message)
+}
+
+func (a *Agent) appendLog(runID int64, level, text string) {
+	if a.Logs == nil {
+		return
+	}
+	w, err := a.Logs.WriterFor(runID)
+	if err != nil {
+		log.Printf("appendLog: %v", err)
+		return
+	}
+	w.Append(logs.Line{Level: level, Text: text})
+}
+
+// Hold issues the per-run ephemeral ed25519 keypair: the agent gets
+// the authorized_keys line, the orchestrator keeps the privkey on disk.
+// Hold also records the agent's reported IP so the tile can print the
+// ssh invocation.
+type HoldRequest struct {
+	AgentIP string `json:"agent_ip"`
+}
+
+type HoldResponse struct {
+	AuthorizedKey string `json:"authorized_key"`
+	RunID         int64  `json:"run_id"`
+}
+
+func (a *Agent) Hold(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	if _, ok := a.authenticate(w, r, runID); !ok {
+		return
+	}
+	var body HoldRequest
+	_ = json.NewDecoder(r.Body).Decode(&body)
+	agentIP := strings.TrimSpace(body.AgentIP)
+	if agentIP == "" {
+		if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil {
+			agentIP = host
+		}
+	}
+	if agentIP != "" {
+		if err := a.Runs.SetHoldIP(r.Context(), runID, agentIP); err != nil {
+			log.Printf("hold: set hold_ip: %v", err)
+		}
+	}
+
+	kp, err := hold.Issue(runID)
+	if err != nil {
+		http.Error(w, "generate key: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	keyPath := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID), "hold.key")
+	abs, err := kp.WritePrivateTo(keyPath)
+	if err != nil {
+		http.Error(w, "write key: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	sum := sha256.Sum256(kp.PrivatePEM)
+	if _, err := a.Artifacts.Create(r.Context(), store.Artifact{
+		RunID:     runID,
+		Kind:      "hold_key",
+		Path:      abs,
+		SHA256:    hex.EncodeToString(sum[:]),
+		SizeBytes: int64(len(kp.PrivatePEM)),
+	}); err != nil {
+		log.Printf("hold: record artifact: %v", err)
+	}
+	a.appendLog(runID, "info", fmt.Sprintf("Hold key issued. SSH in with: ssh -i %s root@%s", abs, agentIP))
+	hostID := mustHostID(a, r, runID)
+	if hostID != 0 {
+		hostName := a.hostNameFor(r.Context(), hostID)
+		a.dispatchEvent(notify.Event{
+			Kind:     notify.KindHoldingOpened,
+			Severity: notify.SeverityCritical,
+			RunID:    runID,
+			HostName: hostName,
+			Title:    fmt.Sprintf("[vetting] %s holding — SSH ready", hostName),
+			Body:     fmt.Sprintf("Host %s is holding at %s.\nssh -i %s root@%s", hostName, agentIP, abs, agentIP),
+			URL:      a.runLinkURL(runID),
+		})
+	}
+	// Refresh the tile so the operator sees the ssh command.
+	host, _ := a.Hosts.Get(r.Context(), mustHostID(a, r, runID))
+	if host != nil {
+		latest, _ := a.Runs.Get(r.Context(), runID)
+		if orchestrator.TileRenderer != nil {
+			payload := orchestrator.TileRenderer(r.Context(), *host, latest)
+			a.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", host.ID), Payload: payload})
+		}
+	}
+	writeJSON(w, http.StatusOK, HoldResponse{AuthorizedKey: kp.AuthorizedKey, RunID: runID})
+}
+
+// dispatchEvent hands an already-populated Event to the notify Registry
+// if one is wired. Handler code uses hostNameFor to resolve the host
+// name for the event payload; this keeps call sites terse.
+func (a *Agent) dispatchEvent(ev notify.Event) {
+	if a.Notify == nil {
+		return
+	}
+	a.Notify.Dispatch(ev)
+}
+
+// hostNameFor returns a human-readable host name for a run, or "host-N"
+// if the lookup fails — notifications should never fail silently over a
+// missing name.
+func (a *Agent) hostNameFor(ctx context.Context, hostID int64) string {
+	if host, err := a.Hosts.Get(ctx, hostID); err == nil && host != nil {
+		return host.Name
+	}
+	return fmt.Sprintf("host-%d", hostID)
+}
+
+func (a *Agent) runLinkURL(runID int64) string {
+	if a.PublicURL == "" {
+		return ""
+	}
+	return strings.TrimRight(a.PublicURL, "/") + "/reports/" + fmt.Sprintf("%d", runID)
+}
+
+func mustHostID(a *Agent, r *http.Request, runID int64) int64 {
+	run, err := a.Runs.Get(r.Context(), runID)
+	if err != nil || run == nil {
+		return 0
+	}
+	return run.HostID
+}
+
+// ===== Phase 4 endpoints =================================================
+
+// SensorBatch is what the agent POSTs to /sensor: a stream of numeric
+// samples (temps, fan rpm, PSU rails, iperf throughput). Each sample is
+// (kind, key, value, unit). Timestamps default to server-now when empty
+// so the thermal sidecar doesn't have to carry a clock.
+type SensorBatch struct {
+	Samples []SensorSample `json:"samples"`
+}
+
+type SensorSample struct {
+	TS    string  `json:"ts,omitempty"`
+	Kind  string  `json:"kind"` // temp|fan|psu_volt|iperf|fio|smart_attr
+	Key   string  `json:"key"`
+	Value float64 `json:"value"`
+	Unit  string  `json:"unit,omitempty"`
+}
+
+// Sensor persists a batch of numeric samples. The thermal sidecar hits
+// this on a tick; stage executors (iperf, fio) also drop here.
+func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	if _, ok := a.authenticate(w, r, runID); !ok {
+		return
+	}
+	if a.Measurements == nil {
+		http.Error(w, "measurements store not wired", http.StatusInternalServerError)
+		return
+	}
+	var body SensorBatch
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		http.Error(w, "bad json", http.StatusBadRequest)
+		return
+	}
+	rows := make([]model.Measurement, 0, len(body.Samples))
+	for _, s := range body.Samples {
+		ts, _ := time.Parse(time.RFC3339Nano, s.TS)
+		rows = append(rows, model.Measurement{
+			RunID: runID,
+			TS:    ts,
+			Kind:  s.Kind,
+			Key:   s.Key,
+			Value: s.Value,
+			Unit:  s.Unit,
+		})
+	}
+	if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil {
+		http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(rows)})
+}
+
+// resolveReporting runs when the pipeline advances into StateReporting.
+// It's an orchestrator-owned stage like SpecValidate: no agent action.
+// Writes a JSON report bundling run + stages + diffs + measurements,
+// then advances the run to Completed. Heartbeat will then return abort
+// and the agent will power the host off in Phase 5.
+func (a *Agent) resolveReporting(r *http.Request, runID int64) {
+	ctx := r.Context()
+	if err := a.Stages.StartByName(ctx, runID, "Reporting"); err != nil {
+		log.Printf("reporting: start stage: %v", err)
+	}
+	run, err := a.Runs.Get(ctx, runID)
+	if err != nil {
+		log.Printf("reporting: get run: %v", err)
+		return
+	}
+	host, err := a.Hosts.Get(ctx, run.HostID)
+	if err != nil {
+		log.Printf("reporting: get host: %v", err)
+		return
+	}
+	stages, err := a.Stages.ListForRun(ctx, runID)
+	if err != nil {
+		log.Printf("reporting: list stages: %v", err)
+	}
+	diffs, err := a.SpecDiffs.ListForRun(ctx, runID)
+	if err != nil {
+		log.Printf("reporting: list diffs: %v", err)
+	}
+	var measurements []model.Measurement
+	if a.Measurements != nil {
+		measurements, err = a.Measurements.ListForRun(ctx, runID)
+		if err != nil {
+			log.Printf("reporting: list measurements: %v", err)
+		}
+	}
+	bundle := map[string]any{
+		"run":          run,
+		"host":         host,
+		"stages":       stages,
+		"spec_diffs":   diffs,
+		"measurements": measurements,
+		"generated_at": time.Now().UTC().Format(time.RFC3339),
+	}
+	buf, err := json.MarshalIndent(bundle, "", "  ")
+	if err != nil {
+		log.Printf("reporting: marshal: %v", err)
+		a.failStage(r, runID, "Reporting", "marshal report: "+err.Error())
+		return
+	}
+	dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID))
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		a.failStage(r, runID, "Reporting", "mkdir: "+err.Error())
+		return
+	}
+	path := filepath.Join(dir, "report.json")
+	if err := os.WriteFile(path, buf, 0o644); err != nil {
+		a.failStage(r, runID, "Reporting", "write: "+err.Error())
+		return
+	}
+	sum := sha256.Sum256(buf)
+	if _, err := a.Artifacts.Create(ctx, store.Artifact{
+		RunID:     runID,
+		Kind:      "report",
+		Path:      path,
+		SHA256:    hex.EncodeToString(sum[:]),
+		SizeBytes: int64(len(buf)),
+	}); err != nil {
+		log.Printf("reporting: record artifact: %v", err)
+	}
+	// Also render the operator-facing HTML summary alongside the JSON.
+	// Failures here are non-fatal — the JSON is the source of truth.
+	if host != nil {
+		htmlData := report.Data{
+			GeneratedAt: time.Now().UTC(),
+			Run:         *run,
+			Host:        *host,
+			Stages:      stages,
+			SpecDiffs:   diffs,
+			Aggregates:  report.AggregateMeasurements(measurements),
+		}
+		if htmlBuf, err := report.RenderHTML(htmlData); err != nil {
+			log.Printf("reporting: render html: %v", err)
+		} else {
+			htmlPath := filepath.Join(dir, "report.html")
+			if err := os.WriteFile(htmlPath, htmlBuf, 0o644); err != nil {
+				log.Printf("reporting: write html: %v", err)
+			} else {
+				htmlSum := sha256.Sum256(htmlBuf)
+				if _, err := a.Artifacts.Create(ctx, store.Artifact{
+					RunID:     runID,
+					Kind:      "report_html",
+					Path:      htmlPath,
+					SHA256:    hex.EncodeToString(htmlSum[:]),
+					SizeBytes: int64(len(htmlBuf)),
+				}); err != nil {
+					log.Printf("reporting: record html artifact: %v", err)
+				}
+			}
+		}
+	}
+	summaryBuf, _ := json.Marshal(map[string]any{
+		"report_path": path,
+		"stages":      len(stages),
+		"diffs":       len(diffs),
+	})
+	if err := a.Stages.CompleteByName(ctx, runID, "Reporting", model.StagePassed, string(summaryBuf)); err != nil {
+		log.Printf("reporting: complete stage: %v", err)
+	}
+	if err := a.Runs.MarkCompleted(ctx, runID, path); err != nil {
+		log.Printf("reporting: mark completed: %v", err)
+	}
+	a.appendLog(runID, "info", "Reporting: wrote "+path+"; run completed.")
+	// Publish a final tile update so the dashboard flips to pass mood.
+	if host != nil && orchestrator.TileRenderer != nil {
+		latest, _ := a.Runs.Get(ctx, runID)
+		payload := orchestrator.TileRenderer(ctx, *host, latest)
+		a.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", host.ID), Payload: payload})
+	}
+	hostName := "host"
+	if host != nil {
+		hostName = host.Name
+	}
+	a.dispatchEvent(notify.Event{
+		Kind:     notify.KindRunCompleted,
+		Severity: notify.SeverityInfo,
+		RunID:    runID,
+		HostName: hostName,
+		Title:    fmt.Sprintf("[vetting] %s passed vetting", hostName),
+		Body:     fmt.Sprintf("Run %d on %s completed all stages. Report: %s", runID, hostName, path),
+		URL:      a.runLinkURL(runID),
+	})
+}
@@ -0,0 +1,128 @@
+package api_test
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"path/filepath"
+	"strconv"
+	"testing"
+
+	"github.com/go-chi/chi/v5"
+
+	"vetting/internal/api"
+	"vetting/internal/db"
+	"vetting/internal/model"
+	"vetting/internal/orchestrator"
+	"vetting/internal/store"
+)
+
+func setupAgent(t *testing.T) (*api.Agent, int64, string) {
+	t.Helper()
+	path := filepath.Join(t.TempDir(), "vetting.db")
+	conn, err := db.Open(path)
+	if err != nil {
+		t.Fatalf("open db: %v", err)
+	}
+	t.Cleanup(func() { _ = conn.Close() })
+
+	hosts := &store.Hosts{DB: conn}
+	runs := &store.Runs{DB: conn}
+	meas := &store.Measurements{DB: conn}
+
+	hostID, err := hosts.Create(context.Background(), model.Host{
+		Name:             "t-host",
+		MAC:              "aa:bb:cc:dd:ee:01",
+		WoLBroadcastIP:   "10.0.0.255",
+		WoLPort:          9,
+		ExpectedSpecYAML: "memory:\n  total_gib: 16\n",
+	})
+	if err != nil {
+		t.Fatalf("create host: %v", err)
+	}
+	plain, hash, err := orchestrator.IssueRunToken()
+	if err != nil {
+		t.Fatalf("issue token: %v", err)
+	}
+	runID, err := runs.Create(context.Background(), hostID, hash)
+	if err != nil {
+		t.Fatalf("create run: %v", err)
+	}
+	return &api.Agent{
+		Hosts:        hosts,
+		Runs:         runs,
+		Measurements: meas,
+	}, runID, plain
+}
+
+func routedRequest(runID int64, method, path string, body []byte) *http.Request {
+	req := httptest.NewRequest(method, path, bytes.NewReader(body))
+	// chi.URLParam is read from chi's context routing; fake that here.
+	rctx := chi.NewRouteContext()
+	rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
+	return req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
+}
+
+func TestSensorPersistsBatch(t *testing.T) {
+	a, runID, token := setupAgent(t)
+	batch := api.SensorBatch{Samples: []api.SensorSample{
+		{Kind: "thermal", Key: "cpu", Value: 47.5, Unit: "C"},
+		{Kind: "iperf", Key: "throughput_mbps", Value: 938.2, Unit: "Mbps"},
+	}}
+	buf, _ := json.Marshal(batch)
+	req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
+	req.Header.Set("Authorization", "Bearer "+token)
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+	a.Sensor(rr, req)
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
+	}
+	rows, err := a.Measurements.ListForRun(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	if len(rows) != 2 {
+		t.Fatalf("expected 2 measurements, got %d", len(rows))
+	}
+}
+
+func TestSensorRejectsBadToken(t *testing.T) {
+	a, runID, _ := setupAgent(t)
+	body, _ := json.Marshal(api.SensorBatch{})
+	req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", body)
+	req.Header.Set("Authorization", "Bearer wrong-token")
+	rr := httptest.NewRecorder()
+	a.Sensor(rr, req)
+	if rr.Code != http.StatusUnauthorized {
+		t.Fatalf("status = %d, want 401", rr.Code)
+	}
+}
+
+// TestHeartbeatShutdownWhenCompleted: once the orchestrator has flipped
+// the run into Completed, the next heartbeat response must carry
+// cmd=shutdown so the agent powers the host down.
+func TestHeartbeatShutdownWhenCompleted(t *testing.T) {
+	a, runID, token := setupAgent(t)
+	// Wire a runner so Heartbeat's TouchHeartbeat call doesn't nil-panic.
+	a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}}
+	if err := a.Runs.SetState(context.Background(), runID, model.StateCompleted); err != nil {
+		t.Fatalf("set state: %v", err)
+	}
+	req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil)
+	req.Header.Set("Authorization", "Bearer "+token)
+	rr := httptest.NewRecorder()
+	a.Heartbeat(rr, req)
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
+	}
+	var resp map[string]any
+	if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if resp["cmd"] != "shutdown" {
+		t.Fatalf("cmd = %v, want shutdown", resp["cmd"])
+	}
+}
@@ -0,0 +1,318 @@
+package api_test
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"vetting/internal/api"
+	"vetting/internal/db"
+	"vetting/internal/events"
+	"vetting/internal/logs"
+	"vetting/internal/model"
+	"vetting/internal/notify"
+	"vetting/internal/orchestrator"
+	"vetting/internal/spec"
+	"vetting/internal/store"
+)
+
+// captureNotifier is a testing-only Notifier that records every Event
+// sent to it, under a mutex so concurrent Dispatch goroutines are safe.
+type captureNotifier struct {
+	mu   sync.Mutex
+	name string
+	evs  []notify.Event
+}
+
+func (c *captureNotifier) Name() string { return c.name }
+
+func (c *captureNotifier) Send(_ context.Context, ev notify.Event) error {
+	c.mu.Lock()
+	c.evs = append(c.evs, ev)
+	c.mu.Unlock()
+	return nil
+}
+
+func (c *captureNotifier) awaitKind(t *testing.T, k notify.Kind) notify.Event {
+	t.Helper()
+	deadline := time.Now().Add(2 * time.Second)
+	for {
+		c.mu.Lock()
+		for _, ev := range c.evs {
+			if ev.Kind == k {
+				got := ev
+				c.mu.Unlock()
+				return got
+			}
+		}
+		c.mu.Unlock()
+		if time.Now().After(deadline) {
+			t.Fatalf("no %q event received within timeout", k)
+		}
+		time.Sleep(5 * time.Millisecond)
+	}
+}
+
+func newCaptureRegistry(c *captureNotifier) *notify.Registry {
+	reg := notify.NewRegistry(time.Second)
+	reg.Register(c)
+	reg.AddRoute(notify.Route{Notifier: c.name}) // wildcard
+	return reg
+}
+
+// Builds a fully-wired Agent against a fresh sqlite DB and returns
+// (agent, runID, plainTokenForBearer). Caller is responsible for
+// transitioning the run out of Queued.
+func fullAgent(t *testing.T) (*api.Agent, int64, string) {
+	t.Helper()
+	tmp := t.TempDir()
+	conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
+	if err != nil {
+		t.Fatalf("open db: %v", err)
+	}
+	t.Cleanup(func() { _ = conn.Close() })
+
+	hostStore := &store.Hosts{DB: conn}
+	runStore := &store.Runs{DB: conn}
+	stageStore := &store.Stages{DB: conn}
+	artifactStore := &store.Artifacts{DB: conn}
+	specDiffStore := &store.SpecDiffs{DB: conn}
+	measurementStore := &store.Measurements{DB: conn}
+
+	hub := events.NewHub()
+	logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
+	if err != nil {
+		t.Fatalf("logs hub: %v", err)
+	}
+	t.Cleanup(func() { logHub.Close() })
+
+	runner := &orchestrator.Runner{
+		Runs:     runStore,
+		Hosts:    hostStore,
+		Stages:   stageStore,
+		EventHub: hub,
+	}
+
+	hostID, err := hostStore.Create(context.Background(), model.Host{
+		Name:             "smoke-host",
+		MAC:              "aa:bb:cc:dd:ee:10",
+		WoLBroadcastIP:   "10.0.0.255",
+		WoLPort:          9,
+		ExpectedSpecYAML: "", // empty spec → no diffs
+	})
+	if err != nil {
+		t.Fatalf("create host: %v", err)
+	}
+	plain, hash, err := orchestrator.IssueRunToken()
+	if err != nil {
+		t.Fatalf("issue token: %v", err)
+	}
+	runID, err := runStore.Create(context.Background(), hostID, hash)
+	if err != nil {
+		t.Fatalf("create run: %v", err)
+	}
+	if err := stageStore.Seed(context.Background(), runID); err != nil {
+		t.Fatalf("seed stages: %v", err)
+	}
+	return &api.Agent{
+		Hosts:        hostStore,
+		Runs:         runStore,
+		Stages:       stageStore,
+		Artifacts:    artifactStore,
+		SpecDiffs:    specDiffStore,
+		Measurements: measurementStore,
+		Runner:       runner,
+		EventHub:     hub,
+		Logs:         logHub,
+		ArtifactsDir: filepath.Join(tmp, "artifacts"),
+		PublicURL:    "https://vetting.example",
+	}, runID, plain
+}
+
+// walkStage simulates the agent reporting a single stage's outcome.
+// Returns the next_state the orchestrator decided to advance to.
+func walkStage(t *testing.T, a *api.Agent, runID int64, token, stage string, passed bool, extras map[string]any) string {
+	t.Helper()
+	body := map[string]any{"stage": stage, "passed": passed}
+	if extras != nil {
+		for k, v := range extras {
+			body[k] = v
+		}
+	}
+	buf, _ := json.Marshal(body)
+	req := httptest.NewRequest(http.MethodPost,
+		"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/result",
+		bytes.NewReader(buf))
+	rctx := chi.NewRouteContext()
+	rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
+	req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
+	req.Header.Set("Authorization", "Bearer "+token)
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+	a.Result(rr, req)
+	if rr.Code != http.StatusOK {
+		t.Fatalf("stage %s: status %d body=%q", stage, rr.Code, rr.Body.String())
+	}
+	var resp struct {
+		OK        bool   `json:"ok"`
+		NextState string `json:"next_state"`
+	}
+	if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil {
+		t.Fatalf("stage %s: decode resp: %v", stage, err)
+	}
+	return resp.NextState
+}
+
+// TestFullPipelineToCompleted walks an agent through all stages of a
+// successful run and asserts the run ends in Completed. Inventory is
+// minimal; the empty expected-spec means SpecValidate produces zero
+// critical diffs and the orchestrator auto-advances past it.
+func TestFullPipelineToCompleted(t *testing.T) {
+	a, runID, token := fullAgent(t)
+	capture := &captureNotifier{name: "capture"}
+	a.Notify = newCaptureRegistry(capture)
+	// Claim would normally transition Booting → InventoryCheck; set it
+	// directly here since we're not exercising the claim path.
+	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
+		t.Fatalf("set state: %v", err)
+	}
+
+	// Stage 1: Inventory — provide a concrete inventory so SpecValidate
+	// has something to compare against.
+	inv := spec.Inventory{
+		CPU:    spec.CPUSpec{Model: "Xeon Gold", LogicalCores: 8},
+		Memory: spec.MemorySpec{TotalGiB: 16},
+	}
+	next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
+	// After Inventory → SpecValidate resolves inline → SMART
+	if next != "SMART" {
+		t.Fatalf("after Inventory, next_state = %q, want SMART", next)
+	}
+
+	// The remaining stages advance one-for-one in order.
+	walkPlan := []struct {
+		stage    string
+		expected string
+	}{
+		{"SMART", "CPUStress"},
+		{"CPUStress", "Storage"},
+		{"Storage", "Network"},
+		{"Network", "GPU"},
+		{"GPU", "PSU"},
+		{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
+	}
+	for _, step := range walkPlan {
+		got := walkStage(t, a, runID, token, step.stage, true, nil)
+		if got != step.expected {
+			t.Fatalf("after %s, next_state = %q, want %q", step.stage, got, step.expected)
+		}
+	}
+
+	run, err := a.Runs.Get(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("Get run: %v", err)
+	}
+	if run.State != model.StateCompleted {
+		t.Fatalf("run.State = %q, want Completed", run.State)
+	}
+	if run.ReportPath == "" {
+		t.Fatalf("run.ReportPath not set")
+	}
+
+	// Phase 5 assertions: an HTML report artifact exists on disk, and
+	// the capture notifier saw a RunCompleted event.
+	arts, err := a.Artifacts.ListForRun(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	var htmlPath string
+	for _, art := range arts {
+		if art.Kind == "report_html" {
+			htmlPath = art.Path
+		}
+	}
+	if htmlPath == "" {
+		t.Fatalf("no report_html artifact recorded (kinds seen: %v)", artifactKinds(arts))
+	}
+	data, err := os.ReadFile(htmlPath)
+	if err != nil {
+		t.Fatalf("read report.html: %v", err)
+	}
+	if !strings.Contains(string(data), "<html") {
+		t.Fatalf("report.html missing <html tag: %s", string(data[:min(200, len(data))]))
+	}
+	ev := capture.awaitKind(t, notify.KindRunCompleted)
+	if ev.HostName != "smoke-host" {
+		t.Errorf("RunCompleted host = %q, want smoke-host", ev.HostName)
+	}
+	if ev.URL == "" || !strings.Contains(ev.URL, "/reports/") {
+		t.Errorf("RunCompleted URL = %q, want non-empty with /reports/", ev.URL)
+	}
+}
+
+func artifactKinds(arts []store.Artifact) []string {
+	out := make([]string, 0, len(arts))
+	for _, a := range arts {
+		out = append(out, a.Kind)
+	}
+	return out
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// TestFaultInjectionSMART verifies a failing SMART stage halts the
+// pipeline at FailedHolding with failed_stage recorded.
+func TestFaultInjectionSMART(t *testing.T) {
+	a, runID, token := fullAgent(t)
+	capture := &captureNotifier{name: "capture"}
+	a.Notify = newCaptureRegistry(capture)
+	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
+		t.Fatalf("set state: %v", err)
+	}
+
+	inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
+	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" {
+		t.Fatalf("after Inventory, next = %q want SMART", next)
+	}
+
+	// Fake SMART failure → expect FailedHolding.
+	if next := walkStage(t, a, runID, token, "SMART", false, nil); next != "FailedHolding" {
+		t.Fatalf("after SMART fail, next = %q want FailedHolding", next)
+	}
+
+	run, err := a.Runs.Get(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("Get run: %v", err)
+	}
+	if run.State != model.StateFailedHolding {
+		t.Fatalf("run.State = %q, want FailedHolding", run.State)
+	}
+	if run.FailedStage != "SMART" {
+		t.Fatalf("run.FailedStage = %q, want SMART", run.FailedStage)
+	}
+
+	// Phase 5 assertion: the fault fires a StageFailed notification.
+	ev := capture.awaitKind(t, notify.KindStageFailed)
+	if !strings.Contains(ev.Title, "SMART") {
+		t.Errorf("StageFailed title = %q, want to mention SMART", ev.Title)
+	}
+	if ev.Severity != notify.SeverityCritical {
+		t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
+	}
+}
@@ -0,0 +1,69 @@
+package api
+
+import (
+	"context"
+	"log"
+
+	"vetting/internal/model"
+	"vetting/internal/store"
+	"vetting/internal/web/templates"
+)
+
+// TileEnricher builds a fully-populated TileData for a host. It looks
+// up the latest run's spec-diff count and hold-key artifact path so the
+// tile can render the "n critical diffs" badge and the ssh invocation
+// without the template package needing DB access.
+//
+// Used by both the Dashboard handler (initial render) and the SSE tile-
+// refresh path (agent_handlers.Hold, orchestrator runner) so every
+// place that renders a tile shows the same data.
+type TileEnricher struct {
+	Runs      *store.Runs
+	Artifacts *store.Artifacts
+	SpecDiffs *store.SpecDiffs
+}
+
+// Build returns a TileData for (host, latest). Fails soft: DB errors
+// fall back to a tile without the extra fields rather than breaking
+// the whole dashboard.
+func (e *TileEnricher) Build(ctx context.Context, host model.Host, latest *model.Run) templates.TileData {
+	t := templates.TileData{Host: host, Latest: latest}
+	if latest == nil {
+		return t
+	}
+	if e.SpecDiffs != nil {
+		if diffs, err := e.SpecDiffs.ListForRun(ctx, latest.ID); err == nil {
+			for _, d := range diffs {
+				if d.Severity == "critical" && !d.Ignored {
+					t.SpecDiffCritical++
+				}
+			}
+		} else {
+			log.Printf("tile: list spec_diffs run %d: %v", latest.ID, err)
+		}
+	}
+	if e.Artifacts != nil {
+		if arts, err := e.Artifacts.ListForRun(ctx, latest.ID); err == nil {
+			for _, a := range arts {
+				if a.Kind == "hold_key" {
+					t.HoldKeyPath = a.Path
+				}
+			}
+		} else {
+			log.Printf("tile: list artifacts run %d: %v", latest.ID, err)
+		}
+	}
+	return t
+}
+
+// BuildByHost looks up the latest run itself — convenient for SSE tile
+// publishers that only know the host ID.
+func (e *TileEnricher) BuildByHost(ctx context.Context, host model.Host) templates.TileData {
+	var latest *model.Run
+	if e.Runs != nil {
+		if r, err := e.Runs.LatestForHost(ctx, host.ID); err == nil {
+			latest = r
+		}
+	}
+	return e.Build(ctx, host, latest)
+}
@@ -0,0 +1,295 @@
+package api
+
+import (
+	"errors"
+	"log"
+	"net/http"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+	"gopkg.in/yaml.v3"
+
+	"vetting/internal/auth"
+	"vetting/internal/events"
+	"vetting/internal/model"
+	"vetting/internal/orchestrator"
+	"vetting/internal/store"
+	"vetting/internal/web/templates"
+)
+
+type UI struct {
+	Hosts     *store.Hosts
+	Runs      *store.Runs
+	Artifacts *store.Artifacts
+	Auth      *auth.Manager
+	EventHub  *events.Hub
+	Runner    *orchestrator.Runner
+	Tiles     *TileEnricher
+}
+
+var macRe = regexp.MustCompile(`^[0-9a-f]{2}(:[0-9a-f]{2}){5}$`)
+
+func (u *UI) Dashboard(w http.ResponseWriter, r *http.Request) {
+	hosts, err := u.Hosts.List(r.Context())
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+	tiles := make([]templates.TileData, 0, len(hosts))
+	for _, h := range hosts {
+		latest, err := u.Runs.LatestForHost(r.Context(), h.ID)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+		tiles = append(tiles, u.Tiles.Build(r.Context(), h, latest))
+	}
+	_ = templates.Dashboard(tiles).Render(r.Context(), w)
+}
+
+// StartRun creates a new Run for the host, issues an agent token, and
+// transitions Registered→Queued. The dispatcher goroutine picks it up
+// and fires WoL.
+func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
+	idStr := chi.URLParam(r, "id")
+	hostID, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil {
+		http.Error(w, "bad host id", http.StatusBadRequest)
+		return
+	}
+	if _, err := u.Hosts.Get(r.Context(), hostID); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			http.NotFound(w, r)
+			return
+		}
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+
+	// Guard: refuse to start a second run while one is still active.
+	if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
+		switch latest.State {
+		case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
+			// ok to start fresh
+		default:
+			http.Error(w, "host already has an active run", http.StatusConflict)
+			return
+		}
+	}
+
+	_, hash, err := orchestrator.IssueRunToken()
+	if err != nil {
+		http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	runID, err := u.Runs.Create(r.Context(), hostID, hash)
+	if err != nil {
+		http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID)
+	http.Redirect(w, r, "/", http.StatusSeeOther)
+}
+
+func (u *UI) LoginForm(w http.ResponseWriter, r *http.Request) {
+	next := r.URL.Query().Get("next")
+	if next == "" {
+		next = "/"
+	}
+	_ = templates.Login("", next).Render(r.Context(), w)
+}
+
+func (u *UI) LoginSubmit(w http.ResponseWriter, r *http.Request) {
+	if err := r.ParseForm(); err != nil {
+		http.Error(w, "bad form", http.StatusBadRequest)
+		return
+	}
+	password := r.PostForm.Get("password")
+	next := r.PostForm.Get("next")
+	if next == "" || !strings.HasPrefix(next, "/") {
+		next = "/"
+	}
+	if !u.Auth.VerifyPassword(password) {
+		w.WriteHeader(http.StatusUnauthorized)
+		_ = templates.Login("Invalid password.", next).Render(r.Context(), w)
+		return
+	}
+	u.Auth.Issue(w, r)
+	http.Redirect(w, r, next, http.StatusSeeOther)
+}
+
+func (u *UI) Logout(w http.ResponseWriter, r *http.Request) {
+	u.Auth.Clear(w)
+	http.Redirect(w, r, "/login", http.StatusSeeOther)
+}
+
+func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
+	_ = templates.Registration(templates.RegistrationForm{}).Render(r.Context(), w)
+}
+
+func (u *UI) CreateHost(w http.ResponseWriter, r *http.Request) {
+	if err := r.ParseForm(); err != nil {
+		http.Error(w, "bad form", http.StatusBadRequest)
+		return
+	}
+	form := templates.RegistrationForm{
+		Name:             strings.TrimSpace(r.PostForm.Get("name")),
+		MAC:              strings.ToLower(strings.TrimSpace(r.PostForm.Get("mac"))),
+		WoLBroadcastIP:   strings.TrimSpace(r.PostForm.Get("wol_broadcast_ip")),
+		WoLPort:          r.PostForm.Get("wol_port"),
+		ExpectedSpecYAML: r.PostForm.Get("expected_spec_yaml"),
+		Notes:            strings.TrimSpace(r.PostForm.Get("notes")),
+	}
+
+	if errMsg := validateHostForm(&form); errMsg != "" {
+		form.Error = errMsg
+		w.WriteHeader(http.StatusBadRequest)
+		_ = templates.Registration(form).Render(r.Context(), w)
+		return
+	}
+
+	wolPort, _ := strconv.Atoi(form.WoLPort)
+	if wolPort == 0 {
+		wolPort = 9
+	}
+
+	_, err := u.Hosts.Create(r.Context(), model.Host{
+		Name:             form.Name,
+		MAC:              form.MAC,
+		WoLBroadcastIP:   form.WoLBroadcastIP,
+		WoLPort:          wolPort,
+		ExpectedSpecYAML: form.ExpectedSpecYAML,
+		Notes:            form.Notes,
+	})
+	if err != nil {
+		form.Error = friendlyDBError(err)
+		w.WriteHeader(http.StatusConflict)
+		_ = templates.Registration(form).Render(r.Context(), w)
+		return
+	}
+	http.Redirect(w, r, "/", http.StatusSeeOther)
+}
+
+// OverrideWipeStorage is the operator's explicit "yes, wipe the disk
+// even though we found filesystem signatures" button. Only meaningful
+// when the latest run is FailedHolding with failed_stage=Storage — the
+// agent's next heartbeat will receive retry_stage with wipe=true and
+// re-enter the Storage stage bypassing the wipe-probe guard.
+func (u *UI) OverrideWipeStorage(w http.ResponseWriter, r *http.Request) {
+	idStr := chi.URLParam(r, "id")
+	hostID, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil {
+		http.Error(w, "bad host id", http.StatusBadRequest)
+		return
+	}
+	latest, err := u.Runs.LatestForHost(r.Context(), hostID)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+	if latest == nil {
+		http.Error(w, "no run for host", http.StatusConflict)
+		return
+	}
+	if latest.State != model.StateFailedHolding || latest.FailedStage != "Storage" {
+		http.Error(w, "override only valid when holding on Storage", http.StatusConflict)
+		return
+	}
+	if _, err := u.Runner.Override(r.Context(), latest.ID, `{"wipe":true}`); err != nil {
+		http.Error(w, "override: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	http.Redirect(w, r, "/", http.StatusSeeOther)
+}
+
+func (u *UI) DeleteHost(w http.ResponseWriter, r *http.Request) {
+	idStr := chi.URLParam(r, "id")
+	id, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil {
+		http.Error(w, "bad id", http.StatusBadRequest)
+		return
+	}
+	if err := u.Hosts.Delete(r.Context(), id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			http.NotFound(w, r)
+			return
+		}
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+	http.Redirect(w, r, "/", http.StatusSeeOther)
+}
+
+func (u *UI) SSE(w http.ResponseWriter, r *http.Request) {
+	u.EventHub.ServeSSE(w, r)
+}
+
+// Report serves the HTML report artifact for a run. Looks up the
+// report_html artifact row for the runID, validates the path lives
+// under the artifacts dir (defence-in-depth against path traversal),
+// and streams it back. 404 when the run hasn't produced one yet.
+func (u *UI) Report(w http.ResponseWriter, r *http.Request) {
+	idStr := chi.URLParam(r, "runID")
+	runID, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil {
+		http.Error(w, "bad run id", http.StatusBadRequest)
+		return
+	}
+	arts, err := u.Artifacts.ListForRun(r.Context(), runID)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+	var path string
+	for _, a := range arts {
+		if a.Kind == "report_html" {
+			path = a.Path
+		}
+	}
+	if path == "" {
+		http.NotFound(w, r)
+		return
+	}
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	http.ServeFile(w, r, path)
+}
+
+func validateHostForm(form *templates.RegistrationForm) string {
+	if form.Name == "" {
+		return "Name is required."
+	}
+	if !macRe.MatchString(form.MAC) {
+		return "MAC address must be in the form aa:bb:cc:dd:ee:ff."
+	}
+	if form.WoLBroadcastIP == "" {
+		return "WoL broadcast IP is required."
+	}
+	if form.ExpectedSpecYAML == "" {
+		return "Expected spec YAML is required."
+	}
+	var anything any
+	if err := yaml.Unmarshal([]byte(form.ExpectedSpecYAML), &anything); err != nil {
+		return "Expected spec YAML is not valid YAML: " + err.Error()
+	}
+	if form.WoLPort != "" {
+		port, err := strconv.Atoi(form.WoLPort)
+		if err != nil || port < 1 || port > 65535 {
+			return "WoL port must be 1–65535."
+		}
+	}
+	return ""
+}
+
+func friendlyDBError(err error) string {
+	s := err.Error()
+	switch {
+	case strings.Contains(s, "UNIQUE constraint failed: hosts.name"):
+		return "A host with that name already exists."
+	case strings.Contains(s, "UNIQUE constraint failed: hosts.mac"):
+		return "A host with that MAC already exists."
+	default:
+		return s
+	}
+}