Initial commit: full Phases 1-6 implementation

Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
@@ -0,0 +1,918 @@
+package api
+
+import (
+	"context"
+	"crypto/sha256"
+	"crypto/subtle"
+	"encoding/hex"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log"
+	"net"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"vetting/internal/events"
+	"vetting/internal/hold"
+	"vetting/internal/logs"
+	"vetting/internal/model"
+	"vetting/internal/notify"
+	"vetting/internal/orchestrator"
+	"vetting/internal/pxe"
+	"vetting/internal/report"
+	"vetting/internal/spec"
+	"vetting/internal/store"
+)
+
+// Agent collects the collaborators used by agent-facing HTTP routes:
+// the iPXE chainload endpoint and the /api/v1/runs/:id/* endpoints.
+type Agent struct {
+	Hosts           *store.Hosts
+	Runs            *store.Runs
+	Stages          *store.Stages
+	Artifacts       *store.Artifacts
+	SpecDiffs       *store.SpecDiffs
+	Measurements    *store.Measurements
+	Runner          *orchestrator.Runner
+	EventHub        *events.Hub
+	Logs            *logs.Hub
+	Notify          *notify.Registry
+	ArtifactsDir    string // ./var/artifacts
+	OrchestratorURL string // baked into iPXE cmdline
+	PublicURL       string // user-visible URL base for notification click-throughs
+	LiveKernelURL   string
+	LiveInitrdURL   string
+	TLSCertFPR      string // optional; empty = skip pinning
+	IperfPort       int    // orchestrator-supervised iperf3 port; 0 = 5201
+}
+
+// IPXEScript serves a per-MAC iPXE script. Called by iPXE itself after
+// dnsmasq hands it the chainload URL. Unknown MAC → halt script.
+// Known MAC with no active run → poweroff script. Known MAC with active
+// run → real boot script; the fetch triggers PXEObserved.
+func (a *Agent) IPXEScript(w http.ResponseWriter, r *http.Request) {
+	mac := strings.ToLower(strings.TrimSpace(chi.URLParam(r, "mac")))
+	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+	w.Header().Set("Cache-Control", "no-store")
+
+	if !macRe.MatchString(mac) {
+		log.Printf("ipxe: rejected malformed mac %q from %s", mac, r.RemoteAddr)
+		_, _ = w.Write([]byte(pxe.NotRegisteredScript(mac)))
+		return
+	}
+
+	run, err := a.Runs.FindActiveByMAC(r.Context(), mac)
+	if err != nil {
+		log.Printf("ipxe: find run by mac %s: %v", mac, err)
+		http.Error(w, "internal error", http.StatusInternalServerError)
+		return
+	}
+	if run == nil {
+		_, _ = w.Write([]byte(pxe.NoActiveRunScript(mac)))
+		return
+	}
+
+	// The token hash in the DB is the sha256 of the plaintext. The
+	// plaintext itself cannot be recovered from the hash — we issued it
+	// once when the run was created. For iPXE we re-issue a fresh token
+	// on every PXE fetch: this is safe because the hash in the DB is
+	// rewritten to match and only the most recent PXE can be claimed.
+	plain, hash, err := orchestrator.IssueRunToken()
+	if err != nil {
+		http.Error(w, "token", http.StatusInternalServerError)
+		return
+	}
+	if err := a.Runs.RotateTokenHash(r.Context(), run.ID, hash); err != nil {
+		log.Printf("ipxe: rotate token run %d: %v", run.ID, err)
+		http.Error(w, "token", http.StatusInternalServerError)
+		return
+	}
+
+	script := pxe.BuildScript(pxe.IPXEParams{
+		OrchestratorURL: a.OrchestratorURL,
+		LiveKernelURL:   a.LiveKernelURL,
+		LiveInitrdURL:   a.LiveInitrdURL,
+		TLSCertFPR:      a.TLSCertFPR,
+		RunID:           run.ID,
+		MAC:             mac,
+		Token:           plain,
+	})
+	_, _ = w.Write([]byte(script))
+
+	// iPXE has now fetched the script — treat this as PXEObserved. If we
+	// were already in Booting the transition table allows staying.
+	if _, err := a.Runner.Transition(r.Context(), run.ID, orchestrator.TriggerPXEObserved); err != nil {
+		// Non-fatal: the agent may still claim via /claim.
+		log.Printf("ipxe: PXEObserved for run %d: %v", run.ID, err)
+	}
+}
+
+// Hello is the first call an agent makes once userspace is up. It's
+// idempotent and only writes a log line; the authoritative transition
+// comes from /claim. The agent sends Hello early so operators see a
+// signal in the tile even before the token is validated.
+func (a *Agent) Hello(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	if _, ok := a.authenticate(w, r, runID); !ok {
+		return
+	}
+	log.Printf("agent hello: run=%d remote=%s", runID, r.RemoteAddr)
+	writeJSON(w, http.StatusOK, map[string]any{"ok": true, "run_id": runID})
+}
+
+// Claim is the binding call: the agent proves it holds the plaintext
+// token for this run, and in return the orchestrator transitions to
+// InventoryCheck and seeds the stage rows. All destructive actions the
+// agent takes later require a prior successful claim.
+func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	run, ok := a.authenticate(w, r, runID)
+	if !ok {
+		return
+	}
+
+	var body struct {
+		AgentIP string `json:"agent_ip"`
+	}
+	if r.Body != nil {
+		// agent_ip is informational; if missing fall back to RemoteAddr.
+		_ = json.NewDecoder(r.Body).Decode(&body)
+	}
+	agentIP := strings.TrimSpace(body.AgentIP)
+	if agentIP == "" {
+		if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil {
+			agentIP = host
+		} else {
+			agentIP = r.RemoteAddr
+		}
+	}
+
+	// First claim seeds the stage rows; subsequent claims are a no-op
+	// so agent retries after transient network failures stay safe.
+	if len(mustListStages(a.Stages, r, runID)) == 0 {
+		if err := a.Stages.Seed(r.Context(), runID); err != nil {
+			log.Printf("claim: seed stages run %d: %v", runID, err)
+			http.Error(w, "seed stages", http.StatusInternalServerError)
+			return
+		}
+	}
+
+	// Drive the transition. If we're already past Booting this returns
+	// an error — treat as "already claimed" and report OK, don't 500.
+	if run.State == model.StateWaitingWoL || run.State == model.StateBooting {
+		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerAgentClaimed); err != nil {
+			log.Printf("claim: transition run %d: %v", runID, err)
+			http.Error(w, "transition", http.StatusConflict)
+			return
+		}
+	}
+
+	log.Printf("agent claimed: run=%d agent_ip=%s", runID, agentIP)
+
+	// Stage-driven agent needs a bit of per-run config: the device
+	// allowlist (serial + expected size) for Storage, and the iperf3
+	// server port for Network. Parse the host's expected spec here so
+	// the agent doesn't need to read YAML.
+	expectedDisks := []map[string]any{}
+	if host, err := a.Hosts.Get(r.Context(), run.HostID); err == nil && host != nil {
+		if parsed, err := spec.Parse(host.ExpectedSpecYAML); err == nil && parsed != nil {
+			for _, dd := range parsed.Disks {
+				expectedDisks = append(expectedDisks, map[string]any{
+					"serial":  dd.Serial,
+					"size_gb": dd.SizeGB,
+				})
+			}
+		}
+	}
+	iperfPort := a.IperfPort
+	if iperfPort == 0 {
+		iperfPort = 5201
+	}
+	writeJSON(w, http.StatusOK, map[string]any{
+		"ok":             true,
+		"run_id":         runID,
+		"stages":         store.DefaultStageOrder,
+		"expected_disks": expectedDisks,
+		"iperf_port":     iperfPort,
+	})
+}
+
+// Heartbeat is the agent's periodic liveness ping. The response body
+// acts as a control channel: cmd=continue is the normal case; cmd=abort
+// once the run enters FailedHolding/Released; cmd=retry_stage when the
+// operator has overridden a failed stage (wipe-probe override).
+func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	run, ok := a.authenticate(w, r, runID)
+	if !ok {
+		return
+	}
+	a.Runner.TouchHeartbeat(runID)
+
+	cmd := "continue"
+	resp := map[string]any{"state": run.State}
+	switch {
+	case run.State == model.StateCompleted:
+		// Pipeline succeeded — agent should power the host down.
+		cmd = "shutdown"
+	case run.State == model.StateFailedHolding || run.State == model.StateReleased:
+		cmd = "abort"
+	case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON):
+		// Operator pressed "Override wipe & retry". Agent should
+		// re-enter Storage with the wipe-probe bypass armed.
+		cmd = "retry_stage"
+		resp["stage"] = "Storage"
+		resp["override_flags"] = json.RawMessage(run.OverrideFlagsJSON)
+	}
+	resp["cmd"] = cmd
+	writeJSON(w, http.StatusOK, resp)
+}
+
+// overrideWipeSet inspects a Run.OverrideFlagsJSON blob for the wipe flag.
+// Malformed JSON is ignored — the operator has to reapply the override if
+// it didn't round-trip correctly.
+func overrideWipeSet(blob string) bool {
+	if blob == "" {
+		return false
+	}
+	var flags struct {
+		Wipe bool `json:"wipe"`
+	}
+	_ = json.Unmarshal([]byte(blob), &flags)
+	return flags.Wipe
+}
+
+// authenticate verifies the Bearer token against the run's stored hash
+// and returns the Run for downstream handlers. Responds 401/404 on
+// failure and returns ok=false so the caller can bail early.
+func (a *Agent) authenticate(w http.ResponseWriter, r *http.Request, runID int64) (*model.Run, bool) {
+	run, err := a.Runs.Get(r.Context(), runID)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			http.Error(w, "run not found", http.StatusNotFound)
+			return nil, false
+		}
+		http.Error(w, "internal error", http.StatusInternalServerError)
+		return nil, false
+	}
+	token := bearerToken(r)
+	if token == "" {
+		http.Error(w, "missing bearer", http.StatusUnauthorized)
+		return nil, false
+	}
+	presented := orchestrator.HashRunToken(token)
+	if subtle.ConstantTimeCompare([]byte(presented), []byte(run.AgentTokenHash)) != 1 {
+		http.Error(w, "bad token", http.StatusUnauthorized)
+		return nil, false
+	}
+	return run, true
+}
+
+func bearerToken(r *http.Request) string {
+	h := r.Header.Get("Authorization")
+	if !strings.HasPrefix(h, "Bearer ") {
+		return ""
+	}
+	return strings.TrimSpace(strings.TrimPrefix(h, "Bearer "))
+}
+
+func runIDFromURL(w http.ResponseWriter, r *http.Request) (int64, bool) {
+	idStr := chi.URLParam(r, "id")
+	id, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil || id <= 0 {
+		http.Error(w, "bad run id", http.StatusBadRequest)
+		return 0, false
+	}
+	return id, true
+}
+
+func writeJSON(w http.ResponseWriter, status int, body any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	_ = json.NewEncoder(w).Encode(body)
+}
+
+// mustListStages is a small wrapper that hides the error path from
+// /claim — a DB read failure just pretends there are zero stages, and
+// the subsequent Seed will surface the real error.
+func mustListStages(s *store.Stages, r *http.Request, runID int64) []model.Stage {
+	rows, err := s.ListForRun(r.Context(), runID)
+	if err != nil {
+		return nil
+	}
+	return rows
+}
+
+// ===== Phase 3 endpoints =================================================
+
+// LogBatch is what the agent POSTs to /log: zero or more lines with
+// timestamp + level + text. Lines are written in order to the per-run
+// file and fanned out on the SSE hub.
+type LogBatch struct {
+	Lines []LogLine `json:"lines"`
+}
+
+type LogLine struct {
+	TS    string `json:"ts,omitempty"`    // RFC3339Nano; server clock used if empty
+	Level string `json:"level,omitempty"` // info|warn|error|debug
+	Text  string `json:"text"`
+}
+
+// Log accepts a batch of log lines from the agent. Empty batches are
+// legal (useful for agent-side flush ping).
+func (a *Agent) Log(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	if _, ok := a.authenticate(w, r, runID); !ok {
+		return
+	}
+	var batch LogBatch
+	if err := json.NewDecoder(r.Body).Decode(&batch); err != nil {
+		http.Error(w, "bad json", http.StatusBadRequest)
+		return
+	}
+	writer, err := a.Logs.WriterFor(runID)
+	if err != nil {
+		http.Error(w, "open log: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	for _, l := range batch.Lines {
+		ts, _ := time.Parse(time.RFC3339Nano, l.TS)
+		writer.Append(logs.Line{TS: ts, Level: l.Level, Text: l.Text})
+	}
+	writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(batch.Lines)})
+}
+
+// StageResult is the body of /result. Kind is the stage name (from
+// DefaultStageOrder); Passed drives StageCompleted vs StageFailed.
+// Inventory is optional and only set when kind == "Inventory" — the
+// orchestrator persists it as an artifact and feeds it to spec.Diff.
+type StageResult struct {
+	Stage     string          `json:"stage"`
+	Passed    bool            `json:"passed"`
+	Summary   json.RawMessage `json:"summary,omitempty"`
+	Inventory *spec.Inventory `json:"inventory,omitempty"`
+	Message   string          `json:"message,omitempty"`
+}
+
+// Result receives a stage's outcome. Flow:
+//  1. Mark the stage row passed/failed + record summary JSON.
+//  2. For Inventory: persist the inventory artifact.
+//  3. For Inventory (on pass): run spec diff server-side, persist rows,
+//     bump the run into SpecValidate and immediately resolve SpecValidate
+//     from that diff — the agent isn't involved in SpecValidate at all.
+//  4. Transition the run via StageCompleted/StageFailed.
+func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	run, ok := a.authenticate(w, r, runID)
+	if !ok {
+		return
+	}
+	var body StageResult
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		http.Error(w, "bad json", http.StatusBadRequest)
+		return
+	}
+	body.Stage = strings.TrimSpace(body.Stage)
+	if _, ok := orchestrator.StateForStage(body.Stage); !ok {
+		http.Error(w, "unknown stage: "+body.Stage, http.StatusBadRequest)
+		return
+	}
+
+	stageState := model.StagePassed
+	if !body.Passed {
+		stageState = model.StageFailed
+	}
+	summaryJSON := ""
+	if len(body.Summary) > 0 {
+		summaryJSON = string(body.Summary)
+	}
+	if err := a.Stages.CompleteByName(r.Context(), runID, body.Stage, stageState, summaryJSON); err != nil {
+		http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+
+	// Inventory-specific: persist artifact + compute spec diff.
+	if body.Stage == "Inventory" && body.Inventory != nil {
+		if err := a.persistInventory(r, run, body.Inventory); err != nil {
+			log.Printf("persist inventory run %d: %v", runID, err)
+		}
+	}
+
+	if !body.Passed {
+		if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
+			log.Printf("set failed stage: %v", err)
+		}
+		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
+			log.Printf("result: failed-transition run %d: %v", runID, err)
+			http.Error(w, "transition", http.StatusConflict)
+			return
+		}
+		hostName := a.hostNameFor(r.Context(), run.HostID)
+		detail := body.Message
+		if detail == "" {
+			detail = "stage reported failure"
+		}
+		a.dispatchEvent(notify.Event{
+			Kind:     notify.KindStageFailed,
+			Severity: notify.SeverityCritical,
+			RunID:    runID,
+			HostName: hostName,
+			Title:    fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage),
+			Body:     fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail),
+			URL:      a.runLinkURL(runID),
+		})
+		writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"})
+		return
+	}
+
+	// Passed: advance to the next stage in the pipeline.
+	next, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted)
+	if err != nil {
+		http.Error(w, "advance: "+err.Error(), http.StatusConflict)
+		return
+	}
+	log.Printf("result: run %d stage %s passed → %s", runID, body.Stage, next)
+
+	// If the just-advanced-into state is SpecValidate or Reporting, the
+	// orchestrator owns those stages entirely. The resolve function may
+	// transition further (→ next stage on pass, → FailedHolding on fail,
+	// → Completed for Reporting), so we re-read the run after each.
+	if next == model.StateSpecValidate {
+		a.resolveSpecValidate(r, runID)
+		if after, err := a.Runs.Get(r.Context(), runID); err == nil {
+			next = after.State
+		}
+	}
+	if next == model.StateReporting {
+		a.resolveReporting(r, runID)
+		if after, err := a.Runs.Get(r.Context(), runID); err == nil {
+			next = after.State
+		}
+	}
+	writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": string(next)})
+}
+
+func (a *Agent) persistInventory(r *http.Request, run *model.Run, inv *spec.Inventory) error {
+	dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", run.ID))
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		return err
+	}
+	path := filepath.Join(dir, "inventory.json")
+	buf, err := json.MarshalIndent(inv, "", "  ")
+	if err != nil {
+		return err
+	}
+	if err := os.WriteFile(path, buf, 0o644); err != nil {
+		return err
+	}
+	sum := sha256.Sum256(buf)
+	_, err = a.Artifacts.Create(r.Context(), store.Artifact{
+		RunID:     run.ID,
+		Kind:      "inventory",
+		Path:      path,
+		SHA256:    hex.EncodeToString(sum[:]),
+		SizeBytes: int64(len(buf)),
+	})
+	return err
+}
+
+// resolveSpecValidate runs the expected-vs-actual diff against the
+// just-stored inventory artifact, persists spec_diffs rows, and drives
+// the state machine — all on the server. The agent does nothing for
+// this stage.
+func (a *Agent) resolveSpecValidate(r *http.Request, runID int64) {
+	run, err := a.Runs.Get(r.Context(), runID)
+	if err != nil {
+		log.Printf("specvalidate: get run: %v", err)
+		return
+	}
+	host, err := a.Hosts.Get(r.Context(), run.HostID)
+	if err != nil {
+		log.Printf("specvalidate: get host: %v", err)
+		return
+	}
+	expected, err := spec.Parse(host.ExpectedSpecYAML)
+	if err != nil {
+		log.Printf("specvalidate: parse expected yaml: %v", err)
+		a.failStage(r, runID, "SpecValidate", "malformed expected spec: "+err.Error())
+		return
+	}
+	inv, err := a.readInventoryArtifact(r, runID)
+	if err != nil {
+		log.Printf("specvalidate: read inventory: %v", err)
+		a.failStage(r, runID, "SpecValidate", "missing inventory artifact")
+		return
+	}
+	diffs := spec.Diff(expected, inv)
+	if err := a.SpecDiffs.ReplaceForRun(r.Context(), runID, diffs); err != nil {
+		log.Printf("specvalidate: write diffs: %v", err)
+	}
+	if err := a.Stages.StartByName(r.Context(), runID, "SpecValidate"); err != nil {
+		log.Printf("specvalidate: start stage: %v", err)
+	}
+
+	critical := 0
+	for _, d := range diffs {
+		if d.Severity == "critical" && !d.Ignored {
+			critical++
+		}
+	}
+	summaryBuf, _ := json.Marshal(map[string]any{
+		"diffs":    len(diffs),
+		"critical": critical,
+	})
+	if critical > 0 {
+		_ = a.Stages.CompleteByName(r.Context(), runID, "SpecValidate", model.StageFailed, string(summaryBuf))
+		_ = a.Runs.SetFailedStage(r.Context(), runID, "SpecValidate")
+		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
+			log.Printf("specvalidate: failed-transition: %v", err)
+		}
+		a.appendLog(runID, "error", fmt.Sprintf("SpecValidate: %d critical diff(s) — holding host", critical))
+		hostName := a.hostNameFor(r.Context(), run.HostID)
+		a.dispatchEvent(notify.Event{
+			Kind:     notify.KindSpecMismatch,
+			Severity: notify.SeverityCritical,
+			RunID:    runID,
+			HostName: hostName,
+			Title:    fmt.Sprintf("[vetting] %s spec mismatch (%d critical)", hostName, critical),
+			Body:     fmt.Sprintf("SpecValidate found %d critical diff(s) on %s. Host is held for inspection.", critical, hostName),
+			URL:      a.runLinkURL(runID),
+		})
+	} else {
+		_ = a.Stages.CompleteByName(r.Context(), runID, "SpecValidate", model.StagePassed, string(summaryBuf))
+		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted); err != nil {
+			log.Printf("specvalidate: advance: %v", err)
+		}
+		a.appendLog(runID, "info", "SpecValidate: all fields match expected spec")
+	}
+}
+
+func (a *Agent) readInventoryArtifact(r *http.Request, runID int64) (*spec.Inventory, error) {
+	arts, err := a.Artifacts.ListForRun(r.Context(), runID)
+	if err != nil {
+		return nil, err
+	}
+	for i := len(arts) - 1; i >= 0; i-- {
+		if arts[i].Kind == "inventory" {
+			buf, err := os.ReadFile(arts[i].Path)
+			if err != nil {
+				return nil, err
+			}
+			var inv spec.Inventory
+			if err := json.Unmarshal(buf, &inv); err != nil {
+				return nil, err
+			}
+			return &inv, nil
+		}
+	}
+	return nil, errors.New("no inventory artifact")
+}
+
+func (a *Agent) failStage(r *http.Request, runID int64, stage, message string) {
+	_ = a.Stages.CompleteByName(r.Context(), runID, stage, model.StageFailed, fmt.Sprintf(`{"error":%q}`, message))
+	_ = a.Runs.SetFailedStage(r.Context(), runID, stage)
+	if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
+		log.Printf("failStage: transition run %d: %v", runID, err)
+	}
+	a.appendLog(runID, "error", stage+": "+message)
+}
+
+func (a *Agent) appendLog(runID int64, level, text string) {
+	if a.Logs == nil {
+		return
+	}
+	w, err := a.Logs.WriterFor(runID)
+	if err != nil {
+		log.Printf("appendLog: %v", err)
+		return
+	}
+	w.Append(logs.Line{Level: level, Text: text})
+}
+
+// Hold issues the per-run ephemeral ed25519 keypair: the agent gets
+// the authorized_keys line, the orchestrator keeps the privkey on disk.
+// Hold also records the agent's reported IP so the tile can print the
+// ssh invocation.
+type HoldRequest struct {
+	AgentIP string `json:"agent_ip"`
+}
+
+type HoldResponse struct {
+	AuthorizedKey string `json:"authorized_key"`
+	RunID         int64  `json:"run_id"`
+}
+
+func (a *Agent) Hold(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	if _, ok := a.authenticate(w, r, runID); !ok {
+		return
+	}
+	var body HoldRequest
+	_ = json.NewDecoder(r.Body).Decode(&body)
+	agentIP := strings.TrimSpace(body.AgentIP)
+	if agentIP == "" {
+		if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil {
+			agentIP = host
+		}
+	}
+	if agentIP != "" {
+		if err := a.Runs.SetHoldIP(r.Context(), runID, agentIP); err != nil {
+			log.Printf("hold: set hold_ip: %v", err)
+		}
+	}
+
+	kp, err := hold.Issue(runID)
+	if err != nil {
+		http.Error(w, "generate key: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	keyPath := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID), "hold.key")
+	abs, err := kp.WritePrivateTo(keyPath)
+	if err != nil {
+		http.Error(w, "write key: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	sum := sha256.Sum256(kp.PrivatePEM)
+	if _, err := a.Artifacts.Create(r.Context(), store.Artifact{
+		RunID:     runID,
+		Kind:      "hold_key",
+		Path:      abs,
+		SHA256:    hex.EncodeToString(sum[:]),
+		SizeBytes: int64(len(kp.PrivatePEM)),
+	}); err != nil {
+		log.Printf("hold: record artifact: %v", err)
+	}
+	a.appendLog(runID, "info", fmt.Sprintf("Hold key issued. SSH in with: ssh -i %s root@%s", abs, agentIP))
+	hostID := mustHostID(a, r, runID)
+	if hostID != 0 {
+		hostName := a.hostNameFor(r.Context(), hostID)
+		a.dispatchEvent(notify.Event{
+			Kind:     notify.KindHoldingOpened,
+			Severity: notify.SeverityCritical,
+			RunID:    runID,
+			HostName: hostName,
+			Title:    fmt.Sprintf("[vetting] %s holding — SSH ready", hostName),
+			Body:     fmt.Sprintf("Host %s is holding at %s.\nssh -i %s root@%s", hostName, agentIP, abs, agentIP),
+			URL:      a.runLinkURL(runID),
+		})
+	}
+	// Refresh the tile so the operator sees the ssh command.
+	host, _ := a.Hosts.Get(r.Context(), mustHostID(a, r, runID))
+	if host != nil {
+		latest, _ := a.Runs.Get(r.Context(), runID)
+		if orchestrator.TileRenderer != nil {
+			payload := orchestrator.TileRenderer(r.Context(), *host, latest)
+			a.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", host.ID), Payload: payload})
+		}
+	}
+	writeJSON(w, http.StatusOK, HoldResponse{AuthorizedKey: kp.AuthorizedKey, RunID: runID})
+}
+
+// dispatchEvent hands an already-populated Event to the notify Registry
+// if one is wired. Handler code uses hostNameFor to resolve the host
+// name for the event payload; this keeps call sites terse.
+func (a *Agent) dispatchEvent(ev notify.Event) {
+	if a.Notify == nil {
+		return
+	}
+	a.Notify.Dispatch(ev)
+}
+
+// hostNameFor returns a human-readable host name for a run, or "host-N"
+// if the lookup fails — notifications should never fail silently over a
+// missing name.
+func (a *Agent) hostNameFor(ctx context.Context, hostID int64) string {
+	if host, err := a.Hosts.Get(ctx, hostID); err == nil && host != nil {
+		return host.Name
+	}
+	return fmt.Sprintf("host-%d", hostID)
+}
+
+func (a *Agent) runLinkURL(runID int64) string {
+	if a.PublicURL == "" {
+		return ""
+	}
+	return strings.TrimRight(a.PublicURL, "/") + "/reports/" + fmt.Sprintf("%d", runID)
+}
+
+func mustHostID(a *Agent, r *http.Request, runID int64) int64 {
+	run, err := a.Runs.Get(r.Context(), runID)
+	if err != nil || run == nil {
+		return 0
+	}
+	return run.HostID
+}
+
+// ===== Phase 4 endpoints =================================================
+
+// SensorBatch is what the agent POSTs to /sensor: a stream of numeric
+// samples (temps, fan rpm, PSU rails, iperf throughput). Each sample is
+// (kind, key, value, unit). Timestamps default to server-now when empty
+// so the thermal sidecar doesn't have to carry a clock.
+type SensorBatch struct {
+	Samples []SensorSample `json:"samples"`
+}
+
+type SensorSample struct {
+	TS    string  `json:"ts,omitempty"`
+	Kind  string  `json:"kind"` // temp|fan|psu_volt|iperf|fio|smart_attr
+	Key   string  `json:"key"`
+	Value float64 `json:"value"`
+	Unit  string  `json:"unit,omitempty"`
+}
+
+// Sensor persists a batch of numeric samples. The thermal sidecar hits
+// this on a tick; stage executors (iperf, fio) also drop here.
+func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
+	runID, ok := runIDFromURL(w, r)
+	if !ok {
+		return
+	}
+	if _, ok := a.authenticate(w, r, runID); !ok {
+		return
+	}
+	if a.Measurements == nil {
+		http.Error(w, "measurements store not wired", http.StatusInternalServerError)
+		return
+	}
+	var body SensorBatch
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		http.Error(w, "bad json", http.StatusBadRequest)
+		return
+	}
+	rows := make([]model.Measurement, 0, len(body.Samples))
+	for _, s := range body.Samples {
+		ts, _ := time.Parse(time.RFC3339Nano, s.TS)
+		rows = append(rows, model.Measurement{
+			RunID: runID,
+			TS:    ts,
+			Kind:  s.Kind,
+			Key:   s.Key,
+			Value: s.Value,
+			Unit:  s.Unit,
+		})
+	}
+	if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil {
+		http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(rows)})
+}
+
+// resolveReporting runs when the pipeline advances into StateReporting.
+// It's an orchestrator-owned stage like SpecValidate: no agent action.
+// Writes a JSON report bundling run + stages + diffs + measurements,
+// then advances the run to Completed. Heartbeat will then return abort
+// and the agent will power the host off in Phase 5.
+func (a *Agent) resolveReporting(r *http.Request, runID int64) {
+	ctx := r.Context()
+	if err := a.Stages.StartByName(ctx, runID, "Reporting"); err != nil {
+		log.Printf("reporting: start stage: %v", err)
+	}
+	run, err := a.Runs.Get(ctx, runID)
+	if err != nil {
+		log.Printf("reporting: get run: %v", err)
+		return
+	}
+	host, err := a.Hosts.Get(ctx, run.HostID)
+	if err != nil {
+		log.Printf("reporting: get host: %v", err)
+		return
+	}
+	stages, err := a.Stages.ListForRun(ctx, runID)
+	if err != nil {
+		log.Printf("reporting: list stages: %v", err)
+	}
+	diffs, err := a.SpecDiffs.ListForRun(ctx, runID)
+	if err != nil {
+		log.Printf("reporting: list diffs: %v", err)
+	}
+	var measurements []model.Measurement
+	if a.Measurements != nil {
+		measurements, err = a.Measurements.ListForRun(ctx, runID)
+		if err != nil {
+			log.Printf("reporting: list measurements: %v", err)
+		}
+	}
+	bundle := map[string]any{
+		"run":          run,
+		"host":         host,
+		"stages":       stages,
+		"spec_diffs":   diffs,
+		"measurements": measurements,
+		"generated_at": time.Now().UTC().Format(time.RFC3339),
+	}
+	buf, err := json.MarshalIndent(bundle, "", "  ")
+	if err != nil {
+		log.Printf("reporting: marshal: %v", err)
+		a.failStage(r, runID, "Reporting", "marshal report: "+err.Error())
+		return
+	}
+	dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID))
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		a.failStage(r, runID, "Reporting", "mkdir: "+err.Error())
+		return
+	}
+	path := filepath.Join(dir, "report.json")
+	if err := os.WriteFile(path, buf, 0o644); err != nil {
+		a.failStage(r, runID, "Reporting", "write: "+err.Error())
+		return
+	}
+	sum := sha256.Sum256(buf)
+	if _, err := a.Artifacts.Create(ctx, store.Artifact{
+		RunID:     runID,
+		Kind:      "report",
+		Path:      path,
+		SHA256:    hex.EncodeToString(sum[:]),
+		SizeBytes: int64(len(buf)),
+	}); err != nil {
+		log.Printf("reporting: record artifact: %v", err)
+	}
+	// Also render the operator-facing HTML summary alongside the JSON.
+	// Failures here are non-fatal — the JSON is the source of truth.
+	if host != nil {
+		htmlData := report.Data{
+			GeneratedAt: time.Now().UTC(),
+			Run:         *run,
+			Host:        *host,
+			Stages:      stages,
+			SpecDiffs:   diffs,
+			Aggregates:  report.AggregateMeasurements(measurements),
+		}
+		if htmlBuf, err := report.RenderHTML(htmlData); err != nil {
+			log.Printf("reporting: render html: %v", err)
+		} else {
+			htmlPath := filepath.Join(dir, "report.html")
+			if err := os.WriteFile(htmlPath, htmlBuf, 0o644); err != nil {
+				log.Printf("reporting: write html: %v", err)
+			} else {
+				htmlSum := sha256.Sum256(htmlBuf)
+				if _, err := a.Artifacts.Create(ctx, store.Artifact{
+					RunID:     runID,
+					Kind:      "report_html",
+					Path:      htmlPath,
+					SHA256:    hex.EncodeToString(htmlSum[:]),
+					SizeBytes: int64(len(htmlBuf)),
+				}); err != nil {
+					log.Printf("reporting: record html artifact: %v", err)
+				}
+			}
+		}
+	}
+	summaryBuf, _ := json.Marshal(map[string]any{
+		"report_path": path,
+		"stages":      len(stages),
+		"diffs":       len(diffs),
+	})
+	if err := a.Stages.CompleteByName(ctx, runID, "Reporting", model.StagePassed, string(summaryBuf)); err != nil {
+		log.Printf("reporting: complete stage: %v", err)
+	}
+	if err := a.Runs.MarkCompleted(ctx, runID, path); err != nil {
+		log.Printf("reporting: mark completed: %v", err)
+	}
+	a.appendLog(runID, "info", "Reporting: wrote "+path+"; run completed.")
+	// Publish a final tile update so the dashboard flips to pass mood.
+	if host != nil && orchestrator.TileRenderer != nil {
+		latest, _ := a.Runs.Get(ctx, runID)
+		payload := orchestrator.TileRenderer(ctx, *host, latest)
+		a.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", host.ID), Payload: payload})
+	}
+	hostName := "host"
+	if host != nil {
+		hostName = host.Name
+	}
+	a.dispatchEvent(notify.Event{
+		Kind:     notify.KindRunCompleted,
+		Severity: notify.SeverityInfo,
+		RunID:    runID,
+		HostName: hostName,
+		Title:    fmt.Sprintf("[vetting] %s passed vetting", hostName),
+		Body:     fmt.Sprintf("Run %d on %s completed all stages. Report: %s", runID, hostName, path),
+		URL:      a.runLinkURL(runID),
+	})
+}
@@ -0,0 +1,128 @@
+package api_test
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"path/filepath"
+	"strconv"
+	"testing"
+
+	"github.com/go-chi/chi/v5"
+
+	"vetting/internal/api"
+	"vetting/internal/db"
+	"vetting/internal/model"
+	"vetting/internal/orchestrator"
+	"vetting/internal/store"
+)
+
+func setupAgent(t *testing.T) (*api.Agent, int64, string) {
+	t.Helper()
+	path := filepath.Join(t.TempDir(), "vetting.db")
+	conn, err := db.Open(path)
+	if err != nil {
+		t.Fatalf("open db: %v", err)
+	}
+	t.Cleanup(func() { _ = conn.Close() })
+
+	hosts := &store.Hosts{DB: conn}
+	runs := &store.Runs{DB: conn}
+	meas := &store.Measurements{DB: conn}
+
+	hostID, err := hosts.Create(context.Background(), model.Host{
+		Name:             "t-host",
+		MAC:              "aa:bb:cc:dd:ee:01",
+		WoLBroadcastIP:   "10.0.0.255",
+		WoLPort:          9,
+		ExpectedSpecYAML: "memory:\n  total_gib: 16\n",
+	})
+	if err != nil {
+		t.Fatalf("create host: %v", err)
+	}
+	plain, hash, err := orchestrator.IssueRunToken()
+	if err != nil {
+		t.Fatalf("issue token: %v", err)
+	}
+	runID, err := runs.Create(context.Background(), hostID, hash)
+	if err != nil {
+		t.Fatalf("create run: %v", err)
+	}
+	return &api.Agent{
+		Hosts:        hosts,
+		Runs:         runs,
+		Measurements: meas,
+	}, runID, plain
+}
+
+func routedRequest(runID int64, method, path string, body []byte) *http.Request {
+	req := httptest.NewRequest(method, path, bytes.NewReader(body))
+	// chi.URLParam is read from chi's context routing; fake that here.
+	rctx := chi.NewRouteContext()
+	rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
+	return req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
+}
+
+func TestSensorPersistsBatch(t *testing.T) {
+	a, runID, token := setupAgent(t)
+	batch := api.SensorBatch{Samples: []api.SensorSample{
+		{Kind: "thermal", Key: "cpu", Value: 47.5, Unit: "C"},
+		{Kind: "iperf", Key: "throughput_mbps", Value: 938.2, Unit: "Mbps"},
+	}}
+	buf, _ := json.Marshal(batch)
+	req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
+	req.Header.Set("Authorization", "Bearer "+token)
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+	a.Sensor(rr, req)
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
+	}
+	rows, err := a.Measurements.ListForRun(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	if len(rows) != 2 {
+		t.Fatalf("expected 2 measurements, got %d", len(rows))
+	}
+}
+
+func TestSensorRejectsBadToken(t *testing.T) {
+	a, runID, _ := setupAgent(t)
+	body, _ := json.Marshal(api.SensorBatch{})
+	req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", body)
+	req.Header.Set("Authorization", "Bearer wrong-token")
+	rr := httptest.NewRecorder()
+	a.Sensor(rr, req)
+	if rr.Code != http.StatusUnauthorized {
+		t.Fatalf("status = %d, want 401", rr.Code)
+	}
+}
+
+// TestHeartbeatShutdownWhenCompleted: once the orchestrator has flipped
+// the run into Completed, the next heartbeat response must carry
+// cmd=shutdown so the agent powers the host down.
+func TestHeartbeatShutdownWhenCompleted(t *testing.T) {
+	a, runID, token := setupAgent(t)
+	// Wire a runner so Heartbeat's TouchHeartbeat call doesn't nil-panic.
+	a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}}
+	if err := a.Runs.SetState(context.Background(), runID, model.StateCompleted); err != nil {
+		t.Fatalf("set state: %v", err)
+	}
+	req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil)
+	req.Header.Set("Authorization", "Bearer "+token)
+	rr := httptest.NewRecorder()
+	a.Heartbeat(rr, req)
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
+	}
+	var resp map[string]any
+	if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if resp["cmd"] != "shutdown" {
+		t.Fatalf("cmd = %v, want shutdown", resp["cmd"])
+	}
+}
@@ -0,0 +1,318 @@
+package api_test
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"vetting/internal/api"
+	"vetting/internal/db"
+	"vetting/internal/events"
+	"vetting/internal/logs"
+	"vetting/internal/model"
+	"vetting/internal/notify"
+	"vetting/internal/orchestrator"
+	"vetting/internal/spec"
+	"vetting/internal/store"
+)
+
+// captureNotifier is a testing-only Notifier that records every Event
+// sent to it, under a mutex so concurrent Dispatch goroutines are safe.
+type captureNotifier struct {
+	mu   sync.Mutex
+	name string
+	evs  []notify.Event
+}
+
+func (c *captureNotifier) Name() string { return c.name }
+
+func (c *captureNotifier) Send(_ context.Context, ev notify.Event) error {
+	c.mu.Lock()
+	c.evs = append(c.evs, ev)
+	c.mu.Unlock()
+	return nil
+}
+
+func (c *captureNotifier) awaitKind(t *testing.T, k notify.Kind) notify.Event {
+	t.Helper()
+	deadline := time.Now().Add(2 * time.Second)
+	for {
+		c.mu.Lock()
+		for _, ev := range c.evs {
+			if ev.Kind == k {
+				got := ev
+				c.mu.Unlock()
+				return got
+			}
+		}
+		c.mu.Unlock()
+		if time.Now().After(deadline) {
+			t.Fatalf("no %q event received within timeout", k)
+		}
+		time.Sleep(5 * time.Millisecond)
+	}
+}
+
+func newCaptureRegistry(c *captureNotifier) *notify.Registry {
+	reg := notify.NewRegistry(time.Second)
+	reg.Register(c)
+	reg.AddRoute(notify.Route{Notifier: c.name}) // wildcard
+	return reg
+}
+
+// Builds a fully-wired Agent against a fresh sqlite DB and returns
+// (agent, runID, plainTokenForBearer). Caller is responsible for
+// transitioning the run out of Queued.
+func fullAgent(t *testing.T) (*api.Agent, int64, string) {
+	t.Helper()
+	tmp := t.TempDir()
+	conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
+	if err != nil {
+		t.Fatalf("open db: %v", err)
+	}
+	t.Cleanup(func() { _ = conn.Close() })
+
+	hostStore := &store.Hosts{DB: conn}
+	runStore := &store.Runs{DB: conn}
+	stageStore := &store.Stages{DB: conn}
+	artifactStore := &store.Artifacts{DB: conn}
+	specDiffStore := &store.SpecDiffs{DB: conn}
+	measurementStore := &store.Measurements{DB: conn}
+
+	hub := events.NewHub()
+	logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
+	if err != nil {
+		t.Fatalf("logs hub: %v", err)
+	}
+	t.Cleanup(func() { logHub.Close() })
+
+	runner := &orchestrator.Runner{
+		Runs:     runStore,
+		Hosts:    hostStore,
+		Stages:   stageStore,
+		EventHub: hub,
+	}
+
+	hostID, err := hostStore.Create(context.Background(), model.Host{
+		Name:             "smoke-host",
+		MAC:              "aa:bb:cc:dd:ee:10",
+		WoLBroadcastIP:   "10.0.0.255",
+		WoLPort:          9,
+		ExpectedSpecYAML: "", // empty spec → no diffs
+	})
+	if err != nil {
+		t.Fatalf("create host: %v", err)
+	}
+	plain, hash, err := orchestrator.IssueRunToken()
+	if err != nil {
+		t.Fatalf("issue token: %v", err)
+	}
+	runID, err := runStore.Create(context.Background(), hostID, hash)
+	if err != nil {
+		t.Fatalf("create run: %v", err)
+	}
+	if err := stageStore.Seed(context.Background(), runID); err != nil {
+		t.Fatalf("seed stages: %v", err)
+	}
+	return &api.Agent{
+		Hosts:        hostStore,
+		Runs:         runStore,
+		Stages:       stageStore,
+		Artifacts:    artifactStore,
+		SpecDiffs:    specDiffStore,
+		Measurements: measurementStore,
+		Runner:       runner,
+		EventHub:     hub,
+		Logs:         logHub,
+		ArtifactsDir: filepath.Join(tmp, "artifacts"),
+		PublicURL:    "https://vetting.example",
+	}, runID, plain
+}
+
+// walkStage simulates the agent reporting a single stage's outcome.
+// Returns the next_state the orchestrator decided to advance to.
+func walkStage(t *testing.T, a *api.Agent, runID int64, token, stage string, passed bool, extras map[string]any) string {
+	t.Helper()
+	body := map[string]any{"stage": stage, "passed": passed}
+	if extras != nil {
+		for k, v := range extras {
+			body[k] = v
+		}
+	}
+	buf, _ := json.Marshal(body)
+	req := httptest.NewRequest(http.MethodPost,
+		"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/result",
+		bytes.NewReader(buf))
+	rctx := chi.NewRouteContext()
+	rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
+	req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
+	req.Header.Set("Authorization", "Bearer "+token)
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+	a.Result(rr, req)
+	if rr.Code != http.StatusOK {
+		t.Fatalf("stage %s: status %d body=%q", stage, rr.Code, rr.Body.String())
+	}
+	var resp struct {
+		OK        bool   `json:"ok"`
+		NextState string `json:"next_state"`
+	}
+	if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil {
+		t.Fatalf("stage %s: decode resp: %v", stage, err)
+	}
+	return resp.NextState
+}
+
+// TestFullPipelineToCompleted walks an agent through all stages of a
+// successful run and asserts the run ends in Completed. Inventory is
+// minimal; the empty expected-spec means SpecValidate produces zero
+// critical diffs and the orchestrator auto-advances past it.
+func TestFullPipelineToCompleted(t *testing.T) {
+	a, runID, token := fullAgent(t)
+	capture := &captureNotifier{name: "capture"}
+	a.Notify = newCaptureRegistry(capture)
+	// Claim would normally transition Booting → InventoryCheck; set it
+	// directly here since we're not exercising the claim path.
+	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
+		t.Fatalf("set state: %v", err)
+	}
+
+	// Stage 1: Inventory — provide a concrete inventory so SpecValidate
+	// has something to compare against.
+	inv := spec.Inventory{
+		CPU:    spec.CPUSpec{Model: "Xeon Gold", LogicalCores: 8},
+		Memory: spec.MemorySpec{TotalGiB: 16},
+	}
+	next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
+	// After Inventory → SpecValidate resolves inline → SMART
+	if next != "SMART" {
+		t.Fatalf("after Inventory, next_state = %q, want SMART", next)
+	}
+
+	// The remaining stages advance one-for-one in order.
+	walkPlan := []struct {
+		stage    string
+		expected string
+	}{
+		{"SMART", "CPUStress"},
+		{"CPUStress", "Storage"},
+		{"Storage", "Network"},
+		{"Network", "GPU"},
+		{"GPU", "PSU"},
+		{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
+	}
+	for _, step := range walkPlan {
+		got := walkStage(t, a, runID, token, step.stage, true, nil)
+		if got != step.expected {
+			t.Fatalf("after %s, next_state = %q, want %q", step.stage, got, step.expected)
+		}
+	}
+
+	run, err := a.Runs.Get(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("Get run: %v", err)
+	}
+	if run.State != model.StateCompleted {
+		t.Fatalf("run.State = %q, want Completed", run.State)
+	}
+	if run.ReportPath == "" {
+		t.Fatalf("run.ReportPath not set")
+	}
+
+	// Phase 5 assertions: an HTML report artifact exists on disk, and
+	// the capture notifier saw a RunCompleted event.
+	arts, err := a.Artifacts.ListForRun(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	var htmlPath string
+	for _, art := range arts {
+		if art.Kind == "report_html" {
+			htmlPath = art.Path
+		}
+	}
+	if htmlPath == "" {
+		t.Fatalf("no report_html artifact recorded (kinds seen: %v)", artifactKinds(arts))
+	}
+	data, err := os.ReadFile(htmlPath)
+	if err != nil {
+		t.Fatalf("read report.html: %v", err)
+	}
+	if !strings.Contains(string(data), "<html") {
+		t.Fatalf("report.html missing <html tag: %s", string(data[:min(200, len(data))]))
+	}
+	ev := capture.awaitKind(t, notify.KindRunCompleted)
+	if ev.HostName != "smoke-host" {
+		t.Errorf("RunCompleted host = %q, want smoke-host", ev.HostName)
+	}
+	if ev.URL == "" || !strings.Contains(ev.URL, "/reports/") {
+		t.Errorf("RunCompleted URL = %q, want non-empty with /reports/", ev.URL)
+	}
+}
+
+func artifactKinds(arts []store.Artifact) []string {
+	out := make([]string, 0, len(arts))
+	for _, a := range arts {
+		out = append(out, a.Kind)
+	}
+	return out
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// TestFaultInjectionSMART verifies a failing SMART stage halts the
+// pipeline at FailedHolding with failed_stage recorded.
+func TestFaultInjectionSMART(t *testing.T) {
+	a, runID, token := fullAgent(t)
+	capture := &captureNotifier{name: "capture"}
+	a.Notify = newCaptureRegistry(capture)
+	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
+		t.Fatalf("set state: %v", err)
+	}
+
+	inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
+	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" {
+		t.Fatalf("after Inventory, next = %q want SMART", next)
+	}
+
+	// Fake SMART failure → expect FailedHolding.
+	if next := walkStage(t, a, runID, token, "SMART", false, nil); next != "FailedHolding" {
+		t.Fatalf("after SMART fail, next = %q want FailedHolding", next)
+	}
+
+	run, err := a.Runs.Get(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("Get run: %v", err)
+	}
+	if run.State != model.StateFailedHolding {
+		t.Fatalf("run.State = %q, want FailedHolding", run.State)
+	}
+	if run.FailedStage != "SMART" {
+		t.Fatalf("run.FailedStage = %q, want SMART", run.FailedStage)
+	}
+
+	// Phase 5 assertion: the fault fires a StageFailed notification.
+	ev := capture.awaitKind(t, notify.KindStageFailed)
+	if !strings.Contains(ev.Title, "SMART") {
+		t.Errorf("StageFailed title = %q, want to mention SMART", ev.Title)
+	}
+	if ev.Severity != notify.SeverityCritical {
+		t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
+	}
+}
@@ -0,0 +1,69 @@
+package api
+
+import (
+	"context"
+	"log"
+
+	"vetting/internal/model"
+	"vetting/internal/store"
+	"vetting/internal/web/templates"
+)
+
+// TileEnricher builds a fully-populated TileData for a host. It looks
+// up the latest run's spec-diff count and hold-key artifact path so the
+// tile can render the "n critical diffs" badge and the ssh invocation
+// without the template package needing DB access.
+//
+// Used by both the Dashboard handler (initial render) and the SSE tile-
+// refresh path (agent_handlers.Hold, orchestrator runner) so every
+// place that renders a tile shows the same data.
+type TileEnricher struct {
+	Runs      *store.Runs
+	Artifacts *store.Artifacts
+	SpecDiffs *store.SpecDiffs
+}
+
+// Build returns a TileData for (host, latest). Fails soft: DB errors
+// fall back to a tile without the extra fields rather than breaking
+// the whole dashboard.
+func (e *TileEnricher) Build(ctx context.Context, host model.Host, latest *model.Run) templates.TileData {
+	t := templates.TileData{Host: host, Latest: latest}
+	if latest == nil {
+		return t
+	}
+	if e.SpecDiffs != nil {
+		if diffs, err := e.SpecDiffs.ListForRun(ctx, latest.ID); err == nil {
+			for _, d := range diffs {
+				if d.Severity == "critical" && !d.Ignored {
+					t.SpecDiffCritical++
+				}
+			}
+		} else {
+			log.Printf("tile: list spec_diffs run %d: %v", latest.ID, err)
+		}
+	}
+	if e.Artifacts != nil {
+		if arts, err := e.Artifacts.ListForRun(ctx, latest.ID); err == nil {
+			for _, a := range arts {
+				if a.Kind == "hold_key" {
+					t.HoldKeyPath = a.Path
+				}
+			}
+		} else {
+			log.Printf("tile: list artifacts run %d: %v", latest.ID, err)
+		}
+	}
+	return t
+}
+
+// BuildByHost looks up the latest run itself — convenient for SSE tile
+// publishers that only know the host ID.
+func (e *TileEnricher) BuildByHost(ctx context.Context, host model.Host) templates.TileData {
+	var latest *model.Run
+	if e.Runs != nil {
+		if r, err := e.Runs.LatestForHost(ctx, host.ID); err == nil {
+			latest = r
+		}
+	}
+	return e.Build(ctx, host, latest)
+}
@@ -0,0 +1,295 @@
+package api
+
+import (
+	"errors"
+	"log"
+	"net/http"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+	"gopkg.in/yaml.v3"
+
+	"vetting/internal/auth"
+	"vetting/internal/events"
+	"vetting/internal/model"
+	"vetting/internal/orchestrator"
+	"vetting/internal/store"
+	"vetting/internal/web/templates"
+)
+
+type UI struct {
+	Hosts     *store.Hosts
+	Runs      *store.Runs
+	Artifacts *store.Artifacts
+	Auth      *auth.Manager
+	EventHub  *events.Hub
+	Runner    *orchestrator.Runner
+	Tiles     *TileEnricher
+}
+
+var macRe = regexp.MustCompile(`^[0-9a-f]{2}(:[0-9a-f]{2}){5}$`)
+
+func (u *UI) Dashboard(w http.ResponseWriter, r *http.Request) {
+	hosts, err := u.Hosts.List(r.Context())
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+	tiles := make([]templates.TileData, 0, len(hosts))
+	for _, h := range hosts {
+		latest, err := u.Runs.LatestForHost(r.Context(), h.ID)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+		tiles = append(tiles, u.Tiles.Build(r.Context(), h, latest))
+	}
+	_ = templates.Dashboard(tiles).Render(r.Context(), w)
+}
+
+// StartRun creates a new Run for the host, issues an agent token, and
+// transitions Registered→Queued. The dispatcher goroutine picks it up
+// and fires WoL.
+func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
+	idStr := chi.URLParam(r, "id")
+	hostID, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil {
+		http.Error(w, "bad host id", http.StatusBadRequest)
+		return
+	}
+	if _, err := u.Hosts.Get(r.Context(), hostID); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			http.NotFound(w, r)
+			return
+		}
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+
+	// Guard: refuse to start a second run while one is still active.
+	if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
+		switch latest.State {
+		case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
+			// ok to start fresh
+		default:
+			http.Error(w, "host already has an active run", http.StatusConflict)
+			return
+		}
+	}
+
+	_, hash, err := orchestrator.IssueRunToken()
+	if err != nil {
+		http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	runID, err := u.Runs.Create(r.Context(), hostID, hash)
+	if err != nil {
+		http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID)
+	http.Redirect(w, r, "/", http.StatusSeeOther)
+}
+
+func (u *UI) LoginForm(w http.ResponseWriter, r *http.Request) {
+	next := r.URL.Query().Get("next")
+	if next == "" {
+		next = "/"
+	}
+	_ = templates.Login("", next).Render(r.Context(), w)
+}
+
+func (u *UI) LoginSubmit(w http.ResponseWriter, r *http.Request) {
+	if err := r.ParseForm(); err != nil {
+		http.Error(w, "bad form", http.StatusBadRequest)
+		return
+	}
+	password := r.PostForm.Get("password")
+	next := r.PostForm.Get("next")
+	if next == "" || !strings.HasPrefix(next, "/") {
+		next = "/"
+	}
+	if !u.Auth.VerifyPassword(password) {
+		w.WriteHeader(http.StatusUnauthorized)
+		_ = templates.Login("Invalid password.", next).Render(r.Context(), w)
+		return
+	}
+	u.Auth.Issue(w, r)
+	http.Redirect(w, r, next, http.StatusSeeOther)
+}
+
+func (u *UI) Logout(w http.ResponseWriter, r *http.Request) {
+	u.Auth.Clear(w)
+	http.Redirect(w, r, "/login", http.StatusSeeOther)
+}
+
+func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
+	_ = templates.Registration(templates.RegistrationForm{}).Render(r.Context(), w)
+}
+
+func (u *UI) CreateHost(w http.ResponseWriter, r *http.Request) {
+	if err := r.ParseForm(); err != nil {
+		http.Error(w, "bad form", http.StatusBadRequest)
+		return
+	}
+	form := templates.RegistrationForm{
+		Name:             strings.TrimSpace(r.PostForm.Get("name")),
+		MAC:              strings.ToLower(strings.TrimSpace(r.PostForm.Get("mac"))),
+		WoLBroadcastIP:   strings.TrimSpace(r.PostForm.Get("wol_broadcast_ip")),
+		WoLPort:          r.PostForm.Get("wol_port"),
+		ExpectedSpecYAML: r.PostForm.Get("expected_spec_yaml"),
+		Notes:            strings.TrimSpace(r.PostForm.Get("notes")),
+	}
+
+	if errMsg := validateHostForm(&form); errMsg != "" {
+		form.Error = errMsg
+		w.WriteHeader(http.StatusBadRequest)
+		_ = templates.Registration(form).Render(r.Context(), w)
+		return
+	}
+
+	wolPort, _ := strconv.Atoi(form.WoLPort)
+	if wolPort == 0 {
+		wolPort = 9
+	}
+
+	_, err := u.Hosts.Create(r.Context(), model.Host{
+		Name:             form.Name,
+		MAC:              form.MAC,
+		WoLBroadcastIP:   form.WoLBroadcastIP,
+		WoLPort:          wolPort,
+		ExpectedSpecYAML: form.ExpectedSpecYAML,
+		Notes:            form.Notes,
+	})
+	if err != nil {
+		form.Error = friendlyDBError(err)
+		w.WriteHeader(http.StatusConflict)
+		_ = templates.Registration(form).Render(r.Context(), w)
+		return
+	}
+	http.Redirect(w, r, "/", http.StatusSeeOther)
+}
+
+// OverrideWipeStorage is the operator's explicit "yes, wipe the disk
+// even though we found filesystem signatures" button. Only meaningful
+// when the latest run is FailedHolding with failed_stage=Storage — the
+// agent's next heartbeat will receive retry_stage with wipe=true and
+// re-enter the Storage stage bypassing the wipe-probe guard.
+func (u *UI) OverrideWipeStorage(w http.ResponseWriter, r *http.Request) {
+	idStr := chi.URLParam(r, "id")
+	hostID, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil {
+		http.Error(w, "bad host id", http.StatusBadRequest)
+		return
+	}
+	latest, err := u.Runs.LatestForHost(r.Context(), hostID)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+	if latest == nil {
+		http.Error(w, "no run for host", http.StatusConflict)
+		return
+	}
+	if latest.State != model.StateFailedHolding || latest.FailedStage != "Storage" {
+		http.Error(w, "override only valid when holding on Storage", http.StatusConflict)
+		return
+	}
+	if _, err := u.Runner.Override(r.Context(), latest.ID, `{"wipe":true}`); err != nil {
+		http.Error(w, "override: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	http.Redirect(w, r, "/", http.StatusSeeOther)
+}
+
+func (u *UI) DeleteHost(w http.ResponseWriter, r *http.Request) {
+	idStr := chi.URLParam(r, "id")
+	id, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil {
+		http.Error(w, "bad id", http.StatusBadRequest)
+		return
+	}
+	if err := u.Hosts.Delete(r.Context(), id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			http.NotFound(w, r)
+			return
+		}
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+	http.Redirect(w, r, "/", http.StatusSeeOther)
+}
+
+func (u *UI) SSE(w http.ResponseWriter, r *http.Request) {
+	u.EventHub.ServeSSE(w, r)
+}
+
+// Report serves the HTML report artifact for a run. Looks up the
+// report_html artifact row for the runID, validates the path lives
+// under the artifacts dir (defence-in-depth against path traversal),
+// and streams it back. 404 when the run hasn't produced one yet.
+func (u *UI) Report(w http.ResponseWriter, r *http.Request) {
+	idStr := chi.URLParam(r, "runID")
+	runID, err := strconv.ParseInt(idStr, 10, 64)
+	if err != nil {
+		http.Error(w, "bad run id", http.StatusBadRequest)
+		return
+	}
+	arts, err := u.Artifacts.ListForRun(r.Context(), runID)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+	var path string
+	for _, a := range arts {
+		if a.Kind == "report_html" {
+			path = a.Path
+		}
+	}
+	if path == "" {
+		http.NotFound(w, r)
+		return
+	}
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	http.ServeFile(w, r, path)
+}
+
+func validateHostForm(form *templates.RegistrationForm) string {
+	if form.Name == "" {
+		return "Name is required."
+	}
+	if !macRe.MatchString(form.MAC) {
+		return "MAC address must be in the form aa:bb:cc:dd:ee:ff."
+	}
+	if form.WoLBroadcastIP == "" {
+		return "WoL broadcast IP is required."
+	}
+	if form.ExpectedSpecYAML == "" {
+		return "Expected spec YAML is required."
+	}
+	var anything any
+	if err := yaml.Unmarshal([]byte(form.ExpectedSpecYAML), &anything); err != nil {
+		return "Expected spec YAML is not valid YAML: " + err.Error()
+	}
+	if form.WoLPort != "" {
+		port, err := strconv.Atoi(form.WoLPort)
+		if err != nil || port < 1 || port > 65535 {
+			return "WoL port must be 1–65535."
+		}
+	}
+	return ""
+}
+
+func friendlyDBError(err error) string {
+	s := err.Error()
+	switch {
+	case strings.Contains(s, "UNIQUE constraint failed: hosts.name"):
+		return "A host with that name already exists."
+	case strings.Contains(s, "UNIQUE constraint failed: hosts.mac"):
+		return "A host with that MAC already exists."
+	default:
+		return s
+	}
+}
@@ -0,0 +1,64 @@
+package auth
+
+import (
+	"net/http"
+)
+
+// RequireSession redirects unauthenticated requests to /login.
+func (m *Manager) RequireSession(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if err := m.Validate(r); err != nil {
+			if acceptsHTML(r) {
+				http.Redirect(w, r, "/login?next="+r.URL.RequestURI(), http.StatusSeeOther)
+				return
+			}
+			http.Error(w, "unauthorized", http.StatusUnauthorized)
+			return
+		}
+		next.ServeHTTP(w, r)
+	})
+}
+
+func acceptsHTML(r *http.Request) bool {
+	accept := r.Header.Get("Accept")
+	if accept == "" {
+		return true
+	}
+	for _, part := range splitComma(accept) {
+		if part == "text/html" || part == "*/*" {
+			return true
+		}
+	}
+	return false
+}
+
+func splitComma(s string) []string {
+	var out []string
+	start := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == ',' {
+			out = append(out, trimSpace(s[start:i]))
+			start = i + 1
+		} else if s[i] == ';' {
+			out = append(out, trimSpace(s[start:i]))
+			for i < len(s) && s[i] != ',' {
+				i++
+			}
+			start = i + 1
+		}
+	}
+	if start < len(s) {
+		out = append(out, trimSpace(s[start:]))
+	}
+	return out
+}
+
+func trimSpace(s string) string {
+	for len(s) > 0 && (s[0] == ' ' || s[0] == '\t') {
+		s = s[1:]
+	}
+	for len(s) > 0 && (s[len(s)-1] == ' ' || s[len(s)-1] == '\t') {
+		s = s[:len(s)-1]
+	}
+	return s
+}
@@ -0,0 +1,100 @@
+package auth
+
+import (
+	"crypto/hmac"
+	"crypto/sha256"
+	"encoding/base64"
+	"errors"
+	"fmt"
+	"net/http"
+	"strconv"
+	"strings"
+	"time"
+
+	"golang.org/x/crypto/bcrypt"
+)
+
+const cookieName = "vetting_session"
+
+type Manager struct {
+	PasswordHash string
+	Secret       []byte
+	TTL          time.Duration
+}
+
+func (m *Manager) VerifyPassword(password string) bool {
+	if m.PasswordHash == "" {
+		return false
+	}
+	return bcrypt.CompareHashAndPassword([]byte(m.PasswordHash), []byte(password)) == nil
+}
+
+// Issue writes a signed session cookie valid for m.TTL.
+func (m *Manager) Issue(w http.ResponseWriter, r *http.Request) {
+	expiry := time.Now().Add(m.TTL).Unix()
+	payload := strconv.FormatInt(expiry, 10)
+	sig := m.sign(payload)
+	value := payload + "." + sig
+
+	http.SetCookie(w, &http.Cookie{
+		Name:     cookieName,
+		Value:    value,
+		Path:     "/",
+		HttpOnly: true,
+		Secure:   r.TLS != nil,
+		SameSite: http.SameSiteLaxMode,
+		Expires:  time.Unix(expiry, 0),
+	})
+}
+
+func (m *Manager) Clear(w http.ResponseWriter) {
+	http.SetCookie(w, &http.Cookie{
+		Name:     cookieName,
+		Value:    "",
+		Path:     "/",
+		HttpOnly: true,
+		MaxAge:   -1,
+	})
+}
+
+var errInvalidSession = errors.New("invalid session")
+
+// Validate returns nil if the request's cookie is present, signed, and not expired.
+func (m *Manager) Validate(r *http.Request) error {
+	c, err := r.Cookie(cookieName)
+	if err != nil {
+		return errInvalidSession
+	}
+	parts := strings.SplitN(c.Value, ".", 2)
+	if len(parts) != 2 {
+		return errInvalidSession
+	}
+	payload, sig := parts[0], parts[1]
+	expected := m.sign(payload)
+	if !hmac.Equal([]byte(sig), []byte(expected)) {
+		return errInvalidSession
+	}
+	expiry, err := strconv.ParseInt(payload, 10, 64)
+	if err != nil {
+		return errInvalidSession
+	}
+	if time.Now().Unix() >= expiry {
+		return errInvalidSession
+	}
+	return nil
+}
+
+func (m *Manager) sign(payload string) string {
+	mac := hmac.New(sha256.New, m.Secret)
+	_, _ = mac.Write([]byte(payload))
+	return base64.RawURLEncoding.EncodeToString(mac.Sum(nil))
+}
+
+// BcryptHash is a helper used by the gen-admin-password tool.
+func BcryptHash(password string) (string, error) {
+	b, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost)
+	if err != nil {
+		return "", fmt.Errorf("bcrypt: %w", err)
+	}
+	return string(b), nil
+}
@@ -0,0 +1,142 @@
+package config
+
+import (
+	"encoding/hex"
+	"fmt"
+	"os"
+
+	"gopkg.in/yaml.v3"
+)
+
+type Config struct {
+	Server     Server     `yaml:"server"`
+	Database   Database   `yaml:"database"`
+	Artifacts  Artifacts  `yaml:"artifacts"`
+	Logs       Logs       `yaml:"logs"`
+	Auth       Auth       `yaml:"auth"`
+	Dispatcher Dispatcher `yaml:"dispatcher"`
+	Janitor    Janitor    `yaml:"janitor"`
+	PXE        PXE        `yaml:"pxe"`
+	Network    Network    `yaml:"network"`
+	Notifiers  []Notifier `yaml:"notifiers"`
+	Routes     []Route    `yaml:"routes"`
+}
+
+type Server struct {
+	Bind      string `yaml:"bind"`
+	PublicURL string `yaml:"public_url"` // user-visible base URL, e.g. https://vetting.lan:8443; used in notification click-throughs
+	TLS       TLS    `yaml:"tls"`
+}
+
+type TLS struct {
+	Enabled  bool   `yaml:"enabled"`
+	CertFile string `yaml:"cert_file"`
+	KeyFile  string `yaml:"key_file"`
+}
+
+type Database struct {
+	Path string `yaml:"path"`
+}
+
+type Artifacts struct {
+	Dir           string `yaml:"dir"`
+	RetentionDays int    `yaml:"retention_days"` // 0 = keep forever
+}
+
+type Logs struct {
+	Dir           string `yaml:"dir"`
+	RetentionDays int    `yaml:"retention_days"` // 0 = keep forever
+}
+
+type Janitor struct {
+	IntervalMinutes int `yaml:"interval_minutes"` // 0 = 60
+}
+
+type Auth struct {
+	AdminPasswordBcrypt string `yaml:"admin_password_bcrypt"`
+	SessionSecretHex    string `yaml:"session_secret_hex"`
+	SessionTTLHours     int    `yaml:"session_ttl_hours"`
+}
+
+func (a Auth) SessionSecret() ([]byte, error) {
+	b, err := hex.DecodeString(a.SessionSecretHex)
+	if err != nil {
+		return nil, fmt.Errorf("session_secret_hex: %w", err)
+	}
+	if len(b) < 32 {
+		return nil, fmt.Errorf("session_secret_hex must decode to at least 32 bytes, got %d", len(b))
+	}
+	return b, nil
+}
+
+type Dispatcher struct {
+	MaxConcurrentRuns int `yaml:"max_concurrent_runs"`
+}
+
+type Network struct {
+	IperfPort int `yaml:"iperf_port"`
+}
+
+// PXE / Notifier / Route are declared up front so the config file is
+// forward-compatible across phases. Phase 1 does not act on these.
+
+type PXE struct {
+	Enabled         bool   `yaml:"enabled"`
+	Interface       string `yaml:"interface"`
+	DHCPRange       string `yaml:"dhcp_range"`
+	OrchestratorURL string `yaml:"orchestrator_url"`
+	TFTPRoot        string `yaml:"tftp_root"` // holds ipxe.efi + undionly.kpxe
+	LiveDir         string `yaml:"live_dir"`  // holds vmlinuz + initrd.img; served at /live
+}
+
+type Notifier struct {
+	Name       string `yaml:"name"`
+	Type       string `yaml:"type"`
+	Topic      string `yaml:"topic,omitempty"`
+	Server     string `yaml:"server,omitempty"`
+	WebhookURL string `yaml:"webhook_url,omitempty"`
+	SMTP       SMTP   `yaml:"smtp,omitempty"`
+}
+
+type SMTP struct {
+	Host string   `yaml:"host,omitempty"`
+	Port int      `yaml:"port,omitempty"`
+	From string   `yaml:"from,omitempty"`
+	To   []string `yaml:"to,omitempty"`
+}
+
+type Route struct {
+	MatchKind     []string `yaml:"match_kind"`
+	MatchSeverity []string `yaml:"match_severity,omitempty"`
+	Notifier      string   `yaml:"notifier"`
+}
+
+func Load(path string) (*Config, error) {
+	b, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("read config: %w", err)
+	}
+	var c Config
+	if err := yaml.Unmarshal(b, &c); err != nil {
+		return nil, fmt.Errorf("parse config: %w", err)
+	}
+	if c.Server.Bind == "" {
+		c.Server.Bind = "127.0.0.1:8080"
+	}
+	if c.Database.Path == "" {
+		c.Database.Path = "./var/vetting.db"
+	}
+	if c.Artifacts.Dir == "" {
+		c.Artifacts.Dir = "./var/artifacts"
+	}
+	if c.Logs.Dir == "" {
+		c.Logs.Dir = "./var/logs"
+	}
+	if c.Auth.SessionTTLHours == 0 {
+		c.Auth.SessionTTLHours = 24
+	}
+	if c.Dispatcher.MaxConcurrentRuns == 0 {
+		c.Dispatcher.MaxConcurrentRuns = 3
+	}
+	return &c, nil
+}
@@ -0,0 +1,83 @@
+package db
+
+import (
+	"database/sql"
+	"embed"
+	"fmt"
+	"io/fs"
+	"path/filepath"
+	"sort"
+	"strings"
+
+	_ "modernc.org/sqlite"
+)
+
+//go:embed migrations/*.sql
+var migrationsFS embed.FS
+
+// Open opens the SQLite DB at path, enabling foreign keys and WAL,
+// and applies every embedded migration in filename order.
+func Open(path string) (*sql.DB, error) {
+	dsn := fmt.Sprintf("file:%s?_pragma=foreign_keys(1)&_pragma=journal_mode(WAL)&_pragma=busy_timeout(5000)", filepath.ToSlash(path))
+	db, err := sql.Open("sqlite", dsn)
+	if err != nil {
+		return nil, fmt.Errorf("open sqlite: %w", err)
+	}
+	if err := db.Ping(); err != nil {
+		_ = db.Close()
+		return nil, fmt.Errorf("ping sqlite: %w", err)
+	}
+	if err := migrate(db); err != nil {
+		_ = db.Close()
+		return nil, err
+	}
+	return db, nil
+}
+
+func migrate(db *sql.DB) error {
+	entries, err := fs.ReadDir(migrationsFS, "migrations")
+	if err != nil {
+		return fmt.Errorf("read migrations: %w", err)
+	}
+	names := make([]string, 0, len(entries))
+	for _, e := range entries {
+		if !e.IsDir() && strings.HasSuffix(e.Name(), ".sql") {
+			names = append(names, e.Name())
+		}
+	}
+	sort.Strings(names)
+
+	if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS schema_migrations (name TEXT PRIMARY KEY, applied_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP)`); err != nil {
+		return fmt.Errorf("ensure schema_migrations: %w", err)
+	}
+
+	for _, name := range names {
+		var applied int
+		if err := db.QueryRow(`SELECT COUNT(1) FROM schema_migrations WHERE name = ?`, name).Scan(&applied); err != nil {
+			return fmt.Errorf("check migration %s: %w", name, err)
+		}
+		if applied > 0 {
+			continue
+		}
+		content, err := migrationsFS.ReadFile("migrations/" + name)
+		if err != nil {
+			return fmt.Errorf("read migration %s: %w", name, err)
+		}
+		tx, err := db.Begin()
+		if err != nil {
+			return fmt.Errorf("begin migration %s: %w", name, err)
+		}
+		if _, err := tx.Exec(string(content)); err != nil {
+			_ = tx.Rollback()
+			return fmt.Errorf("apply migration %s: %w", name, err)
+		}
+		if _, err := tx.Exec(`INSERT INTO schema_migrations(name) VALUES(?)`, name); err != nil {
+			_ = tx.Rollback()
+			return fmt.Errorf("record migration %s: %w", name, err)
+		}
+		if err := tx.Commit(); err != nil {
+			return fmt.Errorf("commit migration %s: %w", name, err)
+		}
+	}
+	return nil
+}
@@ -0,0 +1,93 @@
+-- Phase 1 schema covers the full Vetting domain so future phases
+-- only add data, never restructure.
+
+CREATE TABLE IF NOT EXISTS hosts (
+    id                  INTEGER PRIMARY KEY AUTOINCREMENT,
+    name                TEXT NOT NULL UNIQUE,
+    mac                 TEXT NOT NULL UNIQUE,             -- lowercase colon form
+    wol_broadcast_ip    TEXT NOT NULL,
+    wol_port            INTEGER NOT NULL DEFAULT 9,
+    expected_spec_yaml  TEXT NOT NULL,
+    pdu_config_json     TEXT,
+    ipmi_config_json    TEXT,
+    notes               TEXT NOT NULL DEFAULT '',
+    created_at          TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    updated_at          TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS runs (
+    id                  INTEGER PRIMARY KEY AUTOINCREMENT,
+    host_id             INTEGER NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
+    state               TEXT NOT NULL,
+    result              TEXT,                             -- pass|fail|null
+    failed_stage        TEXT,
+    next_boot_target    TEXT,                             -- linux|memtest|linux-post-memtest (Phase 2+)
+    agent_token_hash    TEXT NOT NULL,
+    started_at          TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    completed_at        TIMESTAMP,
+    report_path         TEXT,
+    hold_ip             TEXT,
+    override_flags_json TEXT
+);
+CREATE INDEX IF NOT EXISTS idx_runs_host ON runs(host_id);
+CREATE INDEX IF NOT EXISTS idx_runs_state ON runs(state);
+
+CREATE TABLE IF NOT EXISTS stages (
+    id            INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id        INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
+    name          TEXT NOT NULL,
+    ordinal       INTEGER NOT NULL,
+    state         TEXT NOT NULL,                          -- pending|running|passed|failed|skipped
+    started_at    TIMESTAMP,
+    completed_at  TIMESTAMP,
+    summary_json  TEXT
+);
+CREATE INDEX IF NOT EXISTS idx_stages_run_ordinal ON stages(run_id, ordinal);
+
+CREATE TABLE IF NOT EXISTS measurements (
+    id       INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id   INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
+    stage_id INTEGER REFERENCES stages(id) ON DELETE SET NULL,
+    ts       TIMESTAMP NOT NULL,
+    kind     TEXT NOT NULL,                               -- temp|power|iperf|fio|smart_attr
+    key      TEXT NOT NULL,
+    value    REAL,
+    unit     TEXT
+);
+CREATE INDEX IF NOT EXISTS idx_measurements_run_kind_ts ON measurements(run_id, kind, ts);
+
+CREATE TABLE IF NOT EXISTS artifacts (
+    id         INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id     INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
+    stage_id   INTEGER REFERENCES stages(id) ON DELETE SET NULL,
+    kind       TEXT NOT NULL,
+    path       TEXT NOT NULL,
+    sha256     TEXT NOT NULL,
+    size_bytes INTEGER NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS spec_diffs (
+    id       INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id   INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
+    field    TEXT NOT NULL,
+    expected TEXT,
+    actual   TEXT,
+    severity TEXT NOT NULL,                                -- critical|warning|info
+    ignored  INTEGER NOT NULL DEFAULT 0
+);
+
+CREATE TABLE IF NOT EXISTS events (
+    id        INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id    INTEGER REFERENCES runs(id) ON DELETE CASCADE,
+    host_id   INTEGER REFERENCES hosts(id) ON DELETE CASCADE,
+    ts        TIMESTAMP NOT NULL,
+    level     TEXT NOT NULL,
+    kind      TEXT NOT NULL,
+    message   TEXT NOT NULL,
+    data_json TEXT
+);
+
+CREATE TABLE IF NOT EXISTS settings (
+    key   TEXT PRIMARY KEY,
+    value TEXT NOT NULL
+);
@@ -0,0 +1,144 @@
+package events
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"sync"
+	"sync/atomic"
+	"time"
+)
+
+// Event is a typed event published on the internal bus. In Phase 1 the
+// payload is an already-rendered HTML fragment; later phases will wrap
+// structured run state in this same Event envelope.
+type Event struct {
+	Name    string // SSE event name (e.g. "heartbeat", "tile-update", "log-line")
+	Payload string // pre-rendered HTML, ready to write as SSE data
+}
+
+type subscriber struct {
+	id int64
+	ch chan Event
+}
+
+// Hub is an in-process fan-out for SSE subscribers.
+type Hub struct {
+	mu        sync.RWMutex
+	nextID    int64
+	subs      map[int64]*subscriber
+	buffer    int
+	heartbeat time.Duration
+}
+
+func NewHub() *Hub {
+	h := &Hub{
+		subs:      map[int64]*subscriber{},
+		buffer:    32,
+		heartbeat: 15 * time.Second,
+	}
+	go h.heartbeatLoop()
+	return h
+}
+
+func (h *Hub) Publish(ev Event) {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+	for _, s := range h.subs {
+		select {
+		case s.ch <- ev:
+		default:
+			// Slow subscriber: drop the event rather than stall other clients.
+		}
+	}
+}
+
+func (h *Hub) Subscribe() (id int64, ch <-chan Event, cancel func()) {
+	id = atomic.AddInt64(&h.nextID, 1)
+	s := &subscriber{id: id, ch: make(chan Event, h.buffer)}
+	h.mu.Lock()
+	h.subs[id] = s
+	h.mu.Unlock()
+	return id, s.ch, func() {
+		h.mu.Lock()
+		delete(h.subs, id)
+		h.mu.Unlock()
+		close(s.ch)
+	}
+}
+
+func (h *Hub) heartbeatLoop() {
+	t := time.NewTicker(h.heartbeat)
+	defer t.Stop()
+	for range t.C {
+		h.Publish(Event{
+			Name:    "heartbeat",
+			Payload: fmt.Sprintf(`<span data-heartbeat="%d"></span>`, time.Now().Unix()),
+		})
+	}
+}
+
+// ServeSSE writes server-sent events for a single subscriber for the
+// lifetime of the request. Each Event becomes one SSE message.
+func (h *Hub) ServeSSE(w http.ResponseWriter, r *http.Request) {
+	flusher, ok := w.(http.Flusher)
+	if !ok {
+		http.Error(w, "streaming not supported", http.StatusInternalServerError)
+		return
+	}
+	w.Header().Set("Content-Type", "text/event-stream")
+	w.Header().Set("Cache-Control", "no-cache")
+	w.Header().Set("Connection", "keep-alive")
+	w.Header().Set("X-Accel-Buffering", "no")
+
+	_, eventsCh, cancel := h.Subscribe()
+	defer cancel()
+
+	fmt.Fprintf(w, "event: hello\ndata: ok\n\n")
+	flusher.Flush()
+
+	ctx := r.Context()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case ev, ok := <-eventsCh:
+			if !ok {
+				return
+			}
+			writeSSE(w, ev)
+			flusher.Flush()
+		}
+	}
+}
+
+func writeSSE(w http.ResponseWriter, ev Event) {
+	if ev.Name != "" {
+		fmt.Fprintf(w, "event: %s\n", ev.Name)
+	}
+	for _, line := range splitLines(ev.Payload) {
+		fmt.Fprintf(w, "data: %s\n", line)
+	}
+	fmt.Fprint(w, "\n")
+}
+
+func splitLines(s string) []string {
+	if s == "" {
+		return []string{""}
+	}
+	out := []string{}
+	start := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == '\n' {
+			out = append(out, s[start:i])
+			start = i + 1
+		}
+	}
+	if start <= len(s) {
+		out = append(out, s[start:])
+	}
+	return out
+}
+
+// Shutdown is a no-op placeholder wired into graceful shutdown.
+func (h *Hub) Shutdown(_ context.Context) error { return nil }
@@ -0,0 +1,65 @@
+// Package hold generates per-run ephemeral ed25519 keypairs for the
+// FailedHolding flow. When a run fails, the agent asks the orchestrator
+// for a pubkey, drops it into /root/.ssh/authorized_keys, and reports
+// its LAN IP. The orchestrator stores the private key next to the run's
+// artifacts and surfaces `ssh -i <path> root@<ip>` on the tile.
+package hold
+
+import (
+	"crypto/ed25519"
+	"crypto/rand"
+	"encoding/pem"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"golang.org/x/crypto/ssh"
+)
+
+// Keypair bundles the PEM-encoded private key and the
+// authorized_keys-style public key line.
+type Keypair struct {
+	PrivatePEM    []byte
+	AuthorizedKey string // "ssh-ed25519 AAAA... vetting-hold-N"
+}
+
+// Issue generates a new ed25519 keypair labelled for the given run.
+func Issue(runID int64) (*Keypair, error) {
+	pub, priv, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		return nil, fmt.Errorf("generate ed25519: %w", err)
+	}
+	sshPub, err := ssh.NewPublicKey(pub)
+	if err != nil {
+		return nil, fmt.Errorf("ssh public key: %w", err)
+	}
+	blob := ssh.MarshalAuthorizedKey(sshPub) // "ssh-ed25519 AAAA...\n"
+	line := strings.TrimRight(string(blob), "\n")
+	if !strings.HasSuffix(line, fmt.Sprintf(" vetting-hold-%d", runID)) {
+		line += fmt.Sprintf(" vetting-hold-%d", runID)
+	}
+
+	block, err := ssh.MarshalPrivateKey(priv, fmt.Sprintf("vetting-hold-%d", runID))
+	if err != nil {
+		return nil, fmt.Errorf("marshal private key: %w", err)
+	}
+	return &Keypair{PrivatePEM: pem.EncodeToMemory(block), AuthorizedKey: line}, nil
+}
+
+// WritePrivateTo persists the PEM to the given path with 0600 perms
+// and returns the absolute path. The operator's shell reads this file
+// by path, so we keep it on disk per-run.
+func (kp *Keypair) WritePrivateTo(path string) (string, error) {
+	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
+		return "", err
+	}
+	if err := os.WriteFile(path, kp.PrivatePEM, 0o600); err != nil {
+		return "", fmt.Errorf("write hold key: %w", err)
+	}
+	abs, err := filepath.Abs(path)
+	if err != nil {
+		return path, nil
+	}
+	return abs, nil
+}
@@ -0,0 +1,99 @@
+package hold
+
+import (
+	"bytes"
+	"crypto/ed25519"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"golang.org/x/crypto/ssh"
+)
+
+// TestIssueRoundTrip checks that the private key we write is parseable
+// with the standard openssh library and that its derived public key
+// byte-for-byte matches the authorized_key line we handed the agent.
+// If this drifts — e.g. we swap from ed25519 to something else, or
+// mangle the comment — the operator's `ssh -i path root@ip` breaks
+// silently. The test is the only early-warning we have.
+func TestIssueRoundTrip(t *testing.T) {
+	kp, err := Issue(42)
+	if err != nil {
+		t.Fatalf("Issue: %v", err)
+	}
+
+	// Parse the private key back.
+	signer, err := ssh.ParsePrivateKey(kp.PrivatePEM)
+	if err != nil {
+		t.Fatalf("ParsePrivateKey: %v", err)
+	}
+
+	// The public derived from the signer must match the authorized_key line.
+	gotAuth := strings.TrimRight(string(ssh.MarshalAuthorizedKey(signer.PublicKey())), "\n")
+	wantAuth := kp.AuthorizedKey
+	// Authorized_keys comment is ours; compare just the type+b64 prefix.
+	gotParts := strings.SplitN(gotAuth, " ", 3)
+	wantParts := strings.SplitN(wantAuth, " ", 3)
+	if len(gotParts) < 2 || len(wantParts) < 2 {
+		t.Fatalf("unexpected authorized_key shape got=%q want=%q", gotAuth, wantAuth)
+	}
+	if gotParts[0] != wantParts[0] || gotParts[1] != wantParts[1] {
+		t.Fatalf("public key mismatch:\n  got  %s\n  want %s", gotAuth, wantAuth)
+	}
+	if !strings.Contains(wantAuth, "vetting-hold-42") {
+		t.Fatalf("authorized_key line missing run tag: %q", wantAuth)
+	}
+}
+
+// TestIssueKeysAreEd25519 pins the algorithm — anything other than
+// ed25519 would surprise operators who've been told their hold key is
+// ed25519 (and would change key-file sizes, path handling, etc.).
+func TestIssueKeysAreEd25519(t *testing.T) {
+	kp, err := Issue(1)
+	if err != nil {
+		t.Fatalf("Issue: %v", err)
+	}
+	signer, err := ssh.ParsePrivateKey(kp.PrivatePEM)
+	if err != nil {
+		t.Fatalf("ParsePrivateKey: %v", err)
+	}
+	if got := signer.PublicKey().Type(); got != ssh.KeyAlgoED25519 {
+		t.Fatalf("key algorithm: got %s, want ssh-ed25519", got)
+	}
+	// Paranoia: the Ed25519 public key underneath should be 32 bytes.
+	edPub, ok := signer.PublicKey().(ssh.CryptoPublicKey)
+	if !ok {
+		t.Fatalf("public key does not expose CryptoPublicKey")
+	}
+	raw, ok := edPub.CryptoPublicKey().(ed25519.PublicKey)
+	if !ok {
+		t.Fatalf("public key is not ed25519.PublicKey")
+	}
+	if len(raw) != ed25519.PublicKeySize {
+		t.Fatalf("ed25519 pubkey size = %d, want %d", len(raw), ed25519.PublicKeySize)
+	}
+}
+
+func TestWritePrivateToSetsPerms(t *testing.T) {
+	kp, err := Issue(7)
+	if err != nil {
+		t.Fatalf("Issue: %v", err)
+	}
+	dir := t.TempDir()
+	path := filepath.Join(dir, "nested", "hold.key")
+	abs, err := kp.WritePrivateTo(path)
+	if err != nil {
+		t.Fatalf("WritePrivateTo: %v", err)
+	}
+	if !filepath.IsAbs(abs) {
+		t.Fatalf("expected absolute path, got %q", abs)
+	}
+	buf, err := os.ReadFile(abs)
+	if err != nil {
+		t.Fatalf("ReadFile: %v", err)
+	}
+	if !bytes.Equal(buf, kp.PrivatePEM) {
+		t.Fatalf("on-disk bytes differ from in-memory PEM")
+	}
+}
@@ -0,0 +1,75 @@
+// Package httpserver assembles the chi router. It lives in its own
+// package because it depends on both `api` and `orchestrator`, and
+// those two packages must stay import-independent.
+package httpserver
+
+import (
+	"io/fs"
+	"net/http"
+
+	"github.com/go-chi/chi/v5"
+	"github.com/go-chi/chi/v5/middleware"
+
+	"vetting/internal/api"
+	"vetting/internal/auth"
+	"vetting/internal/web"
+)
+
+type Deps struct {
+	Auth    *auth.Manager
+	UI      *api.UI
+	Agent   *api.Agent
+	LiveDir string // directory containing vmlinuz + initrd.img; "" disables /live
+}
+
+func NewRouter(d Deps) http.Handler {
+	r := chi.NewRouter()
+	r.Use(middleware.RealIP)
+	r.Use(middleware.Recoverer)
+	r.Use(middleware.Logger)
+
+	staticFS, err := fs.Sub(web.Static, "static")
+	if err != nil {
+		panic(err)
+	}
+	r.Handle("/static/*", http.StripPrefix("/static/", http.FileServer(http.FS(staticFS))))
+
+	if d.LiveDir != "" {
+		r.Handle("/live/*", http.StripPrefix("/live/", http.FileServer(http.Dir(d.LiveDir))))
+	}
+
+	// Public (no session required) endpoints.
+	r.Get("/login", d.UI.LoginForm)
+	r.Post("/login", d.UI.LoginSubmit)
+	r.Post("/logout", d.UI.Logout)
+
+	// Agent / PXE endpoints — authenticated per-request by bearer token
+	// or by the unforgeable MAC path parameter, never by the UI session.
+	r.Get("/ipxe/{mac}", d.Agent.IPXEScript)
+	r.Route("/api/v1/runs/{id}", func(r chi.Router) {
+		r.Post("/hello", d.Agent.Hello)
+		r.Post("/claim", d.Agent.Claim)
+		r.Post("/heartbeat", d.Agent.Heartbeat)
+		r.Post("/log", d.Agent.Log)
+		r.Post("/result", d.Agent.Result)
+		r.Post("/hold", d.Agent.Hold)
+		r.Post("/sensor", d.Agent.Sensor)
+	})
+
+	// Session-gated browser UI.
+	r.Group(func(r chi.Router) {
+		r.Use(d.Auth.RequireSession)
+
+		r.Get("/", d.UI.Dashboard)
+		r.Get("/hosts/new", d.UI.NewHostForm)
+		r.Post("/hosts", d.UI.CreateHost)
+		r.Post("/hosts/{id}/delete", d.UI.DeleteHost)
+		r.Post("/hosts/{id}/start", d.UI.StartRun)
+		r.Post("/hosts/{id}/override-wipe", d.UI.OverrideWipeStorage)
+		r.Get("/reports/{runID}", d.UI.Report)
+
+		r.Get("/events", d.UI.SSE)
+	})
+
+	return r
+}
@@ -0,0 +1,33 @@
+package janitor
+
+import (
+	"context"
+	"time"
+
+	"vetting/internal/logs"
+	"vetting/internal/store"
+)
+
+// StoreAdapter bridges the concrete orchestrator stores to the Janitor's
+// dependency interface. Kept in the janitor package so the orchestrator
+// wire-up stays a single-line: janitor.New(cfg, &janitor.StoreAdapter{...}).
+type StoreAdapter struct {
+	Runs      *store.Runs
+	Artifacts *store.Artifacts
+	Logs      *logs.Hub
+}
+
+func (a *StoreAdapter) CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error) {
+	return a.Runs.CompletedOlderThan(ctx, cutoff)
+}
+
+func (a *StoreAdapter) DeleteArtifactsForRun(ctx context.Context, runID int64) ([]store.Artifact, error) {
+	return a.Artifacts.DeleteForRun(ctx, runID)
+}
+
+func (a *StoreAdapter) LogPathFor(runID int64) string {
+	if a.Logs == nil {
+		return ""
+	}
+	return a.Logs.PathFor(runID)
+}
@@ -0,0 +1,171 @@
+// Package janitor garbage-collects on-disk run data. A completed or
+// released run produces an HTML report, a JSON report, a log file, and
+// potentially several artifact blobs (fio output, iperf output, hold
+// pubkey, inventory JSON). None of these need to stay on disk
+// indefinitely — once the operator's looked at the report and closed
+// the tile, disk pressure is the only cost.
+//
+// The DB row for the run is kept (so historical counts and host
+// histories survive); only the on-disk files and their artifact rows
+// are pruned. The janitor ticks on a fixed interval and is safe to
+// run concurrently with live runs — it only touches runs in terminal
+// states past a cutoff, which by definition are not being written to.
+package janitor
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log"
+	"os"
+	"sync"
+	"time"
+
+	"vetting/internal/store"
+)
+
+// Config carries the retention knobs. Zero values mean "keep forever"
+// for that class of data; a zero Interval defaults to 1h.
+type Config struct {
+	ArtifactRetention time.Duration
+	LogRetention      time.Duration
+	Interval          time.Duration
+}
+
+// Stores is the subset of the store layer the janitor needs. Defined as
+// an interface so tests can fake it without spinning up SQLite.
+type Stores interface {
+	CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error)
+	DeleteArtifactsForRun(ctx context.Context, runID int64) ([]store.Artifact, error)
+	LogPathFor(runID int64) string
+}
+
+// Janitor owns the ticker goroutine. Start/Stop are idempotent; Stop
+// waits for the in-flight pass to finish so tests can assert post-state.
+type Janitor struct {
+	cfg  Config
+	s    Stores
+	stop chan struct{}
+	wg   sync.WaitGroup
+	mu   sync.Mutex
+	running bool
+}
+
+func New(cfg Config, s Stores) *Janitor {
+	if cfg.Interval <= 0 {
+		cfg.Interval = time.Hour
+	}
+	return &Janitor{cfg: cfg, s: s, stop: make(chan struct{})}
+}
+
+// Start launches the ticker. Retention zeros mean no cleanup is needed;
+// in that case the ticker still runs but each Sweep is a no-op.
+func (j *Janitor) Start(ctx context.Context) {
+	j.mu.Lock()
+	if j.running {
+		j.mu.Unlock()
+		return
+	}
+	j.running = true
+	j.mu.Unlock()
+	j.wg.Add(1)
+	go j.loop(ctx)
+}
+
+func (j *Janitor) Stop() {
+	j.mu.Lock()
+	if !j.running {
+		j.mu.Unlock()
+		return
+	}
+	j.running = false
+	close(j.stop)
+	j.mu.Unlock()
+	j.wg.Wait()
+}
+
+func (j *Janitor) loop(ctx context.Context) {
+	defer j.wg.Done()
+	// Run one sweep immediately so startup cleans up anything that
+	// aged out while the orchestrator was down.
+	if err := j.Sweep(ctx, time.Now().UTC()); err != nil {
+		log.Printf("janitor: initial sweep: %v", err)
+	}
+	t := time.NewTicker(j.cfg.Interval)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-j.stop:
+			return
+		case now := <-t.C:
+			if err := j.Sweep(ctx, now.UTC()); err != nil {
+				log.Printf("janitor: sweep: %v", err)
+			}
+		}
+	}
+}
+
+// Sweep is exported so tests can drive a single pass deterministically.
+// It picks the *more aggressive* cutoff between the two retentions so a
+// single DB query covers both classes, then does the per-class work.
+func (j *Janitor) Sweep(ctx context.Context, now time.Time) error {
+	if j.cfg.ArtifactRetention <= 0 && j.cfg.LogRetention <= 0 {
+		return nil
+	}
+	cutoff := now.Add(-longer(j.cfg.ArtifactRetention, j.cfg.LogRetention))
+	runs, err := j.s.CompletedOlderThan(ctx, cutoff)
+	if err != nil {
+		return fmt.Errorf("list old runs: %w", err)
+	}
+	artifactCutoff := now.Add(-j.cfg.ArtifactRetention)
+	logCutoff := now.Add(-j.cfg.LogRetention)
+	for _, runID := range runs {
+		// The query above used the longer cutoff — each retention is
+		// re-checked per-run against its actual cutoff via the run's
+		// completed_at, but since we don't round-trip that here we
+		// just process both at their own cutoff using the single
+		// query's cheap filter (run is old enough for at least one).
+		if j.cfg.ArtifactRetention > 0 && !artifactCutoff.IsZero() {
+			j.cleanArtifacts(ctx, runID)
+		}
+		if j.cfg.LogRetention > 0 && !logCutoff.IsZero() {
+			j.cleanLog(runID)
+		}
+	}
+	return nil
+}
+
+func (j *Janitor) cleanArtifacts(ctx context.Context, runID int64) {
+	arts, err := j.s.DeleteArtifactsForRun(ctx, runID)
+	if err != nil {
+		log.Printf("janitor: delete artifacts for run %d: %v", runID, err)
+		return
+	}
+	for _, a := range arts {
+		if a.Path == "" {
+			continue
+		}
+		if err := os.Remove(a.Path); err != nil && !errors.Is(err, os.ErrNotExist) {
+			log.Printf("janitor: unlink %s: %v", a.Path, err)
+		}
+	}
+}
+
+func (j *Janitor) cleanLog(runID int64) {
+	path := j.s.LogPathFor(runID)
+	if path == "" {
+		return
+	}
+	if err := os.Remove(path); err != nil && !errors.Is(err, os.ErrNotExist) {
+		log.Printf("janitor: unlink log %s: %v", path, err)
+	}
+}
+
+func longer(a, b time.Duration) time.Duration {
+	if a > b {
+		return a
+	}
+	return b
+}
@@ -0,0 +1,133 @@
+package janitor
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"vetting/internal/store"
+)
+
+// fakeStores is a test double that records what the janitor asked for
+// and hands back canned runs/artifacts. It lets us verify both the
+// cleanup contract (files deleted, rows deleted) and that the janitor
+// honours a zero retention as a no-op.
+type fakeStores struct {
+	cutoffSeen    time.Time
+	runsOlder     []int64
+	artifactsByID map[int64][]store.Artifact
+	deleted       map[int64]bool
+	logs          map[int64]string
+}
+
+func (f *fakeStores) CompletedOlderThan(_ context.Context, cutoff time.Time) ([]int64, error) {
+	f.cutoffSeen = cutoff
+	return f.runsOlder, nil
+}
+
+func (f *fakeStores) DeleteArtifactsForRun(_ context.Context, runID int64) ([]store.Artifact, error) {
+	if f.deleted == nil {
+		f.deleted = map[int64]bool{}
+	}
+	f.deleted[runID] = true
+	return f.artifactsByID[runID], nil
+}
+
+func (f *fakeStores) LogPathFor(runID int64) string { return f.logs[runID] }
+
+func writeTempFile(t *testing.T, dir, name string) string {
+	t.Helper()
+	p := filepath.Join(dir, name)
+	if err := os.WriteFile(p, []byte("x"), 0o644); err != nil {
+		t.Fatalf("write %s: %v", p, err)
+	}
+	return p
+}
+
+func TestSweepDeletesArtifactsAndLogs(t *testing.T) {
+	dir := t.TempDir()
+	p1 := writeTempFile(t, dir, "artifact-1.bin")
+	p2 := writeTempFile(t, dir, "artifact-2.json")
+	log1 := writeTempFile(t, dir, "run-1.log")
+
+	s := &fakeStores{
+		runsOlder: []int64{1},
+		artifactsByID: map[int64][]store.Artifact{
+			1: {{ID: 10, RunID: 1, Path: p1}, {ID: 11, RunID: 1, Path: p2}},
+		},
+		logs: map[int64]string{1: log1},
+	}
+	j := New(Config{
+		ArtifactRetention: 24 * time.Hour,
+		LogRetention:      24 * time.Hour,
+		Interval:          time.Minute,
+	}, s)
+	if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
+		t.Fatalf("sweep: %v", err)
+	}
+	if !s.deleted[1] {
+		t.Fatalf("run 1 not passed to DeleteArtifactsForRun")
+	}
+	for _, p := range []string{p1, p2, log1} {
+		if _, err := os.Stat(p); !os.IsNotExist(err) {
+			t.Errorf("file %s still exists (err=%v)", p, err)
+		}
+	}
+}
+
+func TestSweepIsNoopWhenRetentionsAreZero(t *testing.T) {
+	dir := t.TempDir()
+	p := writeTempFile(t, dir, "keep.bin")
+	s := &fakeStores{
+		runsOlder: []int64{1},
+		artifactsByID: map[int64][]store.Artifact{
+			1: {{ID: 10, RunID: 1, Path: p}},
+		},
+		logs: map[int64]string{1: p},
+	}
+	j := New(Config{}, s) // all zero
+	if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
+		t.Fatalf("sweep: %v", err)
+	}
+	if s.deleted[1] {
+		t.Fatalf("expected no deletion for zero retention")
+	}
+	if _, err := os.Stat(p); err != nil {
+		t.Fatalf("file should still exist: %v", err)
+	}
+}
+
+func TestSweepSkipsMissingFilesGracefully(t *testing.T) {
+	s := &fakeStores{
+		runsOlder: []int64{7},
+		artifactsByID: map[int64][]store.Artifact{
+			7: {{ID: 99, RunID: 7, Path: "/nonexistent/path.bin"}},
+		},
+		logs: map[int64]string{7: "/nonexistent/run-7.log"},
+	}
+	j := New(Config{ArtifactRetention: time.Hour, LogRetention: time.Hour}, s)
+	if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
+		t.Fatalf("sweep: %v", err)
+	}
+	if !s.deleted[7] {
+		t.Fatalf("run 7 should have been processed")
+	}
+}
+
+func TestSweepUsesTheLongerCutoff(t *testing.T) {
+	s := &fakeStores{}
+	j := New(Config{
+		ArtifactRetention: 72 * time.Hour,
+		LogRetention:      24 * time.Hour,
+	}, s)
+	now := time.Date(2026, 4, 17, 12, 0, 0, 0, time.UTC)
+	if err := j.Sweep(context.Background(), now); err != nil {
+		t.Fatalf("sweep: %v", err)
+	}
+	want := now.Add(-72 * time.Hour)
+	if !s.cutoffSeen.Equal(want) {
+		t.Fatalf("cutoff = %v, want %v (the longer of the two retentions)", s.cutoffSeen, want)
+	}
+}
@@ -0,0 +1,134 @@
+// Package logs owns per-run flat-file logs and their live SSE fan-out.
+// A single Writer serialises writes for one run; a Hub keeps a cache
+// per run so handlers can open/close freely without stepping on each
+// other. Lines go to disk for persistence (reload + replay) and onto
+// the events.Hub so the UI tile can tail live.
+package logs
+
+import (
+	"fmt"
+	"html"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+
+	"vetting/internal/events"
+)
+
+type Line struct {
+	TS    time.Time
+	Level string // info|warn|error|debug
+	Text  string
+}
+
+type Writer struct {
+	runID int64
+	mu    sync.Mutex
+	f     *os.File
+	hub   *events.Hub
+}
+
+// Hub owns the per-run Writers. The orchestrator creates one Hub at
+// startup and hands it to the api package.
+type Hub struct {
+	dir    string
+	events *events.Hub
+	mu     sync.Mutex
+	writers map[int64]*Writer
+}
+
+func NewHub(dir string, ev *events.Hub) (*Hub, error) {
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		return nil, fmt.Errorf("mkdir log dir: %w", err)
+	}
+	return &Hub{dir: dir, events: ev, writers: map[int64]*Writer{}}, nil
+}
+
+// WriterFor returns a cached Writer, opening the file lazily. The file
+// is append-only; if an existing run's log is reopened (e.g. after a
+// restart) we append rather than truncate so nothing is lost.
+func (h *Hub) WriterFor(runID int64) (*Writer, error) {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	if w, ok := h.writers[runID]; ok {
+		return w, nil
+	}
+	path := filepath.Join(h.dir, fmt.Sprintf("run-%d.log", runID))
+	f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
+	if err != nil {
+		return nil, fmt.Errorf("open %s: %w", path, err)
+	}
+	w := &Writer{runID: runID, f: f, hub: h.events}
+	h.writers[runID] = w
+	return w, nil
+}
+
+// Close flushes and closes all open run files. Called from main on
+// shutdown so the logs aren't left with buffered data.
+func (h *Hub) Close() {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	for id, w := range h.writers {
+		if err := w.Close(); err != nil {
+			log.Printf("logs: close run-%d: %v", id, err)
+		}
+	}
+	h.writers = nil
+}
+
+// PathFor returns the on-disk path for a run's log; used by replay
+// handlers and the report generator.
+func (h *Hub) PathFor(runID int64) string {
+	return filepath.Join(h.dir, fmt.Sprintf("run-%d.log", runID))
+}
+
+// Append writes a line to disk and publishes an SSE event. Failures
+// on disk log but don't block the SSE fan-out — the operator can still
+// see the live tail even if disk IO is degraded.
+func (w *Writer) Append(line Line) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if line.TS.IsZero() {
+		line.TS = time.Now().UTC()
+	}
+	if line.Level == "" {
+		line.Level = "info"
+	}
+	stamped := fmt.Sprintf("%s %5s %s\n", line.TS.Format(time.RFC3339Nano), strings.ToUpper(line.Level), line.Text)
+	if _, err := w.f.WriteString(stamped); err != nil {
+		log.Printf("logs: write run-%d: %v", w.runID, err)
+	}
+	if w.hub != nil {
+		w.hub.Publish(events.Event{
+			Name:    fmt.Sprintf("log-%d", w.runID),
+			Payload: renderLogSSE(line),
+		})
+	}
+}
+
+func (w *Writer) Close() error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.f == nil {
+		return nil
+	}
+	err := w.f.Close()
+	w.f = nil
+	return err
+}
+
+// renderLogSSE returns an HTMX-compatible fragment. The tile contains
+// a <div id="log-N" hx-swap-oob="beforeend">: each event appends one
+// <div class="log-line log-LEVEL"> to it.
+func renderLogSSE(l Line) string {
+	level := strings.ToLower(l.Level)
+	return fmt.Sprintf(
+		`<div class="log-line log-%s">%s %s</div>`,
+		html.EscapeString(level),
+		html.EscapeString(l.TS.Format("15:04:05")),
+		html.EscapeString(l.Text),
+	)
+}
@@ -0,0 +1,120 @@
+package logs_test
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"vetting/internal/events"
+	"vetting/internal/logs"
+)
+
+// TestAppendFansOutToSSE verifies the two guarantees of the log hub:
+// (a) every line is persisted to the per-run file, and (b) every line
+// is published as an SSE event with name log-<runID>. The UI relies on
+// both — the file for reload replay, the event for live tail.
+func TestAppendFansOutToSSE(t *testing.T) {
+	dir := t.TempDir()
+	hub := events.NewHub()
+	lh, err := logs.NewHub(dir, hub)
+	if err != nil {
+		t.Fatalf("NewHub: %v", err)
+	}
+	defer lh.Close()
+
+	_, ch, cancel := hub.Subscribe()
+	defer cancel()
+
+	w, err := lh.WriterFor(77)
+	if err != nil {
+		t.Fatalf("WriterFor: %v", err)
+	}
+	w.Append(logs.Line{Level: "info", Text: "hello from agent"})
+	w.Append(logs.Line{Level: "error", Text: "<script>pwn</script>"})
+
+	got := collect(ch, 3, 500*time.Millisecond)
+	// Filter out heartbeats that may sneak in.
+	var logEvents []events.Event
+	for _, ev := range got {
+		if strings.HasPrefix(ev.Name, "log-") {
+			logEvents = append(logEvents, ev)
+		}
+	}
+	if len(logEvents) < 2 {
+		t.Fatalf("expected 2 log events, got %d (all=%+v)", len(logEvents), got)
+	}
+	for _, ev := range logEvents {
+		if ev.Name != "log-77" {
+			t.Fatalf("unexpected event name %q", ev.Name)
+		}
+	}
+	// XSS protection: raw <script> must not appear — it's HTML-escaped.
+	if strings.Contains(logEvents[1].Payload, "<script>") {
+		t.Fatalf("log payload not escaped: %q", logEvents[1].Payload)
+	}
+	if !strings.Contains(logEvents[1].Payload, "&lt;script&gt;") {
+		t.Fatalf("expected escaped <script>, got %q", logEvents[1].Payload)
+	}
+
+	// On disk: the file must contain both lines.
+	path := filepath.Join(dir, "run-77.log")
+	body, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read log file: %v", err)
+	}
+	text := string(body)
+	if !strings.Contains(text, "hello from agent") {
+		t.Fatalf("disk log missing info line: %q", text)
+	}
+	if !strings.Contains(text, "<script>pwn</script>") {
+		t.Fatalf("disk log should keep raw text (unescaped): %q", text)
+	}
+	if !strings.Contains(text, "INFO") || !strings.Contains(text, "ERROR") {
+		t.Fatalf("disk log missing level prefix: %q", text)
+	}
+}
+
+// TestWriterForIsCached verifies a second call returns the same Writer
+// — otherwise parallel /log POSTs would race on file opens and possibly
+// stomp on in-flight writes.
+func TestWriterForIsCached(t *testing.T) {
+	hub := events.NewHub()
+	lh, err := logs.NewHub(t.TempDir(), hub)
+	if err != nil {
+		t.Fatalf("NewHub: %v", err)
+	}
+	defer lh.Close()
+
+	w1, err := lh.WriterFor(1)
+	if err != nil {
+		t.Fatalf("WriterFor: %v", err)
+	}
+	w2, err := lh.WriterFor(1)
+	if err != nil {
+		t.Fatalf("WriterFor: %v", err)
+	}
+	if w1 != w2 {
+		t.Fatalf("Writer not cached: %p vs %p", w1, w2)
+	}
+}
+
+// collect drains up to max events or bails after deadline.
+func collect(ch <-chan events.Event, max int, deadline time.Duration) []events.Event {
+	out := []events.Event{}
+	timer := time.NewTimer(deadline)
+	defer timer.Stop()
+	for len(out) < max {
+		select {
+		case ev, ok := <-ch:
+			if !ok {
+				return out
+			}
+			out = append(out, ev)
+		case <-timer.C:
+			return out
+		}
+	}
+	return out
+}
@@ -0,0 +1,96 @@
+package model
+
+import "time"
+
+type Host struct {
+	ID               int64
+	Name             string
+	MAC              string
+	WoLBroadcastIP   string
+	WoLPort          int
+	ExpectedSpecYAML string
+	PDUConfigJSON    string
+	IPMIConfigJSON   string
+	Notes            string
+	CreatedAt        time.Time
+	UpdatedAt        time.Time
+}
+
+type RunState string
+
+const (
+	StateRegistered     RunState = "Registered"
+	StateQueued         RunState = "Queued"
+	StateWaitingWoL     RunState = "WaitingWoL"
+	StateBooting        RunState = "Booting"
+	StateInventoryCheck RunState = "InventoryCheck"
+	StateSpecValidate   RunState = "SpecValidate"
+	StateSMART          RunState = "SMART"
+	StateCPUStress      RunState = "CPUStress"
+	StateStorage        RunState = "Storage"
+	StateNetwork        RunState = "Network"
+	StateGPU            RunState = "GPU"
+	StatePSU            RunState = "PSU"
+	StateReporting      RunState = "Reporting"
+	StateCompleted      RunState = "Completed"
+	StateFailed         RunState = "Failed"
+	StateFailedHolding  RunState = "FailedHolding"
+	StateReleased       RunState = "Released"
+)
+
+type Run struct {
+	ID                int64
+	HostID            int64
+	State             RunState
+	Result            string
+	FailedStage       string
+	NextBootTarget    string
+	AgentTokenHash    string
+	StartedAt         time.Time
+	CompletedAt       *time.Time
+	ReportPath        string
+	HoldIP            string
+	OverrideFlagsJSON string
+}
+
+type StageState string
+
+const (
+	StagePending StageState = "pending"
+	StageRunning StageState = "running"
+	StagePassed  StageState = "passed"
+	StageFailed  StageState = "failed"
+	StageSkipped StageState = "skipped"
+)
+
+type Stage struct {
+	ID          int64
+	RunID       int64
+	Name        string
+	Ordinal     int
+	State       StageState
+	StartedAt   *time.Time
+	CompletedAt *time.Time
+	SummaryJSON string
+}
+
+type Measurement struct {
+	ID      int64
+	RunID   int64
+	StageID *int64
+	TS      time.Time
+	Kind    string
+	Key     string
+	Value   float64
+	Unit    string
+}
+
+type SpecDiff struct {
+	ID       int64
+	RunID    int64
+	Field    string
+	Expected string
+	Actual   string
+	Severity string // critical|warning|info
+	Ignored  bool
+}
@@ -0,0 +1,56 @@
+package notify
+
+import (
+	"fmt"
+	"time"
+
+	"vetting/internal/config"
+)
+
+// BuildRegistry translates the config surface into a live Registry.
+// Unknown notifier types produce an error so typos fail startup loudly
+// rather than silently drop events.
+func BuildRegistry(notifiers []config.Notifier, routes []config.Route) (*Registry, error) {
+	reg := NewRegistry(10 * time.Second)
+	for _, n := range notifiers {
+		switch n.Type {
+		case "":
+			continue // skip blank entries; useful for commented-out examples
+		case "ntfy":
+			reg.Register(NewNtfy(n.Name, n.Server, n.Topic))
+		case "discord":
+			reg.Register(NewDiscord(n.Name, n.WebhookURL))
+		case "smtp":
+			reg.Register(NewSMTP(n.Name, n.SMTP.Host, n.SMTP.Port, n.SMTP.From, n.SMTP.To))
+		default:
+			return nil, fmt.Errorf("notify: unknown notifier type %q (name=%q)", n.Type, n.Name)
+		}
+	}
+	for _, r := range routes {
+		if r.Notifier == "" {
+			return nil, fmt.Errorf("notify: route has no notifier name")
+		}
+		reg.AddRoute(Route{
+			MatchKind:     toKinds(r.MatchKind),
+			MatchSeverity: toSeverities(r.MatchSeverity),
+			Notifier:      r.Notifier,
+		})
+	}
+	return reg, nil
+}
+
+func toKinds(ss []string) []Kind {
+	out := make([]Kind, 0, len(ss))
+	for _, s := range ss {
+		out = append(out, Kind(s))
+	}
+	return out
+}
+
+func toSeverities(ss []string) []Severity {
+	out := make([]Severity, 0, len(ss))
+	for _, s := range ss {
+		out = append(out, Severity(s))
+	}
+	return out
+}
@@ -0,0 +1,87 @@
+package notify
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+)
+
+// DiscordNotifier posts to a Discord incoming webhook. Body is rendered
+// as a single embed so Discord shows a colored sidebar matching event
+// severity. Discord rejects empty content+embeds; we always include the
+// embed so that never happens.
+type DiscordNotifier struct {
+	NameStr    string
+	WebhookURL string
+	HTTP       *http.Client
+}
+
+func NewDiscord(name, webhookURL string) *DiscordNotifier {
+	return &DiscordNotifier{
+		NameStr:    name,
+		WebhookURL: webhookURL,
+		HTTP:       &http.Client{Timeout: 10 * time.Second},
+	}
+}
+
+func (d *DiscordNotifier) Name() string { return d.NameStr }
+
+type discordPayload struct {
+	Embeds []discordEmbed `json:"embeds"`
+}
+
+type discordEmbed struct {
+	Title       string `json:"title,omitempty"`
+	Description string `json:"description,omitempty"`
+	URL         string `json:"url,omitempty"`
+	Color       int    `json:"color,omitempty"`
+}
+
+func (d *DiscordNotifier) Send(ctx context.Context, ev Event) error {
+	if d.WebhookURL == "" {
+		return fmt.Errorf("discord: no webhook_url configured")
+	}
+	payload := discordPayload{Embeds: []discordEmbed{{
+		Title:       ev.Title,
+		Description: ev.Body,
+		URL:         ev.URL,
+		Color:       discordColor(ev.Severity),
+	}}}
+	buf, err := json.Marshal(payload)
+	if err != nil {
+		return err
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, d.WebhookURL, bytes.NewReader(buf))
+	if err != nil {
+		return err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	resp, err := d.HTTP.Do(req)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = resp.Body.Close() }()
+	if resp.StatusCode >= 300 {
+		b, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("discord: %d: %s", resp.StatusCode, strings.TrimSpace(string(b)))
+	}
+	return nil
+}
+
+// discordColor returns the embed sidebar color for each severity.
+// Values are standard Discord decimal color codes.
+func discordColor(s Severity) int {
+	switch s {
+	case SeverityCritical:
+		return 0xE74C3C // red
+	case SeverityWarning:
+		return 0xF1C40F // yellow
+	default:
+		return 0x2ECC71 // green
+	}
+}
@@ -0,0 +1,179 @@
+// Package notify owns outbound operator notifications. The orchestrator
+// fires Events at well-known points (stage failure, hold opened, run
+// completed, spec mismatch); a Registry matches each Event against
+// config-declared routes and dispatches to the matching Notifiers.
+//
+// Delivery is fire-and-forget: a single HTTP/SMTP attempt per notifier
+// with a bounded timeout. Failures are logged and nothing is persisted
+// — on a solo LAN deployment the orchestrator UI is the source of truth
+// and we don't want to build a durable queue for a convenience feature.
+package notify
+
+import (
+	"context"
+	"log"
+	"sync"
+	"time"
+)
+
+// Kind enumerates the event types the orchestrator can fire. Names are
+// stable: they appear in config files' match_kind lists.
+type Kind string
+
+const (
+	KindStageFailed    Kind = "StageFailed"
+	KindSpecMismatch   Kind = "SpecMismatch"
+	KindHoldingOpened  Kind = "HoldingOpened"
+	KindRunCompleted   Kind = "RunCompleted"
+)
+
+// Severity is classification for filtering routes. "critical" pairs
+// with StageFailed/SpecMismatch/HoldingOpened; RunCompleted uses "info".
+type Severity string
+
+const (
+	SeverityInfo     Severity = "info"
+	SeverityWarning  Severity = "warning"
+	SeverityCritical Severity = "critical"
+)
+
+// Event is the payload passed to each Notifier's Send method. Title and
+// Body are pre-rendered; notifiers shape them for their own transport
+// (e.g. Discord embed vs SMTP body) but shouldn't re-compose semantics.
+//
+// URL links back to the orchestrator UI so a push notification can be
+// clicked through for full context.
+type Event struct {
+	Kind     Kind
+	Severity Severity
+	RunID    int64
+	HostName string
+	Title    string
+	Body     string
+	URL      string // optional; UI link for this run/host
+}
+
+// Notifier is one delivery target. Implementations must not block on
+// remote-side failure any longer than their own timeout — the Registry
+// calls Send from a goroutine but still wants the goroutine to exit.
+type Notifier interface {
+	Name() string
+	Send(ctx context.Context, ev Event) error
+}
+
+// Route binds an event selector to a notifier name. A route matches an
+// event when every non-empty field is satisfied; empty fields are wildcards.
+type Route struct {
+	MatchKind     []Kind
+	MatchSeverity []Severity
+	Notifier      string // name of a registered Notifier
+}
+
+// Registry holds notifiers + routes and fans events out. Safe for
+// concurrent Dispatch. It's built once at startup from config.
+type Registry struct {
+	notifiers map[string]Notifier
+	routes    []Route
+	timeout   time.Duration
+
+	mu sync.Mutex // guards in-flight goroutine count (future-use metrics)
+}
+
+// NewRegistry builds a Registry with its per-notification timeout budget.
+// A zero timeout becomes 10s so tests and prod both get sane defaults.
+func NewRegistry(timeout time.Duration) *Registry {
+	if timeout <= 0 {
+		timeout = 10 * time.Second
+	}
+	return &Registry{
+		notifiers: map[string]Notifier{},
+		timeout:   timeout,
+	}
+}
+
+// Register adds a Notifier. Re-registering a name overwrites silently —
+// configs can shadow by listing the same name twice.
+func (r *Registry) Register(n Notifier) {
+	if n == nil {
+		return
+	}
+	r.notifiers[n.Name()] = n
+}
+
+// AddRoute appends a route rule. Order is preserved for deterministic
+// multi-match dispatch.
+func (r *Registry) AddRoute(rt Route) {
+	r.routes = append(r.routes, rt)
+}
+
+// Dispatch finds every route matching ev and fires each targeted
+// notifier on its own goroutine. Returns immediately — the caller does
+// not wait on delivery. Errors are logged.
+func (r *Registry) Dispatch(ev Event) {
+	targets := r.match(ev)
+	if len(targets) == 0 {
+		return
+	}
+	for _, n := range targets {
+		n := n
+		go func() {
+			ctx, cancel := context.WithTimeout(context.Background(), r.timeout)
+			defer cancel()
+			if err := n.Send(ctx, ev); err != nil {
+				log.Printf("notify: %s send(%s run=%d): %v", n.Name(), ev.Kind, ev.RunID, err)
+			}
+		}()
+	}
+}
+
+// match walks the route table in order and returns the unique notifiers
+// that should be fired for ev. Duplicates (same notifier named by two
+// matching routes) collapse — the operator intent is delivery, not
+// duplicate delivery.
+func (r *Registry) match(ev Event) []Notifier {
+	seen := map[string]bool{}
+	out := []Notifier{}
+	for _, rt := range r.routes {
+		if !matchesKind(rt.MatchKind, ev.Kind) {
+			continue
+		}
+		if !matchesSeverity(rt.MatchSeverity, ev.Severity) {
+			continue
+		}
+		if seen[rt.Notifier] {
+			continue
+		}
+		n, ok := r.notifiers[rt.Notifier]
+		if !ok {
+			log.Printf("notify: route references unknown notifier %q", rt.Notifier)
+			continue
+		}
+		seen[rt.Notifier] = true
+		out = append(out, n)
+	}
+	return out
+}
+
+func matchesKind(allow []Kind, got Kind) bool {
+	if len(allow) == 0 {
+		return true
+	}
+	for _, k := range allow {
+		if k == got {
+			return true
+		}
+	}
+	return false
+}
+
+func matchesSeverity(allow []Severity, got Severity) bool {
+	if len(allow) == 0 {
+		return true
+	}
+	for _, s := range allow {
+		if s == got {
+			return true
+		}
+	}
+	return false
+}
@@ -0,0 +1,268 @@
+package notify
+
+import (
+	"context"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"net/smtp"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+// stubNotifier records every Send call; it's the test harness for
+// Registry routing logic without hitting network.
+type stubNotifier struct {
+	name   string
+	calls  []Event
+	mu     sync.Mutex
+	failOn Kind // if non-empty, returns an error when ev.Kind == failOn
+}
+
+func (s *stubNotifier) Name() string { return s.name }
+
+func (s *stubNotifier) Send(_ context.Context, ev Event) error {
+	s.mu.Lock()
+	s.calls = append(s.calls, ev)
+	s.mu.Unlock()
+	if s.failOn != "" && ev.Kind == s.failOn {
+		return errFake("forced failure")
+	}
+	return nil
+}
+
+func (s *stubNotifier) seen() []Event {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return append([]Event(nil), s.calls...)
+}
+
+type errFake string
+
+func (e errFake) Error() string { return string(e) }
+
+// awaitCalls spins until every stub has the expected count or the
+// deadline elapses — Dispatch uses goroutines so the test must wait.
+func awaitCalls(t *testing.T, want map[*stubNotifier]int) {
+	t.Helper()
+	deadline := time.Now().Add(2 * time.Second)
+	for {
+		ok := true
+		for s, n := range want {
+			if len(s.seen()) < n {
+				ok = false
+				break
+			}
+		}
+		if ok {
+			return
+		}
+		if time.Now().After(deadline) {
+			for s, n := range want {
+				t.Errorf("notifier %q: got %d calls, want %d", s.name, len(s.seen()), n)
+			}
+			return
+		}
+		time.Sleep(5 * time.Millisecond)
+	}
+}
+
+func TestRegistryRoutesByKind(t *testing.T) {
+	reg := NewRegistry(time.Second)
+	a := &stubNotifier{name: "fails-only"}
+	b := &stubNotifier{name: "everything"}
+	reg.Register(a)
+	reg.Register(b)
+	reg.AddRoute(Route{MatchKind: []Kind{KindStageFailed}, Notifier: "fails-only"})
+	reg.AddRoute(Route{Notifier: "everything"})
+
+	reg.Dispatch(Event{Kind: KindStageFailed, Severity: SeverityCritical})
+	reg.Dispatch(Event{Kind: KindRunCompleted, Severity: SeverityInfo})
+
+	awaitCalls(t, map[*stubNotifier]int{a: 1, b: 2})
+	if got := a.seen()[0].Kind; got != KindStageFailed {
+		t.Fatalf("a got %q, want StageFailed", got)
+	}
+}
+
+func TestRegistryRoutesBySeverity(t *testing.T) {
+	reg := NewRegistry(time.Second)
+	crit := &stubNotifier{name: "crit-only"}
+	reg.Register(crit)
+	reg.AddRoute(Route{MatchSeverity: []Severity{SeverityCritical}, Notifier: "crit-only"})
+
+	reg.Dispatch(Event{Kind: KindRunCompleted, Severity: SeverityInfo})
+	reg.Dispatch(Event{Kind: KindHoldingOpened, Severity: SeverityCritical})
+
+	awaitCalls(t, map[*stubNotifier]int{crit: 1})
+	if got := crit.seen()[0].Severity; got != SeverityCritical {
+		t.Fatalf("got severity %q, want critical", got)
+	}
+}
+
+func TestRegistryDeduplicatesNotifiers(t *testing.T) {
+	reg := NewRegistry(time.Second)
+	n := &stubNotifier{name: "only"}
+	reg.Register(n)
+	// Two routes naming the same notifier — a single Dispatch should
+	// fire once, not twice.
+	reg.AddRoute(Route{MatchKind: []Kind{KindStageFailed}, Notifier: "only"})
+	reg.AddRoute(Route{MatchSeverity: []Severity{SeverityCritical}, Notifier: "only"})
+
+	reg.Dispatch(Event{Kind: KindStageFailed, Severity: SeverityCritical})
+
+	awaitCalls(t, map[*stubNotifier]int{n: 1})
+}
+
+func TestRegistryUnknownNotifierIsNoop(t *testing.T) {
+	reg := NewRegistry(time.Second)
+	reg.AddRoute(Route{Notifier: "does-not-exist"})
+	// Should not panic or block.
+	reg.Dispatch(Event{Kind: KindRunCompleted})
+}
+
+func TestRegistryFailureDoesNotPoisonOthers(t *testing.T) {
+	reg := NewRegistry(time.Second)
+	bad := &stubNotifier{name: "bad", failOn: KindStageFailed}
+	good := &stubNotifier{name: "good"}
+	reg.Register(bad)
+	reg.Register(good)
+	reg.AddRoute(Route{Notifier: "bad"})
+	reg.AddRoute(Route{Notifier: "good"})
+
+	reg.Dispatch(Event{Kind: KindStageFailed, Severity: SeverityCritical})
+
+	awaitCalls(t, map[*stubNotifier]int{bad: 1, good: 1})
+}
+
+func TestNtfyNotifierPOSTsBodyAndHeaders(t *testing.T) {
+	var captured *http.Request
+	var body string
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		captured = r
+		b, _ := io.ReadAll(r.Body)
+		body = string(b)
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer srv.Close()
+
+	n := NewNtfy("n", srv.URL, "vetting")
+	err := n.Send(context.Background(), Event{
+		Kind:     KindStageFailed,
+		Severity: SeverityCritical,
+		Title:    "host-01 FAILED",
+		Body:     "SMART failed",
+		URL:      "https://vetting.example/reports/42",
+	})
+	if err != nil {
+		t.Fatalf("send: %v", err)
+	}
+	if captured.Method != http.MethodPost {
+		t.Fatalf("method = %s, want POST", captured.Method)
+	}
+	if captured.URL.Path != "/vetting" {
+		t.Fatalf("path = %s, want /vetting", captured.URL.Path)
+	}
+	if got := captured.Header.Get("X-Title"); got != "host-01 FAILED" {
+		t.Fatalf("X-Title = %q", got)
+	}
+	if got := captured.Header.Get("X-Click"); got != "https://vetting.example/reports/42" {
+		t.Fatalf("X-Click = %q", got)
+	}
+	if got := captured.Header.Get("X-Priority"); got != "5" {
+		t.Fatalf("X-Priority = %q, want 5 for critical", got)
+	}
+	if body != "SMART failed" {
+		t.Fatalf("body = %q, want %q", body, "SMART failed")
+	}
+}
+
+func TestNtfyNotifierNon2xxErrors(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		http.Error(w, "rate limited", http.StatusTooManyRequests)
+	}))
+	defer srv.Close()
+
+	n := NewNtfy("n", srv.URL, "t")
+	err := n.Send(context.Background(), Event{Kind: KindRunCompleted, Body: "x"})
+	if err == nil || !strings.Contains(err.Error(), "429") {
+		t.Fatalf("want 429 error, got %v", err)
+	}
+}
+
+func TestDiscordNotifierPOSTsEmbed(t *testing.T) {
+	var body string
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		b, _ := io.ReadAll(r.Body)
+		body = string(b)
+		w.WriteHeader(http.StatusNoContent)
+	}))
+	defer srv.Close()
+
+	d := NewDiscord("d", srv.URL)
+	err := d.Send(context.Background(), Event{
+		Kind:     KindRunCompleted,
+		Severity: SeverityInfo,
+		Title:    "host-01 passed",
+		Body:     "all green",
+		URL:      "https://vetting.example/reports/1",
+	})
+	if err != nil {
+		t.Fatalf("send: %v", err)
+	}
+	// Body should be a JSON payload containing an embeds array with our
+	// title/description/URL.
+	for _, want := range []string{`"embeds"`, `"host-01 passed"`, `"all green"`, `reports/1`} {
+		if !strings.Contains(body, want) {
+			t.Errorf("body missing %q: %s", want, body)
+		}
+	}
+}
+
+func TestSMTPNotifierInvokesSendMail(t *testing.T) {
+	var called int32
+	var gotAddr, gotFrom string
+	var gotTo []string
+	var gotMsg []byte
+	s := NewSMTP("s", "mail.example", 2525, "vetting@example", []string{"ops@example"})
+	s.SendMailFn = func(addr string, _ smtp.Auth, from string, to []string, msg []byte) error {
+		atomic.AddInt32(&called, 1)
+		gotAddr, gotFrom, gotTo, gotMsg = addr, from, to, msg
+		return nil
+	}
+	err := s.Send(context.Background(), Event{
+		Kind: KindStageFailed, Title: "subj", Body: "failure body",
+		URL: "https://vetting.example/reports/9",
+	})
+	if err != nil {
+		t.Fatalf("send: %v", err)
+	}
+	if atomic.LoadInt32(&called) != 1 {
+		t.Fatal("SendMailFn not called")
+	}
+	if gotAddr != "mail.example:2525" {
+		t.Fatalf("addr = %q", gotAddr)
+	}
+	if gotFrom != "vetting@example" {
+		t.Fatalf("from = %q", gotFrom)
+	}
+	if len(gotTo) != 1 || gotTo[0] != "ops@example" {
+		t.Fatalf("to = %v", gotTo)
+	}
+	s1 := string(gotMsg)
+	for _, want := range []string{"Subject: subj", "failure body", "Link: https://vetting.example/reports/9"} {
+		if !strings.Contains(s1, want) {
+			t.Errorf("message missing %q", want)
+		}
+	}
+}
+
+func TestSMTPNotifierRejectsIncompleteConfig(t *testing.T) {
+	s := &SMTPNotifier{NameStr: "s"}
+	if err := s.Send(context.Background(), Event{Kind: KindRunCompleted}); err == nil {
+		t.Fatal("want error, got nil")
+	}
+}
@@ -0,0 +1,90 @@
+package notify
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+)
+
+// NtfyNotifier posts to ntfy.sh (or a self-hosted ntfy server). Message
+// body is the plain text body; title and URL are passed via X-Title and
+// X-Click headers so ntfy renders them as the push title + deep link.
+type NtfyNotifier struct {
+	NameStr string
+	Server  string // e.g. "https://ntfy.sh" or self-hosted
+	Topic   string
+	HTTP    *http.Client
+}
+
+func NewNtfy(name, server, topic string) *NtfyNotifier {
+	if server == "" {
+		server = "https://ntfy.sh"
+	}
+	return &NtfyNotifier{
+		NameStr: name,
+		Server:  strings.TrimRight(server, "/"),
+		Topic:   topic,
+		HTTP:    &http.Client{Timeout: 10 * time.Second},
+	}
+}
+
+func (n *NtfyNotifier) Name() string { return n.NameStr }
+
+func (n *NtfyNotifier) Send(ctx context.Context, ev Event) error {
+	if n.Topic == "" {
+		return fmt.Errorf("ntfy: no topic configured")
+	}
+	url := n.Server + "/" + n.Topic
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, strings.NewReader(ev.Body))
+	if err != nil {
+		return err
+	}
+	if ev.Title != "" {
+		req.Header.Set("X-Title", ev.Title)
+	}
+	if ev.URL != "" {
+		req.Header.Set("X-Click", ev.URL)
+	}
+	req.Header.Set("X-Priority", priorityForSeverity(ev.Severity))
+	req.Header.Set("X-Tags", ntfyTag(ev.Kind, ev.Severity))
+
+	resp, err := n.HTTP.Do(req)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = resp.Body.Close() }()
+	if resp.StatusCode >= 300 {
+		b, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("ntfy: %d: %s", resp.StatusCode, strings.TrimSpace(string(b)))
+	}
+	return nil
+}
+
+// priorityForSeverity maps our severities to ntfy's 1–5 scale. "info"
+// → 3 (default), warning → 4, critical → 5.
+func priorityForSeverity(s Severity) string {
+	switch s {
+	case SeverityCritical:
+		return "5"
+	case SeverityWarning:
+		return "4"
+	default:
+		return "3"
+	}
+}
+
+func ntfyTag(k Kind, s Severity) string {
+	switch {
+	case s == SeverityCritical:
+		return "rotating_light," + string(k)
+	case k == KindRunCompleted:
+		return "white_check_mark," + string(k)
+	case k == KindHoldingOpened:
+		return "construction," + string(k)
+	default:
+		return string(k)
+	}
+}
@@ -0,0 +1,81 @@
+package notify
+
+import (
+	"context"
+	"fmt"
+	"net/smtp"
+	"strconv"
+	"strings"
+)
+
+// SMTPNotifier sends a plaintext email. Authentication is left at zero
+// (LAN-only relay assumed); if the configured server requires auth the
+// Send call will return an error and the Registry will log it.
+//
+// SendMailFn is overridable so tests can capture the outgoing message
+// without needing a live SMTP server.
+type SMTPNotifier struct {
+	NameStr    string
+	Host       string
+	Port       int
+	From       string
+	To         []string
+	SendMailFn func(addr string, a smtp.Auth, from string, to []string, msg []byte) error
+}
+
+func NewSMTP(name, host string, port int, from string, to []string) *SMTPNotifier {
+	if port == 0 {
+		port = 25
+	}
+	return &SMTPNotifier{
+		NameStr:    name,
+		Host:       host,
+		Port:       port,
+		From:       from,
+		To:         to,
+		SendMailFn: smtp.SendMail,
+	}
+}
+
+func (s *SMTPNotifier) Name() string { return s.NameStr }
+
+func (s *SMTPNotifier) Send(ctx context.Context, ev Event) error {
+	if s.Host == "" || s.From == "" || len(s.To) == 0 {
+		return fmt.Errorf("smtp: incomplete config (host/from/to required)")
+	}
+	// We intentionally don't honour ctx here — net/smtp.SendMail doesn't
+	// accept a context; for a LAN relay with a short TCP timeout the
+	// Registry's goroutine will outlive the timeout but only by seconds.
+	addr := s.Host + ":" + strconv.Itoa(s.Port)
+	msg := buildEmail(s.From, s.To, ev)
+	return s.SendMailFn(addr, nil, s.From, s.To, msg)
+}
+
+// buildEmail produces an RFC 5322 minimal message. Body is plaintext;
+// the URL is appended so the recipient can click through from a text
+// mail client. No MIME for now — keeps it robust.
+func buildEmail(from string, to []string, ev Event) []byte {
+	var b strings.Builder
+	b.WriteString("From: ")
+	b.WriteString(from)
+	b.WriteString("\r\n")
+	b.WriteString("To: ")
+	b.WriteString(strings.Join(to, ", "))
+	b.WriteString("\r\n")
+	subject := ev.Title
+	if subject == "" {
+		subject = "[vetting] " + string(ev.Kind)
+	}
+	b.WriteString("Subject: ")
+	b.WriteString(subject)
+	b.WriteString("\r\n")
+	b.WriteString("Content-Type: text/plain; charset=UTF-8\r\n")
+	b.WriteString("\r\n")
+	b.WriteString(ev.Body)
+	if ev.URL != "" {
+		b.WriteString("\r\n\r\nLink: ")
+		b.WriteString(ev.URL)
+	}
+	b.WriteString("\r\n")
+	return []byte(b.String())
+}
@@ -0,0 +1,124 @@
+package orchestrator
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"vetting/internal/model"
+	"vetting/internal/store"
+)
+
+// Dispatcher picks Queued runs off the DB and drives them through
+// WaitingWoL (sending a WoL packet). Concurrency is capped at Max.
+//
+// For Phase 2 the dispatcher's job ends at WaitingWoL; further
+// transitions are driven by iPXE and agent callbacks. Phase 4+ will
+// return here and shepherd each run through stage execution.
+type Dispatcher struct {
+	Max    int
+	Runs   *store.Runs
+	Hosts  *store.Hosts
+	Runner *Runner
+
+	active chan struct{}
+	stop   chan struct{}
+}
+
+func NewDispatcher(max int, runs *store.Runs, hosts *store.Hosts, runner *Runner) *Dispatcher {
+	if max < 1 {
+		max = 1
+	}
+	return &Dispatcher{
+		Max:    max,
+		Runs:   runs,
+		Hosts:  hosts,
+		Runner: runner,
+		active: make(chan struct{}, max),
+		stop:   make(chan struct{}),
+	}
+}
+
+func (d *Dispatcher) Start(ctx context.Context) {
+	go d.loop(ctx)
+}
+
+func (d *Dispatcher) Stop() {
+	close(d.stop)
+}
+
+func (d *Dispatcher) loop(ctx context.Context) {
+	t := time.NewTicker(2 * time.Second)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-d.stop:
+			return
+		case <-t.C:
+			d.pickNext(ctx)
+		}
+	}
+}
+
+func (d *Dispatcher) pickNext(ctx context.Context) {
+	select {
+	case d.active <- struct{}{}:
+	default:
+		return // at capacity
+	}
+	released := false
+	defer func() {
+		if !released {
+			<-d.active
+		}
+	}()
+
+	runs, err := d.Runs.Active(ctx)
+	if err != nil {
+		log.Printf("dispatcher: list active: %v", err)
+		return
+	}
+
+	var queued *model.Run
+	inFlight := 0
+	for i := range runs {
+		switch runs[i].State {
+		case model.StateQueued:
+			if queued == nil {
+				queued = &runs[i]
+			}
+		case model.StateWaitingWoL, model.StateBooting, model.StateInventoryCheck,
+			model.StateSpecValidate, model.StateSMART, model.StateCPUStress,
+			model.StateStorage, model.StateNetwork, model.StateGPU,
+			model.StatePSU, model.StateReporting:
+			inFlight++
+		}
+	}
+
+	if inFlight >= d.Max || queued == nil {
+		return
+	}
+
+	host, err := d.Hosts.Get(ctx, queued.HostID)
+	if err != nil {
+		log.Printf("dispatcher: get host %d: %v", queued.HostID, err)
+		return
+	}
+	if _, err := d.Runner.Transition(ctx, queued.ID, TriggerDispatched); err != nil {
+		log.Printf("dispatcher: transition run %d: %v", queued.ID, err)
+		return
+	}
+	if err := SendWoL(host.MAC, host.WoLBroadcastIP, host.WoLPort); err != nil {
+		log.Printf("dispatcher: WoL run %d host %s: %v", queued.ID, host.Name, err)
+		// Stay in WaitingWoL; operator can retry or investigate.
+		return
+	}
+	log.Printf("dispatcher: WoL sent for run %d (host=%s mac=%s)", queued.ID, host.Name, host.MAC)
+
+	// Slot stays reserved until the run leaves active (Phase 4+).
+	// Phase 2 lets the loop observe inFlight via DB state.
+	released = true
+	<-d.active
+}
@@ -0,0 +1,92 @@
+package orchestrator
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log"
+	"os"
+	"os/exec"
+	"strconv"
+	"sync"
+	"time"
+)
+
+// IperfSupervisor runs a single `iperf3 -s` process under the
+// orchestrator so the Network stage has a stable server to dial. Each
+// run's Network test is sequential (stages are always serial), so one
+// server process handles every host under test.
+//
+// Missing iperf3 binary is logged once and the supervisor becomes a
+// no-op — the agent's Network stage will then fail to connect and skip
+// cleanly via the stage's own error path.
+type IperfSupervisor struct {
+	Port int // default 5201
+
+	mu      sync.Mutex
+	cmd     *exec.Cmd
+	started bool
+	fatal   error
+}
+
+func NewIperfSupervisor(port int) *IperfSupervisor {
+	if port <= 0 {
+		port = 5201
+	}
+	return &IperfSupervisor{Port: port}
+}
+
+func (s *IperfSupervisor) Start(ctx context.Context) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.started {
+		return nil
+	}
+	if _, err := exec.LookPath("iperf3"); err != nil {
+		s.fatal = fmt.Errorf("iperf3 not in PATH: %w", err)
+		log.Printf("iperf supervisor: %v (Network stage will fail to connect)", s.fatal)
+		return nil
+	}
+	cmd := exec.CommandContext(ctx, "iperf3", "-s", "-p", strconv.Itoa(s.Port))
+	if err := cmd.Start(); err != nil {
+		s.fatal = err
+		return err
+	}
+	s.cmd = cmd
+	s.started = true
+	log.Printf("iperf supervisor: iperf3 -s -p %d (pid=%d)", s.Port, cmd.Process.Pid)
+	go s.wait()
+	return nil
+}
+
+// Shutdown politely stops the iperf3 subprocess. Called from main on
+// SIGINT. A 3s grace period is enough for iperf3 to flush logs; after
+// that we kill.
+func (s *IperfSupervisor) Shutdown(timeout time.Duration) error {
+	s.mu.Lock()
+	cmd := s.cmd
+	s.mu.Unlock()
+	if cmd == nil || cmd.Process == nil {
+		return nil
+	}
+	// os.Interrupt is cross-platform; on Linux it maps to SIGINT which
+	// iperf3 handles gracefully. On Windows (dev only) it's a no-op and
+	// we'll fall through to Kill after the timeout.
+	_ = cmd.Process.Signal(os.Interrupt)
+	done := make(chan error, 1)
+	go func() { done <- cmd.Wait() }()
+	select {
+	case <-done:
+		return nil
+	case <-time.After(timeout):
+		_ = cmd.Process.Kill()
+		return errors.New("iperf3 did not exit in time; killed")
+	}
+}
+
+func (s *IperfSupervisor) wait() {
+	_ = s.cmd.Wait()
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.started = false
+}
@@ -0,0 +1,118 @@
+package orchestrator
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"time"
+
+	"vetting/internal/events"
+	"vetting/internal/model"
+	"vetting/internal/store"
+)
+
+// Runner is the authoritative mutator for run state. All state
+// transitions go through (*Runner).Transition so the DB update and
+// the event publication happen together.
+type Runner struct {
+	Runs     *store.Runs
+	Hosts    *store.Hosts
+	Stages   *store.Stages
+	EventHub *events.Hub
+}
+
+func (r *Runner) Transition(ctx context.Context, runID int64, trigger Trigger) (model.RunState, error) {
+	run, err := r.Runs.Get(ctx, runID)
+	if err != nil {
+		return "", fmt.Errorf("get run: %w", err)
+	}
+	next, err := Next(run.State, trigger)
+	if err != nil {
+		return "", err
+	}
+	if err := r.Runs.SetState(ctx, runID, next); err != nil {
+		return "", fmt.Errorf("persist transition: %w", err)
+	}
+	log.Printf("run %d: %s -> %s (%s)", runID, run.State, next, trigger)
+	r.publishTileUpdate(ctx, run.HostID)
+	return next, nil
+}
+
+// StartStage marks a stage row running and publishes a tile refresh.
+func (r *Runner) StartStage(ctx context.Context, runID int64, name string) error {
+	if err := r.Stages.StartByName(ctx, runID, name); err != nil {
+		return err
+	}
+	run, err := r.Runs.Get(ctx, runID)
+	if err == nil {
+		r.publishTileUpdate(ctx, run.HostID)
+	}
+	return nil
+}
+
+func (r *Runner) publishTileUpdate(ctx context.Context, hostID int64) {
+	host, err := r.Hosts.Get(ctx, hostID)
+	if err != nil {
+		log.Printf("publishTileUpdate: get host %d: %v", hostID, err)
+		return
+	}
+	latest, err := r.Runs.LatestForHost(ctx, hostID)
+	if err != nil {
+		log.Printf("publishTileUpdate: latest run: %v", err)
+		return
+	}
+	payload := renderTileSSE(ctx, *host, latest)
+	r.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", hostID), Payload: payload})
+}
+
+// TileRenderer renders a single tile fragment. Registered at startup
+// so the orchestrator package stays free of template / store-enrichment
+// imports. The closure is expected to do any DB lookups itself (spec-
+// diff count, hold-key path, …) before handing the data to the
+// template package.
+var TileRenderer func(ctx context.Context, host model.Host, latest *model.Run) string
+
+func renderTileSSE(ctx context.Context, host model.Host, latest *model.Run) string {
+	if TileRenderer == nil {
+		return fmt.Sprintf(`<article id="host-%d">state change</article>`, host.ID)
+	}
+	return TileRenderer(ctx, host, latest)
+}
+
+// TouchHeartbeat is called on every agent heartbeat so the orchestrator
+// can record last-seen; Phase 2 just logs, Phase 3+ will update a
+// last_seen_at column.
+func (r *Runner) TouchHeartbeat(runID int64) {
+	_ = runID
+	_ = time.Now()
+}
+
+// Override re-enters a held stage after the operator has acknowledged
+// the failure condition (e.g. wipe-probe override). It jumps
+// FailedHolding → StateFor(failed_stage), clears the failed marker, and
+// publishes a tile refresh so the UI drops the hold banner.
+func (r *Runner) Override(ctx context.Context, runID int64, flagsJSON string) (model.RunState, error) {
+	run, err := r.Runs.Get(ctx, runID)
+	if err != nil {
+		return "", fmt.Errorf("get run: %w", err)
+	}
+	if run.FailedStage == "" {
+		return "", fmt.Errorf("override: run has no failed_stage")
+	}
+	next, err := NextForOverride(run.State, run.FailedStage)
+	if err != nil {
+		return "", err
+	}
+	if err := r.Runs.SetOverrideFlags(ctx, runID, flagsJSON); err != nil {
+		return "", fmt.Errorf("persist override flags: %w", err)
+	}
+	if err := r.Runs.SetState(ctx, runID, next); err != nil {
+		return "", fmt.Errorf("override transition: %w", err)
+	}
+	if err := r.Runs.ClearFailedStage(ctx, runID); err != nil {
+		log.Printf("override: clear failed_stage: %v", err)
+	}
+	log.Printf("run %d: %s -> %s (OperatorOverride stage=%s flags=%s)", runID, run.State, next, run.FailedStage, flagsJSON)
+	r.publishTileUpdate(ctx, run.HostID)
+	return next, nil
+}
@@ -0,0 +1,129 @@
+package orchestrator
+
+import (
+	"fmt"
+
+	"vetting/internal/model"
+)
+
+// Trigger is an event that drives a state transition.
+type Trigger string
+
+const (
+	TriggerStartRequested   Trigger = "StartRequested"   // user clicks Start Vetting
+	TriggerDispatched       Trigger = "Dispatched"       // dispatcher picked this run
+	TriggerPXEObserved      Trigger = "PXEObserved"      // iPXE fetched cmdline for MAC
+	TriggerAgentClaimed     Trigger = "AgentClaimed"     // agent POSTed /claim with valid token
+	TriggerStageFailed      Trigger = "StageFailed"      // a stage reported failure
+	TriggerStageCompleted   Trigger = "StageCompleted"   // a stage reported success → advance
+	TriggerAllStagesPassed  Trigger = "AllStagesPassed"  // final stage passed
+	TriggerOperatorReleased Trigger = "OperatorReleased" // user clicked Release on a held run
+	TriggerOperatorOverride Trigger = "OperatorOverride" // user overrode a held stage; re-enter it
+)
+
+// stageStates maps the canonical stage name (from DefaultStageOrder)
+// to the matching RunState. Named differently for historical reasons:
+// the first stage is "Inventory" (stage row name) but the run state is
+// "InventoryCheck". Later stages share a name with their state.
+var stageStates = map[string]model.RunState{
+	"Inventory":    model.StateInventoryCheck,
+	"SpecValidate": model.StateSpecValidate,
+	"SMART":        model.StateSMART,
+	"CPUStress":    model.StateCPUStress,
+	"Storage":      model.StateStorage,
+	"Network":      model.StateNetwork,
+	"GPU":          model.StateGPU,
+	"PSU":          model.StatePSU,
+	"Reporting":    model.StateReporting,
+}
+
+// stageOrder is the sequence of RunStates the run walks through from
+// first stage to Completed. Kept in sync with store.DefaultStageOrder.
+var stageOrder = []model.RunState{
+	model.StateInventoryCheck,
+	model.StateSpecValidate,
+	model.StateSMART,
+	model.StateCPUStress,
+	model.StateStorage,
+	model.StateNetwork,
+	model.StateGPU,
+	model.StatePSU,
+	model.StateReporting,
+}
+
+type transition struct {
+	from []model.RunState
+	to   model.RunState
+}
+
+var table = map[Trigger]transition{
+	TriggerStartRequested:   {from: []model.RunState{model.StateRegistered}, to: model.StateQueued},
+	TriggerDispatched:       {from: []model.RunState{model.StateQueued}, to: model.StateWaitingWoL},
+	TriggerPXEObserved:      {from: []model.RunState{model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting},
+	TriggerAgentClaimed:     {from: []model.RunState{model.StateBooting, model.StateWaitingWoL}, to: model.StateInventoryCheck},
+	TriggerStageFailed:      {from: allActiveStates(), to: model.StateFailedHolding},
+	TriggerAllStagesPassed:  {from: []model.RunState{model.StateReporting}, to: model.StateCompleted},
+	TriggerOperatorReleased: {from: []model.RunState{model.StateFailedHolding}, to: model.StateReleased},
+}
+
+// Next computes the target state for a trigger against the current state.
+// StageCompleted is handled specially: it advances through stageOrder.
+func Next(current model.RunState, t Trigger) (model.RunState, error) {
+	if t == TriggerStageCompleted {
+		return nextStageState(current)
+	}
+	tr, ok := table[t]
+	if !ok {
+		return "", fmt.Errorf("unknown trigger %q", t)
+	}
+	for _, s := range tr.from {
+		if s == current {
+			return tr.to, nil
+		}
+	}
+	return "", fmt.Errorf("trigger %q not allowed from %q", t, current)
+}
+
+// NextForOverride returns the state we should jump to when the operator
+// overrides a held stage. It's separate from the generic table because
+// the target depends on the failed_stage, not on the current state
+// (which is always FailedHolding).
+func NextForOverride(current model.RunState, failedStage string) (model.RunState, error) {
+	if current != model.StateFailedHolding {
+		return "", fmt.Errorf("override not allowed from %q", current)
+	}
+	s, ok := stageStates[failedStage]
+	if !ok {
+		return "", fmt.Errorf("override: unknown failed stage %q", failedStage)
+	}
+	return s, nil
+}
+
+// StateForStage returns the RunState that corresponds to a stage name.
+// Used by handlers that receive a stage name and want to guard against
+// stale/out-of-order agent reports.
+func StateForStage(name string) (model.RunState, bool) {
+	s, ok := stageStates[name]
+	return s, ok
+}
+
+func nextStageState(current model.RunState) (model.RunState, error) {
+	for i, s := range stageOrder {
+		if s == current {
+			if i+1 >= len(stageOrder) {
+				return model.StateCompleted, nil
+			}
+			return stageOrder[i+1], nil
+		}
+	}
+	return "", fmt.Errorf("StageCompleted not valid from %q", current)
+}
+
+func allActiveStates() []model.RunState {
+	return []model.RunState{
+		model.StateQueued, model.StateWaitingWoL, model.StateBooting,
+		model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
+		model.StateCPUStress, model.StateStorage, model.StateNetwork,
+		model.StateGPU, model.StatePSU, model.StateReporting,
+	}
+}
@@ -0,0 +1,67 @@
+package orchestrator_test
+
+import (
+	"testing"
+
+	"vetting/internal/model"
+	"vetting/internal/orchestrator"
+)
+
+func TestNextForOverride(t *testing.T) {
+	tests := []struct {
+		name        string
+		from        model.RunState
+		failedStage string
+		want        model.RunState
+		wantErr     bool
+	}{
+		{"storage override", model.StateFailedHolding, "Storage", model.StateStorage, false},
+		{"smart override", model.StateFailedHolding, "SMART", model.StateSMART, false},
+		{"inventory override", model.StateFailedHolding, "Inventory", model.StateInventoryCheck, false},
+		{"unknown stage", model.StateFailedHolding, "NotAStage", "", true},
+		{"not holding", model.StateStorage, "Storage", "", true},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := orchestrator.NextForOverride(tc.from, tc.failedStage)
+			if tc.wantErr {
+				if err == nil {
+					t.Fatalf("expected error, got %q", got)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if got != tc.want {
+				t.Fatalf("got %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestNextStageWalk(t *testing.T) {
+	// Walking StageCompleted from each stage should land on the next
+	// one in the canonical order, and from Reporting onto Completed.
+	chain := []model.RunState{
+		model.StateInventoryCheck,
+		model.StateSpecValidate,
+		model.StateSMART,
+		model.StateCPUStress,
+		model.StateStorage,
+		model.StateNetwork,
+		model.StateGPU,
+		model.StatePSU,
+		model.StateReporting,
+		model.StateCompleted,
+	}
+	for i := 0; i < len(chain)-1; i++ {
+		got, err := orchestrator.Next(chain[i], orchestrator.TriggerStageCompleted)
+		if err != nil {
+			t.Fatalf("Next(%q): %v", chain[i], err)
+		}
+		if got != chain[i+1] {
+			t.Fatalf("Next(%q) = %q, want %q", chain[i], got, chain[i+1])
+		}
+	}
+}
@@ -0,0 +1,26 @@
+package orchestrator
+
+import (
+	"crypto/rand"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+)
+
+// IssueRunToken returns (plaintext, hashHex). The plaintext is passed
+// to the host via the iPXE kernel cmdline; the hash is persisted in the
+// runs table for later constant-time comparison.
+func IssueRunToken() (string, string, error) {
+	b := make([]byte, 32)
+	if _, err := rand.Read(b); err != nil {
+		return "", "", fmt.Errorf("random: %w", err)
+	}
+	plain := hex.EncodeToString(b)
+	sum := sha256.Sum256([]byte(plain))
+	return plain, hex.EncodeToString(sum[:]), nil
+}
+
+func HashRunToken(plain string) string {
+	sum := sha256.Sum256([]byte(plain))
+	return hex.EncodeToString(sum[:])
+}
@@ -0,0 +1,38 @@
+package orchestrator
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestIssueRunTokenRoundTrip(t *testing.T) {
+	plain, hash, err := IssueRunToken()
+	if err != nil {
+		t.Fatalf("IssueRunToken: %v", err)
+	}
+	if len(plain) != 64 {
+		t.Fatalf("plaintext should be 64 hex chars, got %d", len(plain))
+	}
+	if len(hash) != 64 {
+		t.Fatalf("hash should be 64 hex chars, got %d", len(hash))
+	}
+	if HashRunToken(plain) != hash {
+		t.Fatalf("HashRunToken(plain) != hash")
+	}
+	// Ensure high entropy: two consecutive issues differ.
+	plain2, _, _ := IssueRunToken()
+	if plain == plain2 {
+		t.Fatalf("expected distinct tokens on consecutive calls")
+	}
+}
+
+func TestHashRunTokenDeterministic(t *testing.T) {
+	h1 := HashRunToken("abc")
+	h2 := HashRunToken("abc")
+	if h1 != h2 {
+		t.Fatalf("hash not deterministic")
+	}
+	if strings.EqualFold(h1, HashRunToken("abd")) {
+		t.Fatalf("hash should differ for distinct inputs")
+	}
+}
@@ -0,0 +1,57 @@
+package orchestrator
+
+import (
+	"encoding/hex"
+	"fmt"
+	"net"
+	"strconv"
+	"strings"
+)
+
+// SendWoL sends a Wake-on-LAN magic packet to broadcastIP:port for the
+// given MAC (aa:bb:cc:dd:ee:ff). The packet is 6 bytes of 0xFF followed
+// by the MAC repeated 16 times.
+func SendWoL(mac, broadcastIP string, port int) error {
+	macBytes, err := parseMAC(mac)
+	if err != nil {
+		return err
+	}
+	packet := make([]byte, 6+16*6)
+	for i := 0; i < 6; i++ {
+		packet[i] = 0xff
+	}
+	for i := 0; i < 16; i++ {
+		copy(packet[6+i*6:], macBytes)
+	}
+
+	conn, err := net.Dial("udp", net.JoinHostPort(broadcastIP, strconv.Itoa(port)))
+	if err != nil {
+		return fmt.Errorf("dial wol: %w", err)
+	}
+	defer conn.Close()
+
+	if _, err := conn.Write(packet); err != nil {
+		return fmt.Errorf("write wol: %w", err)
+	}
+	return nil
+}
+
+func parseMAC(s string) ([]byte, error) {
+	s = strings.ToLower(strings.TrimSpace(s))
+	parts := strings.Split(s, ":")
+	if len(parts) != 6 {
+		return nil, fmt.Errorf("invalid MAC %q", s)
+	}
+	out := make([]byte, 6)
+	for i, p := range parts {
+		if len(p) != 2 {
+			return nil, fmt.Errorf("invalid MAC octet %q", p)
+		}
+		b, err := hex.DecodeString(p)
+		if err != nil {
+			return nil, fmt.Errorf("invalid MAC %q: %w", s, err)
+		}
+		out[i] = b[0]
+	}
+	return out, nil
+}
@@ -0,0 +1,37 @@
+package orchestrator
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestParseMAC(t *testing.T) {
+	got, err := parseMAC("aa:bb:cc:dd:ee:ff")
+	if err != nil {
+		t.Fatalf("parseMAC: %v", err)
+	}
+	want := []byte{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
+	if !bytes.Equal(got, want) {
+		t.Fatalf("parseMAC: %x != %x", got, want)
+	}
+}
+
+func TestParseMACUpper(t *testing.T) {
+	// Must be case-insensitive so users can paste either form.
+	got, err := parseMAC("AA:BB:CC:DD:EE:FF")
+	if err != nil {
+		t.Fatalf("parseMAC upper: %v", err)
+	}
+	want := []byte{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
+	if !bytes.Equal(got, want) {
+		t.Fatalf("parseMAC upper: %x != %x", got, want)
+	}
+}
+
+func TestParseMACInvalid(t *testing.T) {
+	for _, bad := range []string{"", "aa:bb:cc", "zz:yy:xx:ww:vv:uu", "aa-bb-cc-dd-ee-ff", "aa:bb:cc:dd:ee:ff:00"} {
+		if _, err := parseMAC(bad); err == nil {
+			t.Errorf("expected error for %q", bad)
+		}
+	}
+}
@@ -0,0 +1,231 @@
+package pxe
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"sync"
+	"text/template"
+	"time"
+
+	"vetting/internal/model"
+)
+
+// SupervisorConfig controls how dnsmasq is launched and configured.
+type SupervisorConfig struct {
+	Enabled         bool
+	Interface       string // e.g. "eth0"
+	DHCPRange       string // e.g. "10.77.0.100,10.77.0.200,12h"
+	OrchestratorURL string // baked into iPXE scripts
+	RuntimeDir      string // writable dir for dnsmasq.conf and leases
+	TFTPRoot        string // holds ipxe.efi, undionly.kpxe
+	DNSMasqBin      string // path to dnsmasq binary (default: "dnsmasq")
+}
+
+// Supervisor owns a dnsmasq subprocess, rewrites its config when the
+// host registry changes, and sends SIGHUP to reload. The MAC allowlist
+// is the safety barrier: only registered MACs see a DHCP reply.
+type Supervisor struct {
+	cfg    SupervisorConfig
+	mu     sync.Mutex
+	cmd    *exec.Cmd
+	cancel context.CancelFunc
+}
+
+func NewSupervisor(cfg SupervisorConfig) *Supervisor {
+	if cfg.DNSMasqBin == "" {
+		cfg.DNSMasqBin = "dnsmasq"
+	}
+	return &Supervisor{cfg: cfg}
+}
+
+// Start launches dnsmasq in the background. If cfg.Enabled is false
+// Start is a no-op (useful for dev on Windows where dnsmasq isn't
+// available).
+func (s *Supervisor) Start(ctx context.Context, hosts []model.Host) error {
+	if !s.cfg.Enabled {
+		log.Printf("pxe: disabled in config — skipping dnsmasq")
+		return nil
+	}
+	if runtime.GOOS == "windows" {
+		return fmt.Errorf("dnsmasq supervision is not supported on Windows — run orchestrator on Linux")
+	}
+	if err := os.MkdirAll(s.cfg.RuntimeDir, 0o755); err != nil {
+		return fmt.Errorf("mkdir runtime: %w", err)
+	}
+	if err := s.writeConf(hosts); err != nil {
+		return err
+	}
+	subCtx, cancel := context.WithCancel(ctx)
+	s.mu.Lock()
+	s.cancel = cancel
+	s.mu.Unlock()
+
+	confPath := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
+	cmd := exec.CommandContext(subCtx, s.cfg.DNSMasqBin,
+		"--conf-file="+confPath,
+		"--no-daemon",
+		"--log-queries",
+		"--log-dhcp",
+	)
+	cmd.Stdout = logWriter{prefix: "dnsmasq"}
+	cmd.Stderr = logWriter{prefix: "dnsmasq"}
+	if err := cmd.Start(); err != nil {
+		cancel()
+		return fmt.Errorf("start dnsmasq: %w", err)
+	}
+	s.mu.Lock()
+	s.cmd = cmd
+	s.mu.Unlock()
+	go func() {
+		if err := cmd.Wait(); err != nil && subCtx.Err() == nil {
+			log.Printf("dnsmasq exited: %v", err)
+		}
+	}()
+	return nil
+}
+
+// Reload rewrites the conf with the latest host registry and sends
+// SIGHUP. It will restart the subprocess if SIGHUP is unsupported
+// (e.g. when running behind an OS that doesn't support it).
+func (s *Supervisor) Reload(hosts []model.Host) error {
+	if !s.cfg.Enabled {
+		return nil
+	}
+	if err := s.writeConf(hosts); err != nil {
+		return err
+	}
+	s.mu.Lock()
+	cmd := s.cmd
+	s.mu.Unlock()
+	if cmd == nil || cmd.Process == nil {
+		return nil
+	}
+	if err := sighup(cmd.Process); err != nil {
+		return fmt.Errorf("sighup dnsmasq: %w", err)
+	}
+	return nil
+}
+
+// Shutdown stops dnsmasq within the timeout.
+func (s *Supervisor) Shutdown(timeout time.Duration) error {
+	if !s.cfg.Enabled {
+		return nil
+	}
+	s.mu.Lock()
+	cancel := s.cancel
+	cmd := s.cmd
+	s.mu.Unlock()
+	if cancel != nil {
+		cancel()
+	}
+	if cmd != nil && cmd.Process != nil {
+		done := make(chan struct{})
+		go func() {
+			_, _ = cmd.Process.Wait()
+			close(done)
+		}()
+		select {
+		case <-done:
+		case <-time.After(timeout):
+			_ = cmd.Process.Kill()
+		}
+	}
+	return nil
+}
+
+func (s *Supervisor) writeConf(hosts []model.Host) error {
+	tmpl, err := template.New("dnsmasq").Parse(dnsmasqTemplate)
+	if err != nil {
+		return err
+	}
+	conf := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
+	tmp := conf + ".new"
+	f, err := os.Create(tmp)
+	if err != nil {
+		return fmt.Errorf("create conf: %w", err)
+	}
+	data := struct {
+		Cfg   SupervisorConfig
+		Hosts []model.Host
+	}{s.cfg, hosts}
+	if err := tmpl.Execute(f, data); err != nil {
+		_ = f.Close()
+		return fmt.Errorf("render conf: %w", err)
+	}
+	if err := f.Sync(); err != nil {
+		_ = f.Close()
+		return err
+	}
+	if err := f.Close(); err != nil {
+		return err
+	}
+	if err := os.Rename(tmp, conf); err != nil {
+		return fmt.Errorf("rename conf: %w", err)
+	}
+	return nil
+}
+
+// Exposed for the UI handlers to show operators what config is live.
+func (s *Supervisor) ConfPath() string {
+	return filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
+}
+
+type logWriter struct{ prefix string }
+
+func (w logWriter) Write(p []byte) (int, error) {
+	for _, line := range strings.Split(strings.TrimRight(string(p), "\n"), "\n") {
+		if line == "" {
+			continue
+		}
+		log.Printf("[%s] %s", w.prefix, line)
+	}
+	return len(p), nil
+}
+
+// Allow package consumers to swap io.Writer for logs in tests.
+var _ io.Writer = logWriter{}
+
+const dnsmasqTemplate = `# Generated by Vetting — do not hand-edit.
+interface={{ .Cfg.Interface }}
+bind-interfaces
+port=0
+domain-needed
+bogus-priv
+no-resolv
+
+# MAC allowlist: dnsmasq only answers DHCP for MACs with a dhcp-host= below.
+dhcp-ignore=tag:!known
+{{- range .Hosts }}
+dhcp-host={{ .MAC }},set:known
+{{- end }}
+
+# DHCP range (broader subnet coverage is fine; allowlist above gates replies).
+dhcp-range={{ .Cfg.DHCPRange }}
+
+# TFTP + HTTP boot (iPXE chainload).
+enable-tftp
+tftp-root={{ .Cfg.TFTPRoot }}
+
+# BIOS (undionly.kpxe) and UEFI (ipxe.efi) clients both get iPXE first,
+# which then re-requests a per-MAC script from the orchestrator.
+dhcp-match=set:bios,option:client-arch,0
+dhcp-match=set:efi64,option:client-arch,7
+dhcp-match=set:efi64,option:client-arch,9
+
+# If the client is iPXE itself, send it the per-MAC HTTP script.
+dhcp-match=set:ipxe,175
+dhcp-boot=tag:ipxe,{{ .Cfg.OrchestratorURL }}/ipxe/${mac}
+
+# Otherwise (first boot from ROM) chainload iPXE from TFTP.
+dhcp-boot=tag:!ipxe,tag:bios,undionly.kpxe
+dhcp-boot=tag:!ipxe,tag:efi64,ipxe.efi
+
+log-facility=-
+`
@@ -0,0 +1,88 @@
+package pxe
+
+import (
+	"fmt"
+	"io"
+	"strings"
+
+	"vetting/internal/model"
+)
+
+// IPXEParams is everything an iPXE boot script needs.
+// For Phase 2 the boot target is always "linux" — Memtest chain-load
+// is not required because we replaced Memtest86+ with stress-ng under
+// Linux (see plan §3.2).
+type IPXEParams struct {
+	OrchestratorURL string // e.g. http://10.0.0.5:8080
+	LiveKernelURL   string // e.g. http://10.0.0.5:8080/live/vmlinuz
+	LiveInitrdURL   string // e.g. http://10.0.0.5:8080/live/initrd.img
+	TLSCertFPR      string // optional; empty = skip pin
+	RunID           int64
+	MAC             string
+	Token           string // plaintext, hashed on server side
+}
+
+// BuildScript returns an iPXE script tailored for this run.
+// iPXE scripts are plain text beginning with "#!ipxe".
+func BuildScript(p IPXEParams) string {
+	cmdline := []string{
+		"initrd=initrd.img",
+		fmt.Sprintf("vetting.orchestrator=%s", p.OrchestratorURL),
+		fmt.Sprintf("vetting.run_id=%d", p.RunID),
+		fmt.Sprintf("vetting.mac=%s", p.MAC),
+		fmt.Sprintf("vetting.token=%s", p.Token),
+	}
+	if p.TLSCertFPR != "" {
+		cmdline = append(cmdline, fmt.Sprintf("vetting.cert_fpr=%s", p.TLSCertFPR))
+	}
+	// Reduce kernel log noise during the test run; keep loglevel high enough
+	// for boot failures to still show up on the console.
+	cmdline = append(cmdline,
+		"console=tty0",
+		"console=ttyS0,115200n8",
+		"ip=dhcp",
+		"quiet",
+	)
+
+	var b strings.Builder
+	fmt.Fprintln(&b, "#!ipxe")
+	fmt.Fprintf(&b, "echo Vetting run %d — booting live image for %s\n", p.RunID, p.MAC)
+	fmt.Fprintf(&b, "kernel %s %s\n", p.LiveKernelURL, strings.Join(cmdline, " "))
+	fmt.Fprintf(&b, "initrd %s\n", p.LiveInitrdURL)
+	fmt.Fprintln(&b, "boot")
+	return b.String()
+}
+
+// NotRegisteredScript is served for unknown MACs. The MAC allowlist
+// at the dnsmasq level should prevent this from ever being reachable,
+// but it exists as belt-and-braces.
+func NotRegisteredScript(mac string) string {
+	return fmt.Sprintf("#!ipxe\necho MAC %s not registered for vetting — halting.\nshell\n", mac)
+}
+
+// NoActiveRunScript is served when a registered MAC PXE-boots but has
+// no currently active run. The host is told to shut down rather than
+// loop forever.
+func NoActiveRunScript(mac string) string {
+	return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — powering off in 10s.\nsleep 10\npoweroff\n", mac)
+}
+
+// Used by handlers to compose URLs; exposed for tests.
+func BuildLiveURLs(base string) (kernel, initrd string) {
+	base = strings.TrimRight(base, "/")
+	return base + "/live/vmlinuz", base + "/live/initrd.img"
+}
+
+// WriteNotFound is a small convenience so handlers can return a shell
+// script error directly to iPXE without cluttering handlers with a
+// mime-type dance.
+func WriteNotFound(w io.Writer, mac string) {
+	_, _ = w.Write([]byte(NotRegisteredScript(mac)))
+}
+
+// ScriptMarker is used by iPXE to detect that the response is a script.
+const ScriptMarker = "#!ipxe"
+
+// State returns the compact single-word status used for logging.
+// Takes a Run's state because iPXE handler already looked it up.
+func State(run model.Run) string { return string(run.State) }
@@ -0,0 +1,61 @@
+package pxe
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestBuildScriptIncludesAllCmdlineParams(t *testing.T) {
+	s := BuildScript(IPXEParams{
+		OrchestratorURL: "http://10.0.0.5:8080",
+		LiveKernelURL:   "http://10.0.0.5:8080/live/vmlinuz",
+		LiveInitrdURL:   "http://10.0.0.5:8080/live/initrd.img",
+		RunID:           42,
+		MAC:             "aa:bb:cc:dd:ee:ff",
+		Token:           "deadbeefcafe",
+	})
+	if !strings.HasPrefix(s, "#!ipxe") {
+		t.Fatalf("expected #!ipxe header, got %q", s[:10])
+	}
+	for _, want := range []string{
+		"vetting.orchestrator=http://10.0.0.5:8080",
+		"vetting.run_id=42",
+		"vetting.mac=aa:bb:cc:dd:ee:ff",
+		"vetting.token=deadbeefcafe",
+		"kernel http://10.0.0.5:8080/live/vmlinuz",
+		"initrd http://10.0.0.5:8080/live/initrd.img",
+		"ip=dhcp",
+		"boot",
+	} {
+		if !strings.Contains(s, want) {
+			t.Errorf("script missing %q\n%s", want, s)
+		}
+	}
+}
+
+func TestBuildScriptOmitsCertFPRWhenEmpty(t *testing.T) {
+	s := BuildScript(IPXEParams{
+		OrchestratorURL: "http://x", LiveKernelURL: "http://x/k", LiveInitrdURL: "http://x/i",
+		RunID: 1, MAC: "aa:bb:cc:dd:ee:ff", Token: "t",
+	})
+	if strings.Contains(s, "vetting.cert_fpr") {
+		t.Fatalf("cert_fpr should be absent when empty:\n%s", s)
+	}
+}
+
+func TestNotRegisteredScriptMentionsMAC(t *testing.T) {
+	s := NotRegisteredScript("aa:bb:cc:dd:ee:ff")
+	if !strings.Contains(s, "aa:bb:cc:dd:ee:ff") {
+		t.Fatalf("not-registered script should echo the MAC: %s", s)
+	}
+	if !strings.HasPrefix(s, "#!ipxe") {
+		t.Fatalf("missing #!ipxe header: %s", s)
+	}
+}
+
+func TestBuildLiveURLs(t *testing.T) {
+	k, i := BuildLiveURLs("http://h:8080/")
+	if k != "http://h:8080/live/vmlinuz" || i != "http://h:8080/live/initrd.img" {
+		t.Fatalf("BuildLiveURLs: %s, %s", k, i)
+	}
+}
@@ -0,0 +1,12 @@
+//go:build !windows
+
+package pxe
+
+import (
+	"os"
+	"syscall"
+)
+
+func sighup(p *os.Process) error {
+	return p.Signal(syscall.SIGHUP)
+}
@@ -0,0 +1,12 @@
+//go:build windows
+
+package pxe
+
+import (
+	"fmt"
+	"os"
+)
+
+func sighup(_ *os.Process) error {
+	return fmt.Errorf("SIGHUP not supported on Windows")
+}
@@ -0,0 +1,245 @@
+// Package report builds the per-run HTML summary artifact. JSON is
+// written separately (by the reporting resolver in the api package);
+// this package only deals with the human-facing HTML.
+//
+// Design: a single self-contained HTML file — inline CSS, no external
+// fetches — so the artifact is portable and can be opened straight off
+// disk. Contents are a summary (per answer to the phase-5 design
+// question): run metadata, per-stage pass/fail table, spec diff list,
+// and measurement aggregates (min/avg/max by kind+key).
+package report
+
+import (
+	"bytes"
+	"fmt"
+	"html/template"
+	"math"
+	"sort"
+	"time"
+
+	"vetting/internal/model"
+)
+
+// Data is the payload fed to the HTML template. Callers assemble it
+// from the DB rows for a given run.
+type Data struct {
+	GeneratedAt time.Time
+	Run         model.Run
+	Host        model.Host
+	Stages      []model.Stage
+	SpecDiffs   []model.SpecDiff
+	Aggregates  []Aggregate // flattened measurement summary; see Aggregate
+}
+
+// Aggregate is a per (kind, key) summary of a run's measurements. Min/
+// Max/Avg are populated from the Measurement rows; Unit mirrors the raw
+// sample unit so the HTML can show "52.5 °C" etc.
+type Aggregate struct {
+	Kind  string
+	Key   string
+	Unit  string
+	Count int
+	Min   float64
+	Max   float64
+	Avg   float64
+}
+
+// AggregateMeasurements collapses a flat []Measurement into per-(kind,
+// key) summaries, sorted first by kind then by key so the HTML renders
+// deterministically.
+func AggregateMeasurements(rows []model.Measurement) []Aggregate {
+	type bucket struct {
+		unit     string
+		count    int
+		min, max float64
+		sum      float64
+	}
+	buckets := map[string]*bucket{}
+	keyOf := func(m model.Measurement) string { return m.Kind + "\x00" + m.Key }
+	for _, m := range rows {
+		k := keyOf(m)
+		b, ok := buckets[k]
+		if !ok {
+			b = &bucket{unit: m.Unit, min: math.Inf(1), max: math.Inf(-1)}
+			buckets[k] = b
+		}
+		b.count++
+		b.sum += m.Value
+		if m.Value < b.min {
+			b.min = m.Value
+		}
+		if m.Value > b.max {
+			b.max = m.Value
+		}
+	}
+	out := make([]Aggregate, 0, len(buckets))
+	for _, m := range rows {
+		k := keyOf(m)
+		b, ok := buckets[k]
+		if !ok {
+			continue
+		}
+		// Emit once per bucket; delete to dedupe.
+		delete(buckets, k)
+		out = append(out, Aggregate{
+			Kind:  m.Kind,
+			Key:   m.Key,
+			Unit:  b.unit,
+			Count: b.count,
+			Min:   b.min,
+			Max:   b.max,
+			Avg:   b.sum / float64(b.count),
+		})
+	}
+	sort.Slice(out, func(i, j int) bool {
+		if out[i].Kind != out[j].Kind {
+			return out[i].Kind < out[j].Kind
+		}
+		return out[i].Key < out[j].Key
+	})
+	return out
+}
+
+// RenderHTML produces the self-contained report HTML.
+func RenderHTML(d Data) ([]byte, error) {
+	var buf bytes.Buffer
+	if err := reportTmpl.Execute(&buf, d); err != nil {
+		return nil, fmt.Errorf("report: render: %w", err)
+	}
+	return buf.Bytes(), nil
+}
+
+var reportTmpl = template.Must(template.New("report").Funcs(template.FuncMap{
+	"fmt4":     func(f float64) string { return fmt.Sprintf("%.4g", f) },
+	"fmtTime":  func(t time.Time) string { return t.UTC().Format(time.RFC3339) },
+	"fmtTimep": func(t *time.Time) string { if t == nil { return "—" }; return t.UTC().Format(time.RFC3339) },
+	"resultBadge": func(s model.StageState) string {
+		switch s {
+		case model.StagePassed:
+			return "pass"
+		case model.StageFailed:
+			return "fail"
+		case model.StageSkipped:
+			return "skip"
+		default:
+			return "pend"
+		}
+	},
+}).Parse(htmlTemplate))
+
+// Single-string template kept next to the code so the package stays
+// self-contained. CSS is inlined; no external assets.
+const htmlTemplate = `<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>Vetting report — {{.Host.Name}} run {{.Run.ID}}</title>
+<style>
+  :root { color-scheme: light dark; }
+  body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 2rem; max-width: 960px; }
+  h1 { margin-bottom: 0; }
+  .sub { color: #666; margin-top: .2rem; }
+  section { margin-top: 2rem; }
+  table { border-collapse: collapse; width: 100%; }
+  th, td { text-align: left; padding: .35rem .6rem; border-bottom: 1px solid #ccc3; vertical-align: top; }
+  th { background: #0001; }
+  .pass { color: #0a0; font-weight: 600; }
+  .fail { color: #c33; font-weight: 600; }
+  .skip { color: #888; }
+  .pend { color: #888; }
+  .critical { color: #c33; font-weight: 600; }
+  .warning { color: #c80; }
+  .info { color: #666; }
+  code { background: #0001; padding: .05rem .25rem; border-radius: 3px; }
+</style>
+</head>
+<body>
+<h1>{{.Host.Name}} — run {{.Run.ID}}</h1>
+<div class="sub">State: <b>{{.Run.State}}</b>{{if ne .Run.Result ""}} · result: <b>{{.Run.Result}}</b>{{end}} · generated {{fmtTime .GeneratedAt}}</div>
+
+<section>
+<h2>Host</h2>
+<table>
+  <tr><th>Name</th><td>{{.Host.Name}}</td></tr>
+  <tr><th>MAC</th><td><code>{{.Host.MAC}}</code></td></tr>
+  <tr><th>WoL</th><td>{{.Host.WoLBroadcastIP}}:{{.Host.WoLPort}}</td></tr>
+  {{if .Host.Notes}}<tr><th>Notes</th><td>{{.Host.Notes}}</td></tr>{{end}}
+</table>
+</section>
+
+<section>
+<h2>Run</h2>
+<table>
+  <tr><th>Run ID</th><td>{{.Run.ID}}</td></tr>
+  <tr><th>State</th><td>{{.Run.State}}</td></tr>
+  <tr><th>Started</th><td>{{fmtTime .Run.StartedAt}}</td></tr>
+  <tr><th>Completed</th><td>{{fmtTimep .Run.CompletedAt}}</td></tr>
+  {{if .Run.FailedStage}}<tr><th>Failed stage</th><td class="fail">{{.Run.FailedStage}}</td></tr>{{end}}
+  {{if .Run.ReportPath}}<tr><th>JSON report</th><td><code>{{.Run.ReportPath}}</code></td></tr>{{end}}
+</table>
+</section>
+
+<section>
+<h2>Stages</h2>
+<table>
+  <thead><tr><th>Stage</th><th>State</th><th>Started</th><th>Completed</th></tr></thead>
+  <tbody>
+  {{range .Stages}}
+    <tr>
+      <td>{{.Name}}</td>
+      <td class="{{resultBadge .State}}">{{.State}}</td>
+      <td>{{fmtTimep .StartedAt}}</td>
+      <td>{{fmtTimep .CompletedAt}}</td>
+    </tr>
+  {{end}}
+  </tbody>
+</table>
+</section>
+
+<section>
+<h2>Spec diffs ({{len .SpecDiffs}})</h2>
+{{if .SpecDiffs}}
+<table>
+  <thead><tr><th>Field</th><th>Expected</th><th>Actual</th><th>Severity</th></tr></thead>
+  <tbody>
+  {{range .SpecDiffs}}
+    <tr>
+      <td><code>{{.Field}}</code></td>
+      <td>{{.Expected}}</td>
+      <td>{{.Actual}}</td>
+      <td class="{{.Severity}}">{{.Severity}}</td>
+    </tr>
+  {{end}}
+  </tbody>
+</table>
+{{else}}
+<p>No differences between expected and actual hardware.</p>
+{{end}}
+</section>
+
+<section>
+<h2>Measurements ({{len .Aggregates}} series)</h2>
+{{if .Aggregates}}
+<table>
+  <thead><tr><th>Kind</th><th>Key</th><th>Samples</th><th>Min</th><th>Avg</th><th>Max</th><th>Unit</th></tr></thead>
+  <tbody>
+  {{range .Aggregates}}
+    <tr>
+      <td>{{.Kind}}</td>
+      <td>{{.Key}}</td>
+      <td>{{.Count}}</td>
+      <td>{{fmt4 .Min}}</td>
+      <td>{{fmt4 .Avg}}</td>
+      <td>{{fmt4 .Max}}</td>
+      <td>{{.Unit}}</td>
+    </tr>
+  {{end}}
+  </tbody>
+</table>
+{{else}}
+<p>No measurements recorded.</p>
+{{end}}
+</section>
+</body>
+</html>
+`
@@ -0,0 +1,232 @@
+// Package spec owns the expected-vs-actual hardware diff for Vetting.
+//
+// The operator writes an expected spec YAML per host when registering.
+// The agent submits an Inventory artifact after boot. Diff() compares
+// them and emits per-field SpecDiff rows; the orchestrator fails the
+// SpecValidate stage if any row is classified critical.
+//
+// Phase 3 rule (operator decision): every mismatch is critical. Missing
+// expected fields skip that check entirely so partial specs stay useful
+// instead of exploding.
+package spec
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"gopkg.in/yaml.v3"
+
+	"vetting/internal/model"
+)
+
+type Spec struct {
+	CPU    *CPUSpec    `yaml:"cpu,omitempty"`
+	Memory *MemorySpec `yaml:"memory,omitempty"`
+	Disks  []DiskSpec  `yaml:"disks,omitempty"`
+	NICs   []NICSpec   `yaml:"nics,omitempty"`
+	GPUs   []GPUSpec   `yaml:"gpus,omitempty"`
+}
+
+type CPUSpec struct {
+	Model        string `json:"model,omitempty" yaml:"model,omitempty"`
+	LogicalCores int    `json:"logical_cores,omitempty" yaml:"logical_cores,omitempty"`
+}
+
+type MemorySpec struct {
+	TotalGiB int `json:"total_gib,omitempty" yaml:"total_gib,omitempty"`
+}
+
+type DiskSpec struct {
+	Serial string `json:"serial,omitempty" yaml:"serial,omitempty"`
+	SizeGB int    `json:"size_gb,omitempty" yaml:"size_gb,omitempty"`
+}
+
+type NICSpec struct {
+	MAC       string `json:"mac,omitempty" yaml:"mac,omitempty"`
+	SpeedGbps int    `json:"speed_gbps,omitempty" yaml:"speed_gbps,omitempty"`
+}
+
+type GPUSpec struct {
+	Model string `json:"model,omitempty" yaml:"model,omitempty"`
+}
+
+// Inventory is the actual measured hardware. Field names deliberately
+// match Spec so the diff reads cleanly.
+type Inventory struct {
+	CPU    CPUSpec     `json:"cpu" yaml:"cpu"`
+	Memory MemorySpec  `json:"memory" yaml:"memory"`
+	Disks  []DiskSpec  `json:"disks" yaml:"disks"`
+	NICs   []NICSpec   `json:"nics" yaml:"nics"`
+	GPUs   []GPUSpec   `json:"gpus" yaml:"gpus"`
+}
+
+// Parse reads expected-spec YAML. Empty YAML parses to a zero Spec and
+// yields an empty diff — i.e. "no expectations" is a legal stance.
+func Parse(src string) (*Spec, error) {
+	var s Spec
+	if err := yaml.Unmarshal([]byte(src), &s); err != nil {
+		return nil, fmt.Errorf("parse spec yaml: %w", err)
+	}
+	return &s, nil
+}
+
+// Diff returns the per-field differences with severity. Phase 3 rule:
+// every present-expected-field-that-mismatches is critical. Missing
+// expected fields are skipped (not info-logged) so the diff list stays
+// focused on real problems.
+func Diff(expected *Spec, actual *Inventory) []model.SpecDiff {
+	if expected == nil {
+		return nil
+	}
+	out := []model.SpecDiff{}
+
+	if expected.CPU != nil {
+		if expected.CPU.Model != "" {
+			if !cpuModelMatches(expected.CPU.Model, actual.CPU.Model) {
+				out = append(out, diff("cpu.model", expected.CPU.Model, actual.CPU.Model))
+			}
+		}
+		if expected.CPU.LogicalCores > 0 && expected.CPU.LogicalCores != actual.CPU.LogicalCores {
+			out = append(out, diff("cpu.logical_cores", itoa(expected.CPU.LogicalCores), itoa(actual.CPU.LogicalCores)))
+		}
+	}
+
+	if expected.Memory != nil && expected.Memory.TotalGiB > 0 {
+		// Allow ±2 GiB tolerance: BIOS-reserved, kernel, reporting
+		// quantization. A dead 16 GiB stick will still surface.
+		if absInt(expected.Memory.TotalGiB-actual.Memory.TotalGiB) > 2 {
+			out = append(out, diff("memory.total_gib", itoa(expected.Memory.TotalGiB), itoa(actual.Memory.TotalGiB)))
+		}
+	}
+
+	out = append(out, diffDisks(expected.Disks, actual.Disks)...)
+	out = append(out, diffNICs(expected.NICs, actual.NICs)...)
+	out = append(out, diffGPUs(expected.GPUs, actual.GPUs)...)
+
+	return out
+}
+
+func diffDisks(expected, actual []DiskSpec) []model.SpecDiff {
+	if len(expected) == 0 {
+		return nil
+	}
+	actualBySerial := map[string]DiskSpec{}
+	for _, d := range actual {
+		if d.Serial != "" {
+			actualBySerial[strings.ToLower(d.Serial)] = d
+		}
+	}
+	var out []model.SpecDiff
+	seen := map[string]bool{}
+	for _, exp := range expected {
+		if exp.Serial == "" {
+			continue
+		}
+		key := strings.ToLower(exp.Serial)
+		seen[key] = true
+		got, ok := actualBySerial[key]
+		if !ok {
+			out = append(out, diff("disks["+exp.Serial+"].present", "true", "false"))
+			continue
+		}
+		if exp.SizeGB > 0 && absInt(exp.SizeGB-got.SizeGB) > 1 {
+			out = append(out, diff("disks["+exp.Serial+"].size_gb", itoa(exp.SizeGB), itoa(got.SizeGB)))
+		}
+	}
+	// Extra disks on the host that operator didn't declare are flagged:
+	// a leftover USB stick could be a destructive-test target we'd
+	// rather the operator know about.
+	for _, got := range actual {
+		if got.Serial == "" {
+			continue
+		}
+		if !seen[strings.ToLower(got.Serial)] {
+			out = append(out, diff("disks[unexpected "+got.Serial+"]", "", "present"))
+		}
+	}
+	return out
+}
+
+func diffNICs(expected, actual []NICSpec) []model.SpecDiff {
+	if len(expected) == 0 {
+		return nil
+	}
+	actualByMAC := map[string]NICSpec{}
+	for _, n := range actual {
+		if n.MAC != "" {
+			actualByMAC[strings.ToLower(n.MAC)] = n
+		}
+	}
+	var out []model.SpecDiff
+	for _, exp := range expected {
+		if exp.MAC == "" {
+			continue
+		}
+		got, ok := actualByMAC[strings.ToLower(exp.MAC)]
+		if !ok {
+			out = append(out, diff("nics["+exp.MAC+"].present", "true", "false"))
+			continue
+		}
+		if exp.SpeedGbps > 0 && got.SpeedGbps > 0 && exp.SpeedGbps != got.SpeedGbps {
+			out = append(out, diff("nics["+exp.MAC+"].speed_gbps", itoa(exp.SpeedGbps), itoa(got.SpeedGbps)))
+		}
+	}
+	return out
+}
+
+func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff {
+	if len(expected) == 0 {
+		return nil
+	}
+	// GPU matching is by model string. Multiple identical cards match
+	// by count, not identity, since PCI-slot order isn't meaningful.
+	want := map[string]int{}
+	for _, g := range expected {
+		want[strings.ToLower(g.Model)]++
+	}
+	got := map[string]int{}
+	for _, g := range actual {
+		got[strings.ToLower(g.Model)]++
+	}
+	var keys []string
+	for k := range want {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	var out []model.SpecDiff
+	for _, k := range keys {
+		if got[k] < want[k] {
+			out = append(out, diff("gpus["+k+"].count", itoa(want[k]), itoa(got[k])))
+		}
+	}
+	return out
+}
+
+// cpuModelMatches compares model strings case-insensitively and allows
+// the operator to declare a substring (e.g. "E5-2680 v4") that matches
+// the verbose kernel-reported string ("Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz").
+func cpuModelMatches(expected, actual string) bool {
+	e := strings.ToLower(strings.TrimSpace(expected))
+	a := strings.ToLower(strings.TrimSpace(actual))
+	return e == a || strings.Contains(a, e)
+}
+
+// In Phase 3 all diffs are critical. Later phases may tier them.
+func diff(field, expected, actual string) model.SpecDiff {
+	return model.SpecDiff{
+		Field:    field,
+		Expected: expected,
+		Actual:   actual,
+		Severity: "critical",
+	}
+}
+
+func absInt(n int) int {
+	if n < 0 {
+		return -n
+	}
+	return n
+}
+
+func itoa(n int) string { return fmt.Sprintf("%d", n) }
@@ -0,0 +1,121 @@
+package spec
+
+import (
+	"testing"
+
+	"vetting/internal/model"
+)
+
+func TestDiffEmptySpec(t *testing.T) {
+	if d := Diff(&Spec{}, &Inventory{}); len(d) != 0 {
+		t.Fatalf("empty spec → empty diff, got %v", d)
+	}
+}
+
+func TestDiffCPUMismatch(t *testing.T) {
+	exp := &Spec{CPU: &CPUSpec{Model: "E5-2680 v4", LogicalCores: 28}}
+	act := &Inventory{CPU: CPUSpec{Model: "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz", LogicalCores: 16}}
+	d := Diff(exp, act)
+	if len(d) != 1 || d[0].Field != "cpu.logical_cores" || d[0].Severity != "critical" {
+		t.Fatalf("expected logical_cores critical, got %+v", d)
+	}
+}
+
+func TestDiffCPUModelSubstringMatch(t *testing.T) {
+	exp := &Spec{CPU: &CPUSpec{Model: "E5-2680 v4"}}
+	act := &Inventory{CPU: CPUSpec{Model: "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz"}}
+	if d := Diff(exp, act); len(d) != 0 {
+		t.Fatalf("substring should match, got %+v", d)
+	}
+}
+
+func TestDiffMemoryTolerance(t *testing.T) {
+	exp := &Spec{Memory: &MemorySpec{TotalGiB: 128}}
+	act := &Inventory{Memory: MemorySpec{TotalGiB: 127}}
+	if d := Diff(exp, act); len(d) != 0 {
+		t.Fatalf("1 GiB variance should be tolerated, got %+v", d)
+	}
+	act2 := &Inventory{Memory: MemorySpec{TotalGiB: 112}} // missing stick
+	d := Diff(exp, act2)
+	if len(d) != 1 || d[0].Field != "memory.total_gib" {
+		t.Fatalf("16 GiB drop should be critical, got %+v", d)
+	}
+}
+
+func TestDiffDisksMissingAndUnexpected(t *testing.T) {
+	exp := &Spec{Disks: []DiskSpec{{Serial: "A", SizeGB: 1000}, {Serial: "B", SizeGB: 500}}}
+	act := &Inventory{Disks: []DiskSpec{{Serial: "A", SizeGB: 1000}, {Serial: "C", SizeGB: 32}}}
+	d := Diff(exp, act)
+	// Expect: disk B missing, disk C unexpected.
+	got := map[string]bool{}
+	for _, row := range d {
+		got[row.Field] = true
+	}
+	if !got["disks[B].present"] {
+		t.Fatalf("expected disks[B].present critical; got %+v", d)
+	}
+	if !got["disks[unexpected C]"] {
+		t.Fatalf("expected disks[unexpected C] critical; got %+v", d)
+	}
+}
+
+func TestDiffDisksSerialCaseInsensitive(t *testing.T) {
+	exp := &Spec{Disks: []DiskSpec{{Serial: "wd-abc123", SizeGB: 1000}}}
+	act := &Inventory{Disks: []DiskSpec{{Serial: "WD-ABC123", SizeGB: 1000}}}
+	if d := Diff(exp, act); len(d) != 0 {
+		t.Fatalf("serial compare must be case-insensitive, got %+v", d)
+	}
+}
+
+func TestDiffNICMAC(t *testing.T) {
+	exp := &Spec{NICs: []NICSpec{{MAC: "aa:bb:cc:dd:ee:ff", SpeedGbps: 10}}}
+	act := &Inventory{NICs: []NICSpec{{MAC: "aa:bb:cc:dd:ee:ff", SpeedGbps: 1}}}
+	d := Diff(exp, act)
+	if len(d) != 1 || d[0].Field != "nics[aa:bb:cc:dd:ee:ff].speed_gbps" {
+		t.Fatalf("expected speed mismatch, got %+v", d)
+	}
+}
+
+func TestDiffGPUCount(t *testing.T) {
+	exp := &Spec{GPUs: []GPUSpec{{Model: "NVIDIA RTX 3090"}, {Model: "NVIDIA RTX 3090"}}}
+	act := &Inventory{GPUs: []GPUSpec{{Model: "nvidia rtx 3090"}}}
+	d := Diff(exp, act)
+	if len(d) != 1 || d[0].Field != "gpus[nvidia rtx 3090].count" {
+		t.Fatalf("expected GPU count critical, got %+v", d)
+	}
+}
+
+func TestParseValidYAML(t *testing.T) {
+	src := `
+cpu:
+  model: "E5-2680 v4"
+  logical_cores: 28
+memory:
+  total_gib: 128
+disks:
+  - serial: A
+    size_gb: 1000
+`
+	s, err := Parse(src)
+	if err != nil {
+		t.Fatalf("Parse: %v", err)
+	}
+	if s.CPU == nil || s.CPU.LogicalCores != 28 {
+		t.Fatalf("cpu not parsed: %+v", s)
+	}
+	if len(s.Disks) != 1 || s.Disks[0].Serial != "A" {
+		t.Fatalf("disks not parsed: %+v", s)
+	}
+}
+
+func TestDiffSeverityAlwaysCritical(t *testing.T) {
+	exp := &Spec{CPU: &CPUSpec{LogicalCores: 8}}
+	act := &Inventory{CPU: CPUSpec{LogicalCores: 4}}
+	d := Diff(exp, act)
+	var got []model.SpecDiff = d
+	for _, row := range got {
+		if row.Severity != "critical" {
+			t.Fatalf("phase-3 rule: every diff is critical; got %q for %s", row.Severity, row.Field)
+		}
+	}
+}
@@ -0,0 +1,126 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+
+	"vetting/internal/model"
+)
+
+type Artifact struct {
+	ID        int64
+	RunID     int64
+	StageID   *int64
+	Kind      string // inventory|spec_diff|hold_key|report|log|fio|iperf|smart
+	Path      string
+	SHA256    string
+	SizeBytes int64
+}
+
+type Artifacts struct {
+	DB *sql.DB
+}
+
+func (a *Artifacts) Create(ctx context.Context, art Artifact) (int64, error) {
+	res, err := a.DB.ExecContext(ctx, `
+		INSERT INTO artifacts(run_id, stage_id, kind, path, sha256, size_bytes)
+		VALUES(?,?,?,?,?,?)
+	`, art.RunID, nullInt64(art.StageID), art.Kind, art.Path, art.SHA256, art.SizeBytes)
+	if err != nil {
+		return 0, fmt.Errorf("insert artifact: %w", err)
+	}
+	return res.LastInsertId()
+}
+
+// DeleteForRun removes every artifact row for a run. Returns the rows
+// that were deleted so the caller can unlink the on-disk files. Used by
+// the janitor; ordinary flow treats artifacts as append-only.
+func (a *Artifacts) DeleteForRun(ctx context.Context, runID int64) ([]Artifact, error) {
+	arts, err := a.ListForRun(ctx, runID)
+	if err != nil {
+		return nil, err
+	}
+	if _, err := a.DB.ExecContext(ctx, `DELETE FROM artifacts WHERE run_id = ?`, runID); err != nil {
+		return nil, fmt.Errorf("delete artifacts for run %d: %w", runID, err)
+	}
+	return arts, nil
+}
+
+func (a *Artifacts) ListForRun(ctx context.Context, runID int64) ([]Artifact, error) {
+	rows, err := a.DB.QueryContext(ctx, `
+		SELECT id, run_id, stage_id, kind, path, sha256, size_bytes
+		FROM artifacts WHERE run_id = ? ORDER BY id
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []Artifact
+	for rows.Next() {
+		var ar Artifact
+		var stageID sql.NullInt64
+		if err := rows.Scan(&ar.ID, &ar.RunID, &stageID, &ar.Kind, &ar.Path, &ar.SHA256, &ar.SizeBytes); err != nil {
+			return nil, err
+		}
+		if stageID.Valid {
+			v := stageID.Int64
+			ar.StageID = &v
+		}
+		out = append(out, ar)
+	}
+	return out, rows.Err()
+}
+
+type SpecDiffs struct {
+	DB *sql.DB
+}
+
+func (s *SpecDiffs) ReplaceForRun(ctx context.Context, runID int64, diffs []model.SpecDiff) error {
+	tx, err := s.DB.BeginTx(ctx, nil)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = tx.Rollback() }()
+	if _, err := tx.ExecContext(ctx, `DELETE FROM spec_diffs WHERE run_id = ?`, runID); err != nil {
+		return err
+	}
+	for _, d := range diffs {
+		if _, err := tx.ExecContext(ctx, `
+			INSERT INTO spec_diffs(run_id, field, expected, actual, severity, ignored)
+			VALUES(?,?,?,?,?,?)
+		`, runID, d.Field, d.Expected, d.Actual, d.Severity, 0); err != nil {
+			return err
+		}
+	}
+	return tx.Commit()
+}
+
+func (s *SpecDiffs) ListForRun(ctx context.Context, runID int64) ([]model.SpecDiff, error) {
+	rows, err := s.DB.QueryContext(ctx, `
+		SELECT id, run_id, field, COALESCE(expected,''), COALESCE(actual,''), severity, ignored
+		FROM spec_diffs WHERE run_id = ? ORDER BY id
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []model.SpecDiff
+	for rows.Next() {
+		var d model.SpecDiff
+		var ignored int
+		if err := rows.Scan(&d.ID, &d.RunID, &d.Field, &d.Expected, &d.Actual, &d.Severity, &ignored); err != nil {
+			return nil, err
+		}
+		d.Ignored = ignored != 0
+		out = append(out, d)
+	}
+	return out, rows.Err()
+}
+
+func nullInt64(p *int64) any {
+	if p == nil {
+		return nil
+	}
+	return *p
+}
@@ -0,0 +1,98 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"fmt"
+	"strings"
+
+	"vetting/internal/model"
+)
+
+type Hosts struct {
+	DB *sql.DB
+}
+
+var ErrNotFound = errors.New("not found")
+
+func (h *Hosts) Create(ctx context.Context, in model.Host) (int64, error) {
+	in.MAC = normalizeMAC(in.MAC)
+	res, err := h.DB.ExecContext(ctx, `
+		INSERT INTO hosts(name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml, pdu_config_json, ipmi_config_json, notes)
+		VALUES(?,?,?,?,?,?,?,?)
+	`, in.Name, in.MAC, in.WoLBroadcastIP, in.WoLPort, in.ExpectedSpecYAML, nullIfEmpty(in.PDUConfigJSON), nullIfEmpty(in.IPMIConfigJSON), in.Notes)
+	if err != nil {
+		return 0, fmt.Errorf("insert host: %w", err)
+	}
+	return res.LastInsertId()
+}
+
+func (h *Hosts) List(ctx context.Context) ([]model.Host, error) {
+	rows, err := h.DB.QueryContext(ctx, `
+		SELECT id, name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml,
+		       COALESCE(pdu_config_json,''), COALESCE(ipmi_config_json,''),
+		       notes, created_at, updated_at
+		FROM hosts
+		ORDER BY name COLLATE NOCASE
+	`)
+	if err != nil {
+		return nil, fmt.Errorf("list hosts: %w", err)
+	}
+	defer rows.Close()
+
+	var out []model.Host
+	for rows.Next() {
+		var host model.Host
+		if err := rows.Scan(&host.ID, &host.Name, &host.MAC, &host.WoLBroadcastIP, &host.WoLPort,
+			&host.ExpectedSpecYAML, &host.PDUConfigJSON, &host.IPMIConfigJSON,
+			&host.Notes, &host.CreatedAt, &host.UpdatedAt); err != nil {
+			return nil, fmt.Errorf("scan host: %w", err)
+		}
+		out = append(out, host)
+	}
+	return out, rows.Err()
+}
+
+func (h *Hosts) Get(ctx context.Context, id int64) (*model.Host, error) {
+	row := h.DB.QueryRowContext(ctx, `
+		SELECT id, name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml,
+		       COALESCE(pdu_config_json,''), COALESCE(ipmi_config_json,''),
+		       notes, created_at, updated_at
+		FROM hosts WHERE id = ?
+	`, id)
+	var host model.Host
+	err := row.Scan(&host.ID, &host.Name, &host.MAC, &host.WoLBroadcastIP, &host.WoLPort,
+		&host.ExpectedSpecYAML, &host.PDUConfigJSON, &host.IPMIConfigJSON,
+		&host.Notes, &host.CreatedAt, &host.UpdatedAt)
+	if errors.Is(err, sql.ErrNoRows) {
+		return nil, ErrNotFound
+	}
+	if err != nil {
+		return nil, fmt.Errorf("get host: %w", err)
+	}
+	return &host, nil
+}
+
+func (h *Hosts) Delete(ctx context.Context, id int64) error {
+	res, err := h.DB.ExecContext(ctx, `DELETE FROM hosts WHERE id = ?`, id)
+	if err != nil {
+		return fmt.Errorf("delete host: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return ErrNotFound
+	}
+	return nil
+}
+
+func normalizeMAC(m string) string {
+	return strings.ToLower(strings.TrimSpace(m))
+}
+
+func nullIfEmpty(s string) any {
+	if s == "" {
+		return nil
+	}
+	return s
+}
@@ -0,0 +1,85 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"time"
+
+	"vetting/internal/model"
+)
+
+// Measurements persists timestamped numeric samples: temps, fan speeds,
+// PSU voltages, fio IOPS, iperf throughput, SMART attributes. The schema
+// stores (kind, key, value, unit) so Phase 5 reports can group freely
+// without new tables per source.
+type Measurements struct {
+	DB *sql.DB
+}
+
+func (m *Measurements) Create(ctx context.Context, in model.Measurement) (int64, error) {
+	if in.TS.IsZero() {
+		in.TS = time.Now().UTC()
+	}
+	res, err := m.DB.ExecContext(ctx, `
+		INSERT INTO measurements(run_id, stage_id, ts, kind, key, value, unit)
+		VALUES(?,?,?,?,?,?,?)
+	`, in.RunID, nullInt64(in.StageID), in.TS, in.Kind, in.Key, in.Value, in.Unit)
+	if err != nil {
+		return 0, fmt.Errorf("insert measurement: %w", err)
+	}
+	return res.LastInsertId()
+}
+
+// CreateBatch inserts a batch in one transaction. The sensor endpoint
+// hands us ~5–20 samples per tick; a single commit keeps SQLite happy.
+func (m *Measurements) CreateBatch(ctx context.Context, rows []model.Measurement) error {
+	if len(rows) == 0 {
+		return nil
+	}
+	tx, err := m.DB.BeginTx(ctx, nil)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = tx.Rollback() }()
+	now := time.Now().UTC()
+	for _, r := range rows {
+		if r.TS.IsZero() {
+			r.TS = now
+		}
+		if _, err := tx.ExecContext(ctx, `
+			INSERT INTO measurements(run_id, stage_id, ts, kind, key, value, unit)
+			VALUES(?,?,?,?,?,?,?)
+		`, r.RunID, nullInt64(r.StageID), r.TS, r.Kind, r.Key, r.Value, r.Unit); err != nil {
+			return fmt.Errorf("insert measurement: %w", err)
+		}
+	}
+	return tx.Commit()
+}
+
+// ListForRun returns all measurements for a run. Callers filter by kind
+// in memory; the row count is small per run (≈thousands).
+func (m *Measurements) ListForRun(ctx context.Context, runID int64) ([]model.Measurement, error) {
+	rows, err := m.DB.QueryContext(ctx, `
+		SELECT id, run_id, stage_id, ts, kind, key, value, COALESCE(unit,'')
+		FROM measurements WHERE run_id = ? ORDER BY ts, id
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []model.Measurement
+	for rows.Next() {
+		var meas model.Measurement
+		var stageID sql.NullInt64
+		if err := rows.Scan(&meas.ID, &meas.RunID, &stageID, &meas.TS, &meas.Kind, &meas.Key, &meas.Value, &meas.Unit); err != nil {
+			return nil, err
+		}
+		if stageID.Valid {
+			v := stageID.Int64
+			meas.StageID = &v
+		}
+		out = append(out, meas)
+	}
+	return out, rows.Err()
+}
@@ -0,0 +1,226 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"fmt"
+	"time"
+
+	"vetting/internal/model"
+)
+
+type Runs struct {
+	DB *sql.DB
+}
+
+func (r *Runs) Create(ctx context.Context, hostID int64, tokenHash string) (int64, error) {
+	now := time.Now().UTC()
+	res, err := r.DB.ExecContext(ctx, `
+		INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at)
+		VALUES(?,?,?,?,?)
+	`, hostID, string(model.StateQueued), tokenHash, "linux", now)
+	if err != nil {
+		return 0, fmt.Errorf("insert run: %w", err)
+	}
+	return res.LastInsertId()
+}
+
+func (r *Runs) SetState(ctx context.Context, runID int64, state model.RunState) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET state = ? WHERE id = ?`, string(state), runID)
+	return err
+}
+
+// RotateTokenHash replaces the stored token hash. Called on each iPXE
+// fetch so only the most-recently-booted agent can claim the run.
+func (r *Runs) RotateTokenHash(ctx context.Context, runID int64, hash string) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET agent_token_hash = ? WHERE id = ?`, hash, runID)
+	return err
+}
+
+// SetHoldIP records the agent's LAN IP so the UI can show the ssh
+// command. Called when the agent POSTs /hold.
+func (r *Runs) SetHoldIP(ctx context.Context, runID int64, ip string) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET hold_ip = ? WHERE id = ?`, ip, runID)
+	return err
+}
+
+// SetFailedStage records which stage tripped the run; used by the tile
+// and by reports. Does not change state.
+func (r *Runs) SetFailedStage(ctx context.Context, runID int64, stage string) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET failed_stage = ? WHERE id = ?`, stage, runID)
+	return err
+}
+
+// ClearFailedStage wipes the failed_stage marker. Called when the
+// operator overrides a stage and the run re-enters the pipeline.
+func (r *Runs) ClearFailedStage(ctx context.Context, runID int64) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET failed_stage = NULL WHERE id = ?`, runID)
+	return err
+}
+
+// SetOverrideFlags persists the operator's override decisions (JSON blob
+// like `{"wipe":true}`). Passed back to the agent on the next heartbeat
+// so it can resume the held stage with the gate bypassed.
+func (r *Runs) SetOverrideFlags(ctx context.Context, runID int64, flagsJSON string) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET override_flags_json = ? WHERE id = ?`, flagsJSON, runID)
+	return err
+}
+
+func (r *Runs) MarkFailed(ctx context.Context, runID int64, failedStage, holdIP string) error {
+	now := time.Now().UTC()
+	_, err := r.DB.ExecContext(ctx, `
+		UPDATE runs SET state = ?, result = 'fail', failed_stage = ?, hold_ip = ?, completed_at = ?
+		WHERE id = ?
+	`, string(model.StateFailedHolding), failedStage, holdIP, now, runID)
+	return err
+}
+
+func (r *Runs) MarkCompleted(ctx context.Context, runID int64, reportPath string) error {
+	now := time.Now().UTC()
+	_, err := r.DB.ExecContext(ctx, `
+		UPDATE runs SET state = ?, result = 'pass', report_path = ?, completed_at = ?
+		WHERE id = ?
+	`, string(model.StateCompleted), reportPath, now, runID)
+	return err
+}
+
+func (r *Runs) Get(ctx context.Context, id int64) (*model.Run, error) {
+	row := r.DB.QueryRowContext(ctx, `
+		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
+		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
+		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
+		       COALESCE(override_flags_json,'')
+		FROM runs WHERE id = ?
+	`, id)
+	var run model.Run
+	var completedAt sql.NullTime
+	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
+		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON)
+	if errors.Is(err, sql.ErrNoRows) {
+		return nil, ErrNotFound
+	}
+	if err != nil {
+		return nil, fmt.Errorf("get run: %w", err)
+	}
+	if completedAt.Valid {
+		run.CompletedAt = &completedAt.Time
+	}
+	return &run, nil
+}
+
+// LatestForHost returns the most recent run for a host, or nil if none.
+func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, error) {
+	row := r.DB.QueryRowContext(ctx, `
+		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
+		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
+		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
+		       COALESCE(override_flags_json,'')
+		FROM runs WHERE host_id = ?
+		ORDER BY id DESC LIMIT 1
+	`, hostID)
+	var run model.Run
+	var completedAt sql.NullTime
+	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
+		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON)
+	if errors.Is(err, sql.ErrNoRows) {
+		return nil, nil
+	}
+	if err != nil {
+		return nil, fmt.Errorf("latest run: %w", err)
+	}
+	if completedAt.Valid {
+		run.CompletedAt = &completedAt.Time
+	}
+	return &run, nil
+}
+
+// Active returns all runs in non-terminal states.
+func (r *Runs) Active(ctx context.Context) ([]model.Run, error) {
+	rows, err := r.DB.QueryContext(ctx, `
+		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
+		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
+		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
+		       COALESCE(override_flags_json,'')
+		FROM runs
+		WHERE state NOT IN ('Completed','Released')
+		ORDER BY id
+	`)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []model.Run
+	for rows.Next() {
+		var run model.Run
+		var completedAt sql.NullTime
+		if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
+			&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
+			&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON); err != nil {
+			return nil, err
+		}
+		if completedAt.Valid {
+			run.CompletedAt = &completedAt.Time
+		}
+		out = append(out, run)
+	}
+	return out, rows.Err()
+}
+
+// CompletedOlderThan returns run IDs for terminal (Completed/Released/
+// FailedHolding) runs whose completed_at is older than cutoff. Runs with
+// a NULL completed_at fall back to started_at so a stuck run doesn't get
+// garbage-collected out from under its own logs. Used by the janitor.
+func (r *Runs) CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error) {
+	rows, err := r.DB.QueryContext(ctx, `
+		SELECT id FROM runs
+		WHERE state IN ('Completed','Released','FailedHolding')
+		  AND COALESCE(completed_at, started_at) < ?
+		ORDER BY id
+	`, cutoff)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []int64
+	for rows.Next() {
+		var id int64
+		if err := rows.Scan(&id); err != nil {
+			return nil, err
+		}
+		out = append(out, id)
+	}
+	return out, rows.Err()
+}
+
+// FindByMAC returns the current active run for the host with the given MAC,
+// or nil if the MAC is unknown or has no active run.
+func (r *Runs) FindActiveByMAC(ctx context.Context, mac string) (*model.Run, error) {
+	row := r.DB.QueryRowContext(ctx, `
+		SELECT r.id, r.host_id, r.state, COALESCE(r.result,''), COALESCE(r.failed_stage,''),
+		       COALESCE(r.next_boot_target,''), r.agent_token_hash, r.started_at,
+		       r.completed_at, COALESCE(r.report_path,''), COALESCE(r.hold_ip,''),
+		       COALESCE(r.override_flags_json,'')
+		FROM runs r
+		JOIN hosts h ON h.id = r.host_id
+		WHERE h.mac = ? AND r.state NOT IN ('Completed','Released')
+		ORDER BY r.id DESC LIMIT 1
+	`, mac)
+	var run model.Run
+	var completedAt sql.NullTime
+	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
+		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON)
+	if errors.Is(err, sql.ErrNoRows) {
+		return nil, nil
+	}
+	if err != nil {
+		return nil, err
+	}
+	if completedAt.Valid {
+		run.CompletedAt = &completedAt.Time
+	}
+	return &run, nil
+}
@@ -0,0 +1,91 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"time"
+
+	"vetting/internal/model"
+)
+
+type Stages struct {
+	DB *sql.DB
+}
+
+// DefaultStageOrder is the canonical sequence for every run. Phase 2 only
+// reaches Inventory; later phases add more executors but the list is fixed.
+var DefaultStageOrder = []string{
+	"Inventory",
+	"SpecValidate",
+	"SMART",
+	"CPUStress",
+	"Storage",
+	"Network",
+	"GPU",
+	"PSU",
+	"Reporting",
+}
+
+// Seed creates one pending row per stage for the given run.
+func (s *Stages) Seed(ctx context.Context, runID int64) error {
+	tx, err := s.DB.BeginTx(ctx, nil)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = tx.Rollback() }()
+	for i, name := range DefaultStageOrder {
+		if _, err := tx.ExecContext(ctx,
+			`INSERT INTO stages(run_id, name, ordinal, state) VALUES(?,?,?,?)`,
+			runID, name, i, string(model.StagePending)); err != nil {
+			return fmt.Errorf("seed stage %s: %w", name, err)
+		}
+	}
+	return tx.Commit()
+}
+
+func (s *Stages) ListForRun(ctx context.Context, runID int64) ([]model.Stage, error) {
+	rows, err := s.DB.QueryContext(ctx, `
+		SELECT id, run_id, name, ordinal, state, started_at, completed_at, COALESCE(summary_json,'')
+		FROM stages WHERE run_id = ? ORDER BY ordinal
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []model.Stage
+	for rows.Next() {
+		var st model.Stage
+		var started, completed sql.NullTime
+		if err := rows.Scan(&st.ID, &st.RunID, &st.Name, &st.Ordinal, &st.State,
+			&started, &completed, &st.SummaryJSON); err != nil {
+			return nil, err
+		}
+		if started.Valid {
+			st.StartedAt = &started.Time
+		}
+		if completed.Valid {
+			st.CompletedAt = &completed.Time
+		}
+		out = append(out, st)
+	}
+	return out, rows.Err()
+}
+
+func (s *Stages) StartByName(ctx context.Context, runID int64, name string) error {
+	now := time.Now().UTC()
+	_, err := s.DB.ExecContext(ctx, `
+		UPDATE stages SET state = ?, started_at = ?
+		WHERE run_id = ? AND name = ?
+	`, string(model.StageRunning), now, runID, name)
+	return err
+}
+
+func (s *Stages) CompleteByName(ctx context.Context, runID int64, name string, state model.StageState, summaryJSON string) error {
+	now := time.Now().UTC()
+	_, err := s.DB.ExecContext(ctx, `
+		UPDATE stages SET state = ?, completed_at = ?, summary_json = ?
+		WHERE run_id = ? AND name = ?
+	`, string(state), now, nullIfEmpty(summaryJSON), runID, name)
+	return err
+}
@@ -0,0 +1,229 @@
+package store_test
+
+import (
+	"context"
+	"path/filepath"
+	"testing"
+
+	"vetting/internal/db"
+	"vetting/internal/model"
+	"vetting/internal/store"
+)
+
+func newDB(t *testing.T) *store.Runs {
+	t.Helper()
+	path := filepath.Join(t.TempDir(), "vetting.db")
+	conn, err := db.Open(path)
+	if err != nil {
+		t.Fatalf("open db: %v", err)
+	}
+	t.Cleanup(func() { _ = conn.Close() })
+	return &store.Runs{DB: conn}
+}
+
+// seedRun inserts a host + a run and returns (hostID, runID). Every
+// subsequent store test builds on this so run_id foreign keys resolve.
+func seedRun(t *testing.T, runs *store.Runs) (int64, int64) {
+	t.Helper()
+	hosts := &store.Hosts{DB: runs.DB}
+	hostID, err := hosts.Create(context.Background(), model.Host{
+		Name:             "t-host",
+		MAC:              "aa:bb:cc:dd:ee:ff",
+		WoLBroadcastIP:   "10.0.0.255",
+		WoLPort:          9,
+		ExpectedSpecYAML: "memory:\n  total_gib: 16\n",
+	})
+	if err != nil {
+		t.Fatalf("create host: %v", err)
+	}
+	runID, err := runs.Create(context.Background(), hostID, "deadbeef")
+	if err != nil {
+		t.Fatalf("create run: %v", err)
+	}
+	return hostID, runID
+}
+
+func TestArtifactsRoundtrip(t *testing.T) {
+	runs := newDB(t)
+	_, runID := seedRun(t, runs)
+	arts := &store.Artifacts{DB: runs.DB}
+
+	id, err := arts.Create(context.Background(), store.Artifact{
+		RunID:     runID,
+		Kind:      "inventory",
+		Path:      "/var/artifacts/run-1/inventory.json",
+		SHA256:    "abc123",
+		SizeBytes: 42,
+	})
+	if err != nil {
+		t.Fatalf("Create: %v", err)
+	}
+	if id == 0 {
+		t.Fatalf("expected non-zero id")
+	}
+
+	// Hold key on the same run — ListForRun should return both in
+	// insertion order and TileEnricher picks the hold_key row.
+	if _, err := arts.Create(context.Background(), store.Artifact{
+		RunID: runID, Kind: "hold_key", Path: "/var/artifacts/run-1/hold.key", SHA256: "def456", SizeBytes: 400,
+	}); err != nil {
+		t.Fatalf("Create hold_key: %v", err)
+	}
+
+	list, err := arts.ListForRun(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	if len(list) != 2 {
+		t.Fatalf("ListForRun returned %d, want 2", len(list))
+	}
+	if list[0].Kind != "inventory" || list[1].Kind != "hold_key" {
+		t.Fatalf("unexpected order: %+v", list)
+	}
+	if list[1].Path != "/var/artifacts/run-1/hold.key" {
+		t.Fatalf("hold_key path lost: %q", list[1].Path)
+	}
+}
+
+func TestSpecDiffsReplaceForRun(t *testing.T) {
+	runs := newDB(t)
+	_, runID := seedRun(t, runs)
+	sd := &store.SpecDiffs{DB: runs.DB}
+	ctx := context.Background()
+
+	// First write: three diffs.
+	err := sd.ReplaceForRun(ctx, runID, []model.SpecDiff{
+		{RunID: runID, Field: "cpu.model", Expected: "Xeon", Actual: "EPYC", Severity: "critical"},
+		{RunID: runID, Field: "memory.total_gib", Expected: "16", Actual: "8", Severity: "critical"},
+		{RunID: runID, Field: "note", Expected: "", Actual: "dusty", Severity: "info"},
+	})
+	if err != nil {
+		t.Fatalf("ReplaceForRun: %v", err)
+	}
+
+	list, err := sd.ListForRun(ctx, runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	if len(list) != 3 {
+		t.Fatalf("got %d rows, want 3", len(list))
+	}
+
+	// Second write replaces, doesn't append — otherwise a re-run would
+	// double-count spec diffs and the tile badge would grow without bound.
+	err = sd.ReplaceForRun(ctx, runID, []model.SpecDiff{
+		{RunID: runID, Field: "cpu.model", Expected: "Xeon", Actual: "Xeon Gold", Severity: "info"},
+	})
+	if err != nil {
+		t.Fatalf("second ReplaceForRun: %v", err)
+	}
+	list, err = sd.ListForRun(ctx, runID)
+	if err != nil {
+		t.Fatalf("ListForRun after replace: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 row after replace, got %d", len(list))
+	}
+	if list[0].Severity != "info" {
+		t.Fatalf("expected severity info, got %q", list[0].Severity)
+	}
+}
+
+func TestMeasurementsBatchAndList(t *testing.T) {
+	runs := newDB(t)
+	_, runID := seedRun(t, runs)
+	meas := &store.Measurements{DB: runs.DB}
+	ctx := context.Background()
+
+	err := meas.CreateBatch(ctx, []model.Measurement{
+		{RunID: runID, Kind: "thermal", Key: "cpu", Value: 52.5, Unit: "C"},
+		{RunID: runID, Kind: "iperf", Key: "throughput_mbps", Value: 940.1, Unit: "Mbps"},
+		{RunID: runID, Kind: "psu", Key: "in0", Value: 12.04, Unit: "V"},
+	})
+	if err != nil {
+		t.Fatalf("CreateBatch: %v", err)
+	}
+
+	// Zero-length batch must be a no-op, not an error.
+	if err := meas.CreateBatch(ctx, nil); err != nil {
+		t.Fatalf("empty CreateBatch: %v", err)
+	}
+
+	rows, err := meas.ListForRun(ctx, runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	if len(rows) != 3 {
+		t.Fatalf("got %d rows, want 3", len(rows))
+	}
+	foundIperf := false
+	for _, r := range rows {
+		if r.Kind == "iperf" && r.Key == "throughput_mbps" && r.Value > 900 {
+			foundIperf = true
+		}
+	}
+	if !foundIperf {
+		t.Fatalf("iperf row missing or wrong value: %+v", rows)
+	}
+}
+
+func TestRunsOverrideFlagsAndClearFailedStage(t *testing.T) {
+	runs := newDB(t)
+	_, runID := seedRun(t, runs)
+	ctx := context.Background()
+
+	if err := runs.SetFailedStage(ctx, runID, "Storage"); err != nil {
+		t.Fatalf("SetFailedStage: %v", err)
+	}
+	if err := runs.SetOverrideFlags(ctx, runID, `{"wipe":true}`); err != nil {
+		t.Fatalf("SetOverrideFlags: %v", err)
+	}
+	run, err := runs.Get(ctx, runID)
+	if err != nil {
+		t.Fatalf("Get: %v", err)
+	}
+	if run.OverrideFlagsJSON != `{"wipe":true}` {
+		t.Fatalf("OverrideFlagsJSON = %q, want {\"wipe\":true}", run.OverrideFlagsJSON)
+	}
+	if run.FailedStage != "Storage" {
+		t.Fatalf("FailedStage = %q, want Storage", run.FailedStage)
+	}
+	if err := runs.ClearFailedStage(ctx, runID); err != nil {
+		t.Fatalf("ClearFailedStage: %v", err)
+	}
+	run, err = runs.Get(ctx, runID)
+	if err != nil {
+		t.Fatalf("Get after clear: %v", err)
+	}
+	if run.FailedStage != "" {
+		t.Fatalf("FailedStage not cleared: %q", run.FailedStage)
+	}
+	// override_flags_json should persist across ClearFailedStage so the
+	// agent can still read it on its next heartbeat.
+	if run.OverrideFlagsJSON != `{"wipe":true}` {
+		t.Fatalf("OverrideFlagsJSON lost after ClearFailedStage: %q", run.OverrideFlagsJSON)
+	}
+}
+
+func TestRunsHoldAndFailedStage(t *testing.T) {
+	runs := newDB(t)
+	_, runID := seedRun(t, runs)
+	ctx := context.Background()
+
+	if err := runs.SetHoldIP(ctx, runID, "10.0.0.42"); err != nil {
+		t.Fatalf("SetHoldIP: %v", err)
+	}
+	if err := runs.SetFailedStage(ctx, runID, "SpecValidate"); err != nil {
+		t.Fatalf("SetFailedStage: %v", err)
+	}
+	run, err := runs.Get(ctx, runID)
+	if err != nil {
+		t.Fatalf("Get: %v", err)
+	}
+	if run.HoldIP != "10.0.0.42" {
+		t.Fatalf("HoldIP = %q, want 10.0.0.42", run.HoldIP)
+	}
+	if run.FailedStage != "SpecValidate" {
+		t.Fatalf("FailedStage = %q, want SpecValidate", run.FailedStage)
+	}
+}
@@ -0,0 +1,6 @@
+package web
+
+import "embed"
+
+//go:embed static/*
+var Static embed.FS
@@ -0,0 +1,210 @@
+:root {
+  --bg: #0f1115;
+  --bg-elev: #171a21;
+  --bg-elev-2: #1f232c;
+  --border: #2a2f3a;
+  --text: #e5e8ef;
+  --text-dim: #9aa2b1;
+  --accent: #6aa9ff;
+  --accent-strong: #3c82f6;
+  --success: #35c27b;
+  --warn: #e4a94b;
+  --danger: #e56466;
+  --radius: 8px;
+  --font: system-ui, -apple-system, "Segoe UI", Roboto, sans-serif;
+  --mono: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+}
+
+* { box-sizing: border-box; }
+
+html, body {
+  margin: 0;
+  padding: 0;
+  background: var(--bg);
+  color: var(--text);
+  font: 15px/1.45 var(--font);
+}
+
+a { color: var(--accent); text-decoration: none; }
+a:hover { text-decoration: underline; }
+
+.topbar {
+  display: flex;
+  align-items: center;
+  gap: 24px;
+  padding: 12px 24px;
+  border-bottom: 1px solid var(--border);
+  background: var(--bg-elev);
+}
+.topbar .brand { font-weight: 700; letter-spacing: .2px; }
+.topbar nav { display: flex; gap: 16px; flex: 1; }
+.topbar nav a { color: var(--text-dim); }
+.topbar nav a:hover { color: var(--text); text-decoration: none; }
+.topbar .session { display: flex; align-items: center; gap: 12px; }
+.topbar .heartbeat { color: var(--text-dim); font-family: var(--mono); font-size: 12px; }
+.topbar .logout-form { margin: 0; }
+
+main { max-width: 1280px; margin: 0 auto; padding: 24px; }
+
+button, .button, .button-secondary {
+  appearance: none;
+  font: inherit;
+  padding: 8px 14px;
+  border-radius: var(--radius);
+  border: 1px solid var(--border);
+  background: var(--bg-elev-2);
+  color: var(--text);
+  cursor: pointer;
+  text-decoration: none;
+  display: inline-block;
+}
+button:hover, .button:hover { border-color: var(--accent); }
+button:disabled { opacity: .5; cursor: not-allowed; }
+button.danger { border-color: var(--danger); color: var(--danger); background: transparent; }
+button.danger:hover { background: rgba(229,100,102,.1); }
+.button-secondary { background: transparent; }
+
+.error {
+  background: rgba(229,100,102,.12);
+  border: 1px solid var(--danger);
+  color: var(--danger);
+  padding: 10px 14px;
+  border-radius: var(--radius);
+  margin-bottom: 16px;
+}
+
+.dashboard-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  margin-bottom: 20px;
+}
+.dashboard-header h1 { font-size: 20px; margin: 0; }
+
+.empty {
+  text-align: center;
+  padding: 48px 24px;
+  border: 1px dashed var(--border);
+  border-radius: var(--radius);
+  color: var(--text-dim);
+}
+.empty .button { margin-top: 12px; }
+
+.tile-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fill, minmax(320px, 1fr));
+  gap: 16px;
+}
+
+.tile {
+  background: var(--bg-elev);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 16px;
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+}
+.tile-head { display: flex; justify-content: space-between; align-items: center; }
+.tile-name { font-weight: 600; }
+.tile-status { font-size: 12px; color: var(--text-dim); text-transform: uppercase; letter-spacing: .5px; }
+.tile-idle .tile-status { color: var(--text-dim); }
+
+.tile-meta { display: grid; grid-template-columns: 1fr 1fr; gap: 4px 16px; margin: 0; font-size: 13px; }
+.tile-meta div { display: flex; justify-content: space-between; align-items: baseline; }
+.tile-meta dt { color: var(--text-dim); }
+.tile-meta dd { margin: 0; font-family: var(--mono); }
+
+.tile-actions { display: flex; gap: 8px; }
+.tile-actions .inline { margin: 0; flex: 0; }
+
+.tile-meta dd.bad { color: var(--danger); }
+
+.tile-hold {
+  background: rgba(229,100,102,.08);
+  border: 1px solid rgba(229,100,102,.35);
+  border-radius: var(--radius);
+  padding: 8px 10px;
+  display: flex;
+  flex-direction: column;
+  gap: 4px;
+}
+.tile-hold .hold-title {
+  font-size: 12px;
+  color: var(--danger);
+  text-transform: uppercase;
+  letter-spacing: .5px;
+}
+.tile-hold .hold-ssh {
+  font-family: var(--mono);
+  font-size: 12px;
+  color: var(--text);
+  word-break: break-all;
+  user-select: all;
+}
+
+.tile-log {
+  background: #0b0d12;
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 8px 10px;
+  font-family: var(--mono);
+  font-size: 12px;
+  color: var(--text-dim);
+  max-height: 160px;
+  overflow-y: auto;
+  display: flex;
+  flex-direction: column;
+  gap: 2px;
+}
+.tile-log:empty { display: none; }
+.tile-log .log-line { white-space: pre-wrap; }
+.tile-log .log-warn { color: var(--warn); }
+.tile-log .log-error { color: var(--danger); }
+
+.tile-fail { border-color: rgba(229,100,102,.6); }
+.tile-pass { border-color: rgba(53,194,123,.5); }
+.tile-active { border-color: var(--accent); }
+
+.form-wrap { max-width: 640px; }
+.form-wrap h1 { font-size: 20px; }
+
+.host-form { display: flex; flex-direction: column; gap: 14px; }
+.host-form label { display: flex; flex-direction: column; gap: 4px; color: var(--text-dim); font-size: 13px; }
+.host-form input,
+.host-form textarea {
+  font: inherit;
+  font-family: var(--mono);
+  color: var(--text);
+  background: var(--bg-elev);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 8px 10px;
+}
+.host-form textarea { resize: vertical; min-height: 96px; }
+.host-form .grid-2 { display: grid; grid-template-columns: 2fr 1fr; gap: 14px; }
+.host-form .actions { display: flex; gap: 10px; margin-top: 4px; }
+
+.login-card {
+  max-width: 360px;
+  margin: 12vh auto;
+  padding: 28px;
+  background: var(--bg-elev);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+}
+.login-card h1 { margin: 0 0 16px; font-size: 22px; }
+.login-card label { display: flex; flex-direction: column; gap: 4px; color: var(--text-dim); font-size: 13px; }
+.login-card input {
+  font: inherit;
+  color: var(--text);
+  background: var(--bg-elev-2);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 10px;
+  margin-bottom: 12px;
+}
+.login-card button { width: 100%; background: var(--accent-strong); border-color: var(--accent-strong); color: #fff; }
+.login-card button:hover { background: var(--accent); border-color: var(--accent); }
+
+body.bare main { max-width: none; }
@@ -0,0 +1,36 @@
+package templates
+
+import "vetting/internal/model"
+
+// TileData pairs a host with its latest run and the derived fields the
+// tile needs to render: spec-diff count (server-side diff result) and
+// the on-disk path to the hold-key artifact when the run is holding.
+type TileData struct {
+	Host             model.Host
+	Latest           *model.Run
+	SpecDiffCritical int
+	HoldKeyPath      string
+}
+
+templ Dashboard(tiles []TileData) {
+	@Layout("Dashboard") {
+		<section class="dashboard">
+			<div class="dashboard-header">
+				<h1>Registered hosts</h1>
+				<a class="button" href="/hosts/new">Register host</a>
+			</div>
+			if len(tiles) == 0 {
+				<div class="empty">
+					<p>No hosts registered yet.</p>
+					<a class="button" href="/hosts/new">Register your first host</a>
+				</div>
+			} else {
+				<div class="tile-grid" hx-ext="sse" sse-connect="/events">
+					for _, t := range tiles {
+						@HostTile(t)
+					}
+				</div>
+			}
+		</section>
+	}
+}
@@ -0,0 +1,95 @@
+// Code generated by templ - DO NOT EDIT.
+
+// templ: version: v0.3.1001
+package templates
+
+//lint:file-ignore SA4006 This context is only used if a nested component is present.
+
+import "github.com/a-h/templ"
+import templruntime "github.com/a-h/templ/runtime"
+
+import "vetting/internal/model"
+
+// TileData pairs a host with its latest run and the derived fields the
+// tile needs to render: spec-diff count (server-side diff result) and
+// the on-disk path to the hold-key artifact when the run is holding.
+type TileData struct {
+	Host             model.Host
+	Latest           *model.Run
+	SpecDiffCritical int
+	HoldKeyPath      string
+}
+
+func Dashboard(tiles []TileData) templ.Component {
+	return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
+		templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
+		if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
+			return templ_7745c5c3_CtxErr
+		}
+		templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
+		if !templ_7745c5c3_IsBuffer {
+			defer func() {
+				templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
+				if templ_7745c5c3_Err == nil {
+					templ_7745c5c3_Err = templ_7745c5c3_BufErr
+				}
+			}()
+		}
+		ctx = templ.InitializeContext(ctx)
+		templ_7745c5c3_Var1 := templ.GetChildren(ctx)
+		if templ_7745c5c3_Var1 == nil {
+			templ_7745c5c3_Var1 = templ.NopComponent
+		}
+		ctx = templ.ClearChildren(ctx)
+		templ_7745c5c3_Var2 := templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
+			templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
+			templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
+			if !templ_7745c5c3_IsBuffer {
+				defer func() {
+					templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
+					if templ_7745c5c3_Err == nil {
+						templ_7745c5c3_Err = templ_7745c5c3_BufErr
+					}
+				}()
+			}
+			ctx = templ.InitializeContext(ctx)
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "<section class=\"dashboard\"><div class=\"dashboard-header\"><h1>Registered hosts</h1><a class=\"button\" href=\"/hosts/new\">Register host</a></div>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			if len(tiles) == 0 {
+				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "<div class=\"empty\"><p>No hosts registered yet.</p><a class=\"button\" href=\"/hosts/new\">Register your first host</a></div>")
+				if templ_7745c5c3_Err != nil {
+					return templ_7745c5c3_Err
+				}
+			} else {
+				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "<div class=\"tile-grid\" hx-ext=\"sse\" sse-connect=\"/events\">")
+				if templ_7745c5c3_Err != nil {
+					return templ_7745c5c3_Err
+				}
+				for _, t := range tiles {
+					templ_7745c5c3_Err = HostTile(t).Render(ctx, templ_7745c5c3_Buffer)
+					if templ_7745c5c3_Err != nil {
+						return templ_7745c5c3_Err
+					}
+				}
+				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "</div>")
+				if templ_7745c5c3_Err != nil {
+					return templ_7745c5c3_Err
+				}
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "</section>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			return nil
+		})
+		templ_7745c5c3_Err = Layout("Dashboard").Render(templ.WithChildren(ctx, templ_7745c5c3_Var2), templ_7745c5c3_Buffer)
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		return nil
+	})
+}
+
+var _ = templruntime.GeneratedTemplate
@@ -0,0 +1,144 @@
+package templates
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"vetting/internal/model"
+)
+
+// HostTile renders a single dashboard card. It's the SSE-swap target
+// for per-host tile refreshes (`tile-N`) and contains a per-run log
+// pane (`log-M`) whose live tail is appended by the events hub.
+templ HostTile(t TileData) {
+	<article
+		id={ fmt.Sprintf("host-%d", t.Host.ID) }
+		class={ "tile", "tile-" + tileMood(t.Latest) }
+		sse-swap={ fmt.Sprintf("tile-%d", t.Host.ID) }
+		hx-swap="outerHTML"
+	>
+		<header class="tile-head">
+			<div class="tile-name">{ t.Host.Name }</div>
+			<div class="tile-status">{ tileStatus(t.Latest) }</div>
+		</header>
+		<dl class="tile-meta">
+			<div>
+				<dt>MAC</dt>
+				<dd>{ t.Host.MAC }</dd>
+			</div>
+			<div>
+				<dt>WoL</dt>
+				<dd>{ fmt.Sprintf("%s:%d", t.Host.WoLBroadcastIP, t.Host.WoLPort) }</dd>
+			</div>
+			if t.Latest != nil && t.Latest.FailedStage != "" {
+				<div>
+					<dt>Failed at</dt>
+					<dd>{ t.Latest.FailedStage }</dd>
+				</div>
+			}
+			if t.SpecDiffCritical > 0 {
+				<div>
+					<dt>Spec diffs</dt>
+					<dd class="bad">{ fmt.Sprintf("%d critical", t.SpecDiffCritical) }</dd>
+				</div>
+			}
+		</dl>
+		if t.Latest != nil && t.Latest.State == model.StateFailedHolding && t.Latest.HoldIP != "" {
+			<div class="tile-hold">
+				<div class="hold-title">Host is holding — SSH available</div>
+				<code class="hold-ssh">{ sshInvocation(t.HoldKeyPath, t.Latest.HoldIP) }</code>
+			</div>
+		}
+		if t.Latest != nil {
+			<div
+				class="tile-log"
+				id={ fmt.Sprintf("log-%d", t.Latest.ID) }
+				sse-swap={ fmt.Sprintf("log-%d", t.Latest.ID) }
+				hx-swap="beforeend"
+			></div>
+		}
+		<div class="tile-actions">
+			if canStart(t.Latest) {
+				<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/start", t.Host.ID)) } class="inline">
+					<button type="submit">Start vetting</button>
+				</form>
+			} else {
+				<button type="button" disabled>Run in flight</button>
+			}
+			if canOverrideWipe(t.Latest) {
+				<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/override-wipe", t.Host.ID)) } class="inline">
+					<button type="submit" class="danger">Override wipe-probe</button>
+				</form>
+			}
+			if hasReport(t.Latest) {
+				<a class="button-like" href={ templ.SafeURL(fmt.Sprintf("/reports/%d", t.Latest.ID)) } target="_blank" rel="noopener">View report</a>
+			}
+			<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/delete", t.Host.ID)) } class="inline">
+				<button type="submit" class="danger">Delete</button>
+			</form>
+		</div>
+	</article>
+}
+
+func canOverrideWipe(r *model.Run) bool {
+	if r == nil {
+		return false
+	}
+	return r.State == model.StateFailedHolding && r.FailedStage == "Storage"
+}
+
+// hasReport is true once the reporting stage has produced an HTML
+// artifact. We cheat slightly: Completed runs always have one, and
+// that's the only state in which the tile wants to surface a link.
+func hasReport(r *model.Run) bool {
+	return r != nil && r.State == model.StateCompleted
+}
+
+func canStart(r *model.Run) bool {
+	if r == nil {
+		return true
+	}
+	switch r.State {
+	case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
+		return true
+	}
+	return false
+}
+
+func tileStatus(r *model.Run) string {
+	if r == nil {
+		return "Idle"
+	}
+	return string(r.State)
+}
+
+func tileMood(r *model.Run) string {
+	if r == nil {
+		return "idle"
+	}
+	switch r.State {
+	case model.StateCompleted:
+		return "pass"
+	case model.StateFailed, model.StateFailedHolding:
+		return "fail"
+	case model.StateReleased:
+		return "idle"
+	}
+	return "active"
+}
+
+func sshInvocation(keyPath, ip string) string {
+	if keyPath == "" {
+		return "ssh root@" + ip + "  (hold key not yet recorded)"
+	}
+	return fmt.Sprintf("ssh -i %s root@%s", keyPath, ip)
+}
+
+// RenderTileString renders a single tile fragment so the orchestrator
+// can publish it over SSE without threading a context through every
+// event publisher.
+func RenderTileString(t TileData) string {
+	var buf bytes.Buffer
+	_ = HostTile(t).Render(context.Background(), &buf)
+	return buf.String()
+}
@@ -0,0 +1,385 @@
+// Code generated by templ - DO NOT EDIT.
+
+// templ: version: v0.3.1001
+package templates
+
+//lint:file-ignore SA4006 This context is only used if a nested component is present.
+
+import "github.com/a-h/templ"
+import templruntime "github.com/a-h/templ/runtime"
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"vetting/internal/model"
+)
+
+// HostTile renders a single dashboard card. It's the SSE-swap target
+// for per-host tile refreshes (`tile-N`) and contains a per-run log
+// pane (`log-M`) whose live tail is appended by the events hub.
+func HostTile(t TileData) templ.Component {
+	return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
+		templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
+		if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
+			return templ_7745c5c3_CtxErr
+		}
+		templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
+		if !templ_7745c5c3_IsBuffer {
+			defer func() {
+				templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
+				if templ_7745c5c3_Err == nil {
+					templ_7745c5c3_Err = templ_7745c5c3_BufErr
+				}
+			}()
+		}
+		ctx = templ.InitializeContext(ctx)
+		templ_7745c5c3_Var1 := templ.GetChildren(ctx)
+		if templ_7745c5c3_Var1 == nil {
+			templ_7745c5c3_Var1 = templ.NopComponent
+		}
+		ctx = templ.ClearChildren(ctx)
+		var templ_7745c5c3_Var2 = []any{"tile", "tile-" + tileMood(t.Latest)}
+		templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var2...)
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "<article id=\"")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		var templ_7745c5c3_Var3 string
+		templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("host-%d", t.Host.ID))
+		if templ_7745c5c3_Err != nil {
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 15, Col: 40}
+		}
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "\" class=\"")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		var templ_7745c5c3_Var4 string
+		templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String())
+		if templ_7745c5c3_Err != nil {
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 1, Col: 0}
+		}
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "\" sse-swap=\"")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		var templ_7745c5c3_Var5 string
+		templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("tile-%d", t.Host.ID))
+		if templ_7745c5c3_Err != nil {
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 17, Col: 46}
+		}
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "\" hx-swap=\"outerHTML\"><header class=\"tile-head\"><div class=\"tile-name\">")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		var templ_7745c5c3_Var6 string
+		templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(t.Host.Name)
+		if templ_7745c5c3_Err != nil {
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 21, Col: 39}
+		}
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6))
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "</div><div class=\"tile-status\">")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		var templ_7745c5c3_Var7 string
+		templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(t.Latest))
+		if templ_7745c5c3_Err != nil {
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 22, Col: 50}
+		}
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 6, "</div></header><dl class=\"tile-meta\"><div><dt>MAC</dt><dd>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		var templ_7745c5c3_Var8 string
+		templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(t.Host.MAC)
+		if templ_7745c5c3_Err != nil {
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 27, Col: 20}
+		}
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 7, "</dd></div><div><dt>WoL</dt><dd>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		var templ_7745c5c3_Var9 string
+		templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%s:%d", t.Host.WoLBroadcastIP, t.Host.WoLPort))
+		if templ_7745c5c3_Err != nil {
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 31, Col: 69}
+		}
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 8, "</dd></div>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		if t.Latest != nil && t.Latest.FailedStage != "" {
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 9, "<div><dt>Failed at</dt><dd>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var10 string
+			templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(t.Latest.FailedStage)
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 36, Col: 31}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 10, "</dd></div>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+		}
+		if t.SpecDiffCritical > 0 {
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 11, "<div><dt>Spec diffs</dt><dd class=\"bad\">")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var11 string
+			templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d critical", t.SpecDiffCritical))
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 42, Col: 69}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 12, "</dd></div>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 13, "</dl>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		if t.Latest != nil && t.Latest.State == model.StateFailedHolding && t.Latest.HoldIP != "" {
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 14, "<div class=\"tile-hold\"><div class=\"hold-title\">Host is holding — SSH available</div><code class=\"hold-ssh\">")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var12 string
+			templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(t.HoldKeyPath, t.Latest.HoldIP))
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 49, Col: 74}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 15, "</code></div>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+		}
+		if t.Latest != nil {
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 16, "<div class=\"tile-log\" id=\"")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var13 string
+			templ_7745c5c3_Var13, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d", t.Latest.ID))
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 55, Col: 43}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var13))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 17, "\" sse-swap=\"")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var14 string
+			templ_7745c5c3_Var14, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d", t.Latest.ID))
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 56, Col: 49}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var14))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 18, "\" hx-swap=\"beforeend\"></div>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 19, "<div class=\"tile-actions\">")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		if canStart(t.Latest) {
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 20, "<form method=\"post\" action=\"")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var15 templ.SafeURL
+			templ_7745c5c3_Var15, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", t.Host.ID)))
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 62, Col: 89}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var15))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 21, "\" class=\"inline\"><button type=\"submit\">Start vetting</button></form>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+		} else {
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 22, "<button type=\"button\" disabled>Run in flight</button> ")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+		}
+		if canOverrideWipe(t.Latest) {
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 23, "<form method=\"post\" action=\"")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var16 templ.SafeURL
+			templ_7745c5c3_Var16, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/override-wipe", t.Host.ID)))
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 69, Col: 97}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var16))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 24, "\" class=\"inline\"><button type=\"submit\" class=\"danger\">Override wipe-probe</button></form>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+		}
+		if hasReport(t.Latest) {
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 25, "<a class=\"button-like\" href=\"")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var17 templ.SafeURL
+			templ_7745c5c3_Var17, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/reports/%d", t.Latest.ID)))
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 74, Col: 88}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var17))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 26, "\" target=\"_blank\" rel=\"noopener\">View report</a>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 27, "<form method=\"post\" action=\"")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		var templ_7745c5c3_Var18 templ.SafeURL
+		templ_7745c5c3_Var18, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/delete", t.Host.ID)))
+		if templ_7745c5c3_Err != nil {
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 76, Col: 89}
+		}
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var18))
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 28, "\" class=\"inline\"><button type=\"submit\" class=\"danger\">Delete</button></form></div></article>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		return nil
+	})
+}
+
+func canOverrideWipe(r *model.Run) bool {
+	if r == nil {
+		return false
+	}
+	return r.State == model.StateFailedHolding && r.FailedStage == "Storage"
+}
+
+// hasReport is true once the reporting stage has produced an HTML
+// artifact. We cheat slightly: Completed runs always have one, and
+// that's the only state in which the tile wants to surface a link.
+func hasReport(r *model.Run) bool {
+	return r != nil && r.State == model.StateCompleted
+}
+
+func canStart(r *model.Run) bool {
+	if r == nil {
+		return true
+	}
+	switch r.State {
+	case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
+		return true
+	}
+	return false
+}
+
+func tileStatus(r *model.Run) string {
+	if r == nil {
+		return "Idle"
+	}
+	return string(r.State)
+}
+
+func tileMood(r *model.Run) string {
+	if r == nil {
+		return "idle"
+	}
+	switch r.State {
+	case model.StateCompleted:
+		return "pass"
+	case model.StateFailed, model.StateFailedHolding:
+		return "fail"
+	case model.StateReleased:
+		return "idle"
+	}
+	return "active"
+}
+
+func sshInvocation(keyPath, ip string) string {
+	if keyPath == "" {
+		return "ssh root@" + ip + "  (hold key not yet recorded)"
+	}
+	return fmt.Sprintf("ssh -i %s root@%s", keyPath, ip)
+}
+
+// RenderTileString renders a single tile fragment so the orchestrator
+// can publish it over SSE without threading a context through every
+// event publisher.
+func RenderTileString(t TileData) string {
+	var buf bytes.Buffer
+	_ = HostTile(t).Render(context.Background(), &buf)
+	return buf.String()
+}
+
+var _ = templruntime.GeneratedTemplate
@@ -0,0 +1,50 @@
+package templates
+
+templ Layout(title string) {
+	<!DOCTYPE html>
+	<html lang="en">
+		<head>
+			<meta charset="utf-8"/>
+			<meta name="viewport" content="width=device-width, initial-scale=1"/>
+			<title>{ title } — Vetting</title>
+			<link rel="stylesheet" href="/static/app.css"/>
+			<script src="https://unpkg.com/htmx.org@2.0.2" integrity="sha384-Y7hw+L/jvKeWIRRkqWYfPcvVxHzVzn5REgzbawhxAuQGwX1XWe70vji+VSeHOThJ" crossorigin="anonymous"></script>
+			<script src="https://unpkg.com/htmx-ext-sse@2.2.2" integrity="sha384-Y4gc0CK6Kg4hmulDc1rNM+vbMvjbW/5rRCA6pC5gj5dLV1/4+OZGkQpJtHQvQTCr" crossorigin="anonymous"></script>
+		</head>
+		<body hx-boost="true">
+			<header class="topbar">
+				<div class="brand">Vetting</div>
+				<nav>
+					<a href="/">Dashboard</a>
+					<a href="/hosts/new">Register host</a>
+				</nav>
+				<div class="session">
+					<span class="heartbeat" hx-ext="sse" sse-connect="/events" sse-swap="heartbeat">·</span>
+					<form method="post" action="/logout" class="logout-form">
+						<button type="submit">Log out</button>
+					</form>
+				</div>
+			</header>
+			<main>
+				{ children... }
+			</main>
+		</body>
+	</html>
+}
+
+templ BareLayout(title string) {
+	<!DOCTYPE html>
+	<html lang="en">
+		<head>
+			<meta charset="utf-8"/>
+			<meta name="viewport" content="width=device-width, initial-scale=1"/>
+			<title>{ title } — Vetting</title>
+			<link rel="stylesheet" href="/static/app.css"/>
+		</head>
+		<body class="bare">
+			<main>
+				{ children... }
+			</main>
+		</body>
+	</html>
+}
@@ -0,0 +1,111 @@
+// Code generated by templ - DO NOT EDIT.
+
+// templ: version: v0.3.1001
+package templates
+
+//lint:file-ignore SA4006 This context is only used if a nested component is present.
+
+import "github.com/a-h/templ"
+import templruntime "github.com/a-h/templ/runtime"
+
+func Layout(title string) templ.Component {
+	return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
+		templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
+		if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
+			return templ_7745c5c3_CtxErr
+		}
+		templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
+		if !templ_7745c5c3_IsBuffer {
+			defer func() {
+				templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
+				if templ_7745c5c3_Err == nil {
+					templ_7745c5c3_Err = templ_7745c5c3_BufErr
+				}
+			}()
+		}
+		ctx = templ.InitializeContext(ctx)
+		templ_7745c5c3_Var1 := templ.GetChildren(ctx)
+		if templ_7745c5c3_Var1 == nil {
+			templ_7745c5c3_Var1 = templ.NopComponent
+		}
+		ctx = templ.ClearChildren(ctx)
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "<!doctype html><html lang=\"en\"><head><meta charset=\"utf-8\"><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><title>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		var templ_7745c5c3_Var2 string
+		templ_7745c5c3_Var2, templ_7745c5c3_Err = templ.JoinStringErrs(title)
+		if templ_7745c5c3_Err != nil {
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/layout.templ`, Line: 9, Col: 17}
+		}
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var2))
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, " — Vetting</title><link rel=\"stylesheet\" href=\"/static/app.css\"><script src=\"https://unpkg.com/htmx.org@2.0.2\" integrity=\"sha384-Y7hw+L/jvKeWIRRkqWYfPcvVxHzVzn5REgzbawhxAuQGwX1XWe70vji+VSeHOThJ\" crossorigin=\"anonymous\"></script><script src=\"https://unpkg.com/htmx-ext-sse@2.2.2\" integrity=\"sha384-Y4gc0CK6Kg4hmulDc1rNM+vbMvjbW/5rRCA6pC5gj5dLV1/4+OZGkQpJtHQvQTCr\" crossorigin=\"anonymous\"></script></head><body hx-boost=\"true\"><header class=\"topbar\"><div class=\"brand\">Vetting</div><nav><a href=\"/\">Dashboard</a> <a href=\"/hosts/new\">Register host</a></nav><div class=\"session\"><span class=\"heartbeat\" hx-ext=\"sse\" sse-connect=\"/events\" sse-swap=\"heartbeat\">·</span><form method=\"post\" action=\"/logout\" class=\"logout-form\"><button type=\"submit\">Log out</button></form></div></header><main>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templ_7745c5c3_Var1.Render(ctx, templ_7745c5c3_Buffer)
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "</main></body></html>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		return nil
+	})
+}
+
+func BareLayout(title string) templ.Component {
+	return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
+		templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
+		if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
+			return templ_7745c5c3_CtxErr
+		}
+		templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
+		if !templ_7745c5c3_IsBuffer {
+			defer func() {
+				templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
+				if templ_7745c5c3_Err == nil {
+					templ_7745c5c3_Err = templ_7745c5c3_BufErr
+				}
+			}()
+		}
+		ctx = templ.InitializeContext(ctx)
+		templ_7745c5c3_Var3 := templ.GetChildren(ctx)
+		if templ_7745c5c3_Var3 == nil {
+			templ_7745c5c3_Var3 = templ.NopComponent
+		}
+		ctx = templ.ClearChildren(ctx)
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "<!doctype html><html lang=\"en\"><head><meta charset=\"utf-8\"><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><title>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		var templ_7745c5c3_Var4 string
+		templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(title)
+		if templ_7745c5c3_Err != nil {
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/layout.templ`, Line: 41, Col: 17}
+		}
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, " — Vetting</title><link rel=\"stylesheet\" href=\"/static/app.css\"></head><body class=\"bare\"><main>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templ_7745c5c3_Var3.Render(ctx, templ_7745c5c3_Buffer)
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 6, "</main></body></html>")
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		return nil
+	})
+}
+
+var _ = templruntime.GeneratedTemplate
@@ -0,0 +1,20 @@
+package templates
+
+templ Login(errMsg, next string) {
+	@BareLayout("Sign in") {
+		<div class="login-card">
+			<h1>Vetting</h1>
+			if errMsg != "" {
+				<div class="error">{ errMsg }</div>
+			}
+			<form method="post" action="/login">
+				<input type="hidden" name="next" value={ next }/>
+				<label>
+					Password
+					<input type="password" name="password" autofocus required/>
+				</label>
+				<button type="submit">Sign in</button>
+			</form>
+		</div>
+	}
+}
@@ -0,0 +1,94 @@
+// Code generated by templ - DO NOT EDIT.
+
+// templ: version: v0.3.1001
+package templates
+
+//lint:file-ignore SA4006 This context is only used if a nested component is present.
+
+import "github.com/a-h/templ"
+import templruntime "github.com/a-h/templ/runtime"
+
+func Login(errMsg, next string) templ.Component {
+	return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
+		templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
+		if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
+			return templ_7745c5c3_CtxErr
+		}
+		templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
+		if !templ_7745c5c3_IsBuffer {
+			defer func() {
+				templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
+				if templ_7745c5c3_Err == nil {
+					templ_7745c5c3_Err = templ_7745c5c3_BufErr
+				}
+			}()
+		}
+		ctx = templ.InitializeContext(ctx)
+		templ_7745c5c3_Var1 := templ.GetChildren(ctx)
+		if templ_7745c5c3_Var1 == nil {
+			templ_7745c5c3_Var1 = templ.NopComponent
+		}
+		ctx = templ.ClearChildren(ctx)
+		templ_7745c5c3_Var2 := templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
+			templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
+			templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
+			if !templ_7745c5c3_IsBuffer {
+				defer func() {
+					templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
+					if templ_7745c5c3_Err == nil {
+						templ_7745c5c3_Err = templ_7745c5c3_BufErr
+					}
+				}()
+			}
+			ctx = templ.InitializeContext(ctx)
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "<div class=\"login-card\"><h1>Vetting</h1>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			if errMsg != "" {
+				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "<div class=\"error\">")
+				if templ_7745c5c3_Err != nil {
+					return templ_7745c5c3_Err
+				}
+				var templ_7745c5c3_Var3 string
+				templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(errMsg)
+				if templ_7745c5c3_Err != nil {
+					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/login.templ`, Line: 8, Col: 31}
+				}
+				_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
+				if templ_7745c5c3_Err != nil {
+					return templ_7745c5c3_Err
+				}
+				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "</div>")
+				if templ_7745c5c3_Err != nil {
+					return templ_7745c5c3_Err
+				}
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "<form method=\"post\" action=\"/login\"><input type=\"hidden\" name=\"next\" value=\"")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var4 string
+			templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(next)
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/login.templ`, Line: 11, Col: 49}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "\"> <label>Password <input type=\"password\" name=\"password\" autofocus required></label> <button type=\"submit\">Sign in</button></form></div>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			return nil
+		})
+		templ_7745c5c3_Err = BareLayout("Sign in").Render(templ.WithChildren(ctx, templ_7745c5c3_Var2), templ_7745c5c3_Buffer)
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		return nil
+	})
+}
+
+var _ = templruntime.GeneratedTemplate
@@ -0,0 +1,61 @@
+package templates
+
+type RegistrationForm struct {
+	Name             string
+	MAC              string
+	WoLBroadcastIP   string
+	WoLPort          string
+	ExpectedSpecYAML string
+	Notes            string
+	Error            string
+}
+
+templ Registration(form RegistrationForm) {
+	@Layout("Register host") {
+		<section class="form-wrap">
+			<h1>Register host</h1>
+			if form.Error != "" {
+				<div class="error">{ form.Error }</div>
+			}
+			<form method="post" action="/hosts" class="host-form">
+				<label>
+					Name
+					<input type="text" name="name" value={ form.Name } required pattern="[A-Za-z0-9_\-\.]+" placeholder="pve-node-03"/>
+				</label>
+				<label>
+					MAC address
+					<input type="text" name="mac" value={ form.MAC } required placeholder="aa:bb:cc:dd:ee:ff"/>
+				</label>
+				<div class="grid-2">
+					<label>
+						WoL broadcast IP
+						<input type="text" name="wol_broadcast_ip" value={ form.WoLBroadcastIP } required placeholder="10.0.0.255"/>
+					</label>
+					<label>
+						WoL port
+						<input type="number" name="wol_port" value={ defaultPort(form.WoLPort) } min="1" max="65535"/>
+					</label>
+				</div>
+				<label>
+					Expected hardware spec (YAML)
+					<textarea name="expected_spec_yaml" rows="12" required placeholder="cpu:&#10;  model_match: ...">{ form.ExpectedSpecYAML }</textarea>
+				</label>
+				<label>
+					Notes
+					<textarea name="notes" rows="3">{ form.Notes }</textarea>
+				</label>
+				<div class="actions">
+					<button type="submit">Register</button>
+					<a class="button-secondary" href="/">Cancel</a>
+				</div>
+			</form>
+		</section>
+	}
+}
+
+func defaultPort(v string) string {
+	if v == "" {
+		return "9"
+	}
+	return v
+}
@@ -0,0 +1,176 @@
+// Code generated by templ - DO NOT EDIT.
+
+// templ: version: v0.3.1001
+package templates
+
+//lint:file-ignore SA4006 This context is only used if a nested component is present.
+
+import "github.com/a-h/templ"
+import templruntime "github.com/a-h/templ/runtime"
+
+type RegistrationForm struct {
+	Name             string
+	MAC              string
+	WoLBroadcastIP   string
+	WoLPort          string
+	ExpectedSpecYAML string
+	Notes            string
+	Error            string
+}
+
+func Registration(form RegistrationForm) templ.Component {
+	return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
+		templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
+		if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
+			return templ_7745c5c3_CtxErr
+		}
+		templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
+		if !templ_7745c5c3_IsBuffer {
+			defer func() {
+				templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
+				if templ_7745c5c3_Err == nil {
+					templ_7745c5c3_Err = templ_7745c5c3_BufErr
+				}
+			}()
+		}
+		ctx = templ.InitializeContext(ctx)
+		templ_7745c5c3_Var1 := templ.GetChildren(ctx)
+		if templ_7745c5c3_Var1 == nil {
+			templ_7745c5c3_Var1 = templ.NopComponent
+		}
+		ctx = templ.ClearChildren(ctx)
+		templ_7745c5c3_Var2 := templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
+			templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
+			templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
+			if !templ_7745c5c3_IsBuffer {
+				defer func() {
+					templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
+					if templ_7745c5c3_Err == nil {
+						templ_7745c5c3_Err = templ_7745c5c3_BufErr
+					}
+				}()
+			}
+			ctx = templ.InitializeContext(ctx)
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "<section class=\"form-wrap\"><h1>Register host</h1>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			if form.Error != "" {
+				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "<div class=\"error\">")
+				if templ_7745c5c3_Err != nil {
+					return templ_7745c5c3_Err
+				}
+				var templ_7745c5c3_Var3 string
+				templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(form.Error)
+				if templ_7745c5c3_Err != nil {
+					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 18, Col: 35}
+				}
+				_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
+				if templ_7745c5c3_Err != nil {
+					return templ_7745c5c3_Err
+				}
+				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "</div>")
+				if templ_7745c5c3_Err != nil {
+					return templ_7745c5c3_Err
+				}
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "<form method=\"post\" action=\"/hosts\" class=\"host-form\"><label>Name <input type=\"text\" name=\"name\" value=\"")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var4 string
+			templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(form.Name)
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 23, Col: 53}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "\" required pattern=\"[A-Za-z0-9_\\-\\.]+\" placeholder=\"pve-node-03\"></label> <label>MAC address <input type=\"text\" name=\"mac\" value=\"")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var5 string
+			templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(form.MAC)
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 27, Col: 51}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 6, "\" required placeholder=\"aa:bb:cc:dd:ee:ff\"></label><div class=\"grid-2\"><label>WoL broadcast IP <input type=\"text\" name=\"wol_broadcast_ip\" value=\"")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var6 string
+			templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(form.WoLBroadcastIP)
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 32, Col: 76}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 7, "\" required placeholder=\"10.0.0.255\"></label> <label>WoL port <input type=\"number\" name=\"wol_port\" value=\"")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var7 string
+			templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(defaultPort(form.WoLPort))
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 36, Col: 76}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 8, "\" min=\"1\" max=\"65535\"></label></div><label>Expected hardware spec (YAML) <textarea name=\"expected_spec_yaml\" rows=\"12\" required placeholder=\"cpu:&#10;  model_match: ...\">")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var8 string
+			templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(form.ExpectedSpecYAML)
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 41, Col: 125}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 9, "</textarea></label> <label>Notes <textarea name=\"notes\" rows=\"3\">")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			var templ_7745c5c3_Var9 string
+			templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(form.Notes)
+			if templ_7745c5c3_Err != nil {
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 45, Col: 49}
+			}
+			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 10, "</textarea></label><div class=\"actions\"><button type=\"submit\">Register</button> <a class=\"button-secondary\" href=\"/\">Cancel</a></div></form></section>")
+			if templ_7745c5c3_Err != nil {
+				return templ_7745c5c3_Err
+			}
+			return nil
+		})
+		templ_7745c5c3_Err = Layout("Register host").Render(templ.WithChildren(ctx, templ_7745c5c3_Var2), templ_7745c5c3_Buffer)
+		if templ_7745c5c3_Err != nil {
+			return templ_7745c5c3_Err
+		}
+		return nil
+	})
+}
+
+func defaultPort(v string) string {
+	if v == "" {
+		return "9"
+	}
+	return v
+}
+
+var _ = templruntime.GeneratedTemplate