diff --git a/README.md b/README.md
index a831a49..9079883 100644
--- a/README.md
+++ b/README.md
@@ -11,13 +11,70 @@ Built for solo-operator home labs: one Go binary, SQLite + flat files,
 HTMX + SSE UI, bundled dnsmasq, optional ntfy / Discord / SMTP
 notifications.
 
+## Features
+
+- **Automated PXE boot** — dnsmasq proxy-DHCP serves a disposable
+  Debian live image to registered MACs. No VLAN, no dedicated bridge.
+- **11-stage validation pipeline** — Inventory, Firmware, SpecValidate,
+  SMART, CPUStress, Storage, Network, Burn, GPU, PSU, Reporting.
+- **Three vetting profiles** — quick (~10 min), deep (~8-12 h),
+  soak (~36-40 h). Same probes and gates; only durations scale.
+- **Server-side threshold engine** — per-run rules evaluate every
+  sensor batch in real time. Critical breaches (thermal runaway,
+  EDAC UE, voltage sag) fail the run immediately.
+- **FailedHolding with SSH** — when a stage fails the pipeline parks
+  the host and issues a one-time SSH key so you can triage in the
+  live image.
+- **Real-time dashboard** — HTMX + SSE push tile updates, stage
+  progress, sub-step detail, and live log tailing to the browser.
+- **Pluggable notifications** — ntfy, Discord webhooks, and SMTP with
+  severity-routed delivery.
+- **Non-destructive mode** — skip badblocks + wipe for hosts with
+  data you want to keep.
+- **Host-mode agent** — a persistent reporter that heartbeats from
+  installed hosts and reboots into the live image on command.
+- **Self-contained HTML reports** — offline-viewable summaries with
+  inlined CSS; machine-readable JSON alongside.
+- **Four-layer safety gates** — MAC allowlist, signed run token,
+  wipe probe, device allowlist protect against accidental disk wipes.
+- **Janitor** — automatic retention-based cleanup of artifact files
+  and log files.
+
+## How it works
+
+1. Install the host-mode agent on each node (one-liner from the
+   dashboard's quick-register script).
+2. Register the host in the web UI — name, MAC, expected hardware
+   spec (YAML).
+3. Click **Start Vetting** and choose a profile (quick / deep / soak).
+4. The host-mode agent receives a `reboot_for_vetting` heartbeat
+   command and reboots into PXE.
+5. dnsmasq serves the iPXE script; the host boots a disposable Linux
+   live image containing the vetting agent.
+6. The agent claims the run (token auth), then walks through each
+   stage — posting logs, sensor readings, and results back to the
+   orchestrator.
+7. Thresholds are evaluated server-side on every sensor batch.
+8. **Pass** — auto-reboot to local disk, HTML report generated,
+   notification fires.
+9. **Fail** — pipeline parks in FailedHolding, SSH key issued,
+   notification fires. Operator triages and retries or releases.
+
 ## Documentation
 
-- [docs/operations.md](docs/operations.md) — install + first run +
+- [docs/operations.md](docs/operations.md) — install, first run,
   troubleshooting
 - [docs/architecture.md](docs/architecture.md) — packages, state
-  machine, protocol
+  machine, protocol, safety model
 - [docs/test-suite.md](docs/test-suite.md) — what each stage measures
+- [docs/configuration.md](docs/configuration.md) — every YAML config
+  knob, profiles, thresholds
+- [docs/api-reference.md](docs/api-reference.md) — HTTP API with
+  request/response schemas, SSE events
+- [docs/database.md](docs/database.md) — SQLite schema, tables,
+  entity relationships
+- [docs/development.md](docs/development.md) — dev setup, building,
+  testing, adding stages
 
 ## Quick start (local, against QEMU)
 
diff --git a/cmd/vetting-agent/main.go b/cmd/vetting-agent/main.go
index 81a3834..f65f648 100644
--- a/cmd/vetting-agent/main.go
+++ b/cmd/vetting-agent/main.go
@@ -1,3 +1,7 @@
+// Agent binary. Runs in two modes: live-image (default, no args)
+// parses /proc/cmdline and enters the claim loop; host-mode
+// ("vetting-agent host") reads /etc/vetting/host-agent.yaml and
+// becomes a persistent heartbeat reporter.
 package main
 
 import (
diff --git a/cmd/vetting/main.go b/cmd/vetting/main.go
index dfa286c..470c885 100644
--- a/cmd/vetting/main.go
+++ b/cmd/vetting/main.go
@@ -1,3 +1,6 @@
+// Orchestrator binary. Wires config, stores, runner, dispatcher,
+// PXE supervisor, iperf supervisor, janitor, notifiers, and HTTP
+// router, then serves until SIGTERM/SIGINT.
 package main
 
 import (
diff --git a/docs/api-reference.md b/docs/api-reference.md
new file mode 100644
index 0000000..6229d61
--- /dev/null
+++ b/docs/api-reference.md
@@ -0,0 +1,490 @@
+# API reference
+
+Complete HTTP API for the vetting orchestrator. Routes are assembled
+in `internal/httpserver/router.go`; handler logic lives in
+`internal/api/agent_handlers.go` (agent-facing) and
+`internal/api/ui_handlers.go` (browser + host-mode).
+
+---
+
+## Agent API
+
+These endpoints are called by the in-image vetting agent during a
+run. Every request must carry a `Authorization: Bearer <token>`
+header. The token is issued per-run in the iPXE kernel cmdline and
+verified against a bcrypt hash stored in `runs.agent_token_hash`.
+
+### `GET /ipxe/{mac}`
+
+iPXE chainload script. Called by iPXE itself after dnsmasq hands it
+the chainload URL. No auth required — the MAC path parameter is the
+key.
+
+**Responses:**
+
+| Scenario | Script |
+|----------|--------|
+| Known MAC with an active run | Boot script: kernel + initrd + cmdline (run_id, mac, token, orchestrator_url, tls_fpr). Triggers `PXEObserved` transition. |
+| Known MAC, no active run | Poweroff script. |
+| Unknown MAC | Halt/error script. |
+
+---
+
+### `POST /api/v1/runs/{id}/hello`
+
+First call the agent makes once userspace is up. Idempotent. Writes a
+log line; the authoritative transition comes from `/claim`.
+
+**Request body:**
+
+```json
+{}
+```
+
+**Response (200):**
+
+```json
+{ "ok": true, "run_id": 42 }
+```
+
+---
+
+### `POST /api/v1/runs/{id}/claim`
+
+Binding call: the agent proves it holds the plaintext token for this
+run. In return the orchestrator seeds stage rows, transitions to
+`InventoryCheck`, and returns the stage list + per-profile config.
+Subsequent claims are idempotent (safe after transient network
+failures).
+
+**Request body:**
+
+```json
+{
+  "agent_ip": "192.168.1.42"   // optional; falls back to RemoteAddr
+}
+```
+
+**Response (200):**
+
+```json
+{
+  "ok": true,
+  "run_id": 42,
+  "stages": ["Inventory", "Firmware", "SpecValidate", "SMART", "CPUStress",
+             "Storage", "Network", "Burn", "GPU", "PSU", "Reporting"],
+  "expected_disks": [
+    { "serial": "WD-ABC123", "size_gb": 500 }
+  ],
+  "iperf_port": 5201,
+  "non_destructive": false,
+  "current_state": "InventoryCheck",
+  "stage_config": {
+    "profile": "quick",
+    "stage_timeouts": { "CPUStress": "5m0s", "Storage": "5m0s" },
+    "cpustress": { "cpu_pass": "2m", "mem_pass": "2m", "edac_poll": "10s" },
+    "storage": { "mode": "fio_sample", "fio_size": "1GiB", "fio_time": "3m",
+                 "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5" },
+    "network": { "duration": "60s" },
+    "burn": { "duration": "2m", "cpu_workers": "all", "mem_pct": 50,
+              "fio_on_spare": true, "iperf_parallel": 2 }
+  }
+}
+```
+
+**`stage_config` shape:**
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `profile` | string | `quick`, `deep`, or `soak`. |
+| `stage_timeouts` | map[string]string | Per-stage timeout durations (Go duration strings). |
+| `cpustress.cpu_pass` | string | stress-ng CPU pass duration. |
+| `cpustress.mem_pass` | string | stress-ng memory pass duration. |
+| `cpustress.edac_poll` | string | EDAC error counter polling interval. |
+| `storage.mode` | string | `fio_sample` (skip badblocks) or `full_disk`. |
+| `storage.fio_size` | string | fio test file size (fio_sample mode only). |
+| `storage.fio_time` | string | fio runtime. |
+| `storage.fio_bs` | string | fio block size. |
+| `storage.fio_rw` | string | fio I/O pattern. |
+| `storage.verify` | string | fio integrity mode (`md5` or empty). |
+| `network.duration` | string | iperf3 test duration. |
+| `burn.duration` | string | Total burn-in window. |
+| `burn.cpu_workers` | string | `all` or a numeric string. |
+| `burn.mem_pct` | int | Percentage of MemAvailable to stress. |
+| `burn.fio_on_spare` | bool | Run fio inside Burn. |
+| `burn.iperf_parallel` | int | iperf3 parallel stream count. |
+
+---
+
+### `POST /api/v1/runs/{id}/heartbeat`
+
+Periodic liveness ping. The response body acts as a control channel.
+
+**Request body:**
+
+```json
+{}
+```
+
+**Response (200):**
+
+```json
+{
+  "state": "CPUStress",
+  "cmd": "continue"
+}
+```
+
+**`cmd` values:**
+
+| cmd | When | Agent action |
+|-----|------|--------------|
+| `continue` | Normal case (including FailedHolding) | No-op; keep running current stage or wait for override. |
+| `reboot` | Run reached `Completed` | `systemctl reboot` (falls through iPXE to local disk). |
+| `abort` | Run in `Released` | Stop heartbeat loop. |
+| `retry_stage` | Operator pressed "Override wipe & retry" | Re-enter the named stage with override flags. Response includes `stage` and `override_flags`. |
+| `cancel_stage` | Operator clicked Cancel mid-stage | Kill running stage subprocess, then power off. |
+
+---
+
+### `POST /api/v1/runs/{id}/log`
+
+Batch of log lines from the agent. Written to per-run flat file and
+fanned out to SSE subscribers.
+
+**Request body:**
+
+```json
+{
+  "lines": [
+    {
+      "ts": "2026-04-21T15:32:18.123Z",
+      "level": "info",
+      "stage": "SMART",
+      "text": "smartctl -a /dev/sda: PASSED"
+    }
+  ]
+}
+```
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `ts` | string | no | RFC 3339 timestamp. Server clock used if empty. |
+| `level` | string | no | `info`, `warn`, `error`, `debug`. |
+| `stage` | string | no | Stage tag for per-stage SSE fan-out. |
+| `text` | string | yes | Log message. |
+
+**Response (200):**
+
+```json
+{ "ok": true, "written": 1 }
+```
+
+---
+
+### `POST /api/v1/runs/{id}/sensor`
+
+Batch of numeric samples (thermals, fan RPM, PSU rails, iperf
+throughput, fio IOPS). Each sample is evaluated against the run's
+seeded thresholds — critical breaches fail the run immediately.
+
+**Request body:**
+
+```json
+{
+  "samples": [
+    {
+      "ts": "2026-04-21T15:32:18Z",
+      "kind": "temp",
+      "key": "cpu/0",
+      "value": 72.5,
+      "unit": "C"
+    }
+  ]
+}
+```
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `ts` | string | no | RFC 3339 timestamp. Defaults to server-now. |
+| `kind` | string | yes | `temp`, `fan`, `psu_volt`, `iperf`, `fio`, `fio_p99_us`, `smart_attr`, `nic_retrans`, `edac_ue`, `edac_ce`, `mce`. |
+| `key` | string | yes | Identifies the source (e.g. `cpu/0`, `+12V`, `throughput_mbps`). |
+| `value` | float | yes | Numeric sample value. |
+| `unit` | string | no | Display unit (e.g. `C`, `V`, `Mbps`). |
+
+**Response (200):**
+
+```json
+{
+  "ok": true,
+  "written": 1,
+  "breach": false,
+  "breach_kind": ""
+}
+```
+
+When a critical breach is detected, `breach` is `true` and
+`breach_kind` contains a human-readable label like
+`"temp cpu/0=92.5 breached lt 92"`. The run transitions to
+`FailedHolding`.
+
+---
+
+### `POST /api/v1/runs/{id}/result`
+
+Stage outcome. Drives the state machine forward (pass) or into
+`FailedHolding` (fail).
+
+**Request body:**
+
+```json
+{
+  "stage": "SMART",
+  "passed": true,
+  "summary": { "disks_checked": 2, "reallocated": 0 },
+  "message": "",
+  "inventory": null,
+  "firmware": [],
+  "sub_steps": []
+}
+```
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `stage` | string | yes | Stage name (must match `DefaultStageOrder`). |
+| `passed` | bool | yes | `true` = advance; `false` = fail. |
+| `summary` | object | no | Arbitrary JSON persisted in `stages.summary_json`. |
+| `message` | string | no | Human-readable detail (shown in notifications on failure). |
+| `inventory` | object | no | Only set for `stage=Inventory`. Full `spec.Inventory` JSON. |
+| `firmware` | array | no | Only set for `stage=Firmware`. Array of firmware snapshots. |
+| `sub_steps` | array | no | Per-disk/per-NIC/per-GPU granular results. |
+
+**`firmware[]` shape:**
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `component` | string | `bios`, `bmc`, `nic`, `hba`, `microcode`, `nvme_fw`. |
+| `identifier` | string | Slot, serial, or device path that distinguishes this component. |
+| `version` | string | Firmware version string. |
+| `vendor` | string | Vendor name (optional). |
+| `raw` | map | Additional key-value metadata (optional). |
+
+**`sub_steps[]` shape:**
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `name` | string | Human-readable label (e.g. `sda SMART`, `eth0 iperf`). |
+| `passed` | bool | Sub-step result. |
+| `skipped` | bool | `true` if the sub-step was skipped (e.g. no GPU). |
+| `started_at` | string | RFC 3339 timestamp. |
+| `completed_at` | string | RFC 3339 timestamp. |
+| `summary` | object | Arbitrary JSON persisted in `sub_steps.summary_json`. |
+
+**Response (200, pass):**
+
+```json
+{ "ok": true, "next_state": "CPUStress" }
+```
+
+**Response (200, fail):**
+
+```json
+{ "ok": true, "next_state": "FailedHolding" }
+```
+
+**Response (409, stage mismatch):**
+
+Returned when the agent reports a stage that doesn't match the
+orchestrator's expected state. The run is parked in `FailedHolding`.
+
+```json
+{ "ok": false, "error": "stage mismatch: got SMART, expected CPUStress" }
+```
+
+---
+
+### `POST /api/v1/runs/{id}/hold`
+
+Request the per-run SSH key so the operator can SSH into a held host.
+
+**Request body:**
+
+```json
+{
+  "agent_ip": "192.168.1.42"
+}
+```
+
+**Response (200):**
+
+```json
+{
+  "authorized_key": "ssh-ed25519 AAAAC3... vetting-run-42",
+  "run_id": 42
+}
+```
+
+The private key is written to
+`artifacts/run-<N>/hold.key` on the orchestrator. The agent installs
+the `authorized_key` into `/root/.ssh/authorized_keys` in the live
+image.
+
+---
+
+## Host API
+
+LAN-trusted endpoints called by the host-mode agent. No bearer token.
+Same threat model as the browser UI.
+
+### `POST /api/v1/hosts`
+
+JSON host registration. Called by the quick-register one-liner.
+
+**Request body:**
+
+```json
+{
+  "name": "node-01",
+  "mac": "aa:bb:cc:dd:ee:ff",
+  "wol_broadcast_ip": "192.168.1.255",
+  "wol_port": 9,
+  "expected_spec_yaml": "memory:\n  total_gib: 64\ncpu:\n  logical_cores: 16\n",
+  "notes": ""
+}
+```
+
+**Response (201):**
+
+```json
+{ "ok": true, "id": 5 }
+```
+
+### `POST /api/v1/hosts/{mac}/heartbeat`
+
+Host-mode agent liveness ping. Stamps `hosts.last_seen_at` and
+triggers a dashboard tile refresh via SSE.
+
+**Request body:** empty.
+
+**Response (200):**
+
+```json
+{ "ok": true }
+```
+
+When a run is queued for this host:
+
+```json
+{ "ok": true, "cmd": "reboot_for_vetting", "run_id": 42 }
+```
+
+The agent reboots the host on receiving `cmd=reboot_for_vetting`.
+The `run_id` is informational (for agent logging).
+
+---
+
+## Browser UI routes
+
+No auth. Bind to loopback or LAN only, or front with a reverse proxy.
+
+| Method | Path | Description |
+|--------|------|-------------|
+| GET | `/` | Dashboard — host tile grid. |
+| GET | `/hosts/new` | New host registration form. |
+| POST | `/hosts` | Create host (form submission). |
+| GET | `/hosts/{id}` | Host detail page (summary, actions, run history). |
+| POST | `/hosts/{id}/delete` | Delete host. |
+| POST | `/hosts/{id}/start` | Start a vetting run (queue it). |
+| POST | `/hosts/{id}/cancel` | Cancel the active run. |
+| POST | `/hosts/{id}/override-wipe` | Override the wipe-probe guard and retry Storage. |
+| GET | `/runs/{runID}` | Run detail page (stages, spec diffs, pipeline). |
+| GET | `/reports/{runID}` | HTML report artifact. |
+| GET | `/register/quick.sh` | Quick-register bash one-liner script. |
+| GET | `/events` | SSE event stream (browser subscriptions). |
+
+**Static assets:**
+
+| Path | Description |
+|------|-------------|
+| `/static/*` | Embedded CSS + JS (`internal/web/static/`). |
+| `/live/*` | Live image files (vmlinuz + initrd.img) served from `pxe.live_dir`. |
+| `/assets/*` | Agent binary served from `agent.asset_dir`. |
+
+---
+
+## SSE events
+
+The browser connects to `GET /events` and receives server-sent events.
+Each event has a `name` (the SSE `event:` field) and a `data` payload
+containing a pre-rendered HTML fragment with `hx-swap-oob` attributes
+that HTMX uses to swap the target DOM element.
+
+### Connection events
+
+| Event name | Payload | Description |
+|------------|---------|-------------|
+| `hello` | `ok` | Sent immediately on connection. |
+| `heartbeat` | `<span data-heartbeat="<unix-ts>"></span>` | 15-second keep-alive. |
+
+### Dashboard events
+
+| Event name | Payload | Description |
+|------------|---------|-------------|
+| `tile-{hostID}` | Host tile HTML fragment | Refreshed on state transitions, heartbeats, holds. |
+
+### Host detail page events
+
+| Event name | Payload | Description |
+|------------|---------|-------------|
+| `detail-summary-{hostID}` | Summary section HTML | Host metadata + latest run status. |
+| `detail-actions-{hostID}` | Actions row HTML | Start/Cancel/Override buttons. |
+| `detail-inflight-{hostID}` | In-flight banner HTML | Active run progress indicator. |
+| `runrow-{runID}` | Run history row HTML | Updated when a run completes or fails. |
+
+### Run detail page events
+
+| Event name | Payload | Description |
+|------------|---------|-------------|
+| `run-header-{runID}` | Run metadata HTML | State, profile, timing. |
+| `detail-hold-{runID}` | Hold banner HTML | SSH command + hold IP. |
+| `detail-specdiffs-{runID}` | Spec diffs list HTML | Expected-vs-actual divergences. |
+| `pipeline-{runID}` | Pipeline dot visualization HTML | Stage progress dots. |
+| `substep-{runID}-{stage}-{ordinal}` | Sub-step row HTML | Per-disk, per-NIC, per-GPU detail. |
+
+### Log events
+
+| Event name | Payload | Description |
+|------------|---------|-------------|
+| `log-{runID}` | Log line HTML | All log lines for a run. |
+| `log-{runID}-{stage}` | Log line HTML | Stage-filtered log lines. |
+
+---
+
+## Authentication
+
+### Agent bearer token lifecycle
+
+1. **Issuance** — when a registered host's iPXE script is fetched
+   (`GET /ipxe/{mac}`), the orchestrator generates a random token,
+   hashes it with SHA-256, and stores the hash in
+   `runs.agent_token_hash`. The plaintext token is embedded in the
+   iPXE kernel cmdline as `token=<plaintext>`.
+
+2. **Rotation** — each iPXE fetch rotates the token. Only the most
+   recent PXE boot can claim the run.
+
+3. **Verification** — every `/api/v1/runs/{id}/*` endpoint extracts
+   the `Bearer` header, SHA-256 hashes it, and compares against the
+   stored hash using `crypto/subtle.ConstantTimeCompare`.
+
+4. **Scope** — the token authenticates a single run. It cannot be
+   used to access other runs or host-level endpoints.
+
+### LAN-trust model
+
+Host-mode endpoints (`POST /api/v1/hosts`, `POST /api/v1/hosts/{mac}/heartbeat`)
+and the browser UI have no authentication. They share a LAN-trust
+assumption: anything that can reach the orchestrator's bind address is
+trusted. To add a password, front the orchestrator with a reverse
+proxy (Caddy, nginx, Traefik) that adds basic-auth or OIDC. See
+[operations.md § Exposing outside the LAN](operations.md#exposing-outside-the-lan).
diff --git a/docs/architecture.md b/docs/architecture.md
index 9a0443f..3e608a0 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -37,10 +37,10 @@ Operator browser (HTMX + SSE, admin login)
 |---|---|
 | `cmd/vetting` | Orchestrator entrypoint. Wires config, stores, runner, dispatcher, iperf supervisor, PXE supervisor, janitor, HTTP router. |
 | `cmd/vetting-agent` | In-image agent entrypoint. Reads kernel cmdline params, starts the agent loop. |
-| `internal/config` | YAML loader + types. |
+| `internal/config` | YAML loader + types. `ProfileRegistry` holds the quick/deep/soak profile definitions, threshold defaults, and per-stage probe knobs. |
 | `internal/db` | SQLite open + embedded migrations. Pure Go via modernc.org/sqlite. |
 | `internal/model` | Plain structs: `Host`, `Run`, `Stage`, `Measurement`, `SpecDiff`, `Artifact`. |
-| `internal/store` | Repository layer; SQL is hand-written. |
+| `internal/store` | Repository layer; SQL is hand-written (no ORM). Stores for hosts, runs, stages, sub-steps, artifacts, spec diffs, measurements, thresholds, firmware. |
 | `internal/orchestrator` | State machine, dispatcher, per-run runner, WoL sender, HMAC run tokens, iperf supervisor. |
 | `internal/api` | HTTP handlers: `agent_handlers.go` (the agent-facing API) and `ui_handlers.go` (HTMX fragments + SSE). |
 | `internal/httpserver` | chi router assembly — lives here to avoid `api ↔ orchestrator` cyclic imports. |
@@ -66,11 +66,13 @@ Per-run state is the single source of truth; the UI is a pure
 projection of DB + event stream.
 
 ```
-Registered → Queued → WaitingWoL → Booting → InventoryCheck
-  → SpecValidate → SMART → CPUStress → Storage → Network
-  → GPU → PSU → Reporting → Completed
+Registered → Queued → WaitingWoL / WaitingReboot → Booting
+  → InventoryCheck → Firmware → SpecValidate → SMART
+  → CPUStress → Storage → Network → Burn → GPU → PSU
+  → Reporting → Completed
 
 any stage → Failed → FailedHolding → Released
+any active state → Cancelled
 ```
 
 Key points:
@@ -97,7 +99,10 @@ POST /api/v1/runs/{id}/result        → stage result; response says next_state
 POST /api/v1/runs/{id}/hold          → on FailedHolding, receive authorized_key
 ```
 
-Auth on every `/api/v1/*` call: the bearer token is stored as a bcrypt
+See [api-reference.md](api-reference.md) for full request/response
+schemas and SSE event types.
+
+Auth on every `/api/v1/runs/*` call: the bearer token is stored as a bcrypt
 hash in `runs.agent_token_hash` and compared in constant time. The
 plaintext is in the kernel cmdline — unforgeable by anyone not on the
 trusted bridge, because the iPXE script is issued per-MAC and the MAC
@@ -165,6 +170,56 @@ The janitor goroutine (`internal/janitor`) runs a sweep every
 **never** deleted by the janitor — host histories and aggregate
 metrics survive cleanups.
 
+## Threshold engine
+
+Every `/sensor` batch is evaluated against rules seeded per-run at
+creation time from the `ProfileRegistry` + per-host overrides. Rules
+are immutable for the life of a run — a late config edit can't
+retroactively pass or fail an in-flight run.
+
+Operators: `lt`, `lte`, `gt`, `gte`, `within_pct`. Key matching is
+glob-ish: `*` matches all keys, `cpu/*` matches any key starting with
+`cpu/`, exact strings for specific keys. Stage matching works the same
+way (`*` for global, exact name for stage-specific).
+
+Severity drives the action:
+
+- **critical** — fail the run immediately. The current stage is marked
+  failed, the run enters `FailedHolding`, and a `StageFailed`
+  notification fires.
+- **warning** — record the breach for the report. The stage continues.
+
+Every evaluation (pass or fail) is persisted as a
+`threshold_evaluations` row so the report can render per-sample
+verdict badges. See [configuration.md § thresholds](configuration.md#vettingthresholds)
+for the config-level reference.
+
+## Host-mode agent
+
+The `vetting-agent host` binary runs as a systemd service on
+installed hosts. It heartbeats to `POST /api/v1/hosts/{mac}/heartbeat`
+every 30 s so the dashboard shows online/offline status.
+
+The quick-register one-liner (`GET /register/quick.sh`) downloads the
+agent binary from `/assets/vetting-agent-linux-amd64`, installs it as
+a systemd service, and auto-POSTs to `POST /api/v1/hosts` to register
+the host — no manual MAC entry needed.
+
+When the operator clicks **Start Vetting**, the orchestrator's
+dispatcher sets `cmd=reboot_for_vetting` on the next heartbeat
+response. The host-mode agent reboots the host, which PXE-boots into
+the live image and enters the normal vetting flow.
+
+## Host API
+
+These endpoints are LAN-trusted (no bearer token) and share the same
+threat model as the browser UI:
+
+```
+POST /api/v1/hosts                  → JSON host registration (quick-register)
+POST /api/v1/hosts/{mac}/heartbeat  → host-mode liveness + command channel
+```
+
 ## Reproducible builds
 
 The orchestrator and agent are pure Go; `make orchestrator-linux`
diff --git a/docs/configuration.md b/docs/configuration.md
new file mode 100644
index 0000000..56ece1a
--- /dev/null
+++ b/docs/configuration.md
@@ -0,0 +1,353 @@
+# Configuration reference
+
+The orchestrator reads a single YAML file at startup. Production
+installs use `/etc/vetting/vetting.yaml`; the dev default is
+`deploy/vetting.example.yaml`. Pass the path with `--config`:
+
+```
+vetting --config /etc/vetting/vetting.yaml
+```
+
+Every key has a compile-time default (see `internal/config/config.go`),
+so an empty file produces a working orchestrator bound to
+`127.0.0.1:8080` with PXE disabled.
+
+---
+
+## `server`
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `bind` | string | `127.0.0.1:8080` | Address and port the HTTP server listens on. |
+| `public_url` | string | *(empty)* | External URL the orchestrator is reachable at from a browser. Used in notification click-throughs (e.g. `https://vetting.lan:8443`). |
+| `tls.enabled` | bool | `false` | Terminate TLS at the orchestrator. |
+| `tls.cert_file` | string | *(empty)* | Path to the PEM-encoded certificate. |
+| `tls.key_file` | string | *(empty)* | Path to the PEM-encoded private key. |
+
+## `database`
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `path` | string | `./var/vetting.db` | SQLite database file. Created on first run. |
+
+## `artifacts`
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `dir` | string | `./var/artifacts` | Directory for per-run files (reports, fio logs, iperf logs, hold keys). |
+| `retention_days` | int | `30` | Days to keep artifact files before the janitor prunes them. `0` = keep forever. DB rows are never pruned. |
+
+## `logs`
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `dir` | string | `./var/logs` | Directory for per-run append-only log files. |
+| `retention_days` | int | `30` | Days to keep log files. `0` = keep forever. |
+
+## `janitor`
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `interval_minutes` | int | `60` | Minutes between cleanup sweeps. `0` defaults to `60`. |
+
+## `dispatcher`
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `max_concurrent_runs` | int | `3` | Semaphore limiting how many vetting runs execute in parallel. |
+
+## `network`
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `iperf_port` | int | `5201` | Port the orchestrator-supervised `iperf3 -s` binds to. The agent connects here during the Network stage. |
+
+## `pxe`
+
+PXE is disabled by default. Enable it after running
+[`vetting-pxe-setup`](operations.md#pxe-enablement).
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `enabled` | bool | `false` | Enable dnsmasq + iPXE serving. |
+| `interface` | string | *(empty)* | LAN NIC the dnsmasq proxy-DHCP binds to (e.g. `eth0`). |
+| `subnet` | string | *(empty)* | LAN CIDR (e.g. `192.168.1.0/24`). Scopes the proxy-DHCP responses. |
+| `orchestrator_url` | string | *(empty)* | URL the live-image agent uses to reach the orchestrator (e.g. `http://192.168.1.135:8080`). Baked into the iPXE kernel cmdline. |
+| `tftp_root` | string | *(empty)* | Directory containing `ipxe.efi` + `undionly.kpxe`. |
+| `live_dir` | string | *(empty)* | Directory containing `vmlinuz` + `initrd.img`. Served at `/live/*`. |
+
+dnsmasq runs in **proxy-DHCP mode**: it coexists with your existing
+router's DHCP server and only supplements PXE options. See
+[operations.md](operations.md#pxe-enablement) for the full setup
+walkthrough.
+
+## `agent`
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `asset_dir` | string | `<database.dir>/../assets` | Directory containing `vetting-agent-linux-amd64`. Served at `/assets/*` so the quick-register one-liner can download the agent binary. Empty string disables the route. |
+
+## `notifiers`
+
+An array of notification targets. Each entry declares a named notifier
+with a type-specific set of fields. Delivery is fire-and-forget (one
+attempt per event, 10 s timeout, failures logged).
+
+### ntfy
+
+```yaml
+notifiers:
+  - name: ops-ntfy
+    type: ntfy
+    server: https://ntfy.sh
+    topic: vetting-YOUR-TOPIC
+```
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `name` | string | Identifier referenced by `routes[].notifier`. |
+| `type` | string | `ntfy` |
+| `server` | string | ntfy server URL. |
+| `topic` | string | Topic to publish to. |
+
+### Discord
+
+```yaml
+notifiers:
+  - name: ops-discord
+    type: discord
+    webhook_url: https://discord.com/api/webhooks/XXX/YYY
+```
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `name` | string | Identifier referenced by `routes[].notifier`. |
+| `type` | string | `discord` |
+| `webhook_url` | string | Discord webhook URL. |
+
+### SMTP
+
+```yaml
+notifiers:
+  - name: ops-email
+    type: smtp
+    smtp:
+      host: mail.lan
+      port: 25
+      from: vetting@lan.local
+      to: [ops@lan.local]
+```
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `name` | string | Identifier referenced by `routes[].notifier`. |
+| `type` | string | `smtp` |
+| `smtp.host` | string | SMTP server hostname. |
+| `smtp.port` | int | SMTP server port. |
+| `smtp.from` | string | Sender address. |
+| `smtp.to` | string[] | Recipient addresses. |
+
+## `routes`
+
+Routes map notification events to notifiers by kind and severity.
+Each route is evaluated independently; an event can match multiple
+routes and fire on multiple notifiers.
+
+```yaml
+routes:
+  - match_severity: [critical]
+    notifier: ops-ntfy
+  - match_severity: [critical]
+    notifier: ops-discord
+  - match_kind: [RunCompleted]
+    notifier: ops-ntfy
+```
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `match_kind` | string[] | Event kinds to match: `StageFailed`, `SpecMismatch`, `HoldingOpened`, `RunCompleted`. Omit to match all kinds. |
+| `match_severity` | string[] | Severities to match: `critical`, `warning`, `info`. Omit to match all severities. |
+| `notifier` | string | Name of a declared notifier to deliver to. |
+
+## `vetting`
+
+Shared pipeline defaults that apply to all profiles.
+
+### `vetting.stages`
+
+Ordered list of stage names the pipeline walks. Default:
+
+```yaml
+vetting:
+  stages:
+    - Inventory
+    - Firmware
+    - SpecValidate
+    - SMART
+    - CPUStress
+    - Storage
+    - Network
+    - Burn
+    - GPU
+    - PSU
+    - Reporting
+```
+
+### `vetting.thresholds`
+
+Array of threshold rules evaluated against every `/sensor` batch.
+Rules apply across all profiles — a 92 C CPU limit fails both a
+2-minute quick run and a 12-hour soak.
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `stage` | string | Stage selector. `*` matches any stage; exact name (e.g. `PSU`) limits to that stage. |
+| `kind` | string | Measurement kind to match: `temp`, `psu_volt`, `iperf`, `fio_p99_us`, `nic_retrans`, `edac_ue`, `edac_ce`, `mce`, `smart_attr`, `fan`. |
+| `key` | string | Key selector. Glob-ish matching: `*` matches all, `cpu/*` matches keys starting with `cpu/`, exact string for specific keys. |
+| `op` | string | Comparison operator (see table below). |
+| `value` | float | Threshold limit. |
+| `nominal` | float | Reference value, only used by `within_pct` (e.g. `12.0` for a +12 V rail). |
+| `unit` | string | Display unit (e.g. `C`, `V`, `Mbps`). Informational only. |
+| `severity` | string | `critical` = fail the run immediately. `warning` = record for the report only. |
+
+**Threshold operators:**
+
+| Operator | Pass condition | Typical use |
+|----------|---------------|-------------|
+| `lt` | `observed < value` | CPU temp < 92 C |
+| `lte` | `observed <= value` | EDAC UE count <= 0 |
+| `gt` | `observed > value` | — |
+| `gte` | `observed >= value` | iperf throughput >= 900 Mbps |
+| `within_pct` | `abs(observed - nominal) / nominal * 100 <= value` | +12 V rail within 5 % of 12.0 V |
+
+**Default thresholds** (from `deploy/vetting.example.yaml`):
+
+```yaml
+thresholds:
+  - { stage: "*",       kind: temp,        key: "cpu/*",         op: lt,         value: 92,    unit: C,    severity: critical }
+  - { stage: PSU,       kind: psu_volt,    key: "+12V",          op: within_pct, value: 5,     nominal: 12.0, severity: critical }
+  - { stage: PSU,       kind: psu_volt,    key: "+5V",           op: within_pct, value: 5,     nominal: 5.0,  severity: critical }
+  - { stage: PSU,       kind: psu_volt,    key: "+3.3V",         op: within_pct, value: 5,     nominal: 3.3,  severity: critical }
+  - { stage: Storage,   kind: fio_p99_us,  key: "*",             op: lt,         value: 50000, severity: warning  }
+  - { stage: Network,   kind: iperf,       key: throughput_mbps, op: gte,        value: 900,   severity: critical }
+  - { stage: Network,   kind: nic_retrans, key: "*/rate",        op: lt,         value: 0.001, severity: warning  }
+  - { stage: CPUStress, kind: edac_ue,     key: "*",             op: lte,        value: 0,     severity: critical }
+  - { stage: CPUStress, kind: mce,         key: "*",             op: lte,        value: 0,     severity: critical }
+```
+
+## `profiles`
+
+Three built-in profiles control per-stage durations and probe knobs.
+Every profile exercises every probe and gate — only the durations
+scale. Quick is a ~10-minute same-day sanity check; deep is the
+8-12 hour overnight soak; soak is the opt-in 36-40 hour extreme run.
+
+### Profile inheritance
+
+A profile can declare `inherit: <parent>` to merge the parent's
+timeouts and defaults before applying its own overrides. Child keys
+win. The default `soak` profile inherits from `deep`.
+
+### `stage_timeouts`
+
+Per-stage time limits. The orchestrator kills the agent's stage
+subprocess when a timeout fires.
+
+| Stage | quick | deep | soak |
+|-------|-------|------|------|
+| CPUStress | 5 m | 2 h | 14 h |
+| Storage | 5 m | 4 h | 8 h |
+| Network | 2 m | 35 m | 2 h 30 m |
+| Burn | 3 m | 3 h | 20 h |
+| PSU | 1 m | 10 m | 15 m |
+
+### `defaults`
+
+Per-stage probe knobs shipped to the agent on `/claim`. Empty values
+mean "fall back to the agent's compile-time default".
+
+#### `cpustress`
+
+| Knob | Type | Description | quick | deep | soak |
+|------|------|-------------|-------|------|------|
+| `cpu_pass` | duration | `stress-ng --cpu` duration | 2 m | 60 m | 12 h |
+| `mem_pass` | duration | `stress-ng --vm` duration | 2 m | 60 m | *(inherit)* |
+| `edac_poll` | duration | EDAC error counter polling interval | 10 s | 10 s | *(inherit)* |
+
+#### `storage`
+
+| Knob | Type | Description | quick | deep | soak |
+|------|------|-------------|-------|------|------|
+| `mode` | string | `fio_sample` (skip badblocks) or `full_disk` (badblocks + fio) | fio_sample | full_disk | full_disk |
+| `fio_size` | string | fio test file size (only in `fio_sample` mode) | 1 GiB | *(inherit)* | *(inherit)* |
+| `fio_time` | duration | fio runtime | 3 m | 2 h | 6 h |
+| `fio_bs` | string | fio block size | 4 k | 4 k | *(inherit)* |
+| `fio_rw` | string | fio I/O pattern | randrw | randrw | *(inherit)* |
+| `verify` | string | fio integrity mode (`md5` or empty) | md5 | md5 | *(inherit)* |
+
+#### `network`
+
+| Knob | Type | Description | quick | deep | soak |
+|------|------|-------------|-------|------|------|
+| `duration` | duration | `iperf3` test duration | 60 s | 30 m | 2 h |
+
+#### `burn`
+
+| Knob | Type | Description | quick | deep | soak |
+|------|------|-------------|-------|------|------|
+| `duration` | duration | Total burn-in window (CPU + mem + disk + net simultaneously) | 2 m | 2 h | 18 h |
+| `cpu_workers` | string | `all` (= `runtime.NumCPU()`) or a numeric string | all | all | *(inherit)* |
+| `mem_pct` | int | Percentage of MemAvailable to stress | 50 | 70 | *(inherit)* |
+| `fio_on_spare` | bool | Run fio inside Burn (requires a spare partition) | true | true | *(inherit)* |
+| `iperf_parallel` | int | Parallel stream count fed to `iperf3 -P` | 2 | 4 | 8 |
+
+### Example profile block
+
+```yaml
+profiles:
+  quick:
+    stage_timeouts:
+      CPUStress: 5m
+      Storage:   5m
+      Network:   2m
+    defaults:
+      cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s }
+      storage:   { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 }
+      network:   { duration: 60s }
+      burn:      { duration: 2m, cpu_workers: all, mem_pct: 50, fio_on_spare: true, iperf_parallel: 2 }
+  deep:
+    stage_timeouts:
+      CPUStress: 2h
+      Storage:   4h
+      Network:   35m
+    defaults:
+      cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s }
+      storage:   { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 }
+      network:   { duration: 30m }
+      burn:      { duration: 2h, cpu_workers: all, mem_pct: 70, fio_on_spare: true, iperf_parallel: 4 }
+  soak:
+    inherit: deep
+    stage_timeouts:
+      CPUStress: 14h
+      Storage:   8h
+      Network:   2h30m
+    defaults:
+      cpustress: { cpu_pass: 12h }
+      storage:   { mode: full_disk, fio_time: 6h }
+      network:   { duration: 2h }
+      burn:      { duration: 18h, iperf_parallel: 8 }
+```
+
+---
+
+## Host-mode agent config
+
+The persistent host-mode agent reads a separate file at
+`/etc/vetting/host-agent.yaml`. This is installed by the
+quick-register one-liner and is distinct from the orchestrator config.
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `orchestrator_url` | string | *(required)* | URL of the orchestrator (e.g. `http://192.168.1.135:8080`). |
+| `mac` | string | *(auto-detected)* | MAC address to heartbeat as. Auto-detected from the default route NIC if omitted. |
+| `interval` | duration | `30s` | Heartbeat interval. |
diff --git a/docs/database.md b/docs/database.md
new file mode 100644
index 0000000..60e4ab7
--- /dev/null
+++ b/docs/database.md
@@ -0,0 +1,279 @@
+# Database schema
+
+The orchestrator uses SQLite via
+[modernc.org/sqlite](https://pkg.go.dev/modernc.org/sqlite) — a pure
+Go driver with no cgo dependency. The database file is created on
+first startup at the path in `database.path`
+(default `./var/vetting.db`).
+
+**Pragmas set at open time:**
+
+- `PRAGMA journal_mode = WAL` — write-ahead logging for concurrent
+  readers.
+- `PRAGMA foreign_keys = ON` — enforced referential integrity.
+
+**Migrations** are embedded via `go:embed` in `internal/db/` and
+applied in filename order at startup. A `schema_migrations` table
+tracks which migrations have run.
+
+---
+
+## Tables
+
+### `hosts`
+
+Registered hardware nodes in the vetting cluster.
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `name` | TEXT | NOT NULL UNIQUE | | Human-readable host name. |
+| `mac` | TEXT | NOT NULL UNIQUE | | Lowercase colon form (e.g. `aa:bb:cc:dd:ee:ff`). |
+| `wol_broadcast_ip` | TEXT | NOT NULL | | LAN broadcast IP for Wake-on-LAN magic packets. |
+| `wol_port` | INTEGER | NOT NULL | `9` | WoL UDP port. |
+| `expected_spec_yaml` | TEXT | NOT NULL | | YAML describing expected hardware (CPU, memory, disks, firmware). |
+| `pdu_config_json` | TEXT | | | PDU power control config (future use). |
+| `ipmi_config_json` | TEXT | | | IPMI config (future use). |
+| `notes` | TEXT | NOT NULL | `''` | Operator notes. |
+| `created_at` | TIMESTAMP | NOT NULL | `CURRENT_TIMESTAMP` | |
+| `updated_at` | TIMESTAMP | NOT NULL | `CURRENT_TIMESTAMP` | |
+| `last_seen_at` | TIMESTAMP | | | Host-mode agent heartbeat timestamp. NULL = never seen. |
+
+### `runs`
+
+Vetting run instances. Each run belongs to one host and walks through
+the state machine.
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `host_id` | INTEGER | NOT NULL FK → hosts(id) CASCADE | | |
+| `state` | TEXT | NOT NULL | | Current `RunState` (see `internal/model`). |
+| `result` | TEXT | | | `pass` or `fail` once terminal. |
+| `failed_stage` | TEXT | | | Stage name that halted the pipeline. |
+| `next_boot_target` | TEXT | | | `linux`, `memtest`, etc. (future use). |
+| `agent_token_hash` | TEXT | NOT NULL | | SHA-256 hash of the bearer token. |
+| `started_at` | TIMESTAMP | NOT NULL | `CURRENT_TIMESTAMP` | |
+| `completed_at` | TIMESTAMP | | | Set when run reaches a terminal state. |
+| `report_path` | TEXT | | | Path to `report.json` on disk. |
+| `hold_ip` | TEXT | | | Agent IP during FailedHolding (for SSH command). |
+| `override_flags_json` | TEXT | | | JSON blob (e.g. `{"wipe": true}`). |
+| `non_destructive` | INTEGER | NOT NULL | `0` | `1` = skip badblocks + wipe probe. |
+| `profile` | TEXT | NOT NULL | `'quick'` | `quick`, `deep`, or `soak`. |
+
+**Indices:**
+- `idx_runs_host` on `(host_id)`
+- `idx_runs_state` on `(state)`
+
+### `stages`
+
+Per-stage results within a run. Seeded at `/claim` time with one row
+per stage in `DefaultStageOrder`.
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `run_id` | INTEGER | NOT NULL FK → runs(id) CASCADE | | |
+| `name` | TEXT | NOT NULL | | Stage name (e.g. `SMART`, `CPUStress`). |
+| `ordinal` | INTEGER | NOT NULL | | 0-based position in the pipeline. |
+| `state` | TEXT | NOT NULL | | `pending`, `running`, `passed`, `failed`, `skipped`. |
+| `started_at` | TIMESTAMP | | | Set when the stage begins. |
+| `completed_at` | TIMESTAMP | | | Set when the stage finishes. |
+| `summary_json` | TEXT | | | Arbitrary JSON from the agent's result. |
+
+**Indices:**
+- `idx_stages_run_ordinal` on `(run_id, ordinal)`
+
+### `sub_steps`
+
+Finer-grained units within a stage (per-disk SMART, per-NIC iperf,
+CPU/memory pass, per-GPU run). Not every stage has sub-steps.
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `run_id` | INTEGER | NOT NULL FK → runs(id) CASCADE | | |
+| `stage_name` | TEXT | NOT NULL | | Parent stage name. |
+| `ordinal` | INTEGER | NOT NULL | | 0-based within `(run_id, stage_name)`. |
+| `name` | TEXT | NOT NULL | | Human label (e.g. `sda SMART`, `eth0 iperf`). |
+| `state` | TEXT | NOT NULL | `'pending'` | `pending`, `running`, `passed`, `failed`, `skipped`. |
+| `started_at` | TIMESTAMP | | | |
+| `completed_at` | TIMESTAMP | | | |
+| `summary_json` | TEXT | NOT NULL | `'{}'` | |
+
+**Constraints:** `UNIQUE (run_id, stage_name, ordinal)`
+**Indices:** `idx_sub_steps_run` on `(run_id, stage_name, ordinal)`
+
+### `measurements`
+
+Time-series sensor data from the thermal sidecar and stage executors.
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `run_id` | INTEGER | NOT NULL FK → runs(id) CASCADE | | |
+| `stage_id` | INTEGER | FK → stages(id) SET NULL | | Optional link to a specific stage. |
+| `ts` | TIMESTAMP | NOT NULL | | Sample timestamp. |
+| `kind` | TEXT | NOT NULL | | `temp`, `power`, `iperf`, `fio`, `smart_attr`, `psu_volt`, `fan`, etc. |
+| `key` | TEXT | NOT NULL | | Source identifier (e.g. `cpu/0`, `+12V`). |
+| `value` | REAL | | | Numeric sample. |
+| `unit` | TEXT | | | Display unit. |
+
+**Indices:** `idx_measurements_run_kind_ts` on `(run_id, kind, ts)`
+
+### `artifacts`
+
+On-disk file references (reports, fio logs, iperf logs, hold keys).
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `run_id` | INTEGER | NOT NULL FK → runs(id) CASCADE | | |
+| `stage_id` | INTEGER | FK → stages(id) SET NULL | | |
+| `kind` | TEXT | NOT NULL | | `inventory`, `report`, `report_html`, `hold_key`, `fio`, `iperf`. |
+| `path` | TEXT | NOT NULL | | Absolute path on disk. |
+| `sha256` | TEXT | NOT NULL | | SHA-256 hex digest. |
+| `size_bytes` | INTEGER | NOT NULL | | File size. |
+
+### `spec_diffs`
+
+Expected-vs-actual hardware divergences from SpecValidate.
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `run_id` | INTEGER | NOT NULL FK → runs(id) CASCADE | | |
+| `field` | TEXT | NOT NULL | | Dotted path (e.g. `memory.total_gib`, `cpu.logical_cores`). |
+| `expected` | TEXT | | | Expected value from the host's spec YAML. |
+| `actual` | TEXT | | | Observed value from the inventory probe. |
+| `severity` | TEXT | NOT NULL | | `critical`, `warning`, `info`. |
+| `ignored` | INTEGER | NOT NULL | `0` | `1` = operator chose to ignore this diff. |
+
+### `thresholds`
+
+Per-run threshold rules, seeded from the `ProfileRegistry` + per-host
+overrides at run creation. Immutable for the run's lifetime.
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `run_id` | INTEGER | NOT NULL FK → runs(id) CASCADE | | |
+| `stage_name` | TEXT | NOT NULL | | `*` matches any stage. |
+| `kind` | TEXT | NOT NULL | | Measurement kind to match. |
+| `key` | TEXT | NOT NULL | | Key selector (glob-ish). |
+| `op` | TEXT | NOT NULL | | `lt`, `lte`, `gt`, `gte`, `within_pct`. |
+| `threshold` | REAL | NOT NULL | | Limit value. |
+| `nominal` | REAL | NOT NULL | `0` | Reference for `within_pct`. |
+| `unit` | TEXT | NOT NULL | `''` | Display unit. |
+| `severity` | TEXT | NOT NULL | | `critical` or `warning`. |
+| `source` | TEXT | NOT NULL | | `profile` or `host_override`. |
+
+**Indices:**
+- `idx_thresholds_run` on `(run_id)`
+- `idx_thresholds_kind` on `(run_id, stage_name, kind)`
+
+### `threshold_evaluations`
+
+Per-sample pass/fail results from threshold evaluation. Drives
+report badges and pipeline verdict rendering.
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `run_id` | INTEGER | NOT NULL FK → runs(id) CASCADE | | |
+| `threshold_id` | INTEGER | NOT NULL FK → thresholds(id) CASCADE | | |
+| `stage_name` | TEXT | NOT NULL | | Stage the sample belongs to. |
+| `kind` | TEXT | NOT NULL | | Measurement kind. |
+| `key` | TEXT | NOT NULL | | Source key. |
+| `ts` | TIMESTAMP | NOT NULL | | Sample timestamp. |
+| `observed` | REAL | NOT NULL | | Observed value. |
+| `passed` | INTEGER | NOT NULL | | `1` = within threshold, `0` = breach. |
+
+**Indices:** `idx_threshold_evals_run` on `(run_id, passed)`
+
+### `firmware_snapshots`
+
+Per-run firmware version captures (BIOS, BMC, NIC, HBA, microcode,
+NVMe). Populated by the Firmware stage; consumed by SpecValidate for
+firmware version diffing.
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `run_id` | INTEGER | NOT NULL FK → runs(id) CASCADE | | |
+| `component` | TEXT | NOT NULL | | `bios`, `bmc`, `nic`, `hba`, `microcode`, `nvme_fw`. |
+| `identifier` | TEXT | NOT NULL | | Slot, serial, or device path distinguishing this component. |
+| `version` | TEXT | NOT NULL | | Firmware version string. |
+| `vendor` | TEXT | NOT NULL | `''` | |
+| `raw_json` | TEXT | NOT NULL | `'{}'` | Additional metadata. |
+
+**Indices:** `idx_firmware_run` on `(run_id, component)`
+
+### `events`
+
+Event log table. Reserved for future use.
+
+| Column | Type | Constraints | Default | Description |
+|--------|------|-------------|---------|-------------|
+| `id` | INTEGER | PK AUTOINCREMENT | | |
+| `run_id` | INTEGER | FK → runs(id) CASCADE | | |
+| `host_id` | INTEGER | FK → hosts(id) CASCADE | | |
+| `ts` | TIMESTAMP | NOT NULL | | |
+| `level` | TEXT | NOT NULL | | |
+| `kind` | TEXT | NOT NULL | | |
+| `message` | TEXT | NOT NULL | | |
+| `data_json` | TEXT | | | |
+
+### `settings`
+
+Key-value store for orchestrator-level settings.
+
+| Column | Type | Constraints | Description |
+|--------|------|-------------|-------------|
+| `key` | TEXT | PK | |
+| `value` | TEXT | NOT NULL | |
+
+---
+
+## Entity relationships
+
+```
+hosts 1───N runs
+              ├──N stages
+              │     └──(FK) measurements (stage_id, SET NULL)
+              │     └──(FK) artifacts (stage_id, SET NULL)
+              ├──N sub_steps
+              ├──N measurements (run_id)
+              ├──N artifacts (run_id)
+              ├──N spec_diffs
+              ├──N thresholds
+              │     └──N threshold_evaluations
+              └──N firmware_snapshots
+```
+
+All foreign keys use `ON DELETE CASCADE` (except `stage_id` references
+which use `SET NULL`). Deleting a host cascades through its runs and
+all dependent rows.
+
+## Data retention
+
+The janitor goroutine prunes **on-disk files** (artifacts, logs) based
+on `artifacts.retention_days` and `logs.retention_days`. **Database
+rows are never deleted** by the janitor — run histories, measurement
+time-series, spec diffs, and threshold evaluations survive cleanups
+indefinitely.
+
+See [architecture.md § Data retention](architecture.md#data-retention)
+and [configuration.md § janitor](configuration.md#janitor).
+
+## Migration history
+
+| File | What it adds |
+|------|-------------|
+| `0001_init.sql` | Core schema: `hosts`, `runs`, `stages`, `measurements`, `artifacts`, `spec_diffs`, `events`, `settings`. |
+| `0002_add_hosts_last_seen_at.sql` | `hosts.last_seen_at` column for host-mode agent heartbeats. |
+| `0003_add_runs_non_destructive.sql` | `runs.non_destructive` boolean flag. |
+| `0004_add_sub_steps.sql` | `sub_steps` table for per-disk/per-NIC granular stage detail. |
+| `0005_profiles_thresholds_firmware.sql` | `runs.profile` column, `thresholds` + `threshold_evaluations` tables, `firmware_snapshots` table. |
+
+All migrations are additive — no schema deletions or renames.
diff --git a/docs/development.md b/docs/development.md
new file mode 100644
index 0000000..7d03ebe
--- /dev/null
+++ b/docs/development.md
@@ -0,0 +1,193 @@
+# Development guide
+
+How to build, test, and contribute to the vetting orchestrator and
+agent.
+
+## Prerequisites
+
+| Tool | Version | Notes |
+|------|---------|-------|
+| Go | 1.22+ | Pure Go — no cgo required. |
+| templ | latest | `go install github.com/a-h/templ/cmd/templ@latest` |
+| make | any | GNU Make on Linux/macOS/WSL; `make` ships with Git for Windows. |
+| mkosi | 25.3+ | Only needed for `make live-image`. Linux/WSL only. |
+
+Windows hosts can build and test everything except `live-image` and
+`e2e`. Those targets require a real Linux userspace — use WSL:
+`wsl make live-image`.
+
+## Repository structure
+
+```
+cmd/
+  vetting/              orchestrator binary — HTTP server, dispatcher, runner
+  vetting-agent/        agent binary — dual-mode (live-image + host-mode)
+internal/
+  config/               YAML loader, ProfileRegistry (quick/deep/soak)
+  db/                   SQLite open + embedded migrations (pure Go via modernc.org/sqlite)
+  model/                Plain structs: Host, Run, Stage, SubStep, Measurement, SpecDiff
+  store/                Repository layer — hand-written SQL, no ORM
+  orchestrator/         State machine, dispatcher, runner, WoL, HMAC tokens, iperf supervisor
+  api/                  HTTP handlers — agent_handlers.go + ui_handlers.go
+  httpserver/           chi router assembly (exists to break api ↔ orchestrator import cycle)
+  web/                  Embedded static assets + compiled Templ templates
+  pxe/                  dnsmasq subprocess supervisor + per-MAC iPXE script generator
+  events/               In-process SSE hub (fan-out to browser clients)
+  logs/                 Per-run flat-file writer + SSE fan-out
+  spec/                 Expected-vs-actual hardware diff engine
+  notify/               Pluggable notifier registry (ntfy, Discord, SMTP)
+  report/               HTML + JSON report generation
+  hold/                 Per-run SSH key issuance for FailedHolding
+  janitor/              Retention-based cleanup (artifact + log files)
+agent/
+  runner.go             In-image agent: claim loop, stage dispatch, heartbeat, log forwarder
+  client.go             HTTP client for orchestrator API
+  sensor_mux.go         Thermal + performance metric sidecar
+  bootstate/            Kernel cmdline parser (run_id, mac, orchestrator_url, token)
+  hostmode/             Persistent host-mode reporter (systemd service)
+  probes/               Hardware interrogation (lshw, dmidecode, smartctl, etc.)
+  tests/                Per-stage test implementations
+live-image/             mkosi config + scripts for Debian live image
+deploy/                 systemd unit, install.sh, pxe-setup.sh, example config
+docs/                   You are here
+test/e2e/               Build-tagged QEMU + PXE full-stack integration test
+```
+
+**Key architectural insight:** `internal/httpserver` exists solely to
+break the `api ↔ orchestrator` import cycle. The `internal/` tree is
+the orchestrator binary's code; the `agent/` tree is the agent
+binary's code. They share only `internal/model` (plain structs) and
+`internal/spec` (diff engine, used by the agent's inventory probe and
+the orchestrator's SpecValidate resolver).
+
+## Building
+
+| Target | Command | Description |
+|--------|---------|-------------|
+| Everything | `make all` | Build orchestrator + agent for host OS. |
+| Orchestrator | `make orchestrator` | Host OS binary (`bin/vetting`). |
+| Orchestrator (Linux) | `make orchestrator-linux` | Cross-compile to `bin/vetting-linux-amd64`. |
+| Agent | `make agent` | Host OS binary (dev/testing only). |
+| Agent (Linux) | `make agent-linux` | Cross-compile to `bin/vetting-agent.linux-amd64`. |
+| Templates | `make templ` | Regenerate `.templ` → `.go` files. Run before build if templates changed. |
+| Live image | `make live-image` | Build Debian live image via mkosi (Linux/WSL only). |
+| Release bundle | `make release` | Slim tarball: binaries + deploy scripts + VERSION pointer. |
+| Tidy | `make tidy` | `go mod tidy`. |
+| Format | `make fmt` | `go fmt ./...`. |
+| Lint | `make vet` | `go vet ./...`. |
+| Clean | `make clean` | Remove `bin/`, `build/`, `tmp/`, `out/`, `dist/`. |
+
+Build flags: the git SHA is baked into the binary via
+`-ldflags -X vetting/internal/version.GitSHA=<sha>`.
+
+## Running locally
+
+```bash
+make run
+# → builds orchestrator, launches with deploy/vetting.example.yaml
+# → http://localhost:8080
+```
+
+The example config binds to `127.0.0.1:8080`, disables PXE, and uses
+`./var/` relative paths for the database, artifacts, and logs. Edit
+`deploy/vetting.example.yaml` to tune for your dev environment.
+
+For a QEMU walkthrough (register a host, PXE-boot a VM, watch the
+pipeline), see [operations.md § First vetting run](operations.md#first-vetting-run).
+
+## Testing
+
+| Command | What it does |
+|---------|--------------|
+| `make test` | Unit + smoke tests across all packages. Cross-platform. |
+| `make test-race` | Same tests with Go's race detector (`-race -count=1`). |
+| `make vet` | `go vet ./...` — catches common mistakes. |
+| `make e2e` | QEMU + PXE full-stack integration test. Requires Linux root, a built live image, and a running orchestrator with a registered host and queued run. |
+
+**Test design:**
+
+- Tests use real SQLite (in-memory or temp file) — no mocking the
+  database.
+- The `agent/tests/fakes/` directory contains mock binaries
+  (`dmidecode`, `stress-ng`, etc.) used by agent probe tests.
+- E2E tests are build-tagged with `-tags=e2e` and live in
+  `test/e2e/qemu_test.go`.
+
+## Adding a new test stage
+
+1. Add a `State<Name>` constant to `internal/model/model.go`.
+2. Wire it into `internal/orchestrator/statemachine.go` — both the
+   forward transition table and the stage-for-state lookup.
+3. Add the stage name to `DefaultStages()` in
+   `internal/config/profiles.go`.
+4. Add a `case "<Name>":` to the `runStage` switch in
+   `agent/runner.go`.
+5. Drop the implementation into `agent/tests/<name>.go`.
+6. If the stage is **orchestrator-owned** (like SpecValidate or
+   Reporting), add a `resolve<Name>` helper to
+   `internal/api/agent_handlers.go` and call it from `resultAdvance`.
+7. Add the stage to `vetting.stages` in
+   `deploy/vetting.example.yaml`.
+
+See [test-suite.md](test-suite.md) for what each existing stage
+measures and its pass/fail criteria.
+
+## Adding a new notifier
+
+1. Implement the `notify.Notifier` interface (single `Send` method)
+   in a new file under `internal/notify/`.
+2. Register the new type in the notifier builder (the switch in
+   `internal/notify/build.go` or equivalent factory).
+3. Add the type-specific config fields to the `Notifier` struct in
+   `internal/config/config.go`.
+4. Document the new notifier type in
+   [configuration.md § notifiers](configuration.md#notifiers).
+
+## Code conventions
+
+- **No cgo** — the SQLite driver is `modernc.org/sqlite` (pure Go).
+  Builds cross-compile to Linux from Windows/macOS without a C
+  toolchain.
+- **Hand-written SQL** — no ORM. Queries are explicit and testable.
+  Each store method is a single SQL statement or a short transaction.
+- **Templ for UI** — `.templ` files compile to type-safe Go functions.
+  The report module uses `html/template` instead (self-contained HTML
+  with inlined CSS).
+- **chi for routing** — `github.com/go-chi/chi/v5`. Standard
+  middleware stack: `RealIP`, `Recoverer`, `Logger`.
+- **Error handling** — fail-soft in SSE/tile paths (log and skip),
+  fail-hard in store/migration paths (return error up).
+- **Log convention** — `log.Printf` with a context prefix
+  (e.g. `"claim: seed stages run %d: %v"`).
+
+## CI/CD
+
+Three Gitea Actions workflows in `.gitea/workflows/`:
+
+| Workflow | Trigger | What it does |
+|----------|---------|--------------|
+| `ci.yml` | Push to main + PRs | Templ generate, tidy check, vet, build (native + linux), test with race detector + coverage. |
+| `release.yml` | Push to main (skips doc/test paths) | Detects `live-image/VERSION` changes → builds + publishes live image to registry. Always builds slim bundle → publishes to `vetting/latest/`. |
+| `e2e.yml` | Manual dispatch | Builds live image + orchestrator, installs QEMU + deps, runs `make e2e`. |
+
+**Release bundle structure:**
+
+```
+vetting-bundle/
+  bin/
+    vetting-linux-amd64
+    vetting-agent.linux-amd64
+  live-image/
+    VERSION                    # pointer — actual vmlinuz/initrd.img fetched on install
+  install.sh
+  pxe-setup.sh
+  vetting.service
+  vetting.production.yaml
+  ipxe-shas.txt
+  VERSION                      # git SHA
+```
+
+The ~30 MB bundle is published on every push to main. The ~300 MB live
+image (`vmlinuz` + `initrd.img`) is published separately under
+`live-image/<version>/` and only rebuilds when `live-image/VERSION`
+changes.
diff --git a/docs/test-suite.md b/docs/test-suite.md
index b3bbdc6..4c67ee1 100644
--- a/docs/test-suite.md
+++ b/docs/test-suite.md
@@ -8,8 +8,8 @@ to fix, override, or abandon.
 ## Stage order
 
 ```
-Inventory → SpecValidate → SMART → CPUStress → Storage
-         → Network → GPU → PSU → Reporting
+Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage
+          → Network → Burn → GPU → PSU → Reporting
 ```
 
 Stages marked *orchestrator-owned* resolve inside `/result` and never
@@ -27,6 +27,20 @@ merged into a single JSON blob.
 `nvidia-smi` on a GPU-less host) are tolerated.
 **Artifacts:** `inventory.json` under `artifacts/run-<N>/`.
 
+## Firmware
+
+**Owner:** agent.
+**What it does:** probes firmware versions across all discoverable
+components: BIOS (`dmidecode -t bios`), BMC (`ipmitool mc info`), NIC
+firmware (`ethtool -i` per interface), NVMe firmware (`nvme id-ctrl`),
+HBA firmware (`lspci -vv`), and CPU microcode (`/proc/cpuinfo`).
+Missing tools are tolerated — a GPU-less server won't have
+`nvidia-smi`, a consumer board won't have `ipmitool`.
+**Pass:** always passes. Firmware is advisory-only; SpecValidate is the
+gate that fails on version mismatches.
+**Artifacts:** `firmware_snapshots` table rows (one per component,
+keyed by `(run_id, component, identifier)`).
+
 ## SpecValidate *(orchestrator-owned)*
 
 **Owner:** orchestrator (resolves inline inside the `/result` for the
@@ -93,6 +107,40 @@ binds to the configured `network.iperf_port`.
 for 10GbE).
 **Artifacts:** `iperf-<nic>.json`.
 
+## Burn
+
+**Owner:** agent.
+**What it does:** runs CPU stress, memory stress, disk I/O, and
+network throughput **simultaneously** for the profile's burn duration.
+The goal is to stress every subsystem at once and surface failures that
+only appear under combined load (thermal throttling, PSU voltage sag,
+memory errors under thermal pressure).
+
+Sub-workloads run as parallel goroutines:
+
+- **CPU** — `stress-ng --cpu <workers>` for the burn duration.
+- **Memory** — `stress-ng --vm --vm-bytes <mem_pct>%` for the burn
+  duration.
+- **Disk** — `fio` against a spare partition (when `fio_on_spare` is
+  enabled).
+- **Network** — `iperf3 -c <orchestrator> -P <parallel>` for the burn
+  duration.
+
+**Pass:** all four sub-workloads exit 0 and no critical threshold
+breach fires during the window.
+**Configurable knobs** (per profile):
+
+| Knob | Description |
+|------|-------------|
+| `duration` | Total burn-in window. |
+| `cpu_workers` | `all` = `runtime.NumCPU()`, or a fixed count. |
+| `mem_pct` | Percentage of MemAvailable to stress. |
+| `fio_on_spare` | Run fio inside Burn (requires a spare partition). |
+| `iperf_parallel` | Parallel stream count for `iperf3 -P`. |
+
+See [configuration.md § burn](configuration.md#burn) for per-profile
+default values.
+
 ## GPU
 
 **Owner:** agent.
@@ -153,6 +201,29 @@ the next batch.
 - `artifacts` — on-disk files (report, fio logs, iperf logs, etc).
 - `spec_diffs` — one row per expected-vs-actual divergence.
 
+## Profile duration summary
+
+Three profiles scale every stage's duration. Probes and gates are
+identical across profiles — only the work size changes. See
+[configuration.md § profiles](configuration.md#profiles) for the full
+knob reference.
+
+| Stage | quick (~10 min) | deep (~8-12 h) | soak (~36-40 h) |
+|-------|----------------|----------------|-----------------|
+| Inventory | seconds | seconds | seconds |
+| Firmware | seconds | seconds | seconds |
+| SpecValidate | instant (server) | instant (server) | instant (server) |
+| SMART | seconds per disk | seconds per disk | seconds per disk |
+| CPUStress | 2 m cpu + 2 m mem | 60 m cpu + 60 m mem | 12 h cpu + 12 h mem |
+| Storage | 3 m fio (sample) | badblocks + 2 h fio | badblocks + 6 h fio |
+| Network | 60 s iperf | 30 m iperf | 2 h iperf |
+| Burn | 2 m all-at-once | 2 h all-at-once | 18 h all-at-once |
+| GPU | seconds | seconds | seconds |
+| PSU | 1 m load burst | 10 m load burst | 15 m load burst |
+| Reporting | instant (server) | instant (server) | instant (server) |
+
+---
+
 ## Adding a new stage
 
 1. Add the name to `store.DefaultStageOrder`.
diff --git a/internal/api/agent_handlers.go b/internal/api/agent_handlers.go
index dfb2c8b..251869f 100644
--- a/internal/api/agent_handlers.go
+++ b/internal/api/agent_handlers.go
@@ -1,3 +1,5 @@
+// Package api contains the HTTP handlers for both the agent-facing
+// endpoints (/api/v1/runs/:id/*) and the browser-facing UI routes.
 package api
 
 import (
diff --git a/internal/config/config.go b/internal/config/config.go
index 1c0460b..37c006a 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -1,3 +1,7 @@
+// Package config loads the orchestrator's YAML configuration file and
+// exposes typed structs for every config block. The ProfileRegistry
+// (quick/deep/soak) is built during Load from the vetting: and
+// profiles: top-level blocks.
 package config
 
 import (
diff --git a/internal/db/db.go b/internal/db/db.go
index 96c0357..6ffa871 100644
--- a/internal/db/db.go
+++ b/internal/db/db.go
@@ -1,3 +1,6 @@
+// Package db opens the SQLite database and applies embedded SQL
+// migrations in filename order at startup. Uses modernc.org/sqlite
+// (pure Go, no cgo).
 package db
 
 import (
diff --git a/internal/events/events.go b/internal/events/events.go
index 5ac1559..374a328 100644
--- a/internal/events/events.go
+++ b/internal/events/events.go
@@ -1,3 +1,6 @@
+// Package events provides an in-process SSE fan-out hub. Browser
+// clients subscribe via GET /events; the orchestrator publishes
+// pre-rendered HTML fragments that HTMX swaps into the DOM.
 package events
 
 import (
diff --git a/internal/model/model.go b/internal/model/model.go
index 85543ba..91892fe 100644
--- a/internal/model/model.go
+++ b/internal/model/model.go
@@ -1,7 +1,11 @@
+// Package model defines the domain value types shared across the
+// orchestrator: Host, Run, Stage, SubStep, Measurement, and SpecDiff.
+// These are plain structs with no behaviour beyond state classification.
 package model
 
 import "time"
 
+// Host is a registered hardware node in the vetting cluster.
 type Host struct {
 	ID               int64
 	Name             string
@@ -17,6 +21,7 @@ type Host struct {
 	LastSeenAt       *time.Time // host-mode agent heartbeat; nil = never seen
 }
 
+// RunState is the current position of a run in the state machine.
 type RunState string
 
 const (
@@ -51,6 +56,7 @@ func (s RunState) IsTerminal() bool {
 	return false
 }
 
+// Run is a single vetting pass on a host, walking through the stage pipeline.
 type Run struct {
 	ID                int64
 	HostID            int64
@@ -68,6 +74,7 @@ type Run struct {
 	Profile           string // quick|deep|soak; empty is treated as "quick"
 }
 
+// StageState tracks whether a stage is pending, running, passed, failed, or skipped.
 type StageState string
 
 const (
@@ -78,6 +85,7 @@ const (
 	StageSkipped StageState = "skipped"
 )
 
+// Stage is a single test step within a run (e.g. SMART, CPUStress, Storage).
 type Stage struct {
 	ID          int64
 	RunID       int64
@@ -107,6 +115,7 @@ type SubStep struct {
 	SummaryJSON string
 }
 
+// Measurement is a single time-series sample from the thermal sidecar or a stage executor.
 type Measurement struct {
 	ID      int64
 	RunID   int64
@@ -118,6 +127,7 @@ type Measurement struct {
 	Unit    string
 }
 
+// SpecDiff records a single expected-vs-actual hardware divergence from SpecValidate.
 type SpecDiff struct {
 	ID       int64
 	RunID    int64
diff --git a/internal/orchestrator/statemachine.go b/internal/orchestrator/statemachine.go
index 497e5b0..b88d90c 100644
--- a/internal/orchestrator/statemachine.go
+++ b/internal/orchestrator/statemachine.go
@@ -1,3 +1,6 @@
+// Package orchestrator contains the run state machine, dispatcher,
+// per-run runner, WoL sender, HMAC token issuer, threshold evaluator,
+// and iperf3 supervisor.
 package orchestrator
 
 import (
diff --git a/internal/pxe/dnsmasq.go b/internal/pxe/dnsmasq.go
index 98e44a1..3dadbb3 100644
--- a/internal/pxe/dnsmasq.go
+++ b/internal/pxe/dnsmasq.go
@@ -1,3 +1,6 @@
+// Package pxe supervises a dnsmasq subprocess for proxy-DHCP PXE
+// boot and generates per-MAC iPXE scripts that chainload the live
+// image with run-specific kernel cmdline parameters.
 package pxe
 
 import (
diff --git a/internal/store/hosts.go b/internal/store/hosts.go
index 7f70396..542a7bc 100644
--- a/internal/store/hosts.go
+++ b/internal/store/hosts.go
@@ -1,3 +1,6 @@
+// Package store is the repository layer for the orchestrator's SQLite
+// database. Each store type (Hosts, Runs, Stages, etc.) wraps a
+// *sql.DB and exposes hand-written SQL queries — no ORM.
 package store
 
 import (
diff --git a/internal/web/embed.go b/internal/web/embed.go
index d19c94d..a0260da 100644
--- a/internal/web/embed.go
+++ b/internal/web/embed.go
@@ -1,3 +1,5 @@
+// Package web embeds the static assets (CSS, JS) and compiled Templ
+// templates served by the orchestrator's HTTP routes.
 package web
 
 import "embed"