deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -42,4 +42,20 @@ jobs:
          GOOS=linux GOARCH=amd64 go build ./...
      - name: Test
-        run: go test -race -count=1 ./...
+        run: go test -race -count=1 -coverprofile=coverage.out ./...
      - name: Coverage summary
        run: |
          go tool cover -func=coverage.out | tee coverage.txt
          go tool cover -html=coverage.out -o coverage.html
      - name: Upload coverage artifact
        uses: actions/upload-artifact@v4
        if: always()
        with:
          name: coverage
          path: |
            coverage.out
            coverage.txt
            coverage.html
          retention-days: 14
@@ -124,6 +124,56 @@ type ClaimResponse struct {
 	// at the right stage instead of silently replaying Inventory and
 	// letting the orchestrator advance past the crashed stage.
 	CurrentState string `json:"current_state"`
 	// StageConfig carries per-profile stage knobs (Phase 2): stage-level
 	// timeouts and probe-level durations/modes. Empty when the agent
 	// talks to a pre-Phase-2 orchestrator; the agent applies compile-
 	// time defaults in that case.
 	StageConfig ClaimStageConfig `json:"stage_config"`
 }
 // ClaimStageConfig mirrors config.StageConfig server-side — duplicated so
 // the agent doesn't need to import internal/config. Durations arrive as
 // strings ("2m", "2h") and are parsed by the tests package at the point
 // of use. An empty field means "use the agent-side default" so a missing
 // knob doesn't silently turn CPUStress / Storage into a no-op.
 type ClaimStageConfig struct {
 	Profile       string                `json:"profile"`
 	StageTimeouts map[string]string     `json:"stage_timeouts,omitempty"`
 	CPUStress     ClaimCPUStressKnobs   `json:"cpustress"`
 	Storage       ClaimStorageKnobs     `json:"storage"`
 	Network       ClaimNetworkKnobs     `json:"network"`
 	Burn          ClaimBurnKnobs        `json:"burn"`
 }
 type ClaimCPUStressKnobs struct {
 	CPUPass  string `json:"cpu_pass,omitempty"`
 	MemPass  string `json:"mem_pass,omitempty"`
 	EDACPoll string `json:"edac_poll,omitempty"`
 }
 type ClaimStorageKnobs struct {
 	Mode    string `json:"mode,omitempty"`
 	FioSize string `json:"fio_size,omitempty"`
 	FioTime string `json:"fio_time,omitempty"`
 	FioBS   string `json:"fio_bs,omitempty"`
 	FioRW   string `json:"fio_rw,omitempty"`
 	Verify  string `json:"verify,omitempty"`
 }
 type ClaimNetworkKnobs struct {
 	Duration string `json:"duration,omitempty"`
 }
 // ClaimBurnKnobs mirrors config.BurnKnobs. Duration/CPUWorkers arrive as
 // strings so the agent can treat empty as "use compile-time default".
 // MemPct is a percentage (0-100); IperfParallel is the parallel stream
 // count fed to iperf3 -P. FioOnSpare gates whether fio runs inside Burn.
 type ClaimBurnKnobs struct {
 	Duration      string `json:"duration,omitempty"`
 	CPUWorkers    string `json:"cpu_workers,omitempty"`
 	MemPct        int    `json:"mem_pct,omitempty"`
 	FioOnSpare    bool   `json:"fio_on_spare,omitempty"`
 	IperfParallel int    `json:"iperf_parallel,omitempty"`
 }
 type ClaimExpectedDiskSpec struct {
@@ -0,0 +1,70 @@
 package probes
 import (
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 )
 // EDACSample is one counter reading from /sys/devices/system/edac/mc/.
 // Kind is "edac_ce" (correctable ECC errors) or "edac_ue"
 // (uncorrectable — always a critical signal). Key identifies the memory
 // controller (e.g. "mc0"). Value is the cumulative count since boot;
 // the threshold evaluator flags it the moment it exceeds 0.
 type EDACSample struct {
 	Kind  string
 	Key   string
 	Value float64
 	Unit  string
 }
 // EDAC returns one EDACSample per (memory-controller × {ce,ue}) pair
 // that /sys exposes. Returns an empty slice when EDAC isn't available
 // (virtualized host, missing kernel driver, mdadm-style boards without
 // a controller node) — callers treat an empty return as "no data",
 // not "passed". Errors are swallowed for the same reason: a hot-
 // swapped DIMM that makes /sys blink briefly shouldn't fail the stage
 // before the real counter can be read.
 //
 // This is intentionally small — the sidecar polls periodically, so one
 // bad read is recovered on the next tick. The counters are monotonic,
 // so emitting the current raw value is correct.
 func EDAC() []EDACSample {
 	root := "/sys/devices/system/edac/mc"
 	entries, err := os.ReadDir(root)
 	if err != nil {
 		return nil
 	}
 	var out []EDACSample
 	for _, e := range entries {
 		name := e.Name()
 		if !strings.HasPrefix(name, "mc") {
 			continue
 		}
 		base := filepath.Join(root, name)
 		if ce, ok := readCount(filepath.Join(base, "ce_count")); ok {
 			out = append(out, EDACSample{Kind: "edac_ce", Key: name, Value: ce, Unit: "count"})
 		}
 		if ue, ok := readCount(filepath.Join(base, "ue_count")); ok {
 			out = append(out, EDACSample{Kind: "edac_ue", Key: name, Value: ue, Unit: "count"})
 		}
 	}
 	return out
 }
 // readCount reads a single decimal integer from a sysfs file and
 // returns it as a float. Returns (0, false) on any failure so callers
 // can skip the sample without a diagnostic.
 func readCount(path string) (float64, bool) {
 	b, err := os.ReadFile(path)
 	if err != nil {
 		return 0, false
 	}
 	s := strings.TrimSpace(string(b))
 	n, err := strconv.ParseInt(s, 10, 64)
 	if err != nil {
 		return 0, false
 	}
 	return float64(n), true
 }
@@ -0,0 +1,496 @@
 package probes
 import (
 	"bufio"
 	"context"
 	"fmt"
 	"io"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"regexp"
 	"strings"
 	"time"
 )
 // FirmwareSnapshot is the on-wire shape the agent POSTs alongside the
 // Firmware stage result. Mirrors internal/store.FirmwareSnapshot without
 // the import — the /result handler converts to the store type and
 // persists. One run produces many snapshots (one per BIOS / BMC / NIC
 // port / HBA / microcode / NVMe); identifier distinguishes siblings
 // (e.g. "eth0" / "eth1"), version is the canonical string to diff.
 type FirmwareSnapshot struct {
 	Component  string            `json:"component"` // bios|bmc|nic|hba|microcode|nvme_fw
 	Identifier string            `json:"identifier"`
 	Version    string            `json:"version"`
 	Vendor     string            `json:"vendor,omitempty"`
 	Raw        map[string]string `json:"raw,omitempty"`
 }
 // Firmware runs every sub-probe in sequence. Each one is bounded with
 // a short timeout so a hung dmidecode / ipmitool / nvme tool can't
 // freeze the stage — the probe is best-effort, missing tools produce
 // empty output rather than an error. Returns the aggregated slice
 // along with a list of probe-level warnings (surfaced in the stage
 // summary so operators see which subsystem couldn't be read).
 func Firmware(ctx context.Context) ([]FirmwareSnapshot, []string) {
 	var out []FirmwareSnapshot
 	var warnings []string
 	if snap, warn := probeBIOS(ctx); snap != nil {
 		out = append(out, *snap)
 	} else if warn != "" {
 		warnings = append(warnings, warn)
 	}
 	if snap, warn := probeBMC(ctx); snap != nil {
 		out = append(out, *snap)
 	} else if warn != "" {
 		warnings = append(warnings, warn)
 	}
 	out = append(out, probeNICFirmware(ctx)...)
 	out = append(out, probeNVMeFirmware(ctx)...)
 	out = append(out, probeHBAFirmware(ctx)...)
 	if snap := probeMicrocode(); snap != nil {
 		out = append(out, *snap)
 	}
 	return out, warnings
 }
 // runCmd executes a short-lived command with a per-call timeout. The
 // timeout is intentionally aggressive (5 s) because firmware probes
 // read device registers and occasionally block forever on a wedged
 // controller — the stage should report "no HBA firmware readable"
 // rather than hang the pipeline.
 func runCmd(ctx context.Context, name string, args ...string) (string, error) {
 	cctx, cancel := context.WithTimeout(ctx, 5*time.Second)
 	defer cancel()
 	cmd := exec.CommandContext(cctx, name, args...)
 	out, err := cmd.CombinedOutput()
 	if err != nil {
 		return string(out), err
 	}
 	return string(out), nil
 }
 // ----- BIOS --------------------------------------------------------------
 // probeBIOS invokes dmidecode -t bios and parses the vendor + version
 // lines. dmidecode must run as root; we let it fail gracefully when the
 // agent is mis-deployed without privileges.
 func probeBIOS(ctx context.Context) (*FirmwareSnapshot, string) {
 	if _, err := exec.LookPath("dmidecode"); err != nil {
 		return nil, "bios: dmidecode not installed"
 	}
 	out, err := runCmd(ctx, "dmidecode", "-t", "bios")
 	if err != nil {
 		return nil, fmt.Sprintf("bios: dmidecode failed: %v", trimErr(err, out))
 	}
 	snap := parseDmidecodeBIOS(strings.NewReader(out))
 	if snap == nil {
 		return nil, "bios: dmidecode produced no usable output"
 	}
 	return snap, ""
 }
 // parseDmidecodeBIOS consumes `dmidecode -t bios` output and pulls
 // Vendor / Version / Release Date. Kept as an io.Reader for unit tests.
 func parseDmidecodeBIOS(r io.Reader) *FirmwareSnapshot {
 	kv := parseDmidecodeSection(r, "BIOS Information")
 	if kv == nil {
 		return nil
 	}
 	snap := &FirmwareSnapshot{
 		Component:  "bios",
 		Identifier: "system",
 		Version:    firstNonEmpty(kv["Version"], kv["Firmware Revision"]),
 		Vendor:     kv["Vendor"],
 		Raw:        kv,
 	}
 	if snap.Version == "" {
 		return nil
 	}
 	return snap
 }
 // parseDmidecodeSection returns the key/value map of the first dmidecode
 // handle whose title matches. dmidecode blocks look like:
 //   Handle 0x0000, ...
 //           BIOS Information
 //           Vendor: American Megatrends
 //           Version: 3.0
 //           ...
 // With a blank line between blocks. Values like "Characteristics:"
 // followed by a bulleted sub-list are collapsed into "…" so we don't
 // accidentally swallow the next handle.
 func parseDmidecodeSection(r io.Reader, title string) map[string]string {
 	sc := bufio.NewScanner(r)
 	sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
 	var kv map[string]string
 	var inside, seenTitle bool
 	for sc.Scan() {
 		line := sc.Text()
 		trim := strings.TrimSpace(line)
 		if strings.HasPrefix(line, "Handle ") {
 			if seenTitle && kv != nil {
 				return kv
 			}
 			inside = false
 			kv = nil
 			continue
 		}
 		if !inside {
 			if trim == title {
 				inside = true
 				seenTitle = true
 				kv = map[string]string{}
 			}
 			continue
 		}
 		if trim == "" {
 			continue
 		}
 		if k, v, ok := strings.Cut(trim, ":"); ok {
 			v = strings.TrimSpace(v)
 			if v == "" {
 				continue
 			}
 			kv[strings.TrimSpace(k)] = v
 		}
 	}
 	if seenTitle {
 		return kv
 	}
 	return nil
 }
 // ----- BMC / IPMI --------------------------------------------------------
 // probeBMC walks `ipmitool mc info`. Home-lab hosts often lack a BMC —
 // missing binary or a non-zero exit returns a warning without failing
 // the stage. We capture Firmware Revision + Manufacturer as the version.
 func probeBMC(ctx context.Context) (*FirmwareSnapshot, string) {
 	if _, err := exec.LookPath("ipmitool"); err != nil {
 		return nil, "bmc: ipmitool not installed"
 	}
 	out, err := runCmd(ctx, "ipmitool", "mc", "info")
 	if err != nil {
 		return nil, fmt.Sprintf("bmc: ipmitool mc info failed: %v", trimErr(err, out))
 	}
 	snap := parseIpmitoolMCInfo(strings.NewReader(out))
 	if snap == nil {
 		return nil, "bmc: ipmitool output not parseable"
 	}
 	return snap, ""
 }
 // parseIpmitoolMCInfo pulls "Firmware Revision" + "Manufacturer Name"
 // from the textual output. Format is indented key : value lines.
 func parseIpmitoolMCInfo(r io.Reader) *FirmwareSnapshot {
 	sc := bufio.NewScanner(r)
 	kv := map[string]string{}
 	for sc.Scan() {
 		line := strings.TrimSpace(sc.Text())
 		if k, v, ok := strings.Cut(line, ":"); ok {
 			kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
 		}
 	}
 	version := firstNonEmpty(kv["Firmware Revision"], kv["Aux Firmware Rev Info"])
 	if version == "" {
 		return nil
 	}
 	return &FirmwareSnapshot{
 		Component:  "bmc",
 		Identifier: "bmc0",
 		Version:    version,
 		Vendor:     kv["Manufacturer Name"],
 		Raw:        kv,
 	}
 }
 // ----- NIC firmware ------------------------------------------------------
 // probeNICFirmware enumerates /sys/class/net/*/device and calls
 // `ethtool -i <iface>` on each real NIC (skip lo, bridges, virtuals).
 // One snapshot per interface so a mismatched port lights up in the diff
 // without silencing sibling ports.
 func probeNICFirmware(ctx context.Context) []FirmwareSnapshot {
 	if _, err := exec.LookPath("ethtool"); err != nil {
 		return nil
 	}
 	ifaces, err := os.ReadDir("/sys/class/net")
 	if err != nil {
 		return nil
 	}
 	var out []FirmwareSnapshot
 	for _, entry := range ifaces {
 		name := entry.Name()
 		if !isRealNIC(name) {
 			continue
 		}
 		raw, err := runCmd(ctx, "ethtool", "-i", name)
 		if err != nil {
 			continue
 		}
 		snap := parseEthtoolI(strings.NewReader(raw), name)
 		if snap != nil {
 			out = append(out, *snap)
 		}
 	}
 	return out
 }
 // parseEthtoolI extracts driver/firmware-version from `ethtool -i`
 // output. Lines are "key: value" with a consistent prefix order.
 func parseEthtoolI(r io.Reader, iface string) *FirmwareSnapshot {
 	sc := bufio.NewScanner(r)
 	kv := map[string]string{}
 	for sc.Scan() {
 		line := sc.Text()
 		if k, v, ok := strings.Cut(line, ":"); ok {
 			kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
 		}
 	}
 	if kv["firmware-version"] == "" && kv["driver"] == "" {
 		return nil
 	}
 	return &FirmwareSnapshot{
 		Component:  "nic",
 		Identifier: iface,
 		Version:    kv["firmware-version"],
 		Vendor:     kv["driver"],
 		Raw:        kv,
 	}
 }
 // isRealNIC filters out loopback, bridges, veth, and the handful of
 // virtual kernel devices ethtool will refuse on.
 func isRealNIC(name string) bool {
 	if name == "" || name == "lo" {
 		return false
 	}
 	for _, prefix := range []string{"docker", "br-", "veth", "virbr", "tun", "tap", "bond"} {
 		if strings.HasPrefix(name, prefix) {
 			return false
 		}
 	}
 	// Only accept interfaces that have a `device` link — real PCI NICs
 	// do; pure virtuals (dummy0, wg*) don't.
 	if _, err := os.Stat(filepath.Join("/sys/class/net", name, "device")); err != nil {
 		return false
 	}
 	return true
 }
 // ----- NVMe --------------------------------------------------------------
 // probeNVMeFirmware reads /sys/class/nvme/nvmeN/firmware_rev for every
 // controller. Falls back to `nvme id-ctrl` if the sysfs file is missing
 // (older kernels). Identifier is the controller path so a run with two
 // drives produces two snapshots.
 func probeNVMeFirmware(ctx context.Context) []FirmwareSnapshot {
 	entries, err := os.ReadDir("/sys/class/nvme")
 	if err != nil {
 		return nil
 	}
 	var out []FirmwareSnapshot
 	for _, e := range entries {
 		ctrl := e.Name()
 		rev := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "firmware_rev")))
 		model := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "model")))
 		if rev == "" {
 			// Fallback: nvme id-ctrl -H /dev/<ctrl>. Available on hosts
 			// where sysfs doesn't export firmware_rev.
 			if _, err := exec.LookPath("nvme"); err == nil {
 				raw, _ := runCmd(ctx, "nvme", "id-ctrl", "/dev/"+ctrl)
 				rev = parseNVMeIDCtrl(strings.NewReader(raw), "fr")
 				if model == "" {
 					model = parseNVMeIDCtrl(strings.NewReader(raw), "mn")
 				}
 			}
 		}
 		if rev == "" {
 			continue
 		}
 		out = append(out, FirmwareSnapshot{
 			Component:  "nvme_fw",
 			Identifier: ctrl,
 			Version:    rev,
 			Vendor:     model,
 			Raw:        map[string]string{"model": model, "firmware_rev": rev},
 		})
 	}
 	return out
 }
 // parseNVMeIDCtrl pulls a single field out of `nvme id-ctrl` output.
 // Format: "fr        : FW1234" / "mn        : Samsung SSD 980 PRO".
 // Leading spaces vary, values may contain spaces.
 func parseNVMeIDCtrl(r io.Reader, key string) string {
 	sc := bufio.NewScanner(r)
 	prefix := key + " "
 	for sc.Scan() {
 		line := strings.TrimSpace(sc.Text())
 		if !strings.HasPrefix(line, prefix) {
 			continue
 		}
 		_, v, ok := strings.Cut(line, ":")
 		if !ok {
 			continue
 		}
 		return strings.TrimSpace(v)
 	}
 	return ""
 }
 // ----- HBA ---------------------------------------------------------------
 var lspciClassHBA = regexp.MustCompile(`(?i)(serial attached scsi|sas controller|raid bus controller)`)
 // probeHBAFirmware looks for SAS/RAID HBAs via `lspci -Dvvnn`. The
 // firmware string is typically exposed as "Product Name" +
 // "Capabilities" but in practice the LSI/Broadcom driver writes a
 // "revision" on the device line. We capture what's printed and rely on
 // SpecValidate to diff — this keeps us off tool-specific CLIs (storcli,
 // mpt-status) that aren't always installed.
 func probeHBAFirmware(ctx context.Context) []FirmwareSnapshot {
 	if _, err := exec.LookPath("lspci"); err != nil {
 		return nil
 	}
 	out, err := runCmd(ctx, "lspci", "-Dvvnn")
 	if err != nil {
 		return nil
 	}
 	return parseLspciHBA(strings.NewReader(out))
 }
 // parseLspciHBA walks `lspci -Dvvnn` stanzas and picks SAS/RAID
 // controllers. One snapshot per device; identifier is the PCI address.
 // Version is the device line's revision (rev NN) or the Kernel modules
 // string when no rev is printed.
 func parseLspciHBA(r io.Reader) []FirmwareSnapshot {
 	sc := bufio.NewScanner(r)
 	sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
 	var out []FirmwareSnapshot
 	var cur *FirmwareSnapshot
 	revRe := regexp.MustCompile(`\(rev\s+([0-9a-fA-F]+)\)`)
 	flush := func() {
 		if cur != nil && cur.Version != "" {
 			out = append(out, *cur)
 		}
 		cur = nil
 	}
 	for sc.Scan() {
 		line := sc.Text()
 		if !strings.HasPrefix(line, "\t") && strings.Contains(line, " ") {
 			// New device line.
 			flush()
 			if lspciClassHBA.MatchString(line) {
 				addr, rest, _ := strings.Cut(line, " ")
 				cur = &FirmwareSnapshot{
 					Component:  "hba",
 					Identifier: addr,
 					Vendor:     strings.TrimSpace(rest),
 					Raw:        map[string]string{"device_line": line},
 				}
 				if m := revRe.FindStringSubmatch(line); len(m) == 2 {
 					cur.Version = "rev " + m[1]
 				}
 			}
 			continue
 		}
 		if cur == nil {
 			continue
 		}
 		trim := strings.TrimSpace(line)
 		if strings.HasPrefix(trim, "Kernel modules:") {
 			cur.Raw["kernel_modules"] = strings.TrimPrefix(trim, "Kernel modules:")
 		}
 		if strings.HasPrefix(trim, "Kernel driver in use:") {
 			cur.Raw["kernel_driver"] = strings.TrimPrefix(trim, "Kernel driver in use:")
 		}
 	}
 	flush()
 	return out
 }
 // ----- Microcode ---------------------------------------------------------
 // probeMicrocode reads /proc/cpuinfo for the "microcode" line. All
 // cores report the same value post-boot, so one snapshot is enough.
 func probeMicrocode() *FirmwareSnapshot {
 	f, err := os.Open("/proc/cpuinfo")
 	if err != nil {
 		return nil
 	}
 	defer func() { _ = f.Close() }()
 	snap := parseMicrocode(f)
 	return snap
 }
 func parseMicrocode(r io.Reader) *FirmwareSnapshot {
 	sc := bufio.NewScanner(r)
 	version := ""
 	vendor := ""
 	for sc.Scan() {
 		line := sc.Text()
 		k, v, ok := strings.Cut(line, ":")
 		if !ok {
 			continue
 		}
 		key := strings.TrimSpace(k)
 		val := strings.TrimSpace(v)
 		switch key {
 		case "microcode":
 			if version == "" {
 				version = val
 			}
 		case "vendor_id":
 			if vendor == "" {
 				vendor = val
 			}
 		}
 		if version != "" && vendor != "" {
 			break
 		}
 	}
 	if version == "" {
 		return nil
 	}
 	return &FirmwareSnapshot{
 		Component:  "microcode",
 		Identifier: "cpu",
 		Version:    version,
 		Vendor:     vendor,
 	}
 }
 // ----- helpers -----------------------------------------------------------
 func firstNonEmpty(ss ...string) string {
 	for _, s := range ss {
 		if strings.TrimSpace(s) != "" {
 			return s
 		}
 	}
 	return ""
 }
 func readFile(p string) string {
 	b, err := os.ReadFile(p)
 	if err != nil {
 		return ""
 	}
 	return string(b)
 }
 // trimErr joins the underlying error with the first line of combined
 // output so the warning message carries enough diagnostic context
 // without dumping a screenful of dmidecode/ipmitool noise.
 func trimErr(err error, out string) string {
 	firstLine := strings.SplitN(strings.TrimSpace(out), "\n", 2)[0]
 	if firstLine == "" {
 		return err.Error()
 	}
 	return fmt.Sprintf("%v (%s)", err, firstLine)
 }
@@ -0,0 +1,232 @@
 package probes
 import (
 	"strings"
 	"testing"
 )
 // Golden dmidecode -t bios output (trimmed, representative). A real
 // host will have more lines; parse must tolerate the unknown fields.
 const dmidecodeBIOS = `# dmidecode 3.3
 Getting SMBIOS data from sysfs.
 SMBIOS 3.2.0 present.
 Handle 0x0000, DMI type 0, 26 bytes
 BIOS Information
 	Vendor: American Megatrends Inc.
 	Version: 3.2
 	Release Date: 07/15/2021
 	Address: 0xF0000
 	Runtime Size: 64 kB
 	ROM Size: 32 MB
 	Characteristics:
 		PCI is supported
 		BIOS is upgradeable
 Handle 0x0001, DMI type 1, 27 bytes
 System Information
 	Manufacturer: Supermicro
 	Product Name: X11SSL-F
 `
 func TestParseDmidecodeBIOS(t *testing.T) {
 	snap := parseDmidecodeBIOS(strings.NewReader(dmidecodeBIOS))
 	if snap == nil {
 		t.Fatal("parseDmidecodeBIOS returned nil")
 	}
 	if snap.Component != "bios" {
 		t.Errorf("component = %q, want bios", snap.Component)
 	}
 	if snap.Version != "3.2" {
 		t.Errorf("version = %q, want 3.2", snap.Version)
 	}
 	if snap.Vendor != "American Megatrends Inc." {
 		t.Errorf("vendor = %q, want American Megatrends Inc.", snap.Vendor)
 	}
 	if snap.Raw["Release Date"] != "07/15/2021" {
 		t.Errorf("release date = %q, want 07/15/2021", snap.Raw["Release Date"])
 	}
 }
 func TestParseDmidecodeBIOSMissingBlock(t *testing.T) {
 	// No BIOS Information block → nil result, not a crash.
 	input := "Handle 0x0001, DMI type 1, 27 bytes\nSystem Information\n\tManufacturer: Acme\n"
 	if snap := parseDmidecodeBIOS(strings.NewReader(input)); snap != nil {
 		t.Fatalf("expected nil when BIOS block absent, got %+v", snap)
 	}
 }
 const ipmitoolMCInfo = `Device ID                 : 32
 Device Revision           : 1
 Firmware Revision         : 1.74
 IPMI Version              : 2.0
 Manufacturer ID           : 10876
 Manufacturer Name         : Supermicro
 Product ID                : 2051 (0x0803)
 Product Name              : Unknown (0x803)
 `
 func TestParseIpmitoolMCInfo(t *testing.T) {
 	snap := parseIpmitoolMCInfo(strings.NewReader(ipmitoolMCInfo))
 	if snap == nil {
 		t.Fatal("parseIpmitoolMCInfo returned nil")
 	}
 	if snap.Component != "bmc" {
 		t.Errorf("component = %q, want bmc", snap.Component)
 	}
 	if snap.Version != "1.74" {
 		t.Errorf("version = %q, want 1.74", snap.Version)
 	}
 	if snap.Vendor != "Supermicro" {
 		t.Errorf("vendor = %q, want Supermicro", snap.Vendor)
 	}
 }
 func TestParseIpmitoolMCInfoEmpty(t *testing.T) {
 	if snap := parseIpmitoolMCInfo(strings.NewReader("")); snap != nil {
 		t.Fatalf("expected nil on empty input, got %+v", snap)
 	}
 }
 const ethtoolEth0 = `driver: mlx5_core
 version: 5.15.0
 firmware-version: 16.32.1010 (MT_0000000008)
 expansion-rom-version:
 bus-info: 0000:5e:00.0
 supports-statistics: yes
 `
 func TestParseEthtoolI(t *testing.T) {
 	snap := parseEthtoolI(strings.NewReader(ethtoolEth0), "eth0")
 	if snap == nil {
 		t.Fatal("parseEthtoolI returned nil")
 	}
 	if snap.Component != "nic" || snap.Identifier != "eth0" {
 		t.Errorf("component/id = %q/%q, want nic/eth0", snap.Component, snap.Identifier)
 	}
 	if snap.Version != "16.32.1010 (MT_0000000008)" {
 		t.Errorf("version = %q, want 16.32.1010 (MT_0000000008)", snap.Version)
 	}
 	if snap.Vendor != "mlx5_core" {
 		t.Errorf("vendor = %q, want mlx5_core", snap.Vendor)
 	}
 }
 func TestParseEthtoolIEmpty(t *testing.T) {
 	if snap := parseEthtoolI(strings.NewReader("not a valid output"), "eth0"); snap != nil {
 		t.Fatalf("expected nil on garbage input, got %+v", snap)
 	}
 }
 const nvmeIDCtrl = `NVME Identify Controller:
 vid       : 0x144d
 ssvid     : 0x144d
 sn        : S5GYNX0R500123X
 mn        : Samsung SSD 980 PRO 1TB
 fr        : 5B2QGXA7
 rab       : 2
 `
 func TestParseNVMeIDCtrl(t *testing.T) {
 	if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "fr"); got != "5B2QGXA7" {
 		t.Errorf("fr = %q, want 5B2QGXA7", got)
 	}
 	if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "mn"); got != "Samsung SSD 980 PRO 1TB" {
 		t.Errorf("mn = %q, want Samsung SSD 980 PRO 1TB", got)
 	}
 	if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "missing"); got != "" {
 		t.Errorf("missing key should be empty, got %q", got)
 	}
 }
 const lspciHBA = `0000:01:00.0 Ethernet controller [0200]: Intel Corporation I350 [8086:1521] (rev 01)
 	Subsystem: Intel Corporation I350 [8086:0001]
 	Kernel driver in use: igb
 	Kernel modules: igb
 0000:03:00.0 Serial Attached SCSI controller [0107]: Broadcom / LSI SAS3008 PCI-Express Fusion-MPT SAS-3 [1000:0097] (rev 02)
 	Subsystem: Broadcom / LSI SAS9300-8i [1000:30e0]
 	Kernel driver in use: mpt3sas
 	Kernel modules: mpt3sas
 0000:04:00.0 RAID bus controller [0104]: LSI MegaRAID SAS-3 3108 [1000:005d] (rev 02)
 	Subsystem: LSI MegaRAID SAS 9361-8i [1000:9361]
 	Kernel driver in use: megaraid_sas
 	Kernel modules: megaraid_sas
 `
 func TestParseLspciHBA(t *testing.T) {
 	got := parseLspciHBA(strings.NewReader(lspciHBA))
 	if len(got) != 2 {
 		t.Fatalf("got %d HBA snapshots, want 2 (SAS + RAID; Ethernet must be skipped)", len(got))
 	}
 	for _, s := range got {
 		if s.Component != "hba" {
 			t.Errorf("component = %q, want hba", s.Component)
 		}
 		if s.Version != "rev 02" {
 			t.Errorf("version = %q, want 'rev 02'", s.Version)
 		}
 	}
 	if got[0].Identifier != "0000:03:00.0" {
 		t.Errorf("first identifier = %q, want 0000:03:00.0", got[0].Identifier)
 	}
 	if got[1].Identifier != "0000:04:00.0" {
 		t.Errorf("second identifier = %q, want 0000:04:00.0", got[1].Identifier)
 	}
 }
 const cpuinfo = `processor	: 0
 vendor_id	: GenuineIntel
 cpu family	: 6
 model		: 85
 model name	: Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
 stepping	: 7
 microcode	: 0x5003006
 cpu MHz		: 2100.000
 `
 func TestParseMicrocode(t *testing.T) {
 	snap := parseMicrocode(strings.NewReader(cpuinfo))
 	if snap == nil {
 		t.Fatal("parseMicrocode returned nil")
 	}
 	if snap.Version != "0x5003006" {
 		t.Errorf("version = %q, want 0x5003006", snap.Version)
 	}
 	if snap.Vendor != "GenuineIntel" {
 		t.Errorf("vendor = %q, want GenuineIntel", snap.Vendor)
 	}
 	if snap.Identifier != "cpu" {
 		t.Errorf("identifier = %q, want cpu", snap.Identifier)
 	}
 }
 func TestParseMicrocodeMissing(t *testing.T) {
 	// A /proc/cpuinfo without a microcode line returns nil.
 	input := "processor\t: 0\nvendor_id\t: GenuineIntel\n"
 	if snap := parseMicrocode(strings.NewReader(input)); snap != nil {
 		t.Fatalf("expected nil when microcode line absent, got %+v", snap)
 	}
 }
 func TestIsRealNIC(t *testing.T) {
 	cases := []struct {
 		name string
 		want bool // want=true means a real-looking name (the /sys/class/net/<name>/device check is skipped here)
 	}{
 		{"lo", false},
 		{"", false},
 		{"docker0", false},
 		{"br-abc", false},
 		{"veth1234", false},
 		{"virbr0", false},
 		{"bond0", false},
 		{"tun0", false},
 	}
 	for _, tc := range cases {
 		if got := isRealNIC(tc.name); got != tc.want {
 			t.Errorf("isRealNIC(%q) = %v, want %v", tc.name, got, tc.want)
 		}
 	}
 }
@@ -0,0 +1,85 @@
 package probes
 import (
 	"bufio"
 	"io"
 	"os"
 	"strconv"
 	"strings"
 )
 // NetDevSnapshot is the per-interface counter row from /proc/net/dev at
 // a single instant. Used by the Network stage to compute deltas across
 // an iperf window — a rising rx_errors or tx_dropped during a loaded
 // link is a real NIC problem, not general noise.
 type NetDevSnapshot struct {
 	Iface   string
 	RxBytes uint64
 	RxErrs  uint64
 	RxDrop  uint64
 	TxBytes uint64
 	TxErrs  uint64
 	TxDrop  uint64
 }
 // NetDev reads /proc/net/dev and returns one snapshot per non-loopback
 // interface. Returns nil on read/parse failure (best-effort: a missing
 // /proc is survivable; the caller skips delta reporting that tick).
 func NetDev() []NetDevSnapshot {
 	f, err := os.Open("/proc/net/dev")
 	if err != nil {
 		return nil
 	}
 	defer func() { _ = f.Close() }()
 	return parseNetDev(f)
 }
 // parseNetDev is split from NetDev so tests can feed a fixture without
 // touching the real /proc. The /proc/net/dev format is two header lines
 // followed by rows of "iface: rx_bytes rx_packets rx_errs rx_drop ... tx_bytes tx_packets tx_errs tx_drop ..."
 // — 16 whitespace-separated counters, of which we pull a curated six.
 func parseNetDev(r io.Reader) []NetDevSnapshot {
 	var out []NetDevSnapshot
 	sc := bufio.NewScanner(r)
 	// Skip the two header lines (iface || bytes ... || bytes ...).
 	for i := 0; i < 2 && sc.Scan(); i++ {
 	}
 	for sc.Scan() {
 		line := strings.TrimSpace(sc.Text())
 		if line == "" {
 			continue
 		}
 		colon := strings.IndexByte(line, ':')
 		if colon < 0 {
 			continue
 		}
 		iface := strings.TrimSpace(line[:colon])
 		if iface == "" || iface == "lo" {
 			continue
 		}
 		fields := strings.Fields(line[colon+1:])
 		if len(fields) < 16 {
 			continue
 		}
 		// /proc/net/dev columns:
 		//   0 rx_bytes  1 rx_packets  2 rx_errs  3 rx_drop  4 fifo  5 frame  6 compressed  7 multicast
 		//   8 tx_bytes  9 tx_packets 10 tx_errs 11 tx_drop 12 fifo 13 colls 14 carrier 15 compressed
 		snap := NetDevSnapshot{Iface: iface}
 		snap.RxBytes = parseU64(fields[0])
 		snap.RxErrs = parseU64(fields[2])
 		snap.RxDrop = parseU64(fields[3])
 		snap.TxBytes = parseU64(fields[8])
 		snap.TxErrs = parseU64(fields[10])
 		snap.TxDrop = parseU64(fields[11])
 		out = append(out, snap)
 	}
 	return out
 }
 func parseU64(s string) uint64 {
 	n, err := strconv.ParseUint(s, 10, 64)
 	if err != nil {
 		return 0
 	}
 	return n
 }
@@ -0,0 +1,84 @@
 package probes
 import (
 	"strings"
 	"testing"
 )
 // TestParseNetDev_RealSample exercises parseNetDev against a synthetic
 // /proc/net/dev fixture with the full 16-column layout. Confirms the
 // loopback interface is dropped, headers are skipped, and each of the
 // six curated counters lands in the right field.
 func TestParseNetDev_RealSample(t *testing.T) {
 	// Columns after "iface:":
 	//   0 rx_bytes  1 rx_packets  2 rx_errs  3 rx_drop
 	//   4 fifo  5 frame  6 compressed  7 multicast
 	//   8 tx_bytes  9 tx_packets 10 tx_errs 11 tx_drop
 	//  12 fifo 13 colls 14 carrier 15 compressed
 	fixture := `Inter-|   Receive                                                |  Transmit
 face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
    lo: 1000000   10000    0    0    0     0          0         0  1000000   10000    0    0    0     0       0          0
  eth0: 50000000  100000   7   12    0     0          0         0  40000000   90000   3    5    0     0       0          0
  eth1: 12345      200     0    0    0     0          0         0    54321     180    0    0    0     0       0          0
 `
 	snaps := parseNetDev(strings.NewReader(fixture))
 	if len(snaps) != 2 {
 		t.Fatalf("got %d snapshots, want 2 (lo should be dropped)", len(snaps))
 	}
 	byIface := map[string]NetDevSnapshot{}
 	for _, s := range snaps {
 		byIface[s.Iface] = s
 	}
 	eth0, ok := byIface["eth0"]
 	if !ok {
 		t.Fatalf("eth0 missing from parsed snapshots")
 	}
 	if eth0.RxBytes != 50000000 {
 		t.Errorf("eth0 RxBytes=%d, want 50000000", eth0.RxBytes)
 	}
 	if eth0.RxErrs != 7 {
 		t.Errorf("eth0 RxErrs=%d, want 7", eth0.RxErrs)
 	}
 	if eth0.RxDrop != 12 {
 		t.Errorf("eth0 RxDrop=%d, want 12", eth0.RxDrop)
 	}
 	if eth0.TxBytes != 40000000 {
 		t.Errorf("eth0 TxBytes=%d, want 40000000", eth0.TxBytes)
 	}
 	if eth0.TxErrs != 3 {
 		t.Errorf("eth0 TxErrs=%d, want 3", eth0.TxErrs)
 	}
 	if eth0.TxDrop != 5 {
 		t.Errorf("eth0 TxDrop=%d, want 5", eth0.TxDrop)
 	}
 	if _, ok := byIface["lo"]; ok {
 		t.Errorf("lo should have been filtered out")
 	}
 }
 // TestParseNetDev_Empty: an empty reader returns no snapshots, not a
 // crash. Callers treat nil as "no data" and skip the delta step.
 func TestParseNetDev_Empty(t *testing.T) {
 	snaps := parseNetDev(strings.NewReader(""))
 	if len(snaps) != 0 {
 		t.Errorf("got %d snapshots from empty reader, want 0", len(snaps))
 	}
 }
 // TestParseNetDev_MalformedRow skips rows that don't have the expected
 // 16 columns rather than panicking. A truncated line shouldn't hide the
 // good rows that follow.
 func TestParseNetDev_MalformedRow(t *testing.T) {
 	fixture := `header line 1
 header line 2
  bad0: 123 456
  eth0: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
 `
 	snaps := parseNetDev(strings.NewReader(fixture))
 	if len(snaps) != 1 {
 		t.Fatalf("got %d snapshots, want 1 (bad0 should be dropped)", len(snaps))
 	}
 	if snaps[0].Iface != "eth0" {
 		t.Errorf("got iface=%q, want eth0", snaps[0].Iface)
 	}
 }
@@ -26,6 +26,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -71,7 +72,10 @@ func Run(ctx context.Context, p *bootstate.Params) error {
 	}
 	fwd.info(fmt.Sprintf("claimed run; stages=%v current_state=%s", claim.Stages, claim.CurrentState))
-	go thermalSidecar(ctx, c, fwd)
+	mux := NewSensorMux(ctx, c)
 	defer mux.Close()
 	go thermalSidecar(ctx, mux, fwd)
 	hbCh := make(chan HeartbeatResponse, 4)
 	go heartbeatLoop(ctx, c, fwd, hbCh)
@@ -101,7 +105,7 @@ func Run(ctx context.Context, p *bootstate.Params) error {
 		default:
 		}
 		fwd.info("stage: starting " + nextStage)
-		outcome := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{})
+		outcome := runStageCancellable(ctx, nextStage, claim, fwd, c, mux, overrideFlags{})
 		if outcome.Cancelled {
 			fwd.warn("stage cancelled by operator; posting result and exiting")
 			_, _ = postResult(ctx, c, nextStage, outcome)
@@ -119,7 +123,7 @@ func Run(ctx context.Context, p *bootstate.Params) error {
 				return err
 			}
 			// Park and wait for an override directive.
-			return waitForOverride(ctx, c, fwd, hbCh, claim)
+			return waitForOverride(ctx, c, fwd, mux, hbCh, claim)
 		}
 		if resp.NextState == "Completed" || resp.NextState == "" {
 			fwd.info("pipeline complete")
@@ -144,10 +148,10 @@ func Run(ctx context.Context, p *bootstate.Params) error {
 // it runs the inventory probe and passes the result as the /result body
 // (the orchestrator persists it as an artifact). Every other stage
 // returns a tests.Outcome which postResult marshals generically.
-func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome {
+func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, mux *SensorMux, ovr overrideFlags) stageOutcome {
 	fwd.SetStage(stage)
 	defer fwd.ClearStage()
-	deps := newDeps(ctx, c, fwd, ovr, claim)
+	deps := newDeps(ctx, c, fwd, mux, ovr, claim, stage)
 	switch stage {
 	case "Inventory":
 		fwd.info("Inventory: probing host hardware")
@@ -163,6 +167,25 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
 			},
 			Inventory: inv,
 		}
 	case "Firmware":
 		fwd.info("Firmware: probing firmware versions")
 		snaps, warns := probes.Firmware(ctx)
 		for _, w := range warns {
 			fwd.warn(w)
 		}
 		summary := firmwareSummary(snaps)
 		fwd.info("Firmware: " + summary)
 		return stageOutcome{
 			Outcome: tests.Outcome{
 				Passed:  true,
 				Summary: summary,
 				Extras: map[string]any{
 					"warnings":  warns,
 					"snapshots": len(snaps),
 				},
 			},
 			Firmware: snaps,
 		}
 	case "SMART":
 		return stageOutcome{Outcome: tests.SMART(ctx, deps)}
 	case "CPUStress":
@@ -170,10 +193,19 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
 	case "Storage":
 		return stageOutcome{Outcome: tests.Storage(ctx, deps)}
 	case "Network":
 		duration := deps.NetworkKnobs.Duration
 		if duration <= 0 {
 			duration = 10 * time.Second
 		}
 		return stageOutcome{Outcome: tests.Network(ctx, deps, tests.NetworkConfig{
 			OrchestratorURL: c.BaseURL,
 			IperfPort:       claim.IperfPort,
-			Duration:        10 * time.Second,
+			Duration:        duration,
 		})}
 	case "Burn":
 		return stageOutcome{Outcome: tests.Burn(ctx, deps, tests.BurnConfig{
 			OrchestratorURL: c.BaseURL,
 			IperfPort:       claim.IperfPort,
 		})}
 	case "GPU":
 		return stageOutcome{Outcome: tests.GPU(ctx, deps)}
@@ -189,6 +221,7 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
 type stageOutcome struct {
 	Outcome   tests.Outcome
 	Inventory *spec.Inventory           // only for Inventory stage
 	Firmware  []probes.FirmwareSnapshot // only for Firmware stage
 	Cancelled bool                      // set when the stage was cut short by operator cancel
 }
@@ -197,14 +230,14 @@ type stageOutcome struct {
 // is currently running. If the derived context was cancelled while the
 // stage executed, the outcome is rewritten as a cancellation record so
 // the orchestrator has something to persist.
-func runStageCancellable(parent context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome {
+func runStageCancellable(parent context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, mux *SensorMux, ovr overrideFlags) stageOutcome {
 	stageCtx, cancel := context.WithCancel(parent)
 	stageCancel.Store(cancel)
 	defer func() {
 		cancel()
 		stageCancel.Store(context.CancelFunc(nil))
 	}()
-	out := runStage(stageCtx, stage, claim, fwd, c, ovr)
+	out := runStage(stageCtx, stage, claim, fwd, c, mux, ovr)
 	// If the parent is still live but the stage ctx was cancelled, the
 	// operator fired a cancel — mark the outcome so the caller can exit
 	// the pipeline cleanly. Plain ctx-cancel on ctx.Done (e.g. shutdown)
@@ -235,7 +268,7 @@ type overrideFlags struct {
 	Wipe bool `json:"wipe"`
 }
-func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlags, claim *ClaimResponse) tests.Deps {
+func newDeps(ctx context.Context, c *Client, fwd *logForwarder, mux *SensorMux, ovr overrideFlags, claim *ClaimResponse, stage string) tests.Deps {
 	var expected []tests.ExpectedDisk
 	for _, e := range claim.ExpectedDisks {
 		expected = append(expected, tests.ExpectedDisk{Serial: e.Serial, SizeGB: e.SizeGB})
@@ -247,17 +280,73 @@ func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlag
 		OverrideWipe:   ovr.Wipe,
 		NonDestructive: claim.NonDestructive,
 		ExpectedDisks:  expected,
-		StageTimeout:   2 * time.Minute,
+		StageTimeout:   stageTimeout(claim, stage),
-		Sensor: func(ctx context.Context, samples []tests.Sample) error {
+		CPUStressKnobs: tests.CPUStressKnobs{
 			CPUPass:  parseDur(claim.StageConfig.CPUStress.CPUPass),
 			MemPass:  parseDur(claim.StageConfig.CPUStress.MemPass),
 			EDACPoll: parseDur(claim.StageConfig.CPUStress.EDACPoll),
 		},
 		StorageKnobs: tests.StorageKnobs{
 			Mode:    claim.StageConfig.Storage.Mode,
 			FioSize: claim.StageConfig.Storage.FioSize,
 			FioTime: parseDur(claim.StageConfig.Storage.FioTime),
 			FioBS:   claim.StageConfig.Storage.FioBS,
 			FioRW:   claim.StageConfig.Storage.FioRW,
 			Verify:  claim.StageConfig.Storage.Verify,
 		},
 		NetworkKnobs: tests.NetworkKnobs{
 			Duration: parseDur(claim.StageConfig.Network.Duration),
 		},
 		BurnKnobs: tests.BurnKnobs{
 			Duration:      parseDur(claim.StageConfig.Burn.Duration),
 			CPUWorkers:    claim.StageConfig.Burn.CPUWorkers,
 			MemPct:        claim.StageConfig.Burn.MemPct,
 			FioOnSpare:    claim.StageConfig.Burn.FioOnSpare,
 			IperfParallel: claim.StageConfig.Burn.IperfParallel,
 		},
 		Sensor: func(_ context.Context, samples []tests.Sample) error {
 			out := make([]SensorSample, 0, len(samples))
 			for _, s := range samples {
 				out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
 			}
-			return c.Sensor(ctx, out)
+			mux.Send(out)
 			return nil
 		},
 	}
 }
 // stageTimeout reads claim.StageConfig.StageTimeouts[stage] and falls
 // back to 2 minutes (the pre-Phase-2 default). Malformed entries log and
 // fall back — we'd rather run the stage than refuse on a typo.
 func stageTimeout(claim *ClaimResponse, stage string) time.Duration {
 	if claim == nil || claim.StageConfig.StageTimeouts == nil {
 		return 2 * time.Minute
 	}
 	raw, ok := claim.StageConfig.StageTimeouts[stage]
 	if !ok || raw == "" {
 		return 2 * time.Minute
 	}
 	d, err := time.ParseDuration(raw)
 	if err != nil || d <= 0 {
 		return 2 * time.Minute
 	}
 	return d
 }
 // parseDur is the permissive duration parser for the knob wire shape.
 // Empty strings / parse failures yield 0 so callers can treat a zero
 // value as "use the compile-time default" without a nil-check dance.
 func parseDur(s string) time.Duration {
 	if s == "" {
 		return 0
 	}
 	d, err := time.ParseDuration(s)
 	if err != nil || d < 0 {
 		return 0
 	}
 	return d
 }
 // postResult marshals stageOutcome for the /result endpoint. The
 // Inventory shape is special-cased: it includes the inventory blob so
 // the orchestrator can persist it and run server-side spec diff.
@@ -276,6 +365,9 @@ func postResult(ctx context.Context, c *Client, stage string, s stageOutcome) (*
 	if s.Inventory != nil {
 		body["inventory"] = s.Inventory
 	}
 	if len(s.Firmware) > 0 {
 		body["firmware"] = s.Firmware
 	}
 	if len(s.Outcome.SubSteps) > 0 {
 		wire := make([]SubStepReport, 0, len(s.Outcome.SubSteps))
 		for _, ss := range s.Outcome.SubSteps {
@@ -304,7 +396,7 @@ func stageForState(state string) string {
 	switch state {
 	case "InventoryCheck":
 		return "Inventory"
-	case "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU":
+	case "Firmware", "SMART", "CPUStress", "Storage", "Network", "Burn", "GPU", "PSU":
 		return state
 	}
 	// SpecValidate and Reporting are orchestrator-owned; we never see
@@ -315,7 +407,7 @@ func stageForState(state string) string {
 // waitForOverride parks the agent in FailedHolding. It listens for a
 // heartbeat directive that tells it to retry a stage (e.g. Storage
 // with wipe-override armed) and re-enters runStage from that point.
-func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-chan HeartbeatResponse, claim *ClaimResponse) error {
+func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, mux *SensorMux, hb <-chan HeartbeatResponse, claim *ClaimResponse) error {
 	fwd.info("holding: awaiting operator decision (heartbeat directive or ctx cancel)")
 	for {
 		select {
@@ -333,7 +425,7 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha
 			if len(cmd.OverrideFlags) > 0 {
 				_ = json.Unmarshal(cmd.OverrideFlags, &ovr)
 			}
-			outcome := runStageCancellable(ctx, cmd.Stage, claim, fwd, c, ovr)
+			outcome := runStageCancellable(ctx, cmd.Stage, claim, fwd, c, mux, ovr)
 			if outcome.Cancelled {
 				fwd.warn("stage cancelled by operator; posting result and exiting")
 				_, _ = postResult(ctx, c, cmd.Stage, outcome)
@@ -362,7 +454,7 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha
 					default:
 					}
 					fwd.info("stage: starting " + nextStage)
-					out := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{})
+					out := runStageCancellable(ctx, nextStage, claim, fwd, c, mux, overrideFlags{})
 					if out.Cancelled {
 						fwd.warn("stage cancelled by operator; posting result and exiting")
 						_, _ = postResult(ctx, c, nextStage, out)
@@ -417,11 +509,32 @@ func inventorySummary(inv *spec.Inventory) string {
 		len(inv.Disks), len(inv.NICs), len(inv.GPUs))
 }
 // firmwareSummary renders the one-liner surfaced in the stage tile:
 // per-component counts so an operator can see "bios=1 nic=2 nvme_fw=1"
 // without opening the report.
 func firmwareSummary(snaps []probes.FirmwareSnapshot) string {
 	counts := map[string]int{}
 	for _, s := range snaps {
 		counts[s.Component]++
 	}
 	if len(counts) == 0 {
 		return "no firmware readable"
 	}
 	keys := []string{"bios", "bmc", "nic", "hba", "nvme_fw", "microcode"}
 	parts := make([]string, 0, len(keys))
 	for _, k := range keys {
 		if n := counts[k]; n > 0 {
 			parts = append(parts, fmt.Sprintf("%s=%d", k, n))
 		}
 	}
 	return strings.Join(parts, " ")
 }
 // thermalSidecar posts a batch of /sys/class/hwmon samples every 5s.
 // Idempotent: a dead sensor just drops out of the next batch. Errors
 // are logged but never fatal — we'd rather have a run with partial
 // thermal data than kill the agent over an I/O hiccup.
-func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) {
+func thermalSidecar(ctx context.Context, mux *SensorMux, fwd *logForwarder) {
 	t := time.NewTicker(5 * time.Second)
 	defer t.Stop()
 	for {
@@ -437,11 +550,7 @@ func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) {
 			for _, s := range samples {
 				out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
 			}
-			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+			mux.Send(out)
 			if err := c.Sensor(sendCtx, out); err != nil {
 				fwd.warn("thermal sidecar: " + err.Error())
 			}
 			cancel()
 		}
 	}
 }
@@ -0,0 +1,139 @@
 package agent
 import (
 	"context"
 	"log"
 	"sync"
 	"time"
 )
 // SensorMux coalesces sensor samples from every stage + sidecar into a
 // single batched HTTP POST stream. Without it, a Burn run that fans out
 // four concurrent workloads + thermal + PSU + EDAC sidecars can push ~50
 // samples/sec, each as a separate /sensor request — enough to either
 // saturate the orchestrator's request budget or stall a stage on its
 // own sensor-forwarding path.
 //
 // Contract:
 //   - Send is non-blocking; a full input channel drops a batch on the
 //     floor and logs a warning. That's preferred over back-pressuring
 //     a workload goroutine and skewing its timing.
 //   - Flush happens every flushInterval *or* whenever the pending buffer
 //     exceeds maxBatch samples. Chunk-at-flush keeps each HTTP request
 //     bounded regardless of the incoming rate.
 //   - Close flushes whatever is in the buffer. Callers that need the
 //     final flush to reach the server should defer Close before other
 //     deferred shutdown work.
 type SensorMux struct {
 	c             *Client
 	in            chan []SensorSample
 	flushInterval time.Duration
 	maxBatch      int
 	ctx    context.Context
 	cancel context.CancelFunc
 	wg     sync.WaitGroup
 }
 // NewSensorMux starts the flush loop. Callers hand the returned mux to
 // every code path that previously called Client.Sensor directly (stage
 // Deps.Sensor, thermal sidecar, EDAC sidecar). The mux lives for the
 // duration of the agent run.
 func NewSensorMux(parent context.Context, c *Client) *SensorMux {
 	ctx, cancel := context.WithCancel(parent)
 	m := &SensorMux{
 		c:             c,
 		in:            make(chan []SensorSample, 32),
 		flushInterval: 2 * time.Second,
 		maxBatch:      500,
 		ctx:           ctx,
 		cancel:        cancel,
 	}
 	m.wg.Add(1)
 	go m.loop()
 	return m
 }
 // Send enqueues a batch for the next flush tick. Empty batches are
 // silently ignored so callers with conditional sample lists don't need
 // to guard the call site.
 func (m *SensorMux) Send(samples []SensorSample) {
 	if m == nil || len(samples) == 0 {
 		return
 	}
 	// Copy so caller mutations don't race with the flush loop.
 	out := make([]SensorSample, len(samples))
 	copy(out, samples)
 	select {
 	case m.in <- out:
 	default:
 		log.Printf("sensor mux: input channel full, dropping %d samples", len(out))
 	}
 }
 // Close stops the flush loop and flushes the residual buffer. Safe to
 // call twice (the second is a no-op because the internal context is
 // already cancelled).
 func (m *SensorMux) Close() {
 	if m == nil {
 		return
 	}
 	m.cancel()
 	m.wg.Wait()
 }
 func (m *SensorMux) loop() {
 	defer m.wg.Done()
 	buf := make([]SensorSample, 0, m.maxBatch)
 	t := time.NewTicker(m.flushInterval)
 	defer t.Stop()
 	for {
 		select {
 		case <-m.ctx.Done():
 			m.flushChunks(buf)
 			buf = nil
 			// Drain whatever is still sitting in the channel so a
 			// workload that pushed right before Close doesn't lose
 			// those final samples.
 			for {
 				select {
 				case batch := <-m.in:
 					m.flushChunks(batch)
 				default:
 					return
 				}
 			}
 		case batch := <-m.in:
 			buf = append(buf, batch...)
 			if len(buf) >= m.maxBatch {
 				m.flushChunks(buf)
 				buf = buf[:0]
 			}
 		case <-t.C:
 			if len(buf) > 0 {
 				m.flushChunks(buf)
 				buf = buf[:0]
 			}
 		}
 	}
 }
 // flushChunks splits a potentially-large slice into maxBatch-sized
 // HTTP requests so no single POST carries more than the configured cap.
 // A 10-second per-chunk timeout keeps a stalled orchestrator from
 // freezing the flush loop.
 func (m *SensorMux) flushChunks(all []SensorSample) {
 	for len(all) > 0 {
 		n := len(all)
 		if n > m.maxBatch {
 			n = m.maxBatch
 		}
 		chunk := all[:n]
 		all = all[n:]
 		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 		if err := m.c.Sensor(ctx, chunk); err != nil {
 			log.Printf("sensor mux: flush of %d samples failed: %v", len(chunk), err)
 		}
 		cancel()
 	}
 }
@@ -0,0 +1,144 @@
 package agent
 import (
 	"context"
 	"encoding/json"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
 )
 // TestSensorMux_CloseFlushesBuffer confirms Close() empties the
 // pending buffer through the HTTP client before returning. Without
 // this guarantee a Burn run would drop the last 2 s of samples when
 // the stage tears down, which is exactly the window that contains the
 // peak-load PSU / thermal readings we care about.
 func TestSensorMux_CloseFlushesBuffer(t *testing.T) {
 	var batches int32
 	var totalSamples int32
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		if !strings.HasSuffix(r.URL.Path, "/sensor") {
 			t.Errorf("unexpected path %s", r.URL.Path)
 		}
 		body, _ := io.ReadAll(r.Body)
 		var env struct {
 			Samples []SensorSample `json:"samples"`
 		}
 		if err := json.Unmarshal(body, &env); err != nil {
 			t.Errorf("decode: %v", err)
 		}
 		atomic.AddInt32(&batches, 1)
 		atomic.AddInt32(&totalSamples, int32(len(env.Samples)))
 		w.WriteHeader(http.StatusOK)
 	}))
 	defer srv.Close()
 	c := &Client{
 		BaseURL: srv.URL,
 		RunID:   1,
 		Token:   "t",
 		HTTP:    srv.Client(),
 	}
 	mux := NewSensorMux(context.Background(), c)
 	mux.Send([]SensorSample{
 		{Kind: "temp", Key: "cpu/0", Value: 72.5, Unit: "C"},
 		{Kind: "psu_volt", Key: "+12V", Value: 12.05, Unit: "V"},
 	})
 	mux.Send([]SensorSample{
 		{Kind: "mce", Key: "0", Value: 0, Unit: "count"},
 	})
 	mux.Close()
 	if got := atomic.LoadInt32(&totalSamples); got != 3 {
 		t.Errorf("expected 3 samples flushed, got %d across %d batch(es)", got, atomic.LoadInt32(&batches))
 	}
 	if atomic.LoadInt32(&batches) == 0 {
 		t.Errorf("expected at least one batch HTTP post")
 	}
 }
 // TestSensorMux_ChunksOversizedBatch verifies flushChunks splits a
 // single oversized input into maxBatch-sized HTTP requests. The plan's
 // Burn stage can legitimately push a single input larger than the cap
 // (e.g. a workload goroutine dumping a backlog), and a single giant
 // POST would defeat the point of the multiplexer.
 func TestSensorMux_ChunksOversizedBatch(t *testing.T) {
 	var batchSizes []int
 	var mu sync.Mutex
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		body, _ := io.ReadAll(r.Body)
 		var env struct {
 			Samples []SensorSample `json:"samples"`
 		}
 		_ = json.Unmarshal(body, &env)
 		mu.Lock()
 		batchSizes = append(batchSizes, len(env.Samples))
 		mu.Unlock()
 		w.WriteHeader(http.StatusOK)
 	}))
 	defer srv.Close()
 	c := &Client{BaseURL: srv.URL, RunID: 1, Token: "t", HTTP: srv.Client()}
 	mux := NewSensorMux(context.Background(), c)
 	// One input with 1200 samples → expect chunks of 500 + 500 + 200
 	// given the default maxBatch of 500.
 	big := make([]SensorSample, 1200)
 	for i := range big {
 		big[i] = SensorSample{Kind: "burn/throughput_mbps", Key: "eth0", Value: float64(i), Unit: "Mbps"}
 	}
 	mux.Send(big)
 	mux.Close()
 	mu.Lock()
 	defer mu.Unlock()
 	total := 0
 	for _, n := range batchSizes {
 		total += n
 		if n > 500 {
 			t.Errorf("batch size %d exceeds maxBatch=500", n)
 		}
 	}
 	if total != 1200 {
 		t.Errorf("sum of batch sizes = %d, want 1200 (sizes=%v)", total, batchSizes)
 	}
 	if len(batchSizes) < 3 {
 		t.Errorf("expected at least 3 chunks for a 1200-sample input, got %d (%v)", len(batchSizes), batchSizes)
 	}
 }
 // TestSensorMux_EmptyAndNilSafe covers the defensive guards around
 // Send(nil) / Send([]) / a nil *SensorMux. Callers with conditional
 // sample lists (storage probe that skipped a disk, GPU stage with no
 // devices) should be able to call Send unconditionally without adding
 // their own nil check.
 func TestSensorMux_EmptyAndNilSafe(t *testing.T) {
 	var batches int32
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		atomic.AddInt32(&batches, 1)
 		w.WriteHeader(http.StatusOK)
 	}))
 	defer srv.Close()
 	// Nil receiver must be a no-op.
 	var nilMux *SensorMux
 	nilMux.Send([]SensorSample{{Kind: "x", Key: "y"}})
 	nilMux.Close()
 	c := &Client{BaseURL: srv.URL, RunID: 1, Token: "t", HTTP: srv.Client()}
 	mux := NewSensorMux(context.Background(), c)
 	mux.Send(nil)
 	mux.Send([]SensorSample{})
 	mux.Close()
 	// Give any spurious goroutine a chance to surprise us.
 	time.Sleep(50 * time.Millisecond)
 	if atomic.LoadInt32(&batches) != 0 {
 		t.Errorf("empty/nil Send must not produce HTTP batches, got %d", atomic.LoadInt32(&batches))
 	}
 }
@@ -0,0 +1,486 @@
 package tests
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"os/exec"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 	"vetting/agent/probes"
 )
 // BurnConfig is what the agent passes to Burn: the orchestrator's iperf3
 // server address and port. Durations + concurrency knobs come from
 // Deps.BurnKnobs so they scale with profile.
 type BurnConfig struct {
 	OrchestratorURL string
 	IperfPort       int // 0 = 5201
 }
 // Burn is the concurrent soak stage. Unlike CPUStress (serial
 // CPU→memory) or Storage (serial per disk) it fans out every workload
 // at once: stress-ng hammers CPU + memory, fio drives the allow-listed
 // disks, iperf3 pushes sustained NIC traffic, and two sidecars poll
 // EDAC + PSU rails for the duration of the window.
 //
 // This is where PSU rails actually matter: 12V sag under simultaneous
 // CPU + disk + NIC load is exactly the failure a thermal/power
 // regression produces, and it's invisible to any stage that loads one
 // subsystem at a time. The PSU stage that follows Burn in the pipeline
 // re-samples rails post-window to confirm they settle back to nominal.
 //
 // Burn stays inside the stage framework — it doesn't spawn a parallel
 // stage runner. The goroutine fan-out is local; the stage converges
 // before returning an Outcome so every invariant the orchestrator
 // relies on (serial stage order, single in-flight stage per run) still
 // holds.
 func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome {
 	duration := d.BurnKnobs.Duration
 	if duration <= 0 {
 		duration = 2 * time.Minute
 	}
 	cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers)
 	memPct := clampMemPct(d.BurnKnobs.MemPct)
 	iperfParallel := d.BurnKnobs.IperfParallel
 	if iperfParallel <= 0 {
 		iperfParallel = 2
 	}
 	d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v",
 		duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare))
 	// Sidecars run for the lifetime of the window and are cancelled on
 	// return so the main stage converges cleanly. EDAC catches DIMM
 	// bit-flips that appear only under concurrent load; PSU catches
 	// rail sag that only appears when CPU + disk + NIC pull current
 	// simultaneously.
 	sideCtx, sideCancel := context.WithCancel(ctx)
 	defer sideCancel()
 	var sideWG sync.WaitGroup
 	sideWG.Add(2)
 	go runEDACSidecar(sideCtx, &sideWG, d)
 	go runPSUSidecar(sideCtx, &sideWG, d)
 	runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
 	defer cancel()
 	results := make(chan burnSubResult, 4)
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
 		results <- runBurnCPU(runCtx, d, duration, cpuWorkers)
 	}()
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
 		results <- runBurnMemory(runCtx, d, duration, memPct)
 	}()
 	// fio runs only when explicitly enabled *and* there are allow-listed
 	// disks *and* the run wasn't marked non-destructive. Any of those
 	// missing records a Skipped sub-step so the operator sees why.
 	if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
 			results <- runBurnFio(runCtx, d, duration)
 		}()
 	} else {
 		reason := burnFioSkipReason(d)
 		results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason}
 	}
 	// iperf requires an orchestrator host. Lab hosts run with the
 	// bundled iperf3 server; without a base URL we can't derive a
 	// target so we skip rather than fail the stage.
 	if cfg.OrchestratorURL != "" {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
 			results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel)
 		}()
 	} else {
 		results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"}
 	}
 	wg.Wait()
 	sideCancel()
 	sideWG.Wait()
 	close(results)
 	subs, samples, failures := collectBurnResults(results)
 	if d.Sensor != nil && len(samples) > 0 {
 		_ = d.Sensor(ctx, samples)
 	}
 	extras := map[string]any{
 		"duration":       duration.String(),
 		"cpu_workers":    cpuWorkers,
 		"mem_pct":        memPct,
 		"iperf_parallel": iperfParallel,
 		"fio_on_spare":   d.BurnKnobs.FioOnSpare,
 	}
 	if len(failures) > 0 {
 		msg := "Burn workloads failed: " + strings.Join(failures, ", ")
 		d.Error(msg)
 		return Outcome{
 			Passed:   false,
 			Message:  msg,
 			Summary:  fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)),
 			Extras:   extras,
 			SubSteps: subs,
 		}
 	}
 	d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs)))
 	return Outcome{
 		Passed:   true,
 		Summary:  fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)),
 		Extras:   extras,
 		SubSteps: subs,
 	}
 }
 // burnSubResult is the per-workload return type used by the fan-out
 // goroutines. Sample slice is merged into the stage's final /sensor
 // batch; SubStep becomes a row on the /result sub-steps list.
 type burnSubResult struct {
 	Name    string
 	Passed  bool
 	Skipped bool
 	Reason  string // why a workload was skipped
 	Err     string // why a workload failed
 	Samples []Sample
 	SubStep SubStepReport
 }
 func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) {
 	var subs []SubStepReport
 	var samples []Sample
 	var failures []string
 	for r := range ch {
 		// Non-skipped goroutines populate SubStep directly. Skipped slots
 		// get a synthesized row here so the /result shape stays stable.
 		if r.Skipped {
 			stamp := time.Now().UTC()
 			subs = append(subs, SubStepReport{
 				Name:        r.Name,
 				Skipped:     true,
 				StartedAt:   stamp,
 				CompletedAt: stamp,
 				SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}),
 			})
 			continue
 		}
 		subs = append(subs, r.SubStep)
 		samples = append(samples, r.Samples...)
 		if !r.Passed {
 			reason := r.Err
 			if reason == "" {
 				reason = "unknown"
 			}
 			failures = append(failures, r.Name+": "+reason)
 		}
 	}
 	return subs, samples, failures
 }
 func burnFioSkipReason(d Deps) string {
 	if !d.BurnKnobs.FioOnSpare {
 		return "fio_on_spare knob disabled"
 	}
 	if d.NonDestructive {
 		return "non-destructive run"
 	}
 	if len(d.ExpectedDisks) == 0 {
 		return "no allowlisted disks"
 	}
 	return "disabled"
 }
 // runBurnCPU hammers all CPU cores with stress-ng for the window. Same
 // shape as CPUStress pass 1 but with shorter label so the sub-step row
 // doesn't collide with the earlier stage's "CPU pass".
 func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult {
 	if _, err := exec.LookPath("stress-ng"); err != nil {
 		return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"}
 	}
 	args := []string{
 		"--cpu", strconv.Itoa(workers),
 		"--cpu-method", "all",
 		"--timeout", durationSeconds(duration),
 		"--metrics-brief",
 		"--verify",
 	}
 	d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " ")))
 	pass := runStressPass(ctx, d, "Burn CPU", duration, args)
 	return burnSubResult{
 		Name:    "Burn CPU",
 		Passed:  pass.Passed,
 		Err:     pass.Err,
 		SubStep: subStepFromPass("Burn CPU", pass),
 	}
 }
 // runBurnMemory drives a single --vm worker sized at memPct of
 // MemAvailable, capped so the kernel + agent + other workloads still
 // have headroom. Clamping happens here rather than in resolveBurnKnobs
 // so the cap is computed against real live memory each run.
 func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult {
 	if _, err := exec.LookPath("stress-ng"); err != nil {
 		return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"}
 	}
 	avail, err := memAvailableBytes()
 	if err != nil {
 		return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()}
 	}
 	// Budget = avail * memPct / 100, then subtract the standard headroom.
 	// If the result is below the memory-pass floor we record a skipped
 	// row instead — the window is too tight to be meaningful on this box.
 	budget := int64(float64(avail) * float64(memPct) / 100.0)
 	cap := budget - memHeadroomBytes
 	if cap < memFloorBytes {
 		return burnSubResult{
 			Name:    "Burn memory",
 			Skipped: true,
 			Reason:  fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)),
 		}
 	}
 	args := []string{
 		"--vm", "1",
 		"--vm-bytes", strconv.FormatInt(cap, 10),
 		"--vm-keep",
 		"--timeout", durationSeconds(duration),
 		"--metrics-brief",
 		"--verify",
 	}
 	d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct))
 	pass := runStressPass(ctx, d, "Burn memory", duration, args)
 	return burnSubResult{
 		Name:    "Burn memory",
 		Passed:  pass.Passed,
 		Err:     pass.Err,
 		SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass),
 	}
 }
 // runBurnFio runs fio_sample against the first allow-listed disk for
 // the window. Reuses runFioVerify + parseFioJSON so the samples line
 // up with what Storage emits. Using fio_sample (bounded by --size)
 // keeps Burn's write volume predictable regardless of profile.
 func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult {
 	if _, err := exec.LookPath("fio"); err != nil {
 		return burnSubResult{Name: "Burn fio", Err: "fio missing"}
 	}
 	targets := resolveTargets(d.ExpectedDisks)
 	if len(targets) == 0 {
 		return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"}
 	}
 	t := targets[0]
 	opts := fioOpts{
 		Mode:    "fio_sample",
 		Size:    "512MiB",
 		Runtime: duration,
 		BS:      "4k",
 		RW:      "randrw",
 		Verify:  "md5",
 	}
 	start := time.Now()
 	d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration))
 	fr := runFioVerify(ctx, t.Device, opts)
 	end := time.Now()
 	sub := SubStepReport{
 		Name:        "Burn fio " + t.Device,
 		Passed:      fr.Error == "",
 		StartedAt:   start,
 		CompletedAt: end,
 		SummaryJSON: mustJSON(fr),
 	}
 	out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error}
 	if fr.Error == "" {
 		out.Samples = append(out.Samples,
 			Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
 			Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
 		)
 		if fr.ReadP99Us > 0 {
 			out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
 		}
 		if fr.WriteP99Us > 0 {
 			out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
 		}
 	}
 	return out
 }
 // runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON
 // so the same (mbps, retrans, bytesSent) extraction the Network stage
 // uses applies here too. Samples emitted as Burn-scoped keys so the
 // dashboard can tell at-a-glance which window they came from.
 func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult {
 	if _, err := exec.LookPath("iperf3"); err != nil {
 		return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"}
 	}
 	host, err := deriveHost(orchestratorURL)
 	if err != nil || host == "" {
 		return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"}
 	}
 	if port == 0 {
 		port = 5201
 	}
 	if parallel < 1 {
 		parallel = 1
 	}
 	args := []string{
 		"-c", host,
 		"-p", strconv.Itoa(port),
 		"-t", strconv.Itoa(int(duration.Seconds())),
 		"-P", strconv.Itoa(parallel),
 		"-J",
 	}
 	runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
 	defer cancel()
 	start := time.Now()
 	out, err := exec.CommandContext(runCtx, "iperf3", args...).Output()
 	end := time.Now()
 	if err != nil {
 		return burnSubResult{
 			Name:    "Burn iperf",
 			Err:     "iperf3 client error: " + err.Error(),
 			SubStep: SubStepReport{
 				Name:        "Burn iperf",
 				StartedAt:   start,
 				CompletedAt: end,
 				SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}),
 			},
 		}
 	}
 	mbps, retrans, bytesSent, _, perr := parseIperfJSON(out)
 	if perr != nil {
 		return burnSubResult{
 			Name:    "Burn iperf",
 			Err:     "parse iperf3 json: " + perr.Error(),
 			SubStep: SubStepReport{
 				Name:        "Burn iperf",
 				StartedAt:   start,
 				CompletedAt: end,
 				SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}),
 			},
 		}
 	}
 	samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}}
 	if bytesSent > 0 {
 		packets := float64(bytesSent) / 1460.0
 		if packets > 0 {
 			samples = append(samples, Sample{
 				Kind: "nic_retrans", Key: "burn/rate",
 				Value: float64(retrans) / packets, Unit: "rate",
 			})
 		}
 	}
 	passed := mbps > 0
 	errMsg := ""
 	if !passed {
 		errMsg = "zero throughput from iperf3"
 	}
 	return burnSubResult{
 		Name:    "Burn iperf",
 		Passed:  passed,
 		Err:     errMsg,
 		Samples: samples,
 		SubStep: SubStepReport{
 			Name:        fmt.Sprintf("Burn iperf (P=%d)", parallel),
 			Passed:      passed,
 			StartedAt:   start,
 			CompletedAt: end,
 			SummaryJSON: mustJSON(map[string]any{
 				"throughput_mbps": mbps,
 				"retransmits":     retrans,
 				"bytes_sent":      bytesSent,
 				"parallel":        parallel,
 			}),
 		},
 	}
 }
 // runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration
 // of the Burn window, piping each read into the stage's sensor channel
 // as a psu_volt sample. The threshold evaluator then applies the same
 // within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V
 // under load will fire the critical threshold mid-Burn and the run
 // will flip into FailedHolding without waiting for the post-Burn PSU
 // stage to catch it.
 func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
 	defer wg.Done()
 	if d.Sensor == nil {
 		return
 	}
 	t := time.NewTicker(5 * time.Second)
 	defer t.Stop()
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-t.C:
 			rails := scanPSURails()
 			if len(rails) == 0 {
 				continue
 			}
 			batch := make([]Sample, 0, len(rails))
 			for _, r := range rails {
 				batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
 			}
 			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
 			if err := d.Sensor(sendCtx, batch); err != nil {
 				d.Warn("Burn: PSU sample post: " + err.Error())
 			}
 			cancel()
 		}
 	}
 }
 func resolveCPUWorkers(raw string) int {
 	if raw == "" || strings.EqualFold(raw, "all") {
 		return runtime.NumCPU()
 	}
 	if n, err := strconv.Atoi(raw); err == nil && n > 0 {
 		return n
 	}
 	return runtime.NumCPU()
 }
 // clampMemPct keeps the knob in a sane band. 0 means "use default 50%";
 // above 90 would crowd the kernel + agent + fio + iperf3 workers off the
 // page cache. Anything outside [10, 90] is clamped.
 func clampMemPct(pct int) int {
 	if pct <= 0 {
 		return 50
 	}
 	if pct < 10 {
 		return 10
 	}
 	if pct > 90 {
 		return 90
 	}
 	return pct
 }
 func mustJSON(v any) json.RawMessage {
 	b, err := json.Marshal(v)
 	if err != nil {
 		return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`))
 	}
 	return b
 }
 // Ensure the probes package import stays anchored — the Burn sidecars
 // use probes.EDAC + the PSU rail scanner defined in psu.go which
 // otherwise wouldn't pull probes in on its own.
 var _ = probes.EDAC
@@ -0,0 +1,58 @@
 package tests
 import (
 	"runtime"
 	"testing"
 )
 // TestResolveCPUWorkers covers the three parse branches: empty/"all"
 // falls back to NumCPU, a valid integer is used verbatim, and garbage
 // also falls back to NumCPU rather than returning zero. Zero workers
 // would make stress-ng a no-op and silently defeat Burn's CPU load.
 func TestResolveCPUWorkers(t *testing.T) {
 	np := runtime.NumCPU()
 	cases := []struct {
 		name string
 		in   string
 		want int
 	}{
 		{"empty defaults to NumCPU", "", np},
 		{"all defaults to NumCPU", "all", np},
 		{"ALL is case-insensitive", "ALL", np},
 		{"explicit integer", "3", 3},
 		{"negative falls back", "-1", np},
 		{"zero falls back", "0", np},
 		{"garbage falls back", "lots", np},
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			if got := resolveCPUWorkers(tc.in); got != tc.want {
 				t.Errorf("resolveCPUWorkers(%q) = %d, want %d", tc.in, got, tc.want)
 			}
 		})
 	}
 }
 // TestClampMemPct ensures the mem_pct knob never drives the memory
 // burner into OOM territory (upper clamp) or into uselessness (lower
 // clamp). Zero is treated as "use default 50" so a missing knob in an
 // older orchestrator's claim response doesn't collapse the workload.
 func TestClampMemPct(t *testing.T) {
 	cases := []struct {
 		in, want int
 	}{
 		{0, 50},   // default
 		{-10, 50}, // negative treated as default
 		{5, 10},   // below lower band → clamp up
 		{10, 10},
 		{50, 50},
 		{90, 90},
 		{95, 90}, // above upper band → clamp down
 		{1000, 90},
 	}
 	for _, tc := range cases {
 		if got := clampMemPct(tc.in); got != tc.want {
 			t.Errorf("clampMemPct(%d) = %d, want %d", tc.in, got, tc.want)
 		}
 	}
 }
@@ -11,7 +11,10 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 	"vetting/agent/probes"
 )
 // CPUStress runs stress-ng as two serial passes. The previous shape
@@ -55,11 +58,28 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 	extras := map[string]any{"cores": cores}
 	var subs []SubStepReport
 	// EDAC sidecar runs for the lifetime of the stage; cancelled on
 	// return. It polls /sys/devices/system/edac/mc/*/{ce,ue}_count and
 	// posts the current counters so the server-side threshold evaluator
 	// can gate edac_ue > 0 → fail the run. Zero-valued poll falls back
 	// to 10s — the same cadence rasdaemon uses by default.
 	sideCtx, sideCancel := context.WithCancel(ctx)
 	defer sideCancel()
 	var sideWG sync.WaitGroup
 	sideWG.Add(1)
 	go runEDACSidecar(sideCtx, &sideWG, d)
 	// Per-profile durations come from Deps; zero values (missing knobs
 	// or legacy orchestrator) fall back to the package default so the
 	// stage always has a defined budget.
 	cpuDur := nonzeroDur(d.CPUStressKnobs.CPUPass, cpuPassDuration)
 	memDur := nonzeroDur(d.CPUStressKnobs.MemPass, memPassDuration)
 	// Pass 1: CPU
-	cpu := runStressPass(ctx, d, "CPU", cpuPassDuration, []string{
+	cpu := runStressPass(ctx, d, "CPU", cpuDur, []string{
 		"--cpu", strconv.Itoa(cores),
 		"--cpu-method", "all",
-		"--timeout", durationSeconds(cpuPassDuration),
+		"--timeout", durationSeconds(cpuDur),
 		"--metrics-brief",
 		"--verify",
 	})
@@ -104,11 +124,11 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 			SubSteps: subs,
 		}
 	}
-	mem := runStressPass(ctx, d, "memory", memPassDuration, []string{
+	mem := runStressPass(ctx, d, "memory", memDur, []string{
 		"--vm", "1",
 		"--vm-bytes", strconv.FormatInt(cap, 10),
 		"--vm-keep",
-		"--timeout", durationSeconds(memPassDuration),
+		"--timeout", durationSeconds(memDur),
 		"--metrics-brief",
 		"--verify",
 	})
@@ -133,6 +153,64 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 	}
 }
 // runEDACSidecar polls /sys EDAC counters on d.CPUStressKnobs.EDACPoll
 // cadence (or 10s fallback) for the lifetime of the stage ctx, emitting
 // one sample per (memory-controller × {ce,ue}) pair on each tick. A
 // single failing read is tolerated: the next tick picks up the counter.
 //
 // This is where the critical edac_ue threshold becomes a hard-fail: as
 // soon as a UE counter advances past 0, the server-side evaluator trips
 // and flips the run into FailedHolding. The sidecar emits whether or
 // not stress-ng is still running; that keeps the signal live during
 // inter-pass gaps.
 //
 // MCE counts are intentionally not sampled here — they require
 // rasdaemon or mcelog and vary by live-image packaging. The threshold
 // rule for mce stays seeded (so the DB shape is stable) but only fires
 // once a matching kind lands, which is a follow-up.
 func runEDACSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
 	defer wg.Done()
 	if d.Sensor == nil {
 		return
 	}
 	poll := d.CPUStressKnobs.EDACPoll
 	if poll <= 0 {
 		poll = 10 * time.Second
 	}
 	t := time.NewTicker(poll)
 	defer t.Stop()
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-t.C:
 			edac := probes.EDAC()
 			if len(edac) == 0 {
 				continue
 			}
 			batch := make([]Sample, 0, len(edac))
 			for _, s := range edac {
 				batch = append(batch, Sample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
 			}
 			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
 			if err := d.Sensor(sendCtx, batch); err != nil {
 				d.Warn("CPUStress: edac sample post: " + err.Error())
 			}
 			cancel()
 		}
 	}
 }
 // nonzeroDur picks override over fallback, but only when override is
 // strictly positive. Lets callers pass a zero-value duration to mean
 // "no override; use fallback" without a separate ok return.
 func nonzeroDur(override, fallback time.Duration) time.Duration {
 	if override > 0 {
 		return override
 	}
 	return fallback
 }
 // subStepFromPass projects a stressPass into a SubStepReport — shared by
 // both passes and by the mid-stage early-return paths so the UI always
 // sees exactly one row per pass, even on failure.
@@ -0,0 +1,24 @@
 // fake_dmidecode simulates `dmidecode -t bios` for unit tests of the
 // firmware probe's BIOS parser. Prints deterministic output modeled on
 // a real Supermicro host; exits 0 regardless of flags.
 package main
 import "fmt"
 func main() {
 	fmt.Println(`# dmidecode 3.3
 Getting SMBIOS data from sysfs.
 SMBIOS 3.2.0 present.
 Handle 0x0000, DMI type 0, 26 bytes
 BIOS Information
 	Vendor: American Megatrends Inc.
 	Version: 3.2
 	Release Date: 07/15/2021
 	Address: 0xF0000
 	Runtime Size: 64 kB
 	ROM Size: 32 MB
 	Characteristics:
 		PCI is supported
 		BIOS is upgradeable`)
 }
@@ -0,0 +1,22 @@
 // Package fakes is the umbrella for deterministic stand-ins for
 // external probe binaries that Vetting's stage code normally shells
 // out to (stress-ng, fio, iperf3, dmidecode, ethtool, nvidia-smi,
 // mcelog, nvme). Each real binary gets its own subpackage under
 // fakes/<name>/ with `package main` and a main() that prints golden
 // output — build with `go build -o <tmp>/<name> ./agent/tests/fakes/<name>`
 // and point a test's tests.Deps.LookPath at <tmp>/<name>.
 //
 // The seam in tests is tests.Deps.LookPath: when non-nil the stage
 // code uses it instead of os/exec.LookPath. Outside tests, nil
 // LookPath means "use the real binary on $PATH" — stages continue to
 // work on production hosts without the fakes package around.
 //
 // How to add a new fake:
 //  1. Create agent/tests/fakes/<binaryname>/main.go.
 //  2. Write `package main` with a main() that prints exactly the
 //     bytes the real tool would produce for the input you care to
 //     simulate. Determinism > completeness — tests want a known
 //     sample, not a realistic one.
 //  3. Reference the fake from the unit test with `go test` compiling
 //     it via t.TempDir() + `go build -o` before the test body runs.
 package fakes
@@ -0,0 +1,18 @@
 // fake_stress_ng simulates stress-ng for unit tests. Accepts (and
 // ignores) any flag, sleeps briefly so callers that measure wall-clock
 // see a non-zero elapsed, and prints the "passed" lines CPUStress
 // expects. Exits 0.
 package main
 import (
 	"fmt"
 	"os"
 	"time"
 )
 func main() {
 	fmt.Fprintln(os.Stderr, "fake_stress_ng invoked:", os.Args[1:])
 	time.Sleep(50 * time.Millisecond)
 	fmt.Println("stress-ng: info:  [1] dispatching hogs: 1 cpu")
 	fmt.Println("stress-ng: info:  [1] successful run completed in 0.05s")
 }
@@ -9,19 +9,27 @@ import (
 	"strconv"
 	"strings"
 	"time"
 	"vetting/agent/probes"
 )
 // NetworkConfig is what the agent passes to Network: the orchestrator's
-// iperf3 server address and port. We derive host from OrchestratorURL.
+// iperf3 server address, port, and the per-profile duration.
 type NetworkConfig struct {
 	OrchestratorURL string
 	IperfPort       int // 0 = 5201
 	Duration        time.Duration
 }
-// Network runs iperf3 against the orchestrator's bundled server. Records
+// Network runs iperf3 against the orchestrator's bundled server for
-// bandwidth as a measurement; fails if iperf3 is missing, the server
+// the profile-configured duration. Records throughput as a measurement;
-// isn't reachable, or throughput is zero.
+// records per-interface rx/tx error-rate deltas as nic_retrans samples
 // so the server-side threshold gate (`nic_retrans rate < 0.001`) fires
 // on a flaky PHY or a wire that drops half its packets under load.
 //
 // Failure cases: iperf3 missing, server unreachable, zero throughput.
 // Zero throughput is treated as a hard failure — an iperf that finished
 // cleanly but pushed zero bytes is indistinguishable from a bad run.
 func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
 	if _, err := exec.LookPath("iperf3"); err != nil {
 		// Live image ships iperf3; absence means packaging regression.
@@ -51,6 +59,11 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
 		duration = 10 * time.Second
 	}
 	// Snapshot /proc/net/dev before the test so we can attribute any
 	// error-count growth to *this stage's* traffic. The same snapshot
 	// taken after iperf returns is the end of the window.
 	netStart := indexNetDev(probes.NetDev())
 	args := []string{
 		"-c", host,
 		"-p", strconv.Itoa(port),
@@ -72,7 +85,7 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
 			Extras:  map[string]any{"stderr_tail": tailLines(string(out), 20)},
 		}
 	}
-	mbps, parsed, err := parseIperfJSON(out)
+	mbps, retrans, bytesSent, parsed, err := parseIperfJSON(out)
 	if err != nil {
 		d.Error("Network: parse iperf3 output: " + err.Error())
 		return Outcome{
@@ -82,12 +95,58 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
 			Extras:  map[string]any{"raw": string(out)},
 		}
 	}
 	netEnd := indexNetDev(probes.NetDev())
 	netDelta := diffNetDev(netStart, netEnd)
 	samples := []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}
 	// iperf-derived retrans rate: retrans_count / packet_count_estimate.
 	// TCP typical MTU 1500; payload ~1460. We divide bytes by 1460 to
 	// approximate packets. This keeps the rate bounded in [0, 1].
 	if bytesSent > 0 {
 		packets := float64(bytesSent) / 1460.0
 		if packets > 0 {
 			samples = append(samples, Sample{
 				Kind:  "nic_retrans",
 				Key:   "iperf/rate",
 				Value: float64(retrans) / packets,
 				Unit:  "rate",
 			})
 		}
 	}
 	// Per-interface error-rate deltas. A flaky cable typically surfaces
 	// as tx_errs or tx_drop on the originating interface, not inside
 	// iperf's own tally.
 	for iface, delta := range netDelta {
 		if delta.TxBytes > 0 {
 			packets := float64(delta.TxBytes) / 1460.0
 			if packets > 0 {
 				rate := float64(delta.TxErrs+delta.TxDrop) / packets
 				samples = append(samples, Sample{
 					Kind: "nic_retrans", Key: iface + "/rate", Value: rate, Unit: "rate",
 				})
 			}
 		}
 		// Diagnostic raw counts so the report can show which interface
 		// bled. These don't fire a threshold today but are useful for
 		// post-mortem.
 		samples = append(samples,
 			Sample{Kind: "nic_errs", Key: iface + "/rx", Value: float64(delta.RxErrs + delta.RxDrop), Unit: "count"},
 			Sample{Kind: "nic_errs", Key: iface + "/tx", Value: float64(delta.TxErrs + delta.TxDrop), Unit: "count"},
 		)
 	}
 	if d.Sensor != nil {
-		_ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}})
+		_ = d.Sensor(ctx, samples)
 	}
 	extras := map[string]any{
 		"throughput_mbps": mbps,
 		"retransmits":     retrans,
 		"bytes_sent":      bytesSent,
 		"net_delta":       netDelta,
 		"iperf_end":       parsed,
 	}
 	if mbps <= 0 {
@@ -98,14 +157,55 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
 			Extras:  extras,
 		}
 	}
-	d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps))
+	d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps (retransmits=%d)", mbps, retrans))
 	return Outcome{
 		Passed:  true,
-		Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host),
+		Summary: fmt.Sprintf("%.1f Mbps to %s (retransmits=%d)", mbps, host, retrans),
 		Extras:  extras,
 	}
 }
 // indexNetDev flattens a NetDev slice into a map keyed by interface
 // name so diffNetDev can pair start/end by name without O(n²) scans.
 func indexNetDev(snaps []probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
 	out := map[string]probes.NetDevSnapshot{}
 	for _, s := range snaps {
 		out[s.Iface] = s
 	}
 	return out
 }
 // diffNetDev computes end − start for each interface present in both
 // snapshots. An interface that dropped away mid-run is dropped from
 // the result (can't compute a delta). Underflow (end < start, rare
 // after a counter reset) is clamped to 0.
 func diffNetDev(start, end map[string]probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
 	out := map[string]probes.NetDevSnapshot{}
 	for iface, e := range end {
 		s, ok := start[iface]
 		if !ok {
 			continue
 		}
 		out[iface] = probes.NetDevSnapshot{
 			Iface:   iface,
 			RxBytes: subU64(e.RxBytes, s.RxBytes),
 			RxErrs:  subU64(e.RxErrs, s.RxErrs),
 			RxDrop:  subU64(e.RxDrop, s.RxDrop),
 			TxBytes: subU64(e.TxBytes, s.TxBytes),
 			TxErrs:  subU64(e.TxErrs, s.TxErrs),
 			TxDrop:  subU64(e.TxDrop, s.TxDrop),
 		}
 	}
 	return out
 }
 func subU64(a, b uint64) uint64 {
 	if a < b {
 		return 0
 	}
 	return a - b
 }
 // deriveHost pulls the hostname out of an https://host:port base URL.
 func deriveHost(raw string) (string, error) {
 	if raw == "" {
@@ -119,18 +219,22 @@ func deriveHost(raw string) (string, error) {
 	return strings.TrimSpace(h), nil
 }
-// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J.
+// parseIperfJSON pulls end.sum_sent.bits_per_second and retransmits out
-// Returns (Mbps, full-json-map, err).
+// of iperf3 -J. Returns (Mbps, retransmits, bytes_sent, full-end-map, err).
-func parseIperfJSON(b []byte) (float64, map[string]any, error) {
+func parseIperfJSON(b []byte) (float64, int64, int64, map[string]any, error) {
 	var top map[string]any
 	if err := json.Unmarshal(b, &top); err != nil {
-		return 0, nil, err
+		return 0, 0, 0, nil, err
 	}
 	end, ok := top["end"].(map[string]any)
 	if !ok {
-		return 0, top, fmt.Errorf("missing end")
+		return 0, 0, 0, nil, fmt.Errorf("missing end")
 	}
-	// iperf3 reports either sum_sent (when -R not set) or sum_received.
+	// Pull the first sum that carries bits_per_second; retransmits +
 	// bytes live there too for TCP.
 	var mbps float64
 	var retrans int64
 	var bytesSent int64
 	for _, key := range []string{"sum_sent", "sum_received", "sum"} {
 		sum, ok := end[key].(map[string]any)
 		if !ok {
@@ -140,7 +244,17 @@ func parseIperfJSON(b []byte) (float64, map[string]any, error) {
 		if !ok {
 			continue
 		}
-		return bps / 1_000_000, end, nil
+		mbps = bps / 1_000_000
 		if r, ok := sum["retransmits"].(float64); ok {
 			retrans = int64(r)
 		}
-	return 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
+		if bs, ok := sum["bytes"].(float64); ok {
 			bytesSent = int64(bs)
 		}
 		break
 	}
 	if mbps == 0 {
 		return 0, 0, 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
 	}
 	return mbps, retrans, bytesSent, end, nil
 }
@@ -0,0 +1,192 @@
 package tests
 import (
 	"encoding/json"
 	"testing"
 	"vetting/agent/probes"
 )
 // TestParseIperfJSON_SumSent confirms we pull throughput, retransmits,
 // and bytes_sent from end.sum_sent. Real iperf3 -J output nests these
 // three under end.sum_sent for TCP streams.
 func TestParseIperfJSON_SumSent(t *testing.T) {
 	raw := `{
 		"end": {
 			"sum_sent": {
 				"bits_per_second": 950000000,
 				"retransmits": 42,
 				"bytes": 1187500000
 			}
 		}
 	}`
 	mbps, retrans, bytesSent, _, err := parseIperfJSON([]byte(raw))
 	if err != nil {
 		t.Fatalf("parseIperfJSON: %v", err)
 	}
 	if mbps != 950 {
 		t.Errorf("mbps = %v, want 950", mbps)
 	}
 	if retrans != 42 {
 		t.Errorf("retransmits = %d, want 42", retrans)
 	}
 	if bytesSent != 1187500000 {
 		t.Errorf("bytesSent = %d, want 1187500000", bytesSent)
 	}
 }
 // TestParseIperfJSON_MissingEnd fails cleanly when iperf returned
 // something without an end block (partial/aborted run).
 func TestParseIperfJSON_MissingEnd(t *testing.T) {
 	raw := `{"start": {}}`
 	if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
 		t.Errorf("expected error on iperf output missing end block")
 	}
 }
 // TestParseIperfJSON_ZeroBps returns an error so the stage can fail
 // fast. A successful-exit iperf that pushed zero bits is indistinguishable
 // from a broken run and must not pass.
 func TestParseIperfJSON_ZeroBps(t *testing.T) {
 	raw := `{"end": {"sum_sent": {"bits_per_second": 0}}}`
 	if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
 		t.Errorf("expected error when bits_per_second is 0")
 	}
 }
 // TestParseIperfJSON_FallsBackToSumReceived: UDP tests and some edge
 // cases don't populate sum_sent. The parser walks sum_sent → sum_received
 // → sum and picks the first that has a throughput number.
 func TestParseIperfJSON_FallsBackToSumReceived(t *testing.T) {
 	raw := `{
 		"end": {
 			"sum_received": {"bits_per_second": 500000000}
 		}
 	}`
 	mbps, _, _, _, err := parseIperfJSON([]byte(raw))
 	if err != nil {
 		t.Fatalf("parseIperfJSON: %v", err)
 	}
 	if mbps != 500 {
 		t.Errorf("mbps = %v, want 500", mbps)
 	}
 }
 // TestDiffNetDev_HappyPath confirms end − start on a shared interface
 // produces the delta we expect. eth0 pushed 10k bytes and accumulated
 // 3 tx errors during the window.
 func TestDiffNetDev_HappyPath(t *testing.T) {
 	start := map[string]probes.NetDevSnapshot{
 		"eth0": {Iface: "eth0", RxBytes: 1000, RxErrs: 0, TxBytes: 5000, TxErrs: 1},
 	}
 	end := map[string]probes.NetDevSnapshot{
 		"eth0": {Iface: "eth0", RxBytes: 2000, RxErrs: 0, TxBytes: 15000, TxErrs: 4},
 	}
 	delta := diffNetDev(start, end)
 	got, ok := delta["eth0"]
 	if !ok {
 		t.Fatalf("eth0 missing from diff output")
 	}
 	if got.RxBytes != 1000 {
 		t.Errorf("RxBytes delta=%d, want 1000", got.RxBytes)
 	}
 	if got.TxBytes != 10000 {
 		t.Errorf("TxBytes delta=%d, want 10000", got.TxBytes)
 	}
 	if got.TxErrs != 3 {
 		t.Errorf("TxErrs delta=%d, want 3", got.TxErrs)
 	}
 }
 // TestDiffNetDev_InterfaceVanished: an interface present at start but
 // gone at end drops from the diff rather than carrying a negative or
 // stale number.
 func TestDiffNetDev_InterfaceVanished(t *testing.T) {
 	start := map[string]probes.NetDevSnapshot{
 		"eth0": {Iface: "eth0", TxBytes: 1000},
 		"eth1": {Iface: "eth1", TxBytes: 500},
 	}
 	end := map[string]probes.NetDevSnapshot{
 		"eth0": {Iface: "eth0", TxBytes: 2000},
 	}
 	delta := diffNetDev(start, end)
 	if _, ok := delta["eth1"]; ok {
 		t.Errorf("eth1 should have been dropped (gone at end)")
 	}
 	if delta["eth0"].TxBytes != 1000 {
 		t.Errorf("eth0 TxBytes delta=%d, want 1000", delta["eth0"].TxBytes)
 	}
 }
 // TestDiffNetDev_CounterReset: if a counter resets between snapshots
 // (kernel restart, wrap-around on a 32-bit counter) we clamp to 0
 // rather than underflow a uint64.
 func TestDiffNetDev_CounterReset(t *testing.T) {
 	start := map[string]probes.NetDevSnapshot{
 		"eth0": {Iface: "eth0", TxBytes: 9999, TxErrs: 5},
 	}
 	end := map[string]probes.NetDevSnapshot{
 		"eth0": {Iface: "eth0", TxBytes: 100, TxErrs: 0},
 	}
 	delta := diffNetDev(start, end)
 	if delta["eth0"].TxBytes != 0 {
 		t.Errorf("reset TxBytes delta=%d, want 0 (clamped)", delta["eth0"].TxBytes)
 	}
 	if delta["eth0"].TxErrs != 0 {
 		t.Errorf("reset TxErrs delta=%d, want 0 (clamped)", delta["eth0"].TxErrs)
 	}
 }
 // TestDeriveHost: orchestrator URL → host extraction is how the agent
 // picks the iperf3 server target. Handles both https://host and
 // https://host:port shapes.
 func TestDeriveHost(t *testing.T) {
 	cases := []struct {
 		raw  string
 		want string
 	}{
 		{"https://orch.local", "orch.local"},
 		{"https://orch.local:8443", "orch.local"},
 		{"http://10.0.0.5:8080", "10.0.0.5"},
 	}
 	for _, c := range cases {
 		got, err := deriveHost(c.raw)
 		if err != nil {
 			t.Errorf("deriveHost(%q) error: %v", c.raw, err)
 			continue
 		}
 		if got != c.want {
 			t.Errorf("deriveHost(%q) = %q, want %q", c.raw, got, c.want)
 		}
 	}
 }
 func TestDeriveHost_Empty(t *testing.T) {
 	if _, err := deriveHost(""); err == nil {
 		t.Errorf("deriveHost(\"\") should error")
 	}
 }
 // TestParseIperfJSON_ParsesEndMap confirms the full end map is returned
 // so extras can show every field iperf produced, not just the three we
 // extract by hand.
 func TestParseIperfJSON_ParsesEndMap(t *testing.T) {
 	raw := `{
 		"end": {
 			"sum_sent": {"bits_per_second": 1000000, "retransmits": 0, "bytes": 125000},
 			"cpu_utilization_percent": {"host_total": 12.3}
 		}
 	}`
 	_, _, _, endMap, err := parseIperfJSON([]byte(raw))
 	if err != nil {
 		t.Fatalf("parseIperfJSON: %v", err)
 	}
 	if endMap == nil {
 		t.Fatalf("endMap is nil")
 	}
 	// Sanity: both keys round-trip via json.
 	b, _ := json.Marshal(endMap)
 	if len(b) == 0 {
 		t.Errorf("endMap marshaled to empty")
 	}
 }
@@ -7,12 +7,20 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
 )
 // PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
-// PSU rails. In home-lab hosts the kernel surfaces a handful of named
+// PSU rails, then samples each rail every psuSampleInterval for a
-// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10%
+// window sized by the stage timeout. During Burn a separate sidecar
-// window of its nominal value → fail.
+// (see burn.go) runs the same probe concurrently with workload — the
 // PSU stage itself catches slow post-load sag that only surfaces once
 // the 12V rail starts recovering from a brownout under concurrent CPU
 // + fio + iperf load.
 //
 // Any rail outside ±10% of its nominal value at any tick fires the
 // critical threshold (server-side) and fails the stage. A host with no
 // PSU rails wired to hwmon auto-skips.
 func PSU(ctx context.Context, d Deps) Outcome {
 	rails := scanPSURails()
 	if len(rails) == 0 {
@@ -24,39 +32,150 @@ func PSU(ctx context.Context, d Deps) Outcome {
 		}
 	}
-	var samples []Sample
+	window := resolvePSUWindow(d.StageTimeout)
 	deadline := time.Now().Add(window)
 	interval := psuSampleInterval
 	if window < interval*2 {
 		// Tiny window (tests, pathological stage_timeout) — at least two
 		// ticks so aggregate stats are meaningful.
 		interval = window / 2
 		if interval < time.Second {
 			interval = time.Second
 		}
 	}
 	// Per-label tracking: min/max across the window, count of out-of-range
 	// hits, last-observed value (shown in the summary).
 	type railStats struct {
 		label    string
 		minV     float64
 		maxV     float64
 		lastV    float64
 		ticks    int
 		breaches int
 		reason   string
 	}
 	stats := map[string]*railStats{}
 	tick := time.NewTicker(interval)
 	defer tick.Stop()
 	// Start with an immediate sample so a sub-45s window still produces
 	// at least one reading.
 	sampleOnce := func() {
 		cur := scanPSURails()
 		if len(cur) == 0 {
 			return
 		}
 		batch := make([]Sample, 0, len(cur))
 		for _, r := range cur {
 			s, ok := stats[r.Label]
 			if !ok {
 				s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts}
 				stats[r.Label] = s
 			}
 			s.ticks++
 			s.lastV = r.Volts
 			if r.Volts < s.minV {
 				s.minV = r.Volts
 			}
 			if r.Volts > s.maxV {
 				s.maxV = r.Volts
 			}
 			if ok, why := voltageInRange(r); !ok {
 				s.breaches++
 				if s.reason == "" {
 					s.reason = why
 				}
 			}
 			batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
 		}
 		if d.Sensor != nil && len(batch) > 0 {
 			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
 			_ = d.Sensor(sendCtx, batch)
 			cancel()
 		}
 	}
 	sampleOnce()
 sampling:
 	for time.Now().Before(deadline) {
 		select {
 		case <-ctx.Done():
 			break sampling
 		case <-tick.C:
 			sampleOnce()
 		}
 	}
 	// Build the outcome. Extras carry per-rail rollup so the report can
 	// show "12V min=11.1 max=12.05 (3/120 ticks out of range)".
 	type railRollup struct {
 		Label    string  `json:"label"`
 		MinV     float64 `json:"min_v"`
 		MaxV     float64 `json:"max_v"`
 		LastV    float64 `json:"last_v"`
 		Ticks    int     `json:"ticks"`
 		Breaches int     `json:"breaches"`
 		Reason   string  `json:"reason,omitempty"`
 	}
 	rollups := make([]railRollup, 0, len(stats))
 	problems := []string{}
-	for _, rail := range rails {
+	for _, s := range stats {
-		samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"})
+		rollups = append(rollups, railRollup{
-		if ok, why := voltageInRange(rail); !ok {
+			Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV,
-			problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why))
+			Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason,
 		})
 		if s.breaches > 0 {
 			problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason))
 		}
 	}
 	if d.Sensor != nil {
 		_ = d.Sensor(ctx, samples)
 	}
 	extras := map[string]any{
-		"rails":    rails,
+		"rails":       rollups,
 		"problems":    problems,
 		"window":      window.String(),
 		"interval":    interval.String(),
 	}
 	if len(problems) > 0 {
-		d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", "))
+		d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; "))
 		return Outcome{
 			Passed:  false,
-			Message: "PSU rails out of range: " + strings.Join(problems, ", "),
+			Message: "PSU rails out of range: " + strings.Join(problems, "; "),
-			Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)),
+			Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)),
 			Extras:  extras,
 		}
 	}
-	d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails)))
+	d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window))
 	return Outcome{
 		Passed:  true,
-		Summary: fmt.Sprintf("%d rails nominal", len(rails)),
+		Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window),
 		Extras:  extras,
 	}
 }
 // psuSampleInterval is the default tick for post-Burn rail sampling.
 // Five seconds is slow enough to stay under the HTTP budget and fast
 // enough to catch rail recovery transients.
 const psuSampleInterval = 5 * time.Second
 // resolvePSUWindow maps the stage timeout to the sampling window.
 // With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot-
 // like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom
 // for sensor flush + result post, capped at 10 min so a 24 h soak
 // doesn't spend all day in PSU.
 func resolvePSUWindow(stageTimeout time.Duration) time.Duration {
 	if stageTimeout <= 0 {
 		return 30 * time.Second
 	}
 	w := stageTimeout - 5*time.Second
 	if w < 30*time.Second {
 		w = 30 * time.Second
 	}
 	if w > 10*time.Minute {
 		w = 10 * time.Minute
 	}
 	return w
 }
 type psuRail struct {
 	Label string  `json:"label"`
 	Volts float64 `json:"volts"`
@@ -0,0 +1,112 @@
 package tests
 import (
 	"testing"
 	"time"
 )
 // TestIsPSULabel keeps the allowlist narrow enough that CPU VRM rails
 // don't get misclassified as PSU-out-of-range failures but wide enough
 // that common SuperMicro/Intel hwmon labels land in the Yes bucket.
 func TestIsPSULabel(t *testing.T) {
 	cases := []struct {
 		label string
 		want  bool
 	}{
 		{"+12V", true},
 		{"12V", true},
 		{"+5V", true},
 		{"5V", true},
 		{"+3.3V", true},
 		{"3V3", true},
 		{"VCCIN", true},
 		{"vccin", true},
 		{"Vcore", false},
 		{"CPU VCORE", false},
 		{"AVCC", false},
 		{"", false},
 	}
 	for _, tc := range cases {
 		if got := isPSULabel(tc.label); got != tc.want {
 			t.Errorf("isPSULabel(%q) = %v, want %v", tc.label, got, tc.want)
 		}
 	}
 }
 // TestNominalFor maps rail labels back to expected nominal voltages.
 // Unknown labels must return 0 so voltageInRange short-circuits — an
 // accidental nominal would invent out-of-range failures.
 func TestNominalFor(t *testing.T) {
 	cases := []struct {
 		label string
 		want  float64
 	}{
 		{"+12V", 12.0},
 		{"12V", 12.0},
 		{"+5V", 5.0},
 		{"+3.3V", 3.3},
 		{"3V3", 3.3},
 		{"VCCIN", 0},
 		{"unknown", 0},
 	}
 	for _, tc := range cases {
 		if got := nominalFor(tc.label); got != tc.want {
 			t.Errorf("nominalFor(%q) = %v, want %v", tc.label, got, tc.want)
 		}
 	}
 }
 // TestVoltageInRange verifies the ±10% band: 12V passes in [10.8,
 // 13.2], fails anywhere outside. Unknown labels always pass (since
 // nominalFor returned 0 above).
 func TestVoltageInRange(t *testing.T) {
 	cases := []struct {
 		rail psuRail
 		ok   bool
 	}{
 		{psuRail{Label: "+12V", Volts: 12.0}, true},
 		{psuRail{Label: "+12V", Volts: 10.8}, true},  // exactly at the band
 		{psuRail{Label: "+12V", Volts: 13.2}, true},  // exactly at the band
 		{psuRail{Label: "+12V", Volts: 10.7}, false}, // just below
 		{psuRail{Label: "+12V", Volts: 13.3}, false}, // just above
 		{psuRail{Label: "+12V", Volts: 10.5}, false}, // real sag
 		{psuRail{Label: "+5V", Volts: 4.6}, true},    // 8% low on 5V still in band
 		{psuRail{Label: "+5V", Volts: 4.4}, false},   // 12% low on 5V — out of band
 		{psuRail{Label: "+5V", Volts: 5.0}, true},
 		{psuRail{Label: "VCCIN", Volts: 1.8}, true}, // unknown nominal → pass
 	}
 	for _, tc := range cases {
 		got, _ := voltageInRange(tc.rail)
 		if got != tc.ok {
 			t.Errorf("voltageInRange(%+v) = %v, want %v", tc.rail, got, tc.ok)
 		}
 	}
 }
 // TestResolvePSUWindow maps stage timeouts to the sampling window.
 // Quick's 1m stage_timeout → 55s window; deep's 10m → capped at 10m;
 // missing/zero → 30s (test / legacy orchestrator path); sub-35s → at
 // least 30s so aggregates are non-trivial.
 func TestResolvePSUWindow(t *testing.T) {
 	cases := []struct {
 		name string
 		in   time.Duration
 		want time.Duration
 	}{
 		{"zero → snapshot fallback", 0, 30 * time.Second},
 		{"negative → snapshot fallback", -1 * time.Second, 30 * time.Second},
 		{"tiny timeout clamps up to 30s floor", 10 * time.Second, 30 * time.Second},
 		{"35s - 5s = 30s", 35 * time.Second, 30 * time.Second},
 		{"1m quick → 55s", time.Minute, 55 * time.Second},
 		{"10m deep → 9m55s", 10 * time.Minute, 9*time.Minute + 55*time.Second},
 		{"15m soak → capped at 10m", 15 * time.Minute, 10 * time.Minute},
 		{"1h → capped at 10m", time.Hour, 10 * time.Minute},
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			if got := resolvePSUWindow(tc.in); got != tc.want {
 				t.Errorf("resolvePSUWindow(%s) = %s, want %s", tc.in, got, tc.want)
 			}
 		})
 	}
 }
@@ -59,6 +59,11 @@ func (o Outcome) MarshalSummary() (json.RawMessage, error) {
 // Deps bundles what stages need without pulling in the whole agent.
 // Logger methods print to stdout + forward to the orchestrator; Sensor
 // drops numeric samples; OverrideFlags carries operator-set bypasses.
 //
 // CPUStressKnobs / StorageKnobs / NetworkKnobs are Phase-2 profile
 // knobs. Zero-valued fields mean "fall back to the compile-time
 // default" — that keeps the stages runnable even when the runner can't
 // materialize a profile (tests, legacy orchestrator, etc).
 type Deps struct {
 	Info           func(string)
 	Warn           func(string)
@@ -68,6 +73,58 @@ type Deps struct {
 	NonDestructive bool           // skip wipe-probe + writes in Storage
 	ExpectedDisks  []ExpectedDisk // serials + sizes from host.expected_spec
 	StageTimeout   time.Duration
 	CPUStressKnobs CPUStressKnobs
 	StorageKnobs   StorageKnobs
 	NetworkKnobs   NetworkKnobs
 	BurnKnobs      BurnKnobs
 	// LookPath is the unit-test seam for swapping a real external
 	// binary (stress-ng, fio, iperf3, dmidecode, …) for a fake. When
 	// nil the stage falls back to os/exec.LookPath — production and
 	// existing tests keep working unchanged. Tests under
 	// agent/tests/fakes/ populate this to redirect lookups to a built
 	// fake binary in a tempdir.
 	LookPath func(name string) (string, error)
 }
 // CPUStressKnobs parameterizes the CPUStress stage. Zero durations fall
 // back to the package's compile-time defaults (cpuPassDuration etc).
 type CPUStressKnobs struct {
 	CPUPass  time.Duration
 	MemPass  time.Duration
 	EDACPoll time.Duration
 }
 // StorageKnobs parameterizes the Storage stage. Mode picks between
 // "fio_sample" (bounded tempfile inside the device, quick profile) and
 // "full_disk" (whole-device write verify, deep/soak). Empty strings
 // fall back to the stage's safe defaults.
 type StorageKnobs struct {
 	Mode    string
 	FioSize string
 	FioTime time.Duration
 	FioBS   string
 	FioRW   string
 	Verify  string
 }
 // NetworkKnobs parameterizes the Network stage.
 type NetworkKnobs struct {
 	Duration time.Duration
 }
 // BurnKnobs parameterizes the Burn super-stage. Duration is the total
 // Burn window; sub-workloads run concurrently inside that window.
 // CPUWorkers is "all" (runtime.NumCPU) or a numeric string. MemPct is a
 // percentage of MemAvailable to allocate for the memory burner (clamped
 // 0-90 by the stage). IperfParallel feeds iperf3 -P to generate sustained
 // NIC load. FioOnSpare gates the storage sub-workload: true = fio runs
 // against the allow-listed disks for the same window; false = skip fio.
 type BurnKnobs struct {
 	Duration      time.Duration
 	CPUWorkers    string
 	MemPct        int
 	FioOnSpare    bool
 	IperfParallel int
 }
 // Sample mirrors the server's SensorSample but lives in the tests
@@ -5,24 +5,36 @@ import (
 	"encoding/json"
 	"fmt"
 	"os/exec"
 	"strconv"
 	"strings"
 	"time"
 )
-// Storage is the destructive stage: badblocks (write-mode sample) + fio
+// Storage is the destructive stage. Phase 2 replaced the old
-// random IO, persisting IOPS + latency as measurements. Pre-gates:
+// badblocks + 128 MiB fio combo with a single fio run per disk that
 // writes, verifies md5 of what it wrote, and reports p99 latency.
 // Modes:
 //
 //   - fio_sample (quick): bounded 1 GiB write per disk, ~3 min runtime.
 //   - full_disk (deep/soak): writes the whole device, time-bounded by
 //     the fio_time knob (2 h deep, 6 h soak).
 //
 // Pre-gates kept from Phase 1:
 //
 //  1. Device allowlist: only act on /dev/<X> where the kernel-reported
-//     serial matches one of Deps.ExpectedDisks. This is the operator's
+//     serial matches one of Deps.ExpectedDisks. USB sticks and unexpected
 //     contract for what can be written to. USB sticks and unexpected
 //     drives are excluded.
 //  2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
-//     signatures, partition tables, or LVM metadata → fail with
+//     signature, partition table, or LVM metadata → fail with
 //     UnexpectedData unless Deps.OverrideWipe is set.
 //
-// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w`
+// After fio, the stage captures a SMART diff (start snapshot taken
-// and `fio` in write mode. This matches the plan's "destructive disk
+// before any writes; end snapshot after all writes finish) and posts
-// tests are always-on, gated by layered safety."
+// deltas on attributes like Reallocated_Sector_Ct and Current_Pending_Sector.
 // The threshold evaluator isn't seeded to gate smart_delta out of the
 // box — those samples are diagnostic for the report. Fio's p99 latency
 // posts as fio_p99_us so the per-stage Storage warning threshold can
 // fire on a latency cliff.
 func Storage(ctx context.Context, d Deps) Outcome {
 	if len(d.ExpectedDisks) == 0 {
 		d.Info("Storage: no expected disks in spec — skipping stage")
@@ -44,10 +56,10 @@ func Storage(ctx context.Context, d Deps) Outcome {
 		}
 	}
-	// Non-destructive runs skip wipe-probe (nothing to refuse), badblocks
+	// Non-destructive runs skip wipe-probe (nothing to refuse), fio
-	// -w, and write-mode fio. Every expected disk is still asserted
+	// writes, and SMART delta (nothing changed so no delta to report).
-	// present + readable by listing /sys/block and reading SMART-accessible
+	// Every expected disk is still asserted present so a vanished drive
-	// identity; the per-disk map flags the shortcut so the report is clear.
+	// still fails the stage.
 	if d.NonDestructive {
 		perDisk := map[string]any{}
 		for _, t := range targets {
@@ -89,64 +101,80 @@ func Storage(ctx context.Context, d Deps) Outcome {
 		d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
 	}
-	// Per target: short badblocks write sample + fio random-read/write.
+	// Capture start-of-stage SMART attributes before we write anything
 	// so the delta is attributable to *this* stage's writes and not the
 	// host's prior history. Per-disk failures are tolerated (e.g. the
 	// device doesn't expose SMART); we just can't emit a delta for it.
 	startSMART := captureSMARTAttrs(ctx, targets)
 	fioOpts := resolveFioOpts(d.StorageKnobs)
 	d.Info(fmt.Sprintf("Storage: fio mode=%s size=%s runtime=%s bs=%s rw=%s verify=%s",
 		fioOpts.Mode, fioOpts.Size, fioOpts.Runtime, fioOpts.BS, fioOpts.RW, fioOpts.Verify))
 	var samples []Sample
 	var subs []SubStepReport
 	perDisk := map[string]any{}
 	failed := ""
 	for _, t := range targets {
-		d.Info("Storage: running badblocks write sample on " + t.Device)
+		d.Info(fmt.Sprintf("Storage: running fio %s on %s", fioOpts.Mode, t.Device))
 		bbStart := time.Now()
 		bb := runBadblocks(ctx, t.Device)
 		bbEnd := time.Now()
 		bbSummary, _ := json.Marshal(bb)
 		subs = append(subs, SubStepReport{
 			Name:        fmt.Sprintf("badblocks %s", t.Device),
 			Passed:      bb.OK,
 			StartedAt:   bbStart,
 			CompletedAt: bbEnd,
 			SummaryJSON: bbSummary,
 		})
 		d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
 		fioStart := time.Now()
-		fr := runFio(ctx, t.Device)
+		fr := runFioVerify(ctx, t.Device, fioOpts)
 		fioEnd := time.Now()
 		fioSummary, _ := json.Marshal(fr)
 		subs = append(subs, SubStepReport{
-			Name:        fmt.Sprintf("fio %s", t.Device),
+			Name:        fmt.Sprintf("fio %s %s", fioOpts.Mode, t.Device),
 			Passed:      fr.Error == "",
 			StartedAt:   fioStart,
 			CompletedAt: fioEnd,
 			SummaryJSON: fioSummary,
 		})
 		perDisk[t.Device] = map[string]any{"fio": fr}
-		perDisk[t.Device] = map[string]any{
+		if fr.Error == "" {
 			"badblocks": bb,
 			"fio":       fr,
 		}
 			samples = append(samples,
 				Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
 				Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
 			)
-		if !bb.OK {
+			if fr.ReadP99Us > 0 {
-			return Outcome{
+				samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
-				Passed:   false,
+			}
-				Message:  "badblocks found errors on " + t.Device,
+			if fr.WriteP99Us > 0 {
-				Summary:  "badblocks failed on " + t.Device,
+				samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
-				Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes},
+			}
-				SubSteps: subs,
+		} else if failed == "" {
 			failed = t.Device
 		}
 	}
 	// End-of-stage SMART snapshot + diff. We capture whether or not fio
 	// succeeded — a mid-run failure still produces attributable deltas,
 	// which is often more interesting than the stage outcome itself.
 	endSMART := captureSMARTAttrs(ctx, targets)
 	deltas := diffSMARTAttrs(startSMART, endSMART)
 	for dev, attrs := range deltas {
 		for attr, delta := range attrs {
 			samples = append(samples, Sample{Kind: "smart_delta", Key: dev + "/" + attr, Value: delta, Unit: "count"})
 		}
-	if d.Sensor != nil {
+	}
 	if d.Sensor != nil && len(samples) > 0 {
 		_ = d.Sensor(ctx, samples)
 	}
-	d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
+	if failed != "" {
 		return Outcome{
 			Passed:   false,
 			Message:  "fio verify failed on " + failed,
 			Summary:  "fio failed on " + failed,
 			Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
 			SubSteps: subs,
 		}
 	}
 	d.Info(fmt.Sprintf("Storage: %d disk(s) passed fio --verify", len(targets)))
 	return Outcome{
 		Passed:   true,
-		Summary:  fmt.Sprintf("%d disks passed", len(targets)),
+		Summary:  fmt.Sprintf("%d disks passed (%s)", len(targets), fioOpts.Mode),
-		Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes},
+		Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
 		SubSteps: subs,
 	}
 }
@@ -229,8 +257,8 @@ type wipeProbeResult struct {
 // probeWipe runs blkid + wipefs -n. Any non-empty output from either is
 // a "has data" signal. This is deliberately conservative: we'd rather
-// halt on a bare ext4 signature than hand badblocks a disk with real
+// halt on a bare ext4 signature than hand fio a disk with real bytes on
-// bytes on it.
+// it.
 func probeWipe(ctx context.Context, device string) wipeProbeResult {
 	out := wipeProbeResult{Device: device}
@@ -257,84 +285,269 @@ func probeWipe(ctx context.Context, device string) wipeProbeResult {
 	return out
 }
 // ---------- badblocks ----------
 type badblocksResult struct {
 	OK        bool   `json:"ok"`
 	Elapsed   string `json:"elapsed"`
 	Error     string `json:"error,omitempty"`
 	OutputTail string `json:"output_tail,omitempty"`
 }
 func runBadblocks(ctx context.Context, device string) badblocksResult {
 	// -c 64 blocks per check, -w destructive write, -b 4096 block size,
 	// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays
 	// bounded. A real burn-in would run the whole disk; that belongs in
 	// a separate "deep" stage.
 	args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"}
 	start := time.Now()
 	runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
 	defer cancel()
 	cmd := exec.CommandContext(runCtx, "badblocks", args...)
 	out, err := cmd.CombinedOutput()
 	r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)}
 	if err != nil {
 		r.Error = err.Error()
 		return r
 	}
 	// badblocks prints each bad block to stdout. Empty output = clean.
 	if strings.TrimSpace(string(out)) == "" {
 		r.OK = true
 	} else {
 		r.Error = "bad blocks found"
 	}
 	return r
 }
 // ---------- fio ----------
 // fioOpts resolves the probe knobs into the concrete flag values fio
 // needs. Defaults match the quick profile's fio_sample shape so callers
 // with zero knobs still run something bounded.
 type fioOpts struct {
 	Mode    string        `json:"mode"`     // "fio_sample" | "full_disk"
 	Size    string        `json:"size"`     // "1GiB"; only used for fio_sample
 	Runtime time.Duration `json:"runtime"`  // bounding time
 	BS      string        `json:"bs"`       // "4k"
 	RW      string        `json:"rw"`       // "randrw"
 	Verify  string        `json:"verify"`   // "md5" | ""
 }
 // resolveFioOpts normalizes the knobs into a runnable config. Zero-
 // valued fields fall back to the quick defaults so a stage that's
 // missing its knobs still has coherent behavior (safer than refusing).
 func resolveFioOpts(k StorageKnobs) fioOpts {
 	o := fioOpts{
 		Mode:    firstNonEmpty(k.Mode, "fio_sample"),
 		Size:    firstNonEmpty(k.FioSize, "1GiB"),
 		Runtime: k.FioTime,
 		BS:      firstNonEmpty(k.FioBS, "4k"),
 		RW:      firstNonEmpty(k.FioRW, "randrw"),
 		Verify:  firstNonEmpty(k.Verify, "md5"),
 	}
 	if o.Runtime <= 0 {
 		o.Runtime = 3 * time.Minute
 	}
 	return o
 }
 func firstNonEmpty(vs ...string) string {
 	for _, v := range vs {
 		if v != "" {
 			return v
 		}
 	}
 	return ""
 }
 type fioResult struct {
 	Mode        string  `json:"mode"`
 	ReadIOPS    float64 `json:"read_iops"`
 	WriteIOPS   float64 `json:"write_iops"`
 	ReadBWKBps  float64 `json:"read_bw_kbps"`
 	WriteBWKBps float64 `json:"write_bw_kbps"`
 	ReadP99Us   float64 `json:"read_p99_us,omitempty"`
 	WriteP99Us  float64 `json:"write_p99_us,omitempty"`
 	Error       string  `json:"error,omitempty"`
 	OutputTail  string  `json:"output_tail,omitempty"`
 }
-// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
+// runFioVerify invokes fio with md5-verify semantics. fio_sample mode
-// This is a health bar, not a benchmark — we want to know the disk
+// caps the IO at opts.Size; full_disk drives the whole device bounded
-// services IO, not how fast it is at p99.
+// by runtime. Both use direct IO to bypass the page cache — we want
-func runFio(ctx context.Context, device string) fioResult {
+// real disk latency, not Linux' cheerful buffer.
-	runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
+func runFioVerify(ctx context.Context, device string, opts fioOpts) fioResult {
 	// 30s grace over runtime so fio has time to flush + close cleanly.
 	runCtx, cancel := context.WithTimeout(ctx, opts.Runtime+30*time.Second)
 	defer cancel()
 	args := []string{
-		"--name=health", "--filename=" + device, "--rw=randrw",
+		"--name=verify-" + strings.TrimPrefix(device, "/dev/"),
-		"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
+		"--filename=" + device,
-		"--group_reporting", "--output-format=json", "--direct=1",
+		"--rw=" + opts.RW,
 		"--bs=" + opts.BS,
 		"--numjobs=1",
 		"--direct=1",
 		"--group_reporting",
 		"--output-format=json",
 		"--runtime=" + strconv.Itoa(int(opts.Runtime.Seconds())),
 	}
 	if opts.Verify != "" {
 		args = append(args,
 			"--verify="+opts.Verify,
 			"--verify_pattern=random",
 			"--do_verify=1",
 		)
 	}
 	switch opts.Mode {
 	case "full_disk":
 		// Time-bounded across the full device — fio uses the device's
 		// full size when --size is omitted on a block device.
 		args = append(args, "--time_based=1")
 	default:
 		// fio_sample: bounded write. Setting --size= limits the IO
 		// volume regardless of runtime.
 		args = append(args, "--size="+opts.Size, "--time_based=0")
 	}
 	cmd := exec.CommandContext(runCtx, "fio", args...)
 	out, err := cmd.Output()
 	r := fioResult{Mode: opts.Mode, OutputTail: tailLines(string(out), 20)}
 	if err != nil {
-		return fioResult{Error: err.Error()}
+		r.Error = err.Error()
 		return r
 	}
 	parsed, perr := parseFioJSON(out)
 	if perr != nil {
 		r.Error = "parse fio json: " + perr.Error()
 		return r
 	}
 	r.ReadIOPS = parsed.ReadIOPS
 	r.WriteIOPS = parsed.WriteIOPS
 	r.ReadBWKBps = parsed.ReadBWKBps
 	r.WriteBWKBps = parsed.WriteBWKBps
 	r.ReadP99Us = parsed.ReadP99Us
 	r.WriteP99Us = parsed.WriteP99Us
 	return r
 }
 // parseFioJSON extracts the bits we care about from fio's --output-format=json.
 // Latency percentiles live at .jobs[0].read.clat_ns.percentile["99.000000"];
 // we convert nanoseconds to microseconds for the fio_p99_us sample.
 func parseFioJSON(out []byte) (fioResult, error) {
 	var top struct {
 		Jobs []struct {
 			Read struct {
 				IOPS float64 `json:"iops"`
 				BW   float64 `json:"bw"`
 				CLat struct {
 					Percentile map[string]float64 `json:"percentile"`
 				} `json:"clat_ns"`
 			} `json:"read"`
 			Write struct {
 				IOPS float64 `json:"iops"`
 				BW   float64 `json:"bw"`
 				CLat struct {
 					Percentile map[string]float64 `json:"percentile"`
 				} `json:"clat_ns"`
 			} `json:"write"`
 		} `json:"jobs"`
 	}
-	if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 {
+	if err := json.Unmarshal(out, &top); err != nil {
-		return fioResult{Error: "parse fio json: " + fmt.Sprint(err)}
+		return fioResult{}, err
 	}
 	if len(top.Jobs) == 0 {
 		return fioResult{}, fmt.Errorf("no jobs in fio output")
 	}
 	j := top.Jobs[0]
-	return fioResult{
+	r := fioResult{
 		ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
 		ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
 	}
 	if p := j.Read.CLat.Percentile["99.000000"]; p > 0 {
 		r.ReadP99Us = p / 1000.0
 	}
 	if p := j.Write.CLat.Percentile["99.000000"]; p > 0 {
 		r.WriteP99Us = p / 1000.0
 	}
 	return r, nil
 }
 // ---------- SMART delta ----------
 // smartAttrMap: device → attribute → raw counter value. ATA drives
 // populate named attributes (Reallocated_Sector_Ct etc); NVMe drives
 // populate a flatter nvme-specific map. We track a curated whitelist
 // of wear indicators — anything else is diagnostic and drops to the raw
 // report output.
 type smartAttrMap map[string]map[string]float64
 // captureSMARTAttrs runs smartctl -aj on each target and pulls the
 // whitelisted attributes. Per-device failures (virtio, permission
 // issues) degrade silently — the delta step just shows no data for
 // that device.
 func captureSMARTAttrs(ctx context.Context, targets []diskTarget) smartAttrMap {
 	out := smartAttrMap{}
 	for _, t := range targets {
 		parsed, err := runSmartctl(ctx, t.Device)
 		if err != nil {
 			continue
 		}
 		attrs := extractSMARTAttrs(parsed)
 		if len(attrs) > 0 {
 			out[t.Device] = attrs
 		}
 	}
 	return out
 }
 // smartAttributeWhitelist is the set of attributes we diff across a
 // stage. They're the ones that reflect *this stage's* IO damage, not
 // cumulative drive history. Adding attributes is cheap — missing ones
 // just drop to zero.
 var smartAttributeWhitelist = map[string]bool{
 	// ATA SMART attribute names (smartctl normalizes to these)
 	"Reallocated_Sector_Ct":   true,
 	"Current_Pending_Sector":  true,
 	"Offline_Uncorrectable":   true,
 	"UDMA_CRC_Error_Count":    true,
 	"Reported_Uncorrect":      true,
 	"Raw_Read_Error_Rate":     true,
 	// NVMe log fields (flat keys at top of nvme_smart_health_information_log)
 	"media_errors":            true,
 	"num_err_log_entries":     true,
 	"percentage_used":         true,
 }
 // extractSMARTAttrs walks smartctl's JSON for whitelisted attribute
 // values. Handles both the ATA shape (ata_smart_attributes.table[]) and
 // the NVMe shape (nvme_smart_health_information_log). Returns a map
 // keyed by the canonical attribute name.
 func extractSMARTAttrs(raw map[string]any) map[string]float64 {
 	out := map[string]float64{}
 	// ATA attributes are in ata_smart_attributes.table[] — each element
 	// has {"name": "Reallocated_Sector_Ct", "raw": {"value": N}}.
 	if ata, ok := raw["ata_smart_attributes"].(map[string]any); ok {
 		if tbl, ok := ata["table"].([]any); ok {
 			for _, row := range tbl {
 				rm, ok := row.(map[string]any)
 				if !ok {
 					continue
 				}
 				name, _ := rm["name"].(string)
 				if !smartAttributeWhitelist[name] {
 					continue
 				}
 				if r, ok := rm["raw"].(map[string]any); ok {
 					if v, ok := r["value"].(float64); ok {
 						out[name] = v
 					}
 				}
 			}
 		}
 	}
 	// NVMe attributes live flat under nvme_smart_health_information_log.
 	if nvme, ok := raw["nvme_smart_health_information_log"].(map[string]any); ok {
 		for k, v := range nvme {
 			if !smartAttributeWhitelist[k] {
 				continue
 			}
 			if n, ok := v.(float64); ok {
 				out[k] = n
 			}
 		}
 	}
 	return out
 }
 // diffSMARTAttrs subtracts start from end per (device, attribute).
 // Only attributes present in both ends produce a delta; missing
 // attributes drop out (can't attribute a zero-to-present delta safely).
 // Negative deltas are kept so a drive that resets a counter is visible.
 func diffSMARTAttrs(start, end smartAttrMap) map[string]map[string]float64 {
 	out := map[string]map[string]float64{}
 	for dev, endAttrs := range end {
 		startAttrs, ok := start[dev]
 		if !ok {
 			continue
 		}
 		devOut := map[string]float64{}
 		for attr, endV := range endAttrs {
 			startV, ok := startAttrs[attr]
 			if !ok {
 				continue
 			}
 			devOut[attr] = endV - startV
 		}
 		if len(devOut) > 0 {
 			out[dev] = devOut
 		}
 	}
 	return out
 }
@@ -0,0 +1,218 @@
 package tests
 import (
 	"encoding/json"
 	"testing"
 	"time"
 )
 // TestParseFioJSON_ATAReadWrite confirms we pull IOPS, BW, and p99
 // latency from both read and write sides. P99 is read from clat_ns and
 // converted ns → us (the unit we emit to the threshold evaluator).
 func TestParseFioJSON_ATAReadWrite(t *testing.T) {
 	raw := `{
 		"jobs": [{
 			"read":  {"iops": 1234.5, "bw": 5000, "clat_ns": {"percentile": {"99.000000": 250000}}},
 			"write": {"iops": 432.1,  "bw": 2000, "clat_ns": {"percentile": {"99.000000": 500000}}}
 		}]
 	}`
 	r, err := parseFioJSON([]byte(raw))
 	if err != nil {
 		t.Fatalf("parseFioJSON: %v", err)
 	}
 	if r.ReadIOPS != 1234.5 {
 		t.Errorf("ReadIOPS = %v, want 1234.5", r.ReadIOPS)
 	}
 	if r.WriteIOPS != 432.1 {
 		t.Errorf("WriteIOPS = %v, want 432.1", r.WriteIOPS)
 	}
 	if r.ReadBWKBps != 5000 {
 		t.Errorf("ReadBWKBps = %v, want 5000", r.ReadBWKBps)
 	}
 	// 250000 ns → 250 us
 	if r.ReadP99Us != 250 {
 		t.Errorf("ReadP99Us = %v, want 250", r.ReadP99Us)
 	}
 	// 500000 ns → 500 us
 	if r.WriteP99Us != 500 {
 		t.Errorf("WriteP99Us = %v, want 500", r.WriteP99Us)
 	}
 }
 // TestParseFioJSON_ReadOnlyJob: if only one side has p99 populated the
 // other stays zero (not emitted as a sample). Mirrors a randread job.
 func TestParseFioJSON_ReadOnlyJob(t *testing.T) {
 	raw := `{
 		"jobs": [{
 			"read":  {"iops": 1000, "bw": 4000, "clat_ns": {"percentile": {"99.000000": 100000}}},
 			"write": {"iops": 0, "bw": 0}
 		}]
 	}`
 	r, err := parseFioJSON([]byte(raw))
 	if err != nil {
 		t.Fatalf("parseFioJSON: %v", err)
 	}
 	if r.WriteP99Us != 0 {
 		t.Errorf("WriteP99Us = %v on read-only job, want 0", r.WriteP99Us)
 	}
 	if r.ReadP99Us != 100 {
 		t.Errorf("ReadP99Us = %v, want 100", r.ReadP99Us)
 	}
 }
 // TestParseFioJSON_NoJobs fails rather than reporting zeroes silently.
 // An empty jobs array means fio didn't run anything.
 func TestParseFioJSON_NoJobs(t *testing.T) {
 	raw := `{"jobs": []}`
 	if _, err := parseFioJSON([]byte(raw)); err == nil {
 		t.Errorf("expected error on empty jobs array")
 	}
 }
 // TestExtractSMARTAttrs_ATA picks attributes out of ata_smart_attributes.table
 // when present. Attributes outside the whitelist drop out silently.
 func TestExtractSMARTAttrs_ATA(t *testing.T) {
 	raw := map[string]any{}
 	smartJSON := `{
 		"ata_smart_attributes": {
 			"table": [
 				{"name": "Reallocated_Sector_Ct",   "raw": {"value": 7}},
 				{"name": "Current_Pending_Sector",  "raw": {"value": 3}},
 				{"name": "Spin_Retry_Count",        "raw": {"value": 99}}
 			]
 		}
 	}`
 	if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
 		t.Fatalf("unmarshal fixture: %v", err)
 	}
 	out := extractSMARTAttrs(raw)
 	if out["Reallocated_Sector_Ct"] != 7 {
 		t.Errorf("Reallocated_Sector_Ct = %v, want 7", out["Reallocated_Sector_Ct"])
 	}
 	if out["Current_Pending_Sector"] != 3 {
 		t.Errorf("Current_Pending_Sector = %v, want 3", out["Current_Pending_Sector"])
 	}
 	if _, ok := out["Spin_Retry_Count"]; ok {
 		t.Errorf("Spin_Retry_Count should not appear (not in whitelist)")
 	}
 }
 // TestExtractSMARTAttrs_NVMe picks media_errors and friends from the
 // nvme health log shape, which is a flat map at the top of the JSON.
 func TestExtractSMARTAttrs_NVMe(t *testing.T) {
 	raw := map[string]any{}
 	smartJSON := `{
 		"nvme_smart_health_information_log": {
 			"media_errors": 2,
 			"num_err_log_entries": 15,
 			"percentage_used": 7,
 			"temperature": 42
 		}
 	}`
 	if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
 		t.Fatalf("unmarshal fixture: %v", err)
 	}
 	out := extractSMARTAttrs(raw)
 	if out["media_errors"] != 2 {
 		t.Errorf("media_errors = %v, want 2", out["media_errors"])
 	}
 	if out["num_err_log_entries"] != 15 {
 		t.Errorf("num_err_log_entries = %v, want 15", out["num_err_log_entries"])
 	}
 	if out["percentage_used"] != 7 {
 		t.Errorf("percentage_used = %v, want 7", out["percentage_used"])
 	}
 	if _, ok := out["temperature"]; ok {
 		t.Errorf("temperature should not appear (not in whitelist)")
 	}
 }
 // TestDiffSMARTAttrs: end − start per (device, attr). Only attrs in
 // both snapshots yield a delta; any disappearing attribute just drops
 // out instead of showing a misleading negative.
 func TestDiffSMARTAttrs(t *testing.T) {
 	start := smartAttrMap{
 		"/dev/sda": {"Reallocated_Sector_Ct": 5, "Current_Pending_Sector": 0},
 	}
 	end := smartAttrMap{
 		"/dev/sda": {"Reallocated_Sector_Ct": 8, "Current_Pending_Sector": 2, "UDMA_CRC_Error_Count": 1},
 	}
 	out := diffSMARTAttrs(start, end)
 	if out["/dev/sda"]["Reallocated_Sector_Ct"] != 3 {
 		t.Errorf("Reallocated_Sector_Ct delta = %v, want 3", out["/dev/sda"]["Reallocated_Sector_Ct"])
 	}
 	if out["/dev/sda"]["Current_Pending_Sector"] != 2 {
 		t.Errorf("Current_Pending_Sector delta = %v, want 2", out["/dev/sda"]["Current_Pending_Sector"])
 	}
 	if _, ok := out["/dev/sda"]["UDMA_CRC_Error_Count"]; ok {
 		t.Errorf("UDMA_CRC_Error_Count should not appear (missing at start)")
 	}
 }
 // TestDiffSMARTAttrs_DeviceNewAtEnd: a device only present in the end
 // snapshot (drive hot-plugged mid-run, or SMART read succeeded only at
 // end) is dropped from the diff — no start baseline to subtract from.
 func TestDiffSMARTAttrs_DeviceNewAtEnd(t *testing.T) {
 	start := smartAttrMap{}
 	end := smartAttrMap{
 		"/dev/sda": {"Reallocated_Sector_Ct": 10},
 	}
 	out := diffSMARTAttrs(start, end)
 	if _, ok := out["/dev/sda"]; ok {
 		t.Errorf("/dev/sda should drop from diff when absent at start")
 	}
 }
 // TestResolveFioOpts_Defaults: zero-valued knobs resolve to the quick
 // profile's fio_sample shape. Any stage that's missing per-profile
 // knobs (legacy claim response, test harness) still has coherent
 // bounded defaults — we won't accidentally fall into unbounded writes.
 func TestResolveFioOpts_Defaults(t *testing.T) {
 	o := resolveFioOpts(StorageKnobs{})
 	if o.Mode != "fio_sample" {
 		t.Errorf("Mode = %q, want fio_sample", o.Mode)
 	}
 	if o.Size != "1GiB" {
 		t.Errorf("Size = %q, want 1GiB", o.Size)
 	}
 	if o.Runtime != 3*time.Minute {
 		t.Errorf("Runtime = %v, want 3m", o.Runtime)
 	}
 	if o.BS != "4k" {
 		t.Errorf("BS = %q, want 4k", o.BS)
 	}
 	if o.RW != "randrw" {
 		t.Errorf("RW = %q, want randrw", o.RW)
 	}
 	if o.Verify != "md5" {
 		t.Errorf("Verify = %q, want md5", o.Verify)
 	}
 }
 // TestResolveFioOpts_FullDiskOverride confirms the deep/soak shape
 // round-trips. FioTime as 2h overrides the 3-minute default.
 func TestResolveFioOpts_FullDiskOverride(t *testing.T) {
 	k := StorageKnobs{
 		Mode:    "full_disk",
 		FioTime: 2 * time.Hour,
 		FioBS:   "64k",
 		FioRW:   "write",
 	}
 	o := resolveFioOpts(k)
 	if o.Mode != "full_disk" {
 		t.Errorf("Mode = %q, want full_disk", o.Mode)
 	}
 	if o.Runtime != 2*time.Hour {
 		t.Errorf("Runtime = %v, want 2h", o.Runtime)
 	}
 	if o.BS != "64k" {
 		t.Errorf("BS = %q, want 64k", o.BS)
 	}
 	if o.RW != "write" {
 		t.Errorf("RW = %q, want write", o.RW)
 	}
 	// Verify should fall back to md5 default since knob was empty.
 	if o.Verify != "md5" {
 		t.Errorf("Verify = %q, want md5 (default)", o.Verify)
 	}
 }
@@ -60,6 +60,8 @@ func main() {
 	artifactStore := &store.Artifacts{DB: conn}
 	specDiffStore := &store.SpecDiffs{DB: conn}
 	measurementStore := &store.Measurements{DB: conn}
 	thresholdStore := &store.Thresholds{DB: conn}
 	firmwareStore := &store.Firmware{DB: conn}
 	hub := events.NewHub()
@@ -105,6 +107,8 @@ func main() {
 		SubSteps:   subStepStore,
 		SpecDiffs:  specDiffStore,
 		Artifacts:  artifactStore,
 		Thresholds: thresholdStore,
 		Profiles:   cfg.Profiles,
 		EventHub:   hub,
 		Logs:       logHub,
 		Runner:     runner,
@@ -157,6 +161,9 @@ func main() {
 		Artifacts:       artifactStore,
 		SpecDiffs:       specDiffStore,
 		Measurements:    measurementStore,
 		Thresholds:      thresholdStore,
 		Firmware:        firmwareStore,
 		Profiles:        cfg.Profiles,
 		Runner:          runner,
 		EventHub:        hub,
 		Logs:            logHub,
@@ -85,3 +85,54 @@ agent:
 notifiers: []
 routes: []
 # Vetting pipeline shared defaults. Every profile (quick/deep/soak)
 # walks the same stage list; only per-stage durations differ.
 # Thresholds here apply to every profile — a 92°C CPU fails a
 # 2-minute quick run and a 12-hour soak run alike.
 vetting:
  stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting]
  thresholds:
    - { stage: "*",       kind: temp,        key: "cpu/*",           op: lt,         value: 92,   unit: C, severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+12V",            op: within_pct, value: 5,  nominal: 12.0, severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+5V",             op: within_pct, value: 5,  nominal: 5.0,  severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+3.3V",           op: within_pct, value: 5,  nominal: 3.3,  severity: critical }
    - { stage: Storage,   kind: fio_p99_us,  key: "*",               op: lt,         value: 50000,                 severity: warning }
    - { stage: Network,   kind: iperf,       key: throughput_mbps,   op: gte,        value: 900,                   severity: critical }
    - { stage: Network,   kind: nic_retrans, key: "*/rate",          op: lt,         value: 0.001,                 severity: warning }
    - { stage: CPUStress, kind: edac_ue,     key: "*",               op: lte,        value: 0,                     severity: critical }
    - { stage: CPUStress, kind: mce,         key: "*",               op: lte,        value: 0,                     severity: critical }
 # Per-profile durations + probe knobs. Only the *durations* scale across
 # profiles — every profile exercises every probe and gate. Quick is a
 # ~10-minute same-day sanity check; deep is the 8–12 h overnight soak;
 # soak is the opt-in 36–40 h extreme run.
 profiles:
  quick:
    stage_timeouts:
      CPUStress: 5m
      Storage:   5m
      Network:   2m
    defaults:
      cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s }
      storage:   { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 }
      network:   { duration: 60s }
  deep:
    stage_timeouts:
      CPUStress: 2h
      Storage:   4h
      Network:   35m
    defaults:
      cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s }
      storage:   { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 }
      network:   { duration: 30m }
  soak:
    inherit: deep
    stage_timeouts:
      CPUStress: 14h
      Storage:   8h
      Network:   2h30m
    defaults:
      cpustress: { cpu_pass: 12h }
      storage:   { mode: full_disk, fio_time: 6h }
      network:   { duration: 2h }
@@ -75,3 +75,41 @@ agent:
 notifiers: []
 routes: []
 # Vetting pipeline shared defaults. Every profile (quick/deep/soak)
 # walks the same stage list; only per-stage durations differ.
 # Thresholds apply to every profile — critical breaches fail a run
 # regardless of which profile the operator picked.
 vetting:
  stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting]
  thresholds:
    - { stage: "*",       kind: temp,        key: "cpu/*",           op: lt,         value: 92,   unit: C, severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+12V",            op: within_pct, value: 5,  nominal: 12.0, severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+5V",             op: within_pct, value: 5,  nominal: 5.0,  severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+3.3V",           op: within_pct, value: 5,  nominal: 3.3,  severity: critical }
    - { stage: Storage,   kind: fio_p99_us,  key: "*",               op: lt,         value: 50000,                 severity: warning }
    - { stage: Network,   kind: iperf,       key: throughput_mbps,   op: gte,        value: 900,                   severity: critical }
    - { stage: Network,   kind: nic_retrans, key: "*/rate",          op: lt,         value: 0.001,                 severity: warning }
    - { stage: CPUStress, kind: edac_ue,     key: "*",               op: lte,        value: 0,                     severity: critical }
    - { stage: CPUStress, kind: mce,         key: "*",               op: lte,        value: 0,                     severity: critical }
 profiles:
  quick:
    stage_timeouts: { CPUStress: 5m, Storage: 5m, Network: 2m }
    defaults:
      cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s }
      storage:   { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 }
      network:   { duration: 60s }
  deep:
    stage_timeouts: { CPUStress: 2h, Storage: 4h, Network: 35m }
    defaults:
      cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s }
      storage:   { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 }
      network:   { duration: 30m }
  soak:
    inherit: deep
    stage_timeouts: { CPUStress: 14h, Storage: 8h, Network: 2h30m }
    defaults:
      cpustress: { cpu_pass: 12h }
      storage:   { mode: full_disk, fio_time: 6h }
      network:   { duration: 2h }
@@ -19,6 +19,7 @@ import (
 	"github.com/go-chi/chi/v5"
 	"vetting/internal/config"
 	"vetting/internal/events"
 	"vetting/internal/hold"
 	"vetting/internal/logs"
@@ -41,6 +42,9 @@ type Agent struct {
 	Artifacts       *store.Artifacts
 	SpecDiffs       *store.SpecDiffs
 	Measurements    *store.Measurements
 	Thresholds      *store.Thresholds // Phase 1: seeded per run; consulted on each /sensor batch
 	Firmware        *store.Firmware   // Phase 4: firmware snapshots (unused before then)
 	Profiles        *config.ProfileRegistry // Phase 2: /claim resolves the run's profile → stage knobs
 	Runner          *orchestrator.Runner
 	EventHub        *events.Hub
 	Logs            *logs.Hub
@@ -216,6 +220,21 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
 	if iperfPort == 0 {
 		iperfPort = 5201
 	}
 	// Resolve the run's profile → agent-visible stage knobs. The agent
 	// reads these to size CPUStress / Storage / Network work. An empty
 	// profile (legacy runs seeded before Phase 1) falls back to "quick".
 	profileName := run.Profile
 	if profileName == "" {
 		profileName = config.ProfileQuick
 	}
 	var stageCfg config.StageConfig
 	if a.Profiles != nil {
 		stageCfg = a.Profiles.ResolveStageConfig(profileName)
 	} else {
 		stageCfg = config.StageConfig{Profile: profileName}
 	}
 	writeJSON(w, http.StatusOK, map[string]any{
 		"ok":              true,
 		"run_id":          runID,
@@ -224,6 +243,7 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
 		"iperf_port":      iperfPort,
 		"non_destructive": run.NonDestructive,
 		"current_state":   string(currentState),
 		"stage_config":    stageCfg,
 	})
 }
@@ -398,10 +418,24 @@ type StageResult struct {
 	Passed    bool                `json:"passed"`
 	Summary   json.RawMessage     `json:"summary,omitempty"`
 	Inventory *spec.Inventory     `json:"inventory,omitempty"`
 	Firmware  []FirmwareLine      `json:"firmware,omitempty"`
 	Message   string              `json:"message,omitempty"`
 	SubSteps  []SubStepResultLine `json:"sub_steps,omitempty"`
 }
 // FirmwareLine is a single firmware snapshot POSTed alongside the
 // Firmware stage's /result body. Mirrors agent/probes.FirmwareSnapshot.
 // The server converts each line to a store.FirmwareSnapshot and persists
 // it under the run — SpecValidate reads these back to diff against the
 // host's expected_firmware.
 type FirmwareLine struct {
 	Component  string            `json:"component"`
 	Identifier string            `json:"identifier"`
 	Version    string            `json:"version"`
 	Vendor     string            `json:"vendor,omitempty"`
 	Raw        map[string]string `json:"raw,omitempty"`
 }
 // SubStepResultLine is one entry in StageResult.SubSteps. Ordinal is
 // assigned from slice index server-side; the agent doesn't set it.
 type SubStepResultLine struct {
@@ -476,6 +510,20 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 	// Aggregate threshold gate: flip Passed=false server-side when any
 	// critical breach landed for this stage. The agent's verdict is
 	// advisory — a stage-executor can miss a runaway sample that the
 	// sidecar caught. We check this *before* writing the stage state
 	// so the DB reflects the server-side decision.
 	thresholdDetail := ""
 	if body.Passed {
 		if breached, detail := a.stageHadCriticalBreach(r.Context(), runID, body.Stage); breached {
 			body.Passed = false
 			thresholdDetail = detail
 			a.appendLog(runID, "error", fmt.Sprintf("%s reported passed but %s — flipping to failed", body.Stage, detail))
 		}
 	}
 	stageState := model.StagePassed
 	if !body.Passed {
 		stageState = model.StageFailed
@@ -488,6 +536,9 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
 		http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError)
 		return
 	}
 	if thresholdDetail != "" && body.Message == "" {
 		body.Message = thresholdDetail
 	}
 	// Agent-authored sub-steps: persist in slice order (ordinal = index)
 	// and fan out a per-row SSE event each so the detail pane shows them
@@ -502,6 +553,14 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
 		}
 	}
 	// Firmware-specific: persist each snapshot into firmware_snapshots.
 	// SpecValidate reads them back to diff against expected_firmware.
 	if body.Stage == "Firmware" && len(body.Firmware) > 0 {
 		if err := a.persistFirmware(r.Context(), runID, body.Firmware); err != nil {
 			log.Printf("persist firmware run %d: %v", runID, err)
 		}
 	}
 	if !body.Passed {
 		if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
 			log.Printf("set failed stage: %v", err)
@@ -615,6 +674,34 @@ func parseResultTime(s string) *time.Time {
 	return nil
 }
 // persistFirmware writes the reported snapshots. A nil/unset a.Firmware
 // store is a no-op so tests that don't wire it up stay green; a mid-run
 // persist error is logged but doesn't fail the stage (Firmware is
 // advisory — SpecValidate is the gate).
 func (a *Agent) persistFirmware(ctx context.Context, runID int64, lines []FirmwareLine) error {
 	if a.Firmware == nil || len(lines) == 0 {
 		return nil
 	}
 	rows := make([]store.FirmwareSnapshot, 0, len(lines))
 	for _, l := range lines {
 		raw := "{}"
 		if len(l.Raw) > 0 {
 			if b, err := json.Marshal(l.Raw); err == nil {
 				raw = string(b)
 			}
 		}
 		rows = append(rows, store.FirmwareSnapshot{
 			RunID:      runID,
 			Component:  l.Component,
 			Identifier: l.Identifier,
 			Version:    l.Version,
 			Vendor:     l.Vendor,
 			RawJSON:    raw,
 		})
 	}
 	return a.Firmware.CreateBatch(ctx, rows)
 }
 func (a *Agent) persistInventory(r *http.Request, run *model.Run, inv *spec.Inventory) error {
 	dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", run.ID))
 	if err := os.MkdirAll(dir, 0o755); err != nil {
@@ -667,6 +754,22 @@ func (a *Agent) resolveSpecValidate(r *http.Request, runID int64) {
 		return
 	}
 	diffs := spec.Diff(expected, inv)
 	if a.Firmware != nil && len(expected.Firmware) > 0 {
 		snaps, err := a.Firmware.ListForRun(r.Context(), runID)
 		if err != nil {
 			log.Printf("specvalidate: list firmware: %v", err)
 		} else {
 			observed := make([]spec.FirmwareObserved, 0, len(snaps))
 			for _, s := range snaps {
 				observed = append(observed, spec.FirmwareObserved{
 					Component:  s.Component,
 					Identifier: s.Identifier,
 					Version:    s.Version,
 				})
 			}
 			diffs = append(diffs, spec.DiffFirmware(expected.Firmware, observed)...)
 		}
 	}
 	if err := a.SpecDiffs.ReplaceForRun(r.Context(), runID, diffs); err != nil {
 		log.Printf("specvalidate: write diffs: %v", err)
 	}
@@ -884,13 +987,17 @@ type SensorSample struct {
 }
 // Sensor persists a batch of numeric samples. The thermal sidecar hits
-// this on a tick; stage executors (iperf, fio) also drop here.
+// this on a tick; stage executors (iperf, fio) also drop here. Each
 // sample is evaluated against the run's seeded thresholds — critical
 // breaches fail the run immediately (thermal runaway, EDAC UE, voltage
 // sag); warning breaches are recorded for the report only.
 func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
 	runID, ok := runIDFromURL(w, r)
 	if !ok {
 		return
 	}
-	if _, ok := a.authenticate(w, r, runID); !ok {
+	run, ok := a.authenticate(w, r, runID)
 	if !ok {
 		return
 	}
 	if a.Measurements == nil {
@@ -903,8 +1010,12 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 	rows := make([]model.Measurement, 0, len(body.Samples))
 	sampleStages := make([]string, 0, len(body.Samples))
 	for _, s := range body.Samples {
 		ts, _ := time.Parse(time.RFC3339Nano, s.TS)
 		if ts.IsZero() {
 			ts = time.Now().UTC()
 		}
 		rows = append(rows, model.Measurement{
 			RunID: runID,
 			TS:    ts,
@@ -913,12 +1024,139 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
 			Value: s.Value,
 			Unit:  s.Unit,
 		})
 		// Stage the sample belongs to drives threshold selector
 		// matching. We use the run's current state — the agent does
 		// not tag samples with a stage.
 		sampleStages = append(sampleStages, orchestrator.StageNameForState(run.State))
 	}
 	if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil {
 		http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError)
 		return
 	}
-	writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(rows)})
+	critical := a.evaluateSensorBatch(r.Context(), runID, rows, sampleStages)
 	writeJSON(w, http.StatusOK, map[string]any{
 		"ok":          true,
 		"written":     len(rows),
 		"breach":      critical != "",
 		"breach_kind": critical,
 	})
 	if critical != "" {
 		a.failRunOnCriticalBreach(r, run, critical)
 	}
 }
 // evaluateSensorBatch runs each sample through the run's thresholds,
 // persists evaluations, and returns a short human-readable label for
 // the first critical breach it sees (empty when all samples pass or
 // only hit warning-severity rules).
 func (a *Agent) evaluateSensorBatch(ctx context.Context, runID int64, rows []model.Measurement, sampleStages []string) string {
 	if a.Thresholds == nil || len(rows) == 0 {
 		return ""
 	}
 	rules, err := a.Thresholds.ListForRun(ctx, runID)
 	if err != nil {
 		log.Printf("sensor: list thresholds run %d: %v", runID, err)
 		return ""
 	}
 	if len(rules) == 0 {
 		return ""
 	}
 	evalRules := make([]orchestrator.Threshold, 0, len(rules))
 	for _, r := range rules {
 		evalRules = append(evalRules, orchestrator.Threshold{
 			ID:       r.ID,
 			Stage:    r.Stage,
 			Kind:     r.Kind,
 			Key:      r.Key,
 			Op:       orchestrator.ThresholdOp(r.Op),
 			Value:    r.Threshold,
 			Nominal:  r.Nominal,
 			Severity: orchestrator.ThresholdSeverity(r.Severity),
 		})
 	}
 	evals := make([]store.ThresholdEvaluation, 0, len(rows))
 	critical := ""
 	for i, m := range rows {
 		sample := orchestrator.Sample{
 			Stage: sampleStages[i],
 			Kind:  m.Kind,
 			Key:   m.Key,
 			Value: m.Value,
 		}
 		for _, res := range orchestrator.Evaluate(sample, evalRules) {
 			evals = append(evals, store.ThresholdEvaluation{
 				RunID:       runID,
 				ThresholdID: res.Threshold.ID,
 				Stage:       sample.Stage,
 				Kind:        sample.Kind,
 				Key:         sample.Key,
 				TS:          m.TS,
 				Observed:    res.Observed,
 				Passed:      res.Passed,
 			})
 			if critical == "" && res.CriticalBreach() {
 				critical = fmt.Sprintf("%s %s=%g breached %s %g",
 					res.Threshold.Kind, sample.Key, res.Observed, res.Threshold.Op, res.Threshold.Value)
 			}
 		}
 	}
 	if err := a.Thresholds.RecordBatch(ctx, evals); err != nil {
 		log.Printf("sensor: record evals run %d: %v", runID, err)
 	}
 	return critical
 }
 // stageHadCriticalBreach returns true if any critical-severity
 // threshold evaluation for this run matched samples attributed to the
 // given stage (stage selector "*" or exact). Called at /result close
 // so even an agent that reports Passed=true gets overridden when the
 // aggregate view says the stage tripped a gate.
 func (a *Agent) stageHadCriticalBreach(ctx context.Context, runID int64, stage string) (bool, string) {
 	if a.Thresholds == nil {
 		return false, ""
 	}
 	breaches, err := a.Thresholds.CriticalBreaches(ctx, runID)
 	if err != nil {
 		log.Printf("result: list breaches run %d: %v", runID, err)
 		return false, ""
 	}
 	for _, b := range breaches {
 		if b.Stage == stage || b.Stage == "" || b.Stage == "*" {
 			return true, fmt.Sprintf("critical threshold breach: %s %s=%g", b.Kind, b.Key, b.Observed)
 		}
 	}
 	return false, ""
 }
 // failRunOnCriticalBreach flips the run to FailedHolding in response
 // to a live threshold breach (thermal runaway, EDAC UE, rail sag).
 // The agent's pending /result for the current stage may still arrive —
 // the silent-skip guard handles that by refusing to double-transition.
 func (a *Agent) failRunOnCriticalBreach(r *http.Request, run *model.Run, detail string) {
 	stage := orchestrator.StageNameForState(run.State)
 	if stage == "" {
 		stage = "threshold"
 	}
 	if err := a.Runs.SetFailedStage(r.Context(), run.ID, stage+" (threshold)"); err != nil {
 		log.Printf("sensor: set failed stage run %d: %v", run.ID, err)
 	}
 	if _, err := a.Runner.Transition(r.Context(), run.ID, orchestrator.TriggerStageFailed); err != nil {
 		// If we're already in FailedHolding the transition errors —
 		// that's fine, the first breach wins.
 		log.Printf("sensor: fail-transition run %d: %v", run.ID, err)
 		return
 	}
 	hostName := a.hostNameFor(r.Context(), run.HostID)
 	a.dispatchEvent(notify.Event{
 		Kind:     notify.KindStageFailed,
 		Severity: notify.SeverityCritical,
 		RunID:    run.ID,
 		HostName: hostName,
 		Title:    fmt.Sprintf("[vetting] %s FAILED: %s (threshold)", hostName, stage),
 		Body:     fmt.Sprintf("Run %d on %s tripped a critical threshold during %s: %s", run.ID, hostName, stage, detail),
 		URL:      a.runLinkURL(run.ID),
 	})
 	a.appendLog(run.ID, "error", fmt.Sprintf("threshold breach during %s: %s — run parked in FailedHolding", stage, detail))
 }
 // resolveReporting runs when the pipeline advances into StateReporting.
@@ -956,12 +1194,20 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) {
 			log.Printf("reporting: list measurements: %v", err)
 		}
 	}
 	var firmware []store.FirmwareSnapshot
 	if a.Firmware != nil {
 		firmware, err = a.Firmware.ListForRun(ctx, runID)
 		if err != nil {
 			log.Printf("reporting: list firmware: %v", err)
 		}
 	}
 	bundle := map[string]any{
 		"run":          run,
 		"host":         host,
 		"stages":       stages,
 		"spec_diffs":   diffs,
 		"measurements": measurements,
 		"firmware":     firmware,
 		"generated_at": time.Now().UTC().Format(time.RFC3339),
 	}
 	buf, err := json.MarshalIndent(bundle, "", "  ")
@@ -993,6 +1239,15 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) {
 	// Also render the operator-facing HTML summary alongside the JSON.
 	// Failures here are non-fatal — the JSON is the source of truth.
 	if host != nil {
 		fwRows := make([]report.FirmwareSnapshot, 0, len(firmware))
 		for _, f := range firmware {
 			fwRows = append(fwRows, report.FirmwareSnapshot{
 				Component:  f.Component,
 				Identifier: f.Identifier,
 				Version:    f.Version,
 				Vendor:     f.Vendor,
 			})
 		}
 		htmlData := report.Data{
 			GeneratedAt: time.Now().UTC(),
 			Run:         *run,
@@ -1000,6 +1255,7 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) {
 			Stages:      stages,
 			SpecDiffs:   diffs,
 			Aggregates:  report.AggregateMeasurements(measurements),
 			Firmware:    fwRows,
 		}
 		if htmlBuf, err := report.RenderHTML(htmlData); err != nil {
 			log.Printf("reporting: render html: %v", err)
@@ -108,7 +108,7 @@ func TestRunPage_DefaultStep_Running(t *testing.T) {
 	})
 	runID, _ := runs.Create(ctx, id, "rr", false)
 	_ = ui.Stages.Seed(ctx, runID)
-	for _, name := range []string{"Inventory", "SpecValidate"} {
+	for _, name := range []string{"Inventory", "Firmware", "SpecValidate"} {
 		_ = ui.Stages.StartByName(ctx, runID, name)
 		_ = ui.Stages.CompleteByName(ctx, runID, name, model.StagePassed, "")
 	}
@@ -135,7 +135,7 @@ func TestRunPage_DefaultStep_Failed(t *testing.T) {
 	})
 	runID, _ := runs.Create(ctx, id, "rf", false)
 	_ = ui.Stages.Seed(ctx, runID)
-	for _, name := range []string{"Inventory", "SpecValidate", "SMART"} {
+	for _, name := range []string{"Inventory", "Firmware", "SpecValidate", "SMART"} {
 		_ = ui.Stages.StartByName(ctx, runID, name)
 		_ = ui.Stages.CompleteByName(ctx, runID, name, model.StagePassed, "")
 	}
@@ -0,0 +1,169 @@
 package api_test
 import (
 	"context"
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
 	"path/filepath"
 	"strconv"
 	"testing"
 	"vetting/internal/api"
 	"vetting/internal/db"
 	"vetting/internal/events"
 	"vetting/internal/model"
 	"vetting/internal/orchestrator"
 	"vetting/internal/store"
 )
 // setupAgentWithThresholds builds an Agent wired up to the thresholds
 // store + a Runner so the /sensor handler can drive the state machine.
 // Seeds one critical thermal threshold and parks the run in CPUStress
 // so the handler will stamp a stage-relevant failed_stage.
 func setupAgentWithThresholds(t *testing.T) (*api.Agent, int64, string) {
 	t.Helper()
 	path := filepath.Join(t.TempDir(), "vetting.db")
 	conn, err := db.Open(path)
 	if err != nil {
 		t.Fatalf("open db: %v", err)
 	}
 	t.Cleanup(func() { _ = conn.Close() })
 	hosts := &store.Hosts{DB: conn}
 	runs := &store.Runs{DB: conn}
 	stages := &store.Stages{DB: conn}
 	meas := &store.Measurements{DB: conn}
 	thresholds := &store.Thresholds{DB: conn}
 	hub := events.NewHub()
 	runner := &orchestrator.Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub}
 	hostID, err := hosts.Create(context.Background(), model.Host{
 		Name:             "thresh-host",
 		MAC:              "aa:bb:cc:dd:ee:aa",
 		WoLBroadcastIP:   "10.0.0.255",
 		WoLPort:          9,
 		ExpectedSpecYAML: "memory:\n  total_gib: 16\n",
 	})
 	if err != nil {
 		t.Fatalf("create host: %v", err)
 	}
 	plain, hash, err := orchestrator.IssueRunToken()
 	if err != nil {
 		t.Fatalf("issue token: %v", err)
 	}
 	runID, err := runs.Create(context.Background(), hostID, hash, false)
 	if err != nil {
 		t.Fatalf("create run: %v", err)
 	}
 	if err := stages.Seed(context.Background(), runID); err != nil {
 		t.Fatalf("seed stages: %v", err)
 	}
 	// Park the run where a real thermal sidecar would be posting samples.
 	if err := runs.SetState(context.Background(), runID, model.StateCPUStress); err != nil {
 		t.Fatalf("set state: %v", err)
 	}
 	// Seed one critical thermal threshold.
 	if _, err := thresholds.SeedForRun(context.Background(), runID, []store.ThresholdSpec{
 		{Stage: "*", Kind: "temp", Key: "cpu/*", Op: "lt", Value: 92, Unit: "C", Severity: "critical", Source: "profile"},
 	}); err != nil {
 		t.Fatalf("seed thresholds: %v", err)
 	}
 	return &api.Agent{
 		Hosts:        hosts,
 		Runs:         runs,
 		Stages:       stages,
 		Measurements: meas,
 		Thresholds:   thresholds,
 		Runner:       runner,
 	}, runID, plain
 }
 // TestSensor_ThermalRunawayFailsRun: a sample that breaches a critical
 // threshold lands in threshold_evaluations (passed=0) and flips the
 // run into FailedHolding with failed_stage naming the current stage.
 // This is the Phase-1 behavior gate — without the evaluator, the sample
 // would just sit in measurements and the run would happily march on.
 func TestSensor_ThermalRunawayFailsRun(t *testing.T) {
 	a, runID, token := setupAgentWithThresholds(t)
 	batch := api.SensorBatch{Samples: []api.SensorSample{
 		{Kind: "temp", Key: "cpu/0", Value: 95.3, Unit: "C"},
 	}}
 	buf, _ := json.Marshal(batch)
 	req := routedRequest(runID, http.MethodPost,
 		"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
 	req.Header.Set("Authorization", "Bearer "+token)
 	req.Header.Set("Content-Type", "application/json")
 	rr := httptest.NewRecorder()
 	a.Sensor(rr, req)
 	if rr.Code != http.StatusOK {
 		t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
 	}
 	var resp struct {
 		OK     bool   `json:"ok"`
 		Breach bool   `json:"breach"`
 		Kind   string `json:"breach_kind"`
 	}
 	if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
 		t.Fatalf("decode: %v", err)
 	}
 	if !resp.Breach {
 		t.Fatalf("expected breach=true, got %+v", resp)
 	}
 	run, err := a.Runs.Get(context.Background(), runID)
 	if err != nil {
 		t.Fatalf("get run: %v", err)
 	}
 	if run.State != model.StateFailedHolding {
 		t.Fatalf("state = %s, want FailedHolding", run.State)
 	}
 	if run.FailedStage == "" {
 		t.Fatalf("failed_stage empty; want stage-named breach")
 	}
 	evals, err := a.Thresholds.ListEvaluations(context.Background(), runID)
 	if err != nil {
 		t.Fatalf("list evaluations: %v", err)
 	}
 	if len(evals) != 1 {
 		t.Fatalf("want 1 evaluation recorded, got %d", len(evals))
 	}
 	if evals[0].Passed {
 		t.Fatalf("evaluation recorded as passed for 95.3C sample against <92C rule")
 	}
 }
 // TestSensor_WithinThresholdPasses: a sample comfortably inside the
 // threshold writes an evaluation row with passed=1 and leaves the run
 // state untouched.
 func TestSensor_WithinThresholdPasses(t *testing.T) {
 	a, runID, token := setupAgentWithThresholds(t)
 	batch := api.SensorBatch{Samples: []api.SensorSample{
 		{Kind: "temp", Key: "cpu/0", Value: 55.0, Unit: "C"},
 	}}
 	buf, _ := json.Marshal(batch)
 	req := routedRequest(runID, http.MethodPost,
 		"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
 	req.Header.Set("Authorization", "Bearer "+token)
 	req.Header.Set("Content-Type", "application/json")
 	rr := httptest.NewRecorder()
 	a.Sensor(rr, req)
 	if rr.Code != http.StatusOK {
 		t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
 	}
 	run, err := a.Runs.Get(context.Background(), runID)
 	if err != nil {
 		t.Fatalf("get run: %v", err)
 	}
 	if run.State != model.StateCPUStress {
 		t.Fatalf("state = %s, want CPUStress unchanged", run.State)
 	}
 	evals, err := a.Thresholds.ListEvaluations(context.Background(), runID)
 	if err != nil {
 		t.Fatalf("list evaluations: %v", err)
 	}
 	if len(evals) != 1 || !evals[0].Passed {
 		t.Fatalf("want 1 passed evaluation, got %+v", evals)
 	}
 }
@@ -75,6 +75,12 @@ func newCaptureRegistry(c *captureNotifier) *notify.Registry {
 // (agent, runID, plainTokenForBearer). Caller is responsible for
 // transitioning the run out of Queued.
 func fullAgent(t *testing.T) (*api.Agent, int64, string) {
 	return fullAgentWithSpec(t, "")
 }
 // fullAgentWithSpec is the same as fullAgent but seeds the host with
 // an ExpectedSpecYAML so SpecValidate can pick up diffs in the test.
 func fullAgentWithSpec(t *testing.T, expectedSpecYAML string) (*api.Agent, int64, string) {
 	t.Helper()
 	tmp := t.TempDir()
 	conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
@@ -89,6 +95,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
 	artifactStore := &store.Artifacts{DB: conn}
 	specDiffStore := &store.SpecDiffs{DB: conn}
 	measurementStore := &store.Measurements{DB: conn}
 	firmwareStore := &store.Firmware{DB: conn}
 	hub := events.NewHub()
 	logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
@@ -109,7 +116,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
 		MAC:              "aa:bb:cc:dd:ee:10",
 		WoLBroadcastIP:   "10.0.0.255",
 		WoLPort:          9,
-		ExpectedSpecYAML: "", // empty spec → no diffs
+		ExpectedSpecYAML: expectedSpecYAML,
 	})
 	if err != nil {
 		t.Fatalf("create host: %v", err)
@@ -132,6 +139,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
 		Artifacts:    artifactStore,
 		SpecDiffs:    specDiffStore,
 		Measurements: measurementStore,
 		Firmware:     firmwareStore,
 		Runner:       runner,
 		EventHub:     hub,
 		Logs:         logHub,
@@ -195,20 +203,24 @@ func TestFullPipelineToCompleted(t *testing.T) {
 		Memory: spec.MemorySpec{TotalGiB: 16},
 	}
 	next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
-	// After Inventory → SpecValidate resolves inline → SMART
+	// After Inventory → Firmware
-	if next != "SMART" {
+	if next != "Firmware" {
-		t.Fatalf("after Inventory, next_state = %q, want SMART", next)
+		t.Fatalf("after Inventory, next_state = %q, want Firmware", next)
 	}
-	// The remaining stages advance one-for-one in order.
+	// The remaining stages advance one-for-one in order. After Firmware
 	// the inline SpecValidate resolver advances through SpecValidate to
 	// SMART without a dedicated /result POST for SpecValidate.
 	walkPlan := []struct {
 		stage    string
 		expected string
 	}{
 		{"Firmware", "SMART"},
 		{"SMART", "CPUStress"},
 		{"CPUStress", "Storage"},
 		{"Storage", "Network"},
-		{"Network", "GPU"},
+		{"Network", "Burn"},
 		{"Burn", "GPU"},
 		{"GPU", "PSU"},
 		{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
 	}
@@ -287,8 +299,11 @@ func TestFaultInjectionSMART(t *testing.T) {
 	}
 	inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
-	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" {
+	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
-		t.Fatalf("after Inventory, next = %q want SMART", next)
+		t.Fatalf("after Inventory, next = %q want Firmware", next)
 	}
 	if next := walkStage(t, a, runID, token, "Firmware", true, nil); next != "SMART" {
 		t.Fatalf("after Firmware, next = %q want SMART (inline SpecValidate)", next)
 	}
 	// Fake SMART failure → expect FailedHolding.
@@ -316,3 +331,76 @@ func TestFaultInjectionSMART(t *testing.T) {
 		t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
 	}
 }
 // TestFirmwarePersistAndSpecMismatch exercises the Phase 4 firmware
 // integration: the agent POSTs Firmware snapshots; server persists; the
 // following SpecValidate diff picks up a firmware mismatch and parks
 // the run in FailedHolding with FailedStage=SpecValidate.
 func TestFirmwarePersistAndSpecMismatch(t *testing.T) {
 	// Host demands BIOS 3.3; agent will POST 3.2 → one critical firmware diff.
 	yaml := "firmware:\n  - component: bios\n    version: \"3.3\"\n"
 	a, runID, token := fullAgentWithSpec(t, yaml)
 	a.Notify = newCaptureRegistry(&captureNotifier{name: "capture"})
 	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
 		t.Fatalf("set state: %v", err)
 	}
 	inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
 	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
 		t.Fatalf("after Inventory, next = %q want Firmware", next)
 	}
 	// Firmware stage: agent reports actual BIOS 3.2 → one row persisted.
 	fw := []map[string]any{
 		{"component": "bios", "identifier": "system", "version": "3.2", "vendor": "AMI"},
 	}
 	next := walkStage(t, a, runID, token, "Firmware", true, map[string]any{"firmware": fw})
 	// Inline SpecValidate should detect the firmware mismatch and send
 	// the run to FailedHolding without the agent posting SpecValidate.
 	if next != "FailedHolding" {
 		t.Fatalf("after Firmware mismatch, next = %q want FailedHolding", next)
 	}
 	run, err := a.Runs.Get(context.Background(), runID)
 	if err != nil {
 		t.Fatalf("get run: %v", err)
 	}
 	if run.State != model.StateFailedHolding {
 		t.Fatalf("run.State = %q, want FailedHolding", run.State)
 	}
 	if run.FailedStage != "SpecValidate" {
 		t.Fatalf("run.FailedStage = %q, want SpecValidate", run.FailedStage)
 	}
 	// Persistence: row landed in firmware_snapshots.
 	snaps, err := a.Firmware.ListForRun(context.Background(), runID)
 	if err != nil {
 		t.Fatalf("ListForRun firmware: %v", err)
 	}
 	if len(snaps) != 1 {
 		t.Fatalf("firmware rows = %d, want 1: %+v", len(snaps), snaps)
 	}
 	if snaps[0].Component != "bios" || snaps[0].Version != "3.2" {
 		t.Errorf("persisted snapshot = %+v", snaps[0])
 	}
 	// Diff row: SpecDiffs has a firmware-specific entry (rather than
 	// only CPU/memory/disk rows) and is critical.
 	diffs, err := a.SpecDiffs.ListForRun(context.Background(), runID)
 	if err != nil {
 		t.Fatalf("ListForRun specdiffs: %v", err)
 	}
 	found := false
 	for _, d := range diffs {
 		if strings.HasPrefix(d.Field, "firmware[") {
 			found = true
 			if d.Severity != "critical" {
 				t.Errorf("firmware diff severity = %q, want critical", d.Severity)
 			}
 		}
 	}
 	if !found {
 		t.Fatalf("no firmware[...] entry in spec diffs: %+v", diffs)
 	}
 }
@@ -16,6 +16,7 @@ import (
 	"github.com/go-chi/chi/v5"
 	"gopkg.in/yaml.v3"
 	"vetting/internal/config"
 	"vetting/internal/events"
 	"vetting/internal/logs"
 	"vetting/internal/model"
@@ -32,6 +33,8 @@ type UI struct {
 	SubSteps   *store.SubSteps
 	SpecDiffs  *store.SpecDiffs
 	Artifacts  *store.Artifacts
 	Thresholds *store.Thresholds // Phase 1: seeded at StartRun from Profiles
 	Profiles   *config.ProfileRegistry
 	EventHub   *events.Hub
 	Logs       *logs.Hub
 	Runner     *orchestrator.Runner
@@ -316,23 +319,71 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
 	}
 	nonDestructive := r.PostFormValue("non_destructive") == "1"
 	profile := strings.TrimSpace(r.PostFormValue("profile"))
 	if profile == "" {
 		profile = config.ProfileQuick
 	}
 	if !config.IsValidProfile(profile) {
 		http.Error(w, "unknown profile: "+profile, http.StatusBadRequest)
 		return
 	}
 	_, hash, err := orchestrator.IssueRunToken()
 	if err != nil {
 		http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
 		return
 	}
-	runID, err := u.Runs.Create(r.Context(), hostID, hash, nonDestructive)
+	runID, err := u.Runs.CreateWithProfile(r.Context(), hostID, hash, nonDestructive, profile)
 	if err != nil {
 		http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
 		return
 	}
-	log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID)
+	if err := u.seedThresholds(r.Context(), runID, host, profile); err != nil {
 		// A threshold-seed failure shouldn't orphan a run row — log
 		// and continue. Samples will just accumulate without a gate
 		// until the operator retries, same as before Phase 1.
 		log.Printf("ui: seed thresholds run %d: %v", runID, err)
 	}
 	log.Printf("ui: created run %d for host %d profile=%s (state=Queued)", runID, hostID, profile)
 	// Send the operator straight to the new run — the button they clicked
 	// was "Start vetting", the thing they want next is to watch it.
 	http.Redirect(w, r, fmt.Sprintf("/runs/%d", runID), http.StatusSeeOther)
 }
 // seedThresholds materializes the per-run threshold table from the
 // ProfileRegistry. The shared vetting.thresholds block applies to
 // every profile; future per-profile overrides will layer on top here,
 // and per-host overrides (Phase 1 extra) land via ExpectedSpecYAML in
 // a later iteration. Safe to skip silently when Thresholds or the
 // registry isn't wired — tests do not always build one.
 func (u *UI) seedThresholds(ctx context.Context, runID int64, host *model.Host, profile string) error {
 	if u.Thresholds == nil || u.Profiles == nil {
 		return nil
 	}
 	_ = host    // reserved for per-host override layer
 	_ = profile // reserved for per-profile override layer
 	defaults := u.Profiles.Vetting.Thresholds
 	if len(defaults) == 0 {
 		return nil
 	}
 	specs := make([]store.ThresholdSpec, 0, len(defaults))
 	for _, d := range defaults {
 		specs = append(specs, store.ThresholdSpec{
 			Stage:    d.Stage,
 			Kind:     d.Kind,
 			Key:      d.Key,
 			Op:       d.Op,
 			Value:    d.Value,
 			Nominal:  d.Nominal,
 			Unit:     d.Unit,
 			Severity: d.Severity,
 			Source:   "profile",
 		})
 	}
 	_, err := u.Thresholds.SeedForRun(ctx, runID, specs)
 	return err
 }
 func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
 	_ = templates.Registration(templates.RegistrationForm{
 		QuickRegisterURL: u.baseURL(r),
@@ -20,6 +20,13 @@ type Config struct {
 	Agent      Agent      `yaml:"agent"`
 	Notifiers  []Notifier `yaml:"notifiers"`
 	Routes     []Route    `yaml:"routes"`
 	// Profiles holds the Phase-1 quick/deep/soak registry (stage order,
 	// threshold defaults, per-profile stage timeouts + probe knobs).
 	// Populated from the `vetting:` and `profiles:` top-level blocks
 	// during Load. Nil is never returned — Load installs a default
 	// registry when those blocks are absent.
 	Profiles *ProfileRegistry `yaml:"-"`
 }
 type Server struct {
@@ -111,6 +118,20 @@ func Load(path string) (*Config, error) {
 	if err := yaml.Unmarshal(b, &c); err != nil {
 		return nil, fmt.Errorf("parse config: %w", err)
 	}
 	// The `vetting:` + `profiles:` blocks live alongside the existing
 	// fields but we decode them into the raw shape because YAML
 	// durations arrive as strings. Reusing the same byte buffer is
 	// safe: yaml.Unmarshal is happy to ignore keys the target doesn't
 	// know about.
 	var rawProfiles rawProfilesBlock
 	if err := yaml.Unmarshal(b, &rawProfiles); err != nil {
 		return nil, fmt.Errorf("parse profiles: %w", err)
 	}
 	reg, err := buildProfileRegistry(rawProfiles)
 	if err != nil {
 		return nil, fmt.Errorf("profiles: %w", err)
 	}
 	c.Profiles = reg
 	if c.Server.Bind == "" {
 		c.Server.Bind = "127.0.0.1:8080"
 	}
@@ -0,0 +1,441 @@
 package config
 import (
 	"fmt"
 	"strings"
 	"time"
 )
 // ProfileName is the set of legal values for a Run's profile column.
 // Exposed as constants so callers (UI handler, tests, agent) don't
 // sprinkle literal strings.
 const (
 	ProfileQuick = "quick"
 	ProfileDeep  = "deep"
 	ProfileSoak  = "soak"
 )
 // AllProfiles is the canonical ordering shown in the picker. Leftmost
 // is the default; rightmost is the longest-running.
 var AllProfiles = []string{ProfileQuick, ProfileDeep, ProfileSoak}
 // IsValidProfile returns true when name is one of the known profile
 // identifiers. Used at the UI boundary to reject malformed POSTs and in
 // store code as a fallback guard.
 func IsValidProfile(name string) bool {
 	for _, p := range AllProfiles {
 		if p == name {
 			return true
 		}
 	}
 	return false
 }
 // Vetting holds the stage order + threshold defaults that are shared
 // across all profiles. Only the per-stage durations/concurrency differ
 // between quick/deep/soak; gates like "CPU > 92C fails the run" apply
 // to a 2-minute quick run and a 12-hour soak alike.
 type Vetting struct {
 	Stages     []string            `yaml:"stages"`
 	Thresholds []ThresholdDefaults `yaml:"thresholds"`
 }
 // ThresholdDefaults is the YAML shape of a threshold declaration. One
 // stanza can declare a per-stage rule ("stage: Network") or a global
 // rule ("stage: *") — the threshold evaluator applies both to samples
 // with matching (stage, kind, key).
 type ThresholdDefaults struct {
 	Stage    string  `yaml:"stage"`
 	Kind     string  `yaml:"kind"`
 	Key      string  `yaml:"key"`
 	Op       string  `yaml:"op"`       // lt|lte|gt|gte|within_pct
 	Value    float64 `yaml:"value"`
 	Nominal  float64 `yaml:"nominal"`  // only used by within_pct (e.g. 12.0 for +12V rail)
 	Unit     string  `yaml:"unit"`
 	Severity string  `yaml:"severity"` // critical|warning
 }
 // ProfileRegistry is the in-memory view of the `profiles:` block in
 // vetting.yaml. The orchestrator queries it at run creation time to
 // seed thresholds and (in Phase 3+) to scale per-stage durations.
 type ProfileRegistry struct {
 	// Shared stage ordering + threshold defaults. Every profile walks
 	// the same list; only durations/concurrency differ.
 	Vetting Vetting
 	// Profiles is keyed by name ("quick"/"deep"/"soak"). Inherit is
 	// already resolved at load time — a caller sees a flattened view.
 	Profiles map[string]Profile
 }
 // Profile is a loaded profile. StageTimeouts is keyed by stage name.
 // Defaults carries the free-form knobs each probe reads.
 type Profile struct {
 	Name          string
 	Inherit       string
 	StageTimeouts map[string]time.Duration
 	Defaults      map[string]map[string]any
 }
 // StageConfig is the flat view of a profile's knobs, shipped on the
 // claim response so the agent can size CPUStress/Storage/Network/Burn
 // work without parsing YAML. Empty values mean "fall back to the
 // agent's compile-time default" — an older orchestrator that doesn't
 // set these fields keeps working unchanged.
 type StageConfig struct {
 	Profile       string            `json:"profile"`
 	StageTimeouts map[string]string `json:"stage_timeouts,omitempty"`
 	CPUStress     CPUStressKnobs    `json:"cpustress"`
 	Storage       StorageKnobs      `json:"storage"`
 	Network       NetworkKnobs      `json:"network"`
 	Burn          BurnKnobs         `json:"burn"`
 }
 // CPUStressKnobs parallels the `cpustress:` block under `profiles.<name>.defaults`.
 // Durations are YAML duration strings ("2m", "60m", "12h").
 type CPUStressKnobs struct {
 	CPUPass  string `json:"cpu_pass,omitempty"`
 	MemPass  string `json:"mem_pass,omitempty"`
 	EDACPoll string `json:"edac_poll,omitempty"`
 }
 // StorageKnobs parallels `storage:` defaults. Mode is "fio_sample" (quick)
 // or "full_disk" (deep/soak). Verify names the integrity mode ("md5" or "").
 type StorageKnobs struct {
 	Mode    string `json:"mode,omitempty"`
 	FioSize string `json:"fio_size,omitempty"`
 	FioTime string `json:"fio_time,omitempty"`
 	FioBS   string `json:"fio_bs,omitempty"`
 	FioRW   string `json:"fio_rw,omitempty"`
 	Verify  string `json:"verify,omitempty"`
 }
 // NetworkKnobs parallels `network:` defaults. Duration is a YAML string.
 type NetworkKnobs struct {
 	Duration string `json:"duration,omitempty"`
 }
 // BurnKnobs parallels `burn:` defaults. Duration is the total Burn window.
 // CPUWorkers is "all" (agent picks runtime.NumCPU) or a numeric string.
 // MemPct is a percentage of MemAvailable to stress. FioOnSpare gates
 // whether fio runs inside Burn (set false if operator lacks a spare
 // partition). IperfParallel is the parallel stream count fed to iperf3 -P.
 type BurnKnobs struct {
 	Duration      string `json:"duration,omitempty"`
 	CPUWorkers    string `json:"cpu_workers,omitempty"`
 	MemPct        int    `json:"mem_pct,omitempty"`
 	FioOnSpare    bool   `json:"fio_on_spare,omitempty"`
 	IperfParallel int    `json:"iperf_parallel,omitempty"`
 }
 // ResolveStageConfig flattens the named profile into the wire shape the
 // claim handler ships. Missing keys render as empty strings so the agent
 // falls back to its own defaults.
 func (pr *ProfileRegistry) ResolveStageConfig(name string) StageConfig {
 	if pr == nil {
 		return StageConfig{Profile: name}
 	}
 	p, err := pr.Lookup(name)
 	if err != nil {
 		return StageConfig{Profile: name}
 	}
 	out := StageConfig{Profile: p.Name}
 	if len(p.StageTimeouts) > 0 {
 		out.StageTimeouts = make(map[string]string, len(p.StageTimeouts))
 		for k, v := range p.StageTimeouts {
 			out.StageTimeouts[k] = v.String()
 		}
 	}
 	cpu := p.Defaults["cpustress"]
 	out.CPUStress.CPUPass = yamlString(cpu, "cpu_pass")
 	out.CPUStress.MemPass = yamlString(cpu, "mem_pass")
 	out.CPUStress.EDACPoll = yamlString(cpu, "edac_poll")
 	st := p.Defaults["storage"]
 	out.Storage.Mode = yamlString(st, "mode")
 	out.Storage.FioSize = yamlString(st, "fio_size")
 	out.Storage.FioTime = yamlString(st, "fio_time")
 	out.Storage.FioBS = yamlString(st, "fio_bs")
 	out.Storage.FioRW = yamlString(st, "fio_rw")
 	out.Storage.Verify = yamlString(st, "verify")
 	net := p.Defaults["network"]
 	out.Network.Duration = yamlString(net, "duration")
 	burn := p.Defaults["burn"]
 	out.Burn.Duration = yamlString(burn, "duration")
 	out.Burn.CPUWorkers = yamlString(burn, "cpu_workers")
 	out.Burn.MemPct = yamlInt(burn, "mem_pct")
 	out.Burn.FioOnSpare = yamlBool(burn, "fio_on_spare")
 	out.Burn.IperfParallel = yamlInt(burn, "iperf_parallel")
 	return out
 }
 // yamlInt coerces a map[string]any entry to int. Accepts native int,
 // float64 (JSON numbers round-trip as float), or numeric string. Missing
 // / malformed values return 0 so the agent falls back to its default.
 func yamlInt(m map[string]any, key string) int {
 	v, ok := m[key]
 	if !ok || v == nil {
 		return 0
 	}
 	switch x := v.(type) {
 	case int:
 		return x
 	case int64:
 		return int(x)
 	case float64:
 		return int(x)
 	case string:
 		// Best-effort string → int. Empty and non-numeric fall through
 		// to zero.
 		var n int
 		if _, err := fmt.Sscanf(x, "%d", &n); err == nil {
 			return n
 		}
 	}
 	return 0
 }
 // yamlBool accepts native bool or "true"/"false" strings. Anything else
 // (missing key, numeric, typo) returns false — a safer default than
 // "true" for a destructive knob like fio_on_spare.
 func yamlBool(m map[string]any, key string) bool {
 	v, ok := m[key]
 	if !ok || v == nil {
 		return false
 	}
 	switch x := v.(type) {
 	case bool:
 		return x
 	case string:
 		return strings.EqualFold(x, "true")
 	}
 	return false
 }
 // yamlString coerces a map[string]any entry to its string form. YAML
 // durations like "2m" parse as strings; numeric literals like 5 parse as
 // int. We format non-string scalars with fmt.Sprint so the agent can
 // still interpret them.
 func yamlString(m map[string]any, key string) string {
 	v, ok := m[key]
 	if !ok || v == nil {
 		return ""
 	}
 	if s, ok := v.(string); ok {
 		return s
 	}
 	return fmt.Sprint(v)
 }
 // Lookup returns the profile with the given name. Falls back to the
 // default profile (quick) if the name is empty. Returns an error when
 // the name is non-empty but unknown so the caller can surface it.
 func (pr *ProfileRegistry) Lookup(name string) (Profile, error) {
 	if name == "" {
 		name = ProfileQuick
 	}
 	p, ok := pr.Profiles[name]
 	if !ok {
 		return Profile{}, fmt.Errorf("unknown profile %q", name)
 	}
 	return p, nil
 }
 // Names returns the registry's profile names in the canonical
 // picker order (quick/deep/soak). Profiles present in the config but
 // unknown to AllProfiles are appended after, alphabetically.
 func (pr *ProfileRegistry) Names() []string {
 	out := make([]string, 0, len(pr.Profiles))
 	seen := map[string]bool{}
 	for _, n := range AllProfiles {
 		if _, ok := pr.Profiles[n]; ok {
 			out = append(out, n)
 			seen[n] = true
 		}
 	}
 	for n := range pr.Profiles {
 		if !seen[n] {
 			out = append(out, n)
 		}
 	}
 	return out
 }
 // Stages returns the shared stage order, or a safe default when the
 // config didn't declare one — keeps tests that don't build a full
 // ProfileRegistry from tripping over a nil slice.
 func (pr *ProfileRegistry) Stages() []string {
 	if len(pr.Vetting.Stages) == 0 {
 		return DefaultStages()
 	}
 	out := make([]string, len(pr.Vetting.Stages))
 	copy(out, pr.Vetting.Stages)
 	return out
 }
 // DefaultStages is the canonical stage list the orchestrator walks
 // when no config is loaded. Mirrored in the vetting.yaml shipped with
 // the repo so edits to the slice and the file stay in sync.
 func DefaultStages() []string {
 	return []string{
 		"Inventory",
 		"Firmware",
 		"SpecValidate",
 		"SMART",
 		"CPUStress",
 		"Storage",
 		"Network",
 		"Burn",
 		"GPU",
 		"PSU",
 		"Reporting",
 	}
 }
 // rawProfile is the YAML shape before inherit resolution. Durations
 // arrive as strings (e.g. "2h") so we can parse them with
 // time.ParseDuration instead of rolling our own.
 type rawProfile struct {
 	Inherit       string                       `yaml:"inherit"`
 	StageTimeouts map[string]string            `yaml:"stage_timeouts"`
 	Defaults      map[string]map[string]any    `yaml:"defaults"`
 }
 type rawProfilesBlock struct {
 	Vetting  Vetting               `yaml:"vetting"`
 	Profiles map[string]rawProfile `yaml:"profiles"`
 }
 // buildProfileRegistry flattens a rawProfilesBlock into a ProfileRegistry.
 // Resolves `inherit:` by recursive merge (child keys win), parses
 // stage_timeouts strings into time.Durations, and returns an error if
 // the inherit chain loops or references an unknown profile.
 func buildProfileRegistry(raw rawProfilesBlock) (*ProfileRegistry, error) {
 	if len(raw.Profiles) == 0 {
 		raw.Profiles = defaultRawProfiles()
 	}
 	out := &ProfileRegistry{
 		Vetting:  raw.Vetting,
 		Profiles: make(map[string]Profile, len(raw.Profiles)),
 	}
 	if len(out.Vetting.Stages) == 0 {
 		out.Vetting.Stages = DefaultStages()
 	}
 	for name := range raw.Profiles {
 		resolved, err := resolveProfile(raw.Profiles, name, nil)
 		if err != nil {
 			return nil, err
 		}
 		out.Profiles[name] = resolved
 	}
 	return out, nil
 }
 // resolveProfile recursively walks inherit chains, depth-first. The
 // visited slice is a cycle guard — we add the current name before
 // recursing and bail if we ever see it again.
 func resolveProfile(all map[string]rawProfile, name string, visited []string) (Profile, error) {
 	for _, v := range visited {
 		if v == name {
 			return Profile{}, fmt.Errorf("profile inherit cycle: %s -> %s", strings.Join(visited, " -> "), name)
 		}
 	}
 	raw, ok := all[name]
 	if !ok {
 		return Profile{}, fmt.Errorf("unknown profile %q", name)
 	}
 	base := Profile{
 		Name:          name,
 		Inherit:       raw.Inherit,
 		StageTimeouts: map[string]time.Duration{},
 		Defaults:      map[string]map[string]any{},
 	}
 	if raw.Inherit != "" {
 		parent, err := resolveProfile(all, raw.Inherit, append(visited, name))
 		if err != nil {
 			return Profile{}, err
 		}
 		for k, v := range parent.StageTimeouts {
 			base.StageTimeouts[k] = v
 		}
 		for k, v := range parent.Defaults {
 			copyMap := make(map[string]any, len(v))
 			for kk, vv := range v {
 				copyMap[kk] = vv
 			}
 			base.Defaults[k] = copyMap
 		}
 	}
 	for stage, s := range raw.StageTimeouts {
 		d, err := time.ParseDuration(s)
 		if err != nil {
 			return Profile{}, fmt.Errorf("profile %s stage_timeouts[%s]: %w", name, stage, err)
 		}
 		base.StageTimeouts[stage] = d
 	}
 	for group, kv := range raw.Defaults {
 		dest, ok := base.Defaults[group]
 		if !ok {
 			dest = map[string]any{}
 			base.Defaults[group] = dest
 		}
 		for k, v := range kv {
 			dest[k] = v
 		}
 	}
 	return base, nil
 }
 // defaultRawProfiles returns sane per-profile durations + probe knobs
 // used when vetting.yaml omits the `profiles:` block entirely. Matches
 // the plan's per-stage budget table so the agent still gets coherent
 // CPUStress/Storage/Network knobs without any operator-visible config.
 func defaultRawProfiles() map[string]rawProfile {
 	return map[string]rawProfile{
 		ProfileQuick: {
 			StageTimeouts: map[string]string{
 				"CPUStress": "5m",
 				"Storage":   "5m",
 				"Network":   "2m",
 				"Burn":      "3m",
 				"PSU":       "1m",
 			},
 			Defaults: map[string]map[string]any{
 				"cpustress": {"cpu_pass": "2m", "mem_pass": "2m", "edac_poll": "10s"},
 				"storage":   {"mode": "fio_sample", "fio_size": "1GiB", "fio_time": "3m", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"},
 				"network":   {"duration": "60s"},
 				"burn":      {"duration": "2m", "cpu_workers": "all", "mem_pct": 50, "fio_on_spare": true, "iperf_parallel": 2},
 			},
 		},
 		ProfileDeep: {
 			StageTimeouts: map[string]string{
 				"CPUStress": "2h",
 				"Storage":   "4h",
 				"Network":   "35m",
 				"Burn":      "3h",
 				"PSU":       "10m",
 			},
 			Defaults: map[string]map[string]any{
 				"cpustress": {"cpu_pass": "60m", "mem_pass": "60m", "edac_poll": "10s"},
 				"storage":   {"mode": "full_disk", "fio_time": "2h", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"},
 				"network":   {"duration": "30m"},
 				"burn":      {"duration": "2h", "cpu_workers": "all", "mem_pct": 70, "fio_on_spare": true, "iperf_parallel": 4},
 			},
 		},
 		ProfileSoak: {
 			Inherit: ProfileDeep,
 			StageTimeouts: map[string]string{
 				"CPUStress": "14h",
 				"Storage":   "8h",
 				"Network":   "2h30m",
 				"Burn":      "20h",
 				"PSU":       "15m",
 			},
 			Defaults: map[string]map[string]any{
 				"cpustress": {"cpu_pass": "12h"},
 				"storage":   {"mode": "full_disk", "fio_time": "6h"},
 				"network":   {"duration": "2h"},
 				"burn":      {"duration": "18h", "iperf_parallel": 8},
 			},
 		},
 	}
 }
@@ -0,0 +1,57 @@
 -- Phase-1 groundwork for profile-aware, threshold-gated vetting.
 --
 -- Adds:
 --   * runs.profile             — which profile the run is executing
 --                                (quick|deep|soak; defaults to quick for
 --                                backfill of older rows + tests).
 --   * thresholds               — seeded per run at creation from the
 --                                ProfileRegistry + per-host overrides;
 --                                immutable for that run so a late config
 --                                edit can't retroactively pass/fail it.
 --   * threshold_evaluations    — one row per observed sample vs threshold;
 --                                drives the report + pipeline badges.
 --   * firmware_snapshots       — per-run BIOS/BMC/NIC/HBA/microcode/NVMe
 --                                version captures used by SpecValidate
 --                                diffing in Phase 4.
 ALTER TABLE runs ADD COLUMN profile TEXT NOT NULL DEFAULT 'quick';
 CREATE TABLE IF NOT EXISTS thresholds (
    id         INTEGER PRIMARY KEY AUTOINCREMENT,
    run_id     INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
    stage_name TEXT    NOT NULL,                  -- "*" matches any stage
    kind       TEXT    NOT NULL,                  -- temp|psu_volt|iperf|fio_p99_us|nic_retrans|edac_ce|edac_ue|mce|...
    key        TEXT    NOT NULL,                  -- "*" or glob-ish match (prefix* / *suffix / exact)
    op         TEXT    NOT NULL,                  -- lt|lte|gt|gte|within_pct
    threshold  REAL    NOT NULL,
    nominal    REAL    NOT NULL DEFAULT 0,         -- used by within_pct; 0 elsewhere
    unit       TEXT    NOT NULL DEFAULT '',
    severity   TEXT    NOT NULL,                  -- critical|warning
    source     TEXT    NOT NULL                   -- profile|host_override
 );
 CREATE INDEX IF NOT EXISTS idx_thresholds_run  ON thresholds(run_id);
 CREATE INDEX IF NOT EXISTS idx_thresholds_kind ON thresholds(run_id, stage_name, kind);
 CREATE TABLE IF NOT EXISTS threshold_evaluations (
    id           INTEGER PRIMARY KEY AUTOINCREMENT,
    run_id       INTEGER   NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
    threshold_id INTEGER   NOT NULL REFERENCES thresholds(id) ON DELETE CASCADE,
    stage_name   TEXT      NOT NULL,
    kind         TEXT      NOT NULL,
    key          TEXT      NOT NULL,
    ts           TIMESTAMP NOT NULL,
    observed     REAL      NOT NULL,
    passed       INTEGER   NOT NULL                -- 1 = sample within threshold, 0 = breach
 );
 CREATE INDEX IF NOT EXISTS idx_threshold_evals_run ON threshold_evaluations(run_id, passed);
 CREATE TABLE IF NOT EXISTS firmware_snapshots (
    id         INTEGER PRIMARY KEY AUTOINCREMENT,
    run_id     INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
    component  TEXT    NOT NULL,                   -- bios|bmc|nic|hba|microcode|nvme_fw
    identifier TEXT    NOT NULL,                   -- slot/serial/device path that distinguishes this component
    version    TEXT    NOT NULL,
    vendor     TEXT    NOT NULL DEFAULT '',
    raw_json   TEXT    NOT NULL DEFAULT '{}'
 );
 CREATE INDEX IF NOT EXISTS idx_firmware_run ON firmware_snapshots(run_id, component);
@@ -26,11 +26,13 @@ const (
 	StateWaitingReboot  RunState = "WaitingReboot"
 	StateBooting        RunState = "Booting"
 	StateInventoryCheck RunState = "InventoryCheck"
 	StateFirmware       RunState = "Firmware"
 	StateSpecValidate   RunState = "SpecValidate"
 	StateSMART          RunState = "SMART"
 	StateCPUStress      RunState = "CPUStress"
 	StateStorage        RunState = "Storage"
 	StateNetwork        RunState = "Network"
 	StateBurn           RunState = "Burn"
 	StateGPU            RunState = "GPU"
 	StatePSU            RunState = "PSU"
 	StateReporting      RunState = "Reporting"
@@ -63,6 +65,7 @@ type Run struct {
 	HoldIP            string
 	OverrideFlagsJSON string
 	NonDestructive    bool
 	Profile           string // quick|deep|soak; empty is treated as "quick"
 }
 type StageState string
@@ -119,9 +119,9 @@ func (d *Dispatcher) pickNext(ctx context.Context) {
 				queued = &runs[i]
 			}
 		case model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
-			model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
+			model.StateInventoryCheck, model.StateFirmware, model.StateSpecValidate, model.StateSMART,
 			model.StateCPUStress, model.StateStorage, model.StateNetwork,
-			model.StateGPU, model.StatePSU, model.StateReporting:
+			model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting:
 			inFlight++
 		}
 	}
@@ -30,11 +30,13 @@ const (
 // "InventoryCheck". Later stages share a name with their state.
 var stageStates = map[string]model.RunState{
 	"Inventory":    model.StateInventoryCheck,
 	"Firmware":     model.StateFirmware,
 	"SpecValidate": model.StateSpecValidate,
 	"SMART":        model.StateSMART,
 	"CPUStress":    model.StateCPUStress,
 	"Storage":      model.StateStorage,
 	"Network":      model.StateNetwork,
 	"Burn":         model.StateBurn,
 	"GPU":          model.StateGPU,
 	"PSU":          model.StatePSU,
 	"Reporting":    model.StateReporting,
@@ -44,11 +46,13 @@ var stageStates = map[string]model.RunState{
 // first stage to Completed. Kept in sync with store.DefaultStageOrder.
 var stageOrder = []model.RunState{
 	model.StateInventoryCheck,
 	model.StateFirmware,
 	model.StateSpecValidate,
 	model.StateSMART,
 	model.StateCPUStress,
 	model.StateStorage,
 	model.StateNetwork,
 	model.StateBurn,
 	model.StateGPU,
 	model.StatePSU,
 	model.StateReporting,
@@ -143,9 +147,9 @@ func nextStageState(current model.RunState) (model.RunState, error) {
 func allActiveStates() []model.RunState {
 	return []model.RunState{
 		model.StateQueued, model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
-		model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
+		model.StateInventoryCheck, model.StateFirmware, model.StateSpecValidate, model.StateSMART,
 		model.StateCPUStress, model.StateStorage, model.StateNetwork,
-		model.StateGPU, model.StatePSU, model.StateReporting,
+		model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting,
 	}
 }
@@ -80,11 +80,13 @@ func TestTriggerAgentClaimedFromWaitingReboot(t *testing.T) {
 func TestTriggerStageMismatch(t *testing.T) {
 	stageStates := []model.RunState{
 		model.StateInventoryCheck,
 		model.StateFirmware,
 		model.StateSpecValidate,
 		model.StateSMART,
 		model.StateCPUStress,
 		model.StateStorage,
 		model.StateNetwork,
 		model.StateBurn,
 		model.StateGPU,
 		model.StatePSU,
 		model.StateReporting,
@@ -114,11 +116,13 @@ func TestTriggerStageMismatch(t *testing.T) {
 func TestStageNameForState(t *testing.T) {
 	pairs := map[string]model.RunState{
 		"Inventory":    model.StateInventoryCheck,
 		"Firmware":     model.StateFirmware,
 		"SpecValidate": model.StateSpecValidate,
 		"SMART":        model.StateSMART,
 		"CPUStress":    model.StateCPUStress,
 		"Storage":      model.StateStorage,
 		"Network":      model.StateNetwork,
 		"Burn":         model.StateBurn,
 		"GPU":          model.StateGPU,
 		"PSU":          model.StatePSU,
 		"Reporting":    model.StateReporting,
@@ -143,11 +147,13 @@ func TestNextStageWalk(t *testing.T) {
 	// one in the canonical order, and from Reporting onto Completed.
 	chain := []model.RunState{
 		model.StateInventoryCheck,
 		model.StateFirmware,
 		model.StateSpecValidate,
 		model.StateSMART,
 		model.StateCPUStress,
 		model.StateStorage,
 		model.StateNetwork,
 		model.StateBurn,
 		model.StateGPU,
 		model.StatePSU,
 		model.StateReporting,
@@ -0,0 +1,182 @@
 package orchestrator
 import (
 	"fmt"
 	"strings"
 )
 // ThresholdOp is one of the comparison operators a threshold supports.
 // within_pct is the only one that cares about a "nominal" value for
 // the key — used for PSU rails ("+12V within 5% of 12.0").
 type ThresholdOp string
 const (
 	OpLT        ThresholdOp = "lt"
 	OpLTE       ThresholdOp = "lte"
 	OpGT        ThresholdOp = "gt"
 	OpGTE       ThresholdOp = "gte"
 	OpWithinPct ThresholdOp = "within_pct"
 )
 // ThresholdSeverity routes a breach to either "fail the run" or "just
 // surface a warning in the report". The evaluator returns it alongside
 // the Pass flag so the caller can decide whether to transition the run.
 type ThresholdSeverity string
 const (
 	SeverityCritical ThresholdSeverity = "critical"
 	SeverityWarning  ThresholdSeverity = "warning"
 )
 // Threshold is the evaluator's view of a stored threshold row. It's a
 // flat, already-parsed value-object — the evaluator doesn't look at
 // the DB and the store doesn't look at the evaluator.
 type Threshold struct {
 	ID        int64
 	Stage     string // "*" matches any stage
 	Kind      string
 	Key       string // glob-ish: "*" / "prefix*" / "*suffix" / exact
 	Op        ThresholdOp
 	Value     float64
 	Nominal   float64 // for within_pct (nominal voltage/frequency)
 	Severity  ThresholdSeverity
 }
 // Sample is a single observation the evaluator tests against matching
 // thresholds. Stage may be empty when the agent doesn't know which
 // stage posted it (e.g. the thermal sidecar running across stages) —
 // empty-stage samples only match thresholds with Stage == "*".
 type Sample struct {
 	Stage string
 	Kind  string
 	Key   string
 	Value float64
 }
 // EvalResult is the per-sample outcome of a threshold evaluation:
 // which threshold was consulted, whether the sample passed, and the
 // severity so the caller can fast-fail on critical breaches.
 type EvalResult struct {
 	Threshold Threshold
 	Passed    bool
 	Observed  float64
 }
 // Breached returns true when the sample violated the threshold.
 func (r EvalResult) Breached() bool { return !r.Passed }
 // CriticalBreach returns true only for critical-severity breaches —
 // the "fail the run right now" case.
 func (r EvalResult) CriticalBreach() bool {
 	return r.Breached() && r.Threshold.Severity == SeverityCritical
 }
 // Evaluate runs a single sample through every threshold that applies
 // to it. A sample may match more than one threshold (a generic "*"
 // rule + a stage-specific override); each match produces its own
 // EvalResult in the returned slice so both get persisted.
 func Evaluate(sample Sample, thresholds []Threshold) []EvalResult {
 	out := make([]EvalResult, 0, 1)
 	for _, t := range thresholds {
 		if !thresholdMatchesSample(t, sample) {
 			continue
 		}
 		passed, err := evaluateOp(t.Op, sample.Value, t.Value, t.Nominal)
 		if err != nil {
 			// Unknown operator — skip. The caller could validate on
 			// insert; here we prefer to drop the threshold than to
 			// return an error that forces every Sensor write to 500.
 			continue
 		}
 		out = append(out, EvalResult{
 			Threshold: t,
 			Passed:    passed,
 			Observed:  sample.Value,
 		})
 	}
 	return out
 }
 // thresholdMatchesSample applies the stage + kind + key filter. Kind
 // is always literal — there's no "any kind" threshold and if there
 // ever is we'll add a `kind: *` escape hatch. Stage and key both
 // support glob-ish matching.
 func thresholdMatchesSample(t Threshold, s Sample) bool {
 	if t.Kind != s.Kind {
 		return false
 	}
 	if !stageMatches(t.Stage, s.Stage) {
 		return false
 	}
 	if !keyMatches(t.Key, s.Key) {
 		return false
 	}
 	return true
 }
 // stageMatches returns true if the threshold's stage selector applies
 // to the sample's stage. "*" matches everything; empty threshold
 // selector is treated as "*" so a threshold declared without a stage
 // key isn't accidentally inert. A sample without a stage only matches
 // the "*" selector — we don't guess.
 func stageMatches(selector, sampleStage string) bool {
 	if selector == "" || selector == "*" {
 		return true
 	}
 	return selector == sampleStage
 }
 // keyMatches handles "*", "prefix*", "*suffix", and exact match. We
 // avoid pulling in filepath.Match so Windows `\`-vs-`/` rules don't
 // leak into the sample namespace (key "eth0/rx_errors" is not a path).
 func keyMatches(pattern, key string) bool {
 	if pattern == "" || pattern == "*" {
 		return true
 	}
 	hasPrefix := strings.HasPrefix(pattern, "*")
 	hasSuffix := strings.HasSuffix(pattern, "*")
 	switch {
 	case hasPrefix && hasSuffix:
 		inner := strings.TrimPrefix(strings.TrimSuffix(pattern, "*"), "*")
 		return strings.Contains(key, inner)
 	case hasSuffix:
 		return strings.HasPrefix(key, strings.TrimSuffix(pattern, "*"))
 	case hasPrefix:
 		return strings.HasSuffix(key, strings.TrimPrefix(pattern, "*"))
 	default:
 		return pattern == key
 	}
 }
 // evaluateOp does the numeric comparison. within_pct is the oddball:
 // it tests |observed - nominal| <= (pct / 100) * nominal. Returns an
 // error for unknown operators so the caller can log + drop.
 func evaluateOp(op ThresholdOp, observed, threshold, nominal float64) (bool, error) {
 	switch op {
 	case OpLT:
 		return observed < threshold, nil
 	case OpLTE:
 		return observed <= threshold, nil
 	case OpGT:
 		return observed > threshold, nil
 	case OpGTE:
 		return observed >= threshold, nil
 	case OpWithinPct:
 		if nominal == 0 {
 			// within_pct against a 0 nominal is meaningless. Treat as
 			// pass so a misconfigured rule doesn't spuriously fail.
 			return true, nil
 		}
 		allowed := (threshold / 100.0) * nominal
 		if allowed < 0 {
 			allowed = -allowed
 		}
 		diff := observed - nominal
 		if diff < 0 {
 			diff = -diff
 		}
 		return diff <= allowed, nil
 	default:
 		return false, fmt.Errorf("unknown op %q", op)
 	}
 }
@@ -0,0 +1,152 @@
 package orchestrator
 import "testing"
 // TestEvaluate_Ops covers every operator against the boundary case
 // (equal to threshold) plus one clearly-inside and one clearly-outside
 // value. Table-driven because the logic is regular.
 func TestEvaluate_Ops(t *testing.T) {
 	cases := []struct {
 		name     string
 		op       ThresholdOp
 		value    float64
 		nominal  float64
 		observed float64
 		want     bool
 	}{
 		{"lt strict below", OpLT, 10, 0, 5, true},
 		{"lt equal fails", OpLT, 10, 0, 10, false},
 		{"lt above fails", OpLT, 10, 0, 15, false},
 		{"lte below", OpLTE, 10, 0, 5, true},
 		{"lte equal passes", OpLTE, 10, 0, 10, true},
 		{"lte above fails", OpLTE, 10, 0, 11, false},
 		{"gt below fails", OpGT, 900, 0, 800, false},
 		{"gt equal fails", OpGT, 900, 0, 900, false},
 		{"gt above passes", OpGT, 900, 0, 950, true},
 		{"gte equal passes", OpGTE, 900, 0, 900, true},
 		{"gte below fails", OpGTE, 900, 0, 800, false},
 		{"within_pct exact", OpWithinPct, 5, 12.0, 12.0, true},
 		{"within_pct inside", OpWithinPct, 5, 12.0, 11.7, true},
 		{"within_pct outside low", OpWithinPct, 5, 12.0, 11.0, false},
 		{"within_pct outside high", OpWithinPct, 5, 12.0, 12.7, false},
 		{"within_pct zero nominal passes", OpWithinPct, 5, 0, 99, true},
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			rules := []Threshold{{
 				Stage: "*", Kind: "k", Key: "k", Op: tc.op,
 				Value: tc.value, Nominal: tc.nominal, Severity: SeverityCritical,
 			}}
 			res := Evaluate(Sample{Stage: "Any", Kind: "k", Key: "k", Value: tc.observed}, rules)
 			if len(res) != 1 {
 				t.Fatalf("expected 1 match, got %d", len(res))
 			}
 			if res[0].Passed != tc.want {
 				t.Fatalf("op=%s observed=%v want passed=%v got %v", tc.op, tc.observed, tc.want, res[0].Passed)
 			}
 		})
 	}
 }
 // TestEvaluate_StageMatching: a Network-scoped rule ignores samples
 // stamped with other stages. Global "*" catches everything.
 func TestEvaluate_StageMatching(t *testing.T) {
 	rules := []Threshold{
 		{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
 		{Stage: "Burn", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 88, Severity: SeverityCritical},
 	}
 	// Sample from CPUStress — only the global rule applies.
 	res := Evaluate(Sample{Stage: "CPUStress", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
 	if len(res) != 1 {
 		t.Fatalf("cpustress sample: expected 1 match, got %d", len(res))
 	}
 	if res[0].Threshold.Value != 92 {
 		t.Fatalf("cpustress sample matched wrong rule: %+v", res[0].Threshold)
 	}
 	// Sample from Burn — both rules match. The stricter one breaches.
 	res = Evaluate(Sample{Stage: "Burn", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
 	if len(res) != 2 {
 		t.Fatalf("burn sample: expected 2 matches, got %d", len(res))
 	}
 	var globalPassed, burnPassed bool
 	for _, r := range res {
 		switch r.Threshold.Value {
 		case 92:
 			globalPassed = r.Passed
 		case 88:
 			burnPassed = r.Passed
 		}
 	}
 	if !globalPassed {
 		t.Fatalf("global 92C rule should pass at 89C")
 	}
 	if burnPassed {
 		t.Fatalf("burn 88C rule should breach at 89C")
 	}
 }
 // TestEvaluate_KeyWildcards covers "*" / "prefix*" / "*suffix".
 func TestEvaluate_KeyWildcards(t *testing.T) {
 	cases := []struct {
 		pattern string
 		key     string
 		match   bool
 	}{
 		{"*", "anything", true},
 		{"", "anything", true},
 		{"cpu/*", "cpu/0", true},
 		{"cpu/*", "gpu/0", false},
 		{"*/rate", "eth0/rate", true},
 		{"*/rate", "eth0/count", false},
 		{"exact", "exact", true},
 		{"exact", "exactly", false},
 	}
 	for _, tc := range cases {
 		t.Run(tc.pattern+"_vs_"+tc.key, func(t *testing.T) {
 			got := keyMatches(tc.pattern, tc.key)
 			if got != tc.match {
 				t.Fatalf("keyMatches(%q, %q) = %v, want %v", tc.pattern, tc.key, got, tc.match)
 			}
 		})
 	}
 }
 // TestEvaluate_SeverityDispatch: only critical breaches flip
 // CriticalBreach; warning-severity breaches stay advisory.
 func TestEvaluate_SeverityDispatch(t *testing.T) {
 	rules := []Threshold{
 		{Stage: "*", Kind: "temp", Key: "cpu", Op: OpLT, Value: 92, Severity: SeverityCritical},
 		{Stage: "*", Kind: "fio", Key: "p99", Op: OpLT, Value: 50000, Severity: SeverityWarning},
 	}
 	res := Evaluate(Sample{Stage: "CPU", Kind: "temp", Key: "cpu", Value: 95}, rules)
 	if len(res) != 1 || !res[0].CriticalBreach() {
 		t.Fatalf("critical breach not detected: %+v", res)
 	}
 	res = Evaluate(Sample{Stage: "Storage", Kind: "fio", Key: "p99", Value: 80000}, rules)
 	if len(res) != 1 {
 		t.Fatalf("expected 1 match, got %d", len(res))
 	}
 	if res[0].CriticalBreach() {
 		t.Fatalf("warning-severity breach should not be critical")
 	}
 	if !res[0].Breached() {
 		t.Fatalf("warning-severity rule should still show breach=true")
 	}
 }
 // TestEvaluate_NoMatchingThreshold: a sample that doesn't hit any rule
 // produces an empty result slice — callers treat that as "advisory".
 func TestEvaluate_NoMatchingThreshold(t *testing.T) {
 	rules := []Threshold{
 		{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
 	}
 	res := Evaluate(Sample{Stage: "Network", Kind: "iperf", Key: "throughput", Value: 950}, rules)
 	if len(res) != 0 {
 		t.Fatalf("unmatched sample should yield 0 results, got %d", len(res))
 	}
 }
@@ -29,6 +29,16 @@ type Data struct {
 	Stages      []model.Stage
 	SpecDiffs   []model.SpecDiff
 	Aggregates  []Aggregate        // flattened measurement summary; see Aggregate
 	Firmware    []FirmwareSnapshot // captured firmware versions, empty if none
 }
 // FirmwareSnapshot is the report-facing view of one firmware row.
 // Package-local so the HTML template stays decoupled from store types.
 type FirmwareSnapshot struct {
 	Component  string
 	Identifier string
 	Version    string
 	Vendor     string
 }
 // Aggregate is a per (kind, key) summary of a run's measurements. Min/
@@ -196,6 +206,27 @@ const htmlTemplate = `<!doctype html>
 </table>
 </section>
 <section>
 <h2>Firmware ({{len .Firmware}})</h2>
 {{if .Firmware}}
 <table>
  <thead><tr><th>Component</th><th>Identifier</th><th>Version</th><th>Vendor</th></tr></thead>
  <tbody>
  {{range .Firmware}}
    <tr>
      <td>{{.Component}}</td>
      <td><code>{{.Identifier}}</code></td>
      <td><code>{{.Version}}</code></td>
      <td>{{.Vendor}}</td>
    </tr>
  {{end}}
  </tbody>
 </table>
 {{else}}
 <p>No firmware snapshots captured.</p>
 {{end}}
 </section>
 <section>
 <h2>Spec diffs ({{len .SpecDiffs}})</h2>
 {{if .SpecDiffs}}
@@ -26,6 +26,31 @@ type Spec struct {
 	Disks    []DiskSpec     `yaml:"disks,omitempty"`
 	NICs     []NICSpec      `yaml:"nics,omitempty"`
 	GPUs     []GPUSpec      `yaml:"gpus,omitempty"`
 	Firmware []FirmwareSpec `yaml:"firmware,omitempty"`
 }
 // FirmwareSpec is one row in the expected-spec YAML's `firmware:` block.
 // Component is one of bios|bmc|nic|hba|microcode|nvme_fw (matches the
 // on-wire value from agent/probes.FirmwareSnapshot.Component). Identifier
 // is optional — when empty the rule applies to every observed snapshot
 // of that component (use for single-instance things like BIOS/microcode);
 // when set it pins the check to a specific NIC port / NVMe controller /
 // PCI address. Version is the literal string expected; comparison is
 // exact after trimming whitespace.
 type FirmwareSpec struct {
 	Component  string `yaml:"component"`
 	Identifier string `yaml:"identifier,omitempty"`
 	Version    string `yaml:"version"`
 }
 // FirmwareObserved is what the agent reported, in a spec-package-local
 // shape so callers don't need to thread store types through the diff.
 // The server converts store.FirmwareSnapshot → FirmwareObserved before
 // calling DiffFirmware.
 type FirmwareObserved struct {
 	Component  string
 	Identifier string
 	Version    string
 }
 type CPUSpec struct {
@@ -175,6 +200,73 @@ func diffNICs(expected, actual []NICSpec) []model.SpecDiff {
 	return out
 }
 // DiffFirmware returns a SpecDiff per firmware expectation that doesn't
 // find a matching observed snapshot. Matching rules:
 //   - An expected rule with Identifier set matches by (component, id);
 //     a missing observed snapshot yields a "present=false" diff.
 //   - An expected rule with Identifier empty applies to every observed
 //     snapshot of that component — useful for "all NICs must run fw
 //     8.30" without listing each port. Zero observed snapshots of the
 //     component yields a single "present=false" diff, not N.
 //   - Version mismatch emits an exact-string expected→actual diff.
 // Case is preserved (firmware versions are case-sensitive in practice).
 func DiffFirmware(expected []FirmwareSpec, actual []FirmwareObserved) []model.SpecDiff {
 	if len(expected) == 0 {
 		return nil
 	}
 	byCompIdent := map[string]FirmwareObserved{}
 	byComp := map[string][]FirmwareObserved{}
 	for _, o := range actual {
 		byCompIdent[fwKey(o.Component, o.Identifier)] = o
 		byComp[o.Component] = append(byComp[o.Component], o)
 	}
 	var out []model.SpecDiff
 	for _, exp := range expected {
 		comp := strings.TrimSpace(exp.Component)
 		if comp == "" || strings.TrimSpace(exp.Version) == "" {
 			continue
 		}
 		label := "firmware[" + comp
 		if exp.Identifier != "" {
 			label += "/" + exp.Identifier
 		}
 		label += "]"
 		if exp.Identifier != "" {
 			got, ok := byCompIdent[fwKey(comp, exp.Identifier)]
 			if !ok {
 				out = append(out, diff(label+".present", "true", "false"))
 				continue
 			}
 			if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
 				out = append(out, diff(label+".version", exp.Version, got.Version))
 			}
 			continue
 		}
 		// No identifier: fan out across every observed snapshot of this
 		// component. Missing is one diff; a mismatching port/controller
 		// emits one diff per mismatch.
 		observed := byComp[comp]
 		if len(observed) == 0 {
 			out = append(out, diff(label+".present", "true", "false"))
 			continue
 		}
 		for _, got := range observed {
 			if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
 				slot := got.Identifier
 				if slot == "" {
 					slot = "*"
 				}
 				out = append(out, diff("firmware["+comp+"/"+slot+"].version", exp.Version, got.Version))
 			}
 		}
 	}
 	return out
 }
 func fwKey(component, identifier string) string {
 	return strings.ToLower(component) + "|" + strings.ToLower(identifier)
 }
 func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff {
 	if len(expected) == 0 {
 		return nil
@@ -119,3 +119,96 @@ func TestDiffSeverityAlwaysCritical(t *testing.T) {
 		}
 	}
 }
 func TestDiffFirmwareIdentifierMatch(t *testing.T) {
 	exp := []FirmwareSpec{{Component: "bios", Version: "3.2"}}
 	obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}}
 	if d := DiffFirmware(exp, obs); len(d) != 0 {
 		t.Fatalf("matching bios version should produce no diff, got %+v", d)
 	}
 }
 func TestDiffFirmwareVersionMismatch(t *testing.T) {
 	exp := []FirmwareSpec{{Component: "bios", Version: "3.3"}}
 	obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}}
 	d := DiffFirmware(exp, obs)
 	if len(d) != 1 {
 		t.Fatalf("want 1 diff, got %d: %+v", len(d), d)
 	}
 	if d[0].Expected != "3.3" || d[0].Actual != "3.2" {
 		t.Fatalf("diff expected/actual = %q/%q, want 3.3/3.2", d[0].Expected, d[0].Actual)
 	}
 	if d[0].Severity != "critical" {
 		t.Errorf("severity = %q, want critical", d[0].Severity)
 	}
 }
 func TestDiffFirmwareMissingComponentPresent(t *testing.T) {
 	// Expected rule with no identifier + zero observed snapshots →
 	// single "present=false" diff, not N.
 	exp := []FirmwareSpec{{Component: "bmc", Version: "1.74"}}
 	d := DiffFirmware(exp, nil)
 	if len(d) != 1 {
 		t.Fatalf("want 1 diff for missing BMC, got %d: %+v", len(d), d)
 	}
 	if d[0].Field != "firmware[bmc].present" || d[0].Expected != "true" || d[0].Actual != "false" {
 		t.Fatalf("missing-BMC diff = %+v", d[0])
 	}
 }
 func TestDiffFirmwareWildcardFanOut(t *testing.T) {
 	// Expected rule with empty identifier fans across every observed
 	// snapshot of the component — one port matches, one doesn't → one diff.
 	exp := []FirmwareSpec{{Component: "nic", Version: "16.32.1010"}}
 	obs := []FirmwareObserved{
 		{Component: "nic", Identifier: "eth0", Version: "16.32.1010"},
 		{Component: "nic", Identifier: "eth1", Version: "14.28.0000"},
 	}
 	d := DiffFirmware(exp, obs)
 	if len(d) != 1 {
 		t.Fatalf("want 1 diff (mismatched eth1 only), got %d: %+v", len(d), d)
 	}
 	if d[0].Field != "firmware[nic/eth1].version" {
 		t.Errorf("field = %q, want firmware[nic/eth1].version", d[0].Field)
 	}
 }
 func TestDiffFirmwareIdentifierPin(t *testing.T) {
 	// Identifier set: pins the rule to a specific port. Other ports
 	// with mismatched firmware are not evaluated by this rule.
 	exp := []FirmwareSpec{{Component: "nic", Identifier: "eth0", Version: "1.0"}}
 	obs := []FirmwareObserved{
 		{Component: "nic", Identifier: "eth0", Version: "1.0"},
 		{Component: "nic", Identifier: "eth1", Version: "9.9"},
 	}
 	if d := DiffFirmware(exp, obs); len(d) != 0 {
 		t.Fatalf("pinned rule should ignore other ports, got %+v", d)
 	}
 }
 func TestDiffFirmwareIdentifierPinMissing(t *testing.T) {
 	// Pinned rule with no matching observed snapshot → present=false diff.
 	exp := []FirmwareSpec{{Component: "nic", Identifier: "eth0", Version: "1.0"}}
 	if d := DiffFirmware(exp, nil); len(d) != 1 || d[0].Field != "firmware[nic/eth0].present" {
 		t.Fatalf("want present=false for pinned rule, got %+v", d)
 	}
 }
 func TestDiffFirmwareEmptyRuleSkipped(t *testing.T) {
 	// Empty component or empty version silently skip rather than panic.
 	exp := []FirmwareSpec{{Component: "", Version: "x"}, {Component: "bios", Version: ""}}
 	obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}}
 	if d := DiffFirmware(exp, obs); len(d) != 0 {
 		t.Fatalf("empty rules should skip, got %+v", d)
 	}
 }
 func TestDiffFirmwareCaseInsensitive(t *testing.T) {
 	// Version match is case-insensitive after trim; avoids spurious diff
 	// from ethtool's "FW1234" vs expected YAML's "fw1234".
 	exp := []FirmwareSpec{{Component: "nvme_fw", Identifier: "nvme0", Version: "fw1234"}}
 	obs := []FirmwareObserved{{Component: "nvme_fw", Identifier: "nvme0", Version: "FW1234"}}
 	if d := DiffFirmware(exp, obs); len(d) != 0 {
 		t.Fatalf("case-insensitive match expected, got %+v", d)
 	}
 }
@@ -0,0 +1,97 @@
 package store
 import (
 	"context"
 	"database/sql"
 	"fmt"
 )
 // FirmwareSnapshot is one row in firmware_snapshots. A run captures
 // many (one per BIOS/BMC/NIC/HBA/microcode/NVMe) so SpecValidate can
 // diff them against the host's expected spec in Phase 4.
 type FirmwareSnapshot struct {
 	ID         int64
 	RunID      int64
 	Component  string // bios|bmc|nic|hba|microcode|nvme_fw
 	Identifier string // slot/serial/device path
 	Version    string
 	Vendor     string
 	RawJSON    string
 }
 // Firmware is the CRUD seam. The agent's Phase-4 probe POSTs captured
 // rows; the orchestrator stores them. SpecValidate reads them back.
 type Firmware struct {
 	DB *sql.DB
 }
 // Create inserts a single firmware snapshot. One call per (run, component,
 // identifier) — the agent probe owns dedup/formatting.
 func (f *Firmware) Create(ctx context.Context, s FirmwareSnapshot) (int64, error) {
 	raw := s.RawJSON
 	if raw == "" {
 		raw = "{}"
 	}
 	res, err := f.DB.ExecContext(ctx, `
 		INSERT INTO firmware_snapshots(run_id, component, identifier, version, vendor, raw_json)
 		VALUES(?,?,?,?,?,?)
 	`, s.RunID, s.Component, s.Identifier, s.Version, s.Vendor, raw)
 	if err != nil {
 		return 0, fmt.Errorf("insert firmware: %w", err)
 	}
 	return res.LastInsertId()
 }
 // CreateBatch persists a slice of snapshots under one transaction.
 // Agent probe enumerates all components in one pass, so batching wins.
 func (f *Firmware) CreateBatch(ctx context.Context, rows []FirmwareSnapshot) error {
 	if len(rows) == 0 {
 		return nil
 	}
 	tx, err := f.DB.BeginTx(ctx, nil)
 	if err != nil {
 		return err
 	}
 	defer func() { _ = tx.Rollback() }()
 	stmt, err := tx.PrepareContext(ctx, `
 		INSERT INTO firmware_snapshots(run_id, component, identifier, version, vendor, raw_json)
 		VALUES(?,?,?,?,?,?)
 	`)
 	if err != nil {
 		return fmt.Errorf("prepare firmware insert: %w", err)
 	}
 	defer func() { _ = stmt.Close() }()
 	for _, s := range rows {
 		raw := s.RawJSON
 		if raw == "" {
 			raw = "{}"
 		}
 		if _, err := stmt.ExecContext(ctx, s.RunID, s.Component, s.Identifier, s.Version, s.Vendor, raw); err != nil {
 			return fmt.Errorf("insert firmware %s/%s: %w", s.Component, s.Identifier, err)
 		}
 	}
 	return tx.Commit()
 }
 // ListForRun returns every firmware snapshot for a run in stable order.
 // Report page + SpecValidate both read this.
 func (f *Firmware) ListForRun(ctx context.Context, runID int64) ([]FirmwareSnapshot, error) {
 	rows, err := f.DB.QueryContext(ctx, `
 		SELECT id, run_id, component, identifier, version, vendor, raw_json
 		FROM firmware_snapshots WHERE run_id = ? ORDER BY id
 	`, runID)
 	if err != nil {
 		return nil, err
 	}
 	defer rows.Close()
 	var out []FirmwareSnapshot
 	for rows.Next() {
 		var s FirmwareSnapshot
 		if err := rows.Scan(&s.ID, &s.RunID, &s.Component, &s.Identifier,
 			&s.Version, &s.Vendor, &s.RawJSON); err != nil {
 			return nil, err
 		}
 		out = append(out, s)
 	}
 	return out, rows.Err()
 }
@@ -14,16 +14,30 @@ type Runs struct {
 	DB *sql.DB
 }
 // Create inserts a new run using the default "quick" profile. Older
 // call sites (and most tests) target this form — the profile column's
 // DEFAULT 'quick' on runs takes care of the backfill.
 func (r *Runs) Create(ctx context.Context, hostID int64, tokenHash string, nonDestructive bool) (int64, error) {
 	return r.CreateWithProfile(ctx, hostID, tokenHash, nonDestructive, "quick")
 }
 // CreateWithProfile inserts a new run with an explicit profile
 // ("quick"|"deep"|"soak"). The UI handler is the authoritative caller;
 // empty profile falls back to "quick" so a misconfigured form doesn't
 // leave a row with a blank profile column.
 func (r *Runs) CreateWithProfile(ctx context.Context, hostID int64, tokenHash string, nonDestructive bool, profile string) (int64, error) {
 	if profile == "" {
 		profile = "quick"
 	}
 	now := time.Now().UTC()
 	nd := 0
 	if nonDestructive {
 		nd = 1
 	}
 	res, err := r.DB.ExecContext(ctx, `
-		INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at, non_destructive)
+		INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at, non_destructive, profile)
-		VALUES(?,?,?,?,?,?)
+		VALUES(?,?,?,?,?,?,?)
-	`, hostID, string(model.StateQueued), tokenHash, "linux", now, nd)
+	`, hostID, string(model.StateQueued), tokenHash, "linux", now, nd, profile)
 	if err != nil {
 		return 0, fmt.Errorf("insert run: %w", err)
 	}
@@ -107,14 +121,15 @@ func (r *Runs) Get(ctx context.Context, id int64) (*model.Run, error) {
 		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
 		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
 		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
-		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0)
+		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
 		       COALESCE(profile,'quick')
 		FROM runs WHERE id = ?
 	`, id)
 	var run model.Run
 	var completedAt sql.NullTime
 	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
 		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
-		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive)
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile)
 	if errors.Is(err, sql.ErrNoRows) {
 		return nil, ErrNotFound
 	}
@@ -133,7 +148,8 @@ func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, err
 		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
 		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
 		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
-		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0)
+		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
 		       COALESCE(profile,'quick')
 		FROM runs WHERE host_id = ?
 		ORDER BY id DESC LIMIT 1
 	`, hostID)
@@ -141,7 +157,7 @@ func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, err
 	var completedAt sql.NullTime
 	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
 		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
-		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive)
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile)
 	if errors.Is(err, sql.ErrNoRows) {
 		return nil, nil
 	}
@@ -165,7 +181,8 @@ func (r *Runs) ListForHost(ctx context.Context, hostID int64, limit int) ([]mode
 		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
 		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
 		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
-		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0)
+		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
 		       COALESCE(profile,'quick')
 		FROM runs
 		WHERE host_id = ?
 		ORDER BY id DESC
@@ -181,7 +198,7 @@ func (r *Runs) ListForHost(ctx context.Context, hostID int64, limit int) ([]mode
 		var completedAt sql.NullTime
 		if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
 			&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
-			&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive); err != nil {
+			&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile); err != nil {
 			return nil, err
 		}
 		if completedAt.Valid {
@@ -206,7 +223,8 @@ func (r *Runs) Active(ctx context.Context) ([]model.Run, error) {
 		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
 		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
 		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
-		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0)
+		       COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
 		       COALESCE(profile,'quick')
 		FROM runs
 		WHERE state NOT IN ('Completed','Released','Cancelled')
 		ORDER BY id
@@ -221,7 +239,7 @@ func (r *Runs) Active(ctx context.Context) ([]model.Run, error) {
 		var completedAt sql.NullTime
 		if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
 			&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
-			&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive); err != nil {
+			&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile); err != nil {
 			return nil, err
 		}
 		if completedAt.Valid {
@@ -275,7 +293,7 @@ func (r *Runs) FindActiveByMAC(ctx context.Context, mac string) (*model.Run, err
 	var completedAt sql.NullTime
 	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
 		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
-		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive)
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile)
 	if errors.Is(err, sql.ErrNoRows) {
 		return nil, nil
 	}
@@ -17,11 +17,13 @@ type Stages struct {
 // reaches Inventory; later phases add more executors but the list is fixed.
 var DefaultStageOrder = []string{
 	"Inventory",
 	"Firmware",
 	"SpecValidate",
 	"SMART",
 	"CPUStress",
 	"Storage",
 	"Network",
 	"Burn",
 	"GPU",
 	"PSU",
 	"Reporting",
@@ -0,0 +1,280 @@
 package store
 import (
 	"context"
 	"database/sql"
 	"fmt"
 	"time"
 )
 // Threshold is the DB view of a per-run threshold row. Mirrors the
 // orchestrator.Threshold value-object but keeps Severity/Op as strings
 // so callers higher up don't force this package to import orchestrator.
 type Threshold struct {
 	ID        int64
 	RunID     int64
 	Stage     string
 	Kind      string
 	Key       string
 	Op        string
 	Threshold float64
 	Nominal   float64
 	Unit      string
 	Severity  string
 	Source    string // profile|host_override
 }
 // ThresholdEvaluation is one recorded comparison — the evaluator calls
 // this for every sample that matched a threshold, whether it passed
 // or breached. The report page aggregates these to show the operator
 // why a run failed (or was flagged as warning-only).
 type ThresholdEvaluation struct {
 	ID          int64
 	RunID       int64
 	ThresholdID int64
 	Stage       string
 	Kind        string
 	Key         string
 	TS          time.Time
 	Observed    float64
 	Passed      bool
 }
 // Thresholds is the CRUD seam. Kept intentionally narrow: seed at run
 // creation, list for evaluation on each sensor batch, record eval
 // results, aggregate for the report.
 type Thresholds struct {
 	DB *sql.DB
 }
 // ThresholdSpec is the caller-supplied shape for seeding — a flat
 // value-object that carries the threshold rule plus its source so
 // the ProfileRegistry-driven seed and per-host overrides converge
 // on one insert path. Kept here (not in config) so the store layer
 // doesn't have to import config.
 type ThresholdSpec struct {
 	Stage    string
 	Kind     string
 	Key      string
 	Op       string
 	Value    float64
 	Nominal  float64
 	Unit     string
 	Severity string
 	Source   string
 }
 // SeedForRun converts the caller's specs into Threshold rows for the
 // given run and bulk-inserts them. Returns the inserted rows with IDs
 // populated so the evaluator can pin evaluations without a re-read.
 func (t *Thresholds) SeedForRun(ctx context.Context, runID int64, specs []ThresholdSpec) ([]Threshold, error) {
 	rows := make([]Threshold, 0, len(specs))
 	for _, s := range specs {
 		rows = append(rows, Threshold{
 			RunID:     runID,
 			Stage:     s.Stage,
 			Kind:      s.Kind,
 			Key:       s.Key,
 			Op:        s.Op,
 			Threshold: s.Value,
 			Nominal:   s.Nominal,
 			Unit:      s.Unit,
 			Severity:  s.Severity,
 			Source:    s.Source,
 		})
 	}
 	return t.CreateBatch(ctx, rows)
 }
 // Create inserts a single threshold row — used by the seed path when
 // the orchestrator materializes per-run rules from the ProfileRegistry.
 // Returns the row's ID so the evaluator can pin evaluations to it.
 func (t *Thresholds) Create(ctx context.Context, th Threshold) (int64, error) {
 	res, err := t.DB.ExecContext(ctx, `
 		INSERT INTO thresholds(run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source)
 		VALUES(?,?,?,?,?,?,?,?,?,?)
 	`, th.RunID, th.Stage, th.Kind, th.Key, th.Op, th.Threshold, th.Nominal, th.Unit, th.Severity, th.Source)
 	if err != nil {
 		return 0, fmt.Errorf("insert threshold: %w", err)
 	}
 	return res.LastInsertId()
 }
 // CreateBatch is the fast path for run seeding — one transaction per
 // run, one row per threshold. Returns the inserted rows with IDs set
 // so the caller can drop them into the in-memory evaluator without a
 // follow-up read.
 func (t *Thresholds) CreateBatch(ctx context.Context, rows []Threshold) ([]Threshold, error) {
 	if len(rows) == 0 {
 		return nil, nil
 	}
 	tx, err := t.DB.BeginTx(ctx, nil)
 	if err != nil {
 		return nil, err
 	}
 	defer func() { _ = tx.Rollback() }()
 	stmt, err := tx.PrepareContext(ctx, `
 		INSERT INTO thresholds(run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source)
 		VALUES(?,?,?,?,?,?,?,?,?,?)
 	`)
 	if err != nil {
 		return nil, fmt.Errorf("prepare threshold insert: %w", err)
 	}
 	defer func() { _ = stmt.Close() }()
 	out := make([]Threshold, 0, len(rows))
 	for _, th := range rows {
 		res, err := stmt.ExecContext(ctx, th.RunID, th.Stage, th.Kind, th.Key, th.Op,
 			th.Threshold, th.Nominal, th.Unit, th.Severity, th.Source)
 		if err != nil {
 			return nil, fmt.Errorf("insert threshold %s/%s: %w", th.Stage, th.Key, err)
 		}
 		id, err := res.LastInsertId()
 		if err != nil {
 			return nil, err
 		}
 		th.ID = id
 		out = append(out, th)
 	}
 	if err := tx.Commit(); err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 // ListForRun returns every threshold seeded for a run, in stable ID
 // order. Evaluator expects this to be cheap (few tens of rows per run)
 // and pulls it on each /sensor batch.
 func (t *Thresholds) ListForRun(ctx context.Context, runID int64) ([]Threshold, error) {
 	rows, err := t.DB.QueryContext(ctx, `
 		SELECT id, run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source
 		FROM thresholds WHERE run_id = ? ORDER BY id
 	`, runID)
 	if err != nil {
 		return nil, err
 	}
 	defer rows.Close()
 	var out []Threshold
 	for rows.Next() {
 		var th Threshold
 		if err := rows.Scan(&th.ID, &th.RunID, &th.Stage, &th.Kind, &th.Key,
 			&th.Op, &th.Threshold, &th.Nominal, &th.Unit, &th.Severity, &th.Source); err != nil {
 			return nil, err
 		}
 		out = append(out, th)
 	}
 	return out, rows.Err()
 }
 // RecordEvaluation persists a single evaluation outcome. Called per
 // matching sample so the run's report has a full audit trail ("temp
 // hit 95 at 14:22:03" rather than just "temp failed").
 func (t *Thresholds) RecordEvaluation(ctx context.Context, ev ThresholdEvaluation) error {
 	passed := 0
 	if ev.Passed {
 		passed = 1
 	}
 	if ev.TS.IsZero() {
 		ev.TS = time.Now().UTC()
 	}
 	_, err := t.DB.ExecContext(ctx, `
 		INSERT INTO threshold_evaluations(run_id, threshold_id, stage_name, kind, key, ts, observed, passed)
 		VALUES(?,?,?,?,?,?,?,?)
 	`, ev.RunID, ev.ThresholdID, ev.Stage, ev.Kind, ev.Key, ev.TS, ev.Observed, passed)
 	if err != nil {
 		return fmt.Errorf("record evaluation: %w", err)
 	}
 	return nil
 }
 // RecordBatch persists a slice of evaluations in one transaction. The
 // agent-handler hot path builds these one per sample and batches them
 // under the same Sensor POST so we take one round-trip rather than N.
 func (t *Thresholds) RecordBatch(ctx context.Context, evals []ThresholdEvaluation) error {
 	if len(evals) == 0 {
 		return nil
 	}
 	tx, err := t.DB.BeginTx(ctx, nil)
 	if err != nil {
 		return err
 	}
 	defer func() { _ = tx.Rollback() }()
 	stmt, err := tx.PrepareContext(ctx, `
 		INSERT INTO threshold_evaluations(run_id, threshold_id, stage_name, kind, key, ts, observed, passed)
 		VALUES(?,?,?,?,?,?,?,?)
 	`)
 	if err != nil {
 		return fmt.Errorf("prepare eval insert: %w", err)
 	}
 	defer func() { _ = stmt.Close() }()
 	for _, ev := range evals {
 		passed := 0
 		if ev.Passed {
 			passed = 1
 		}
 		if ev.TS.IsZero() {
 			ev.TS = time.Now().UTC()
 		}
 		if _, err := stmt.ExecContext(ctx, ev.RunID, ev.ThresholdID, ev.Stage, ev.Kind, ev.Key, ev.TS, ev.Observed, passed); err != nil {
 			return fmt.Errorf("insert eval: %w", err)
 		}
 	}
 	return tx.Commit()
 }
 // ListEvaluations returns the evaluation history for a run, newest
 // last. Bounded at a sane cap so a pathological run with a sample-per-
 // second sidecar doesn't blow up the report page.
 func (t *Thresholds) ListEvaluations(ctx context.Context, runID int64) ([]ThresholdEvaluation, error) {
 	rows, err := t.DB.QueryContext(ctx, `
 		SELECT id, run_id, threshold_id, stage_name, kind, key, ts, observed, passed
 		FROM threshold_evaluations WHERE run_id = ?
 		ORDER BY id LIMIT 5000
 	`, runID)
 	if err != nil {
 		return nil, err
 	}
 	defer rows.Close()
 	var out []ThresholdEvaluation
 	for rows.Next() {
 		var ev ThresholdEvaluation
 		var passed int
 		if err := rows.Scan(&ev.ID, &ev.RunID, &ev.ThresholdID, &ev.Stage, &ev.Kind,
 			&ev.Key, &ev.TS, &ev.Observed, &passed); err != nil {
 			return nil, err
 		}
 		ev.Passed = passed == 1
 		out = append(out, ev)
 	}
 	return out, rows.Err()
 }
 // CriticalBreaches returns the evaluations that fire the "fail the
 // run" gate — critical-severity thresholds with passed=0. The
 // agent-handler calls this at /result close so an aggregate breach
 // (p99 latency > bound) still flips the run to FailedHolding even if
 // no single sample tripped the fast-fail path.
 func (t *Thresholds) CriticalBreaches(ctx context.Context, runID int64) ([]ThresholdEvaluation, error) {
 	rows, err := t.DB.QueryContext(ctx, `
 		SELECT e.id, e.run_id, e.threshold_id, e.stage_name, e.kind, e.key, e.ts, e.observed, e.passed
 		FROM threshold_evaluations e
 		JOIN thresholds t ON t.id = e.threshold_id
 		WHERE e.run_id = ? AND e.passed = 0 AND t.severity = 'critical'
 		ORDER BY e.id
 	`, runID)
 	if err != nil {
 		return nil, err
 	}
 	defer rows.Close()
 	var out []ThresholdEvaluation
 	for rows.Next() {
 		var ev ThresholdEvaluation
 		var passed int
 		if err := rows.Scan(&ev.ID, &ev.RunID, &ev.ThresholdID, &ev.Stage, &ev.Kind,
 			&ev.Key, &ev.TS, &ev.Observed, &passed); err != nil {
 			return nil, err
 		}
 		ev.Passed = passed == 1
 		out = append(out, ev)
 	}
 	return out, rows.Err()
 }
@@ -636,6 +636,21 @@ body.bare main { max-width: none; }
 .run-failed-stage { color: var(--danger); }
 .run-failed-stage strong { font-family: var(--mono); }
 .run-diffs { color: var(--danger); }
 .run-profile-chip {
  display: inline-block;
  font-family: var(--mono);
  font-size: 11px;
  text-transform: uppercase;
  letter-spacing: .04em;
  padding: 2px 8px;
  border-radius: 999px;
  border: 1px solid rgba(255,255,255,.15);
  background: rgba(255,255,255,.05);
  color: var(--text-dim);
 }
 .run-profile-quick { color: var(--accent); border-color: rgba(60,130,246,.45); background: rgba(60,130,246,.08); }
 .run-profile-deep  { color: #e5b94f;       border-color: rgba(229,185,79,.45); background: rgba(229,185,79,.08); }
 .run-profile-soak  { color: #d97a57;       border-color: rgba(217,122,87,.45); background: rgba(217,122,87,.08); }
 .hold-banner {
  background: rgba(229,100,102,.1);
@@ -890,6 +905,17 @@ body.bare main { max-width: none; }
 .host-actions { padding: 0; }
 .host-actions-row { display: flex; gap: 10px; flex-wrap: wrap; align-items: center; }
 .host-nd-toggle { display: inline-flex; gap: 6px; align-items: center; color: var(--text-dim); font-size: 13px; }
 .host-profile-picker {
  border: 1px solid var(--border);
  border-radius: var(--radius);
  padding: 6px 10px;
  display: inline-flex;
  gap: 12px;
  align-items: center;
  margin: 0 8px 0 0;
 }
 .host-profile-picker legend { font-size: 11px; color: var(--text-dim); text-transform: uppercase; letter-spacing: .05em; padding: 0 4px; }
 .host-profile-picker label { display: inline-flex; gap: 4px; align-items: center; font-family: var(--mono); font-size: 13px; cursor: pointer; }
 .in-flight-banner-wrap { display: contents; }
 .in-flight-banner {
@@ -65,7 +65,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
 		var templ_7745c5c3_Var3 string
 		templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String())
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 1, Col: 0}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 1, Col: 0}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
 		if templ_7745c5c3_Err != nil {
@@ -88,7 +88,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
 		var templ_7745c5c3_Var4 string
 		templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name)
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 28, Col: 102}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 28, Col: 102}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
 		if templ_7745c5c3_Err != nil {
@@ -110,7 +110,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
 		var templ_7745c5c3_Var6 string
 		templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var5).String())
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 1, Col: 0}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 1, Col: 0}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6))
 		if templ_7745c5c3_Err != nil {
@@ -123,7 +123,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
 		var templ_7745c5c3_Var7 string
 		templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(stageMarker(string(d.Stage.State)))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 30, Col: 105}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 30, Col: 105}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
 		if templ_7745c5c3_Err != nil {
@@ -136,7 +136,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
 		var templ_7745c5c3_Var8 string
 		templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name)
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 31, Col: 41}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 31, Col: 41}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
 		if templ_7745c5c3_Err != nil {
@@ -149,7 +149,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
 		var templ_7745c5c3_Var9 string
 		templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(stageDurationFromStage(d.Stage))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 32, Col: 64}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 32, Col: 64}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
 		if templ_7745c5c3_Err != nil {
@@ -182,7 +182,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
 		var templ_7745c5c3_Var10 string
 		templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name)
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 43, Col: 99}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 43, Col: 99}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
 		if templ_7745c5c3_Err != nil {
@@ -195,7 +195,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
 		var templ_7745c5c3_Var11 string
 		templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d-%s", d.RunID, d.Stage.Name))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 47, Col: 56}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 47, Col: 56}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11))
 		if templ_7745c5c3_Err != nil {
@@ -208,7 +208,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
 		var templ_7745c5c3_Var12 string
 		templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d-%s", d.RunID, d.Stage.Name))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 48, Col: 62}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 48, Col: 62}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12))
 		if templ_7745c5c3_Err != nil {
@@ -102,6 +102,21 @@ templ HostActions(d HostPageData) {
 		<div class="host-actions-row">
 			if hostCanStart(d) {
 				<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/start", d.Host.ID)) } class="inline host-start-form">
 					<fieldset class="host-profile-picker">
 						<legend>Profile</legend>
 						<label title="~10 min — post-repair sanity: all probes + gates, short budgets">
 							<input type="radio" name="profile" value="quick" checked/>
 							quick
 						</label>
 						<label title="~8–12 h — overnight soak: long CPU/RAM, full-disk fio verify, 30 min network">
 							<input type="radio" name="profile" value="deep"/>
 							deep
 						</label>
 						<label title="≥24 h — week-long burn-in; opt-in when you suspect intermittent faults">
 							<input type="radio" name="profile" value="soak"/>
 							soak
 						</label>
 					</fieldset>
 					<label class="host-nd-toggle">
 						<input type="checkbox" name="non_destructive" value="1"/>
 						Non-destructive (skip wipe-probe + disk writes)
@@ -258,6 +273,16 @@ func hostCanStartIfOnline(d HostPageData) bool {
 	return d.ActiveRun == nil
 }
 // profileChipValue normalizes a Run.Profile string for display on the
 // run page chip. Older runs with an empty column predate Phase 1 — show
 // them as "quick" (the prior implicit default).
 func profileChipValue(p string) string {
 	if p == "" {
 		return "quick"
 	}
 	return p
 }
 // runDuration formats the elapsed time for a run using the same buckets
 // as stageDuration. In-flight runs clock from StartedAt to now so the
 // run-page header + runs-table row keep ticking on each SSE push.
@@ -361,7 +361,7 @@ func HostActions(d HostPageData) templ.Component {
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 21, "\" class=\"inline host-start-form\"><label class=\"host-nd-toggle\"><input type=\"checkbox\" name=\"non_destructive\" value=\"1\"> Non-destructive (skip wipe-probe + disk writes)</label> <button type=\"submit\" class=\"btn-primary\">Start vetting</button></form>")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 21, "\" class=\"inline host-start-form\"><fieldset class=\"host-profile-picker\"><legend>Profile</legend> <label title=\"~10 min — post-repair sanity: all probes + gates, short budgets\"><input type=\"radio\" name=\"profile\" value=\"quick\" checked> quick</label> <label title=\"~8–12 h — overnight soak: long CPU/RAM, full-disk fio verify, 30 min network\"><input type=\"radio\" name=\"profile\" value=\"deep\"> deep</label> <label title=\"≥24 h — week-long burn-in; opt-in when you suspect intermittent faults\"><input type=\"radio\" name=\"profile\" value=\"soak\"> soak</label></fieldset><label class=\"host-nd-toggle\"><input type=\"checkbox\" name=\"non_destructive\" value=\"1\"> Non-destructive (skip wipe-probe + disk writes)</label> <button type=\"submit\" class=\"btn-primary\">Start vetting</button></form>")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
@@ -383,7 +383,7 @@ func HostActions(d HostPageData) templ.Component {
 		var templ_7745c5c3_Var19 templ.SafeURL
 		templ_7745c5c3_Var19, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/delete", d.Host.ID)))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 116, Col: 89}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 131, Col: 89}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var19))
 		if templ_7745c5c3_Err != nil {
@@ -428,7 +428,7 @@ func InFlightBanner(d HostPageData) templ.Component {
 		var templ_7745c5c3_Var21 string
 		templ_7745c5c3_Var21, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-inflight-%d", d.Host.ID))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 128, Col: 51}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 143, Col: 51}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var21))
 		if templ_7745c5c3_Err != nil {
@@ -441,7 +441,7 @@ func InFlightBanner(d HostPageData) templ.Component {
 		var templ_7745c5c3_Var22 string
 		templ_7745c5c3_Var22, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-inflight-%d", d.Host.ID))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 130, Col: 57}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 145, Col: 57}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var22))
 		if templ_7745c5c3_Err != nil {
@@ -459,7 +459,7 @@ func InFlightBanner(d HostPageData) templ.Component {
 			var templ_7745c5c3_Var23 templ.SafeURL
 			templ_7745c5c3_Var23, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/runs/%d", d.ActiveRun.ID)))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 134, Col: 92}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 149, Col: 92}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var23))
 			if templ_7745c5c3_Err != nil {
@@ -472,7 +472,7 @@ func InFlightBanner(d HostPageData) templ.Component {
 			var templ_7745c5c3_Var24 string
 			templ_7745c5c3_Var24, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", d.ActiveRun.ID))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 135, Col: 74}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 150, Col: 74}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var24))
 			if templ_7745c5c3_Err != nil {
@@ -485,7 +485,7 @@ func InFlightBanner(d HostPageData) templ.Component {
 			var templ_7745c5c3_Var25 string
 			templ_7745c5c3_Var25, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(d.ActiveRun))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 136, Col: 59}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 151, Col: 59}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var25))
 			if templ_7745c5c3_Err != nil {
@@ -541,7 +541,7 @@ func HostEmptyState(d HostPageData) templ.Component {
 			var templ_7745c5c3_Var27 templ.SafeURL
 			templ_7745c5c3_Var27, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", d.Host.ID)))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 152, Col: 88}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 167, Col: 88}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var27))
 			if templ_7745c5c3_Err != nil {
@@ -655,7 +655,7 @@ func RunRow(d RunRowData) templ.Component {
 		var templ_7745c5c3_Var31 string
 		templ_7745c5c3_Var31, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("runrow-%d", d.Run.ID))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 204, Col: 41}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 219, Col: 41}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var31))
 		if templ_7745c5c3_Err != nil {
@@ -681,7 +681,7 @@ func RunRow(d RunRowData) templ.Component {
 		var templ_7745c5c3_Var33 string
 		templ_7745c5c3_Var33, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("runrow-%d", d.Run.ID))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 206, Col: 47}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 221, Col: 47}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var33))
 		if templ_7745c5c3_Err != nil {
@@ -694,7 +694,7 @@ func RunRow(d RunRowData) templ.Component {
 		var templ_7745c5c3_Var34 templ.SafeURL
 		templ_7745c5c3_Var34, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/runs/%d", d.Run.ID)))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 210, Col: 61}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 225, Col: 61}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var34))
 		if templ_7745c5c3_Err != nil {
@@ -707,7 +707,7 @@ func RunRow(d RunRowData) templ.Component {
 		var templ_7745c5c3_Var35 string
 		templ_7745c5c3_Var35, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("#%d", d.Run.ID))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 210, Col: 94}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 225, Col: 94}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var35))
 		if templ_7745c5c3_Err != nil {
@@ -742,7 +742,7 @@ func RunRow(d RunRowData) templ.Component {
 		var templ_7745c5c3_Var38 string
 		templ_7745c5c3_Var38, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(&d.Run))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 213, Col: 92}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 228, Col: 92}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var38))
 		if templ_7745c5c3_Err != nil {
@@ -755,7 +755,7 @@ func RunRow(d RunRowData) templ.Component {
 		var templ_7745c5c3_Var39 string
 		templ_7745c5c3_Var39, templ_7745c5c3_Err = templ.JoinStringErrs(relativeTime(d.Run.StartedAt))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 215, Col: 62}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 230, Col: 62}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var39))
 		if templ_7745c5c3_Err != nil {
@@ -768,7 +768,7 @@ func RunRow(d RunRowData) templ.Component {
 		var templ_7745c5c3_Var40 string
 		templ_7745c5c3_Var40, templ_7745c5c3_Err = templ.JoinStringErrs(runDuration(&d.Run))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 216, Col: 53}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 231, Col: 53}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var40))
 		if templ_7745c5c3_Err != nil {
@@ -805,7 +805,7 @@ func RunRow(d RunRowData) templ.Component {
 			var templ_7745c5c3_Var43 string
 			templ_7745c5c3_Var43, templ_7745c5c3_Err = templ.JoinStringErrs(name)
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 221, Col: 94}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 236, Col: 94}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var43))
 			if templ_7745c5c3_Err != nil {
@@ -823,7 +823,7 @@ func RunRow(d RunRowData) templ.Component {
 		var templ_7745c5c3_Var44 templ.SafeURL
 		templ_7745c5c3_Var44, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/runs/%d", d.Run.ID)))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 226, Col: 84}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 241, Col: 84}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var44))
 		if templ_7745c5c3_Err != nil {
@@ -867,6 +867,16 @@ func hostCanStartIfOnline(d HostPageData) bool {
 	return d.ActiveRun == nil
 }
 // profileChipValue normalizes a Run.Profile string for display on the
 // run page chip. Older runs with an empty column predate Phase 1 — show
 // them as "quick" (the prior implicit default).
 func profileChipValue(p string) string {
 	if p == "" {
 		return "quick"
 	}
 	return p
 }
 // runDuration formats the elapsed time for a run using the same buckets
 // as stageDuration. In-flight runs clock from StartedAt to now so the
 // run-page header + runs-table row keep ticking on each SSE push.
@@ -55,7 +55,7 @@ func HostTile(t TileData) templ.Component {
 		var templ_7745c5c3_Var3 string
 		templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("host-%d", t.Host.ID))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 19, Col: 40}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 19, Col: 40}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
 		if templ_7745c5c3_Err != nil {
@@ -68,7 +68,7 @@ func HostTile(t TileData) templ.Component {
 		var templ_7745c5c3_Var4 string
 		templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String())
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 1, Col: 0}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 1, Col: 0}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
 		if templ_7745c5c3_Err != nil {
@@ -81,7 +81,7 @@ func HostTile(t TileData) templ.Component {
 		var templ_7745c5c3_Var5 string
 		templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("tile-%d", t.Host.ID))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 21, Col: 46}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 21, Col: 46}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
 		if templ_7745c5c3_Err != nil {
@@ -94,7 +94,7 @@ func HostTile(t TileData) templ.Component {
 		var templ_7745c5c3_Var6 templ.SafeURL
 		templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d", t.Host.ID)))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 24, Col: 80}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 24, Col: 80}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6))
 		if templ_7745c5c3_Err != nil {
@@ -107,7 +107,7 @@ func HostTile(t TileData) templ.Component {
 		var templ_7745c5c3_Var7 string
 		templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs("Open " + t.Host.Name)
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 24, Col: 117}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 24, Col: 117}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
 		if templ_7745c5c3_Err != nil {
@@ -120,7 +120,7 @@ func HostTile(t TileData) templ.Component {
 		var templ_7745c5c3_Var8 string
 		templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(t.Host.Name)
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 26, Col: 39}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 26, Col: 39}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
 		if templ_7745c5c3_Err != nil {
@@ -142,7 +142,7 @@ func HostTile(t TileData) templ.Component {
 		var templ_7745c5c3_Var10 string
 		templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var9).String())
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 1, Col: 0}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 1, Col: 0}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
 		if templ_7745c5c3_Err != nil {
@@ -155,7 +155,7 @@ func HostTile(t TileData) templ.Component {
 		var templ_7745c5c3_Var11 string
 		templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(lastSeenLabel(t.LastSeenAt))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 28, Col: 95}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 28, Col: 95}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11))
 		if templ_7745c5c3_Err != nil {
@@ -168,7 +168,7 @@ func HostTile(t TileData) templ.Component {
 		var templ_7745c5c3_Var12 string
 		templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(t.Latest))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 29, Col: 51}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 29, Col: 51}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12))
 		if templ_7745c5c3_Err != nil {
@@ -186,7 +186,7 @@ func HostTile(t TileData) templ.Component {
 			var templ_7745c5c3_Var13 templ.SafeURL
 			templ_7745c5c3_Var13, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", t.Host.ID)))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 34, Col: 89}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 34, Col: 89}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var13))
 			if templ_7745c5c3_Err != nil {
@@ -209,7 +209,7 @@ func HostTile(t TileData) templ.Component {
 			var templ_7745c5c3_Var14 templ.SafeURL
 			templ_7745c5c3_Var14, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/cancel", t.Host.ID)))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 44, Col: 90}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 44, Col: 90}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var14))
 			if templ_7745c5c3_Err != nil {
@@ -227,7 +227,7 @@ func HostTile(t TileData) templ.Component {
 			var templ_7745c5c3_Var15 templ.SafeURL
 			templ_7745c5c3_Var15, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/reports/%d", t.Latest.ID)))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 48, Col: 88}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 48, Col: 88}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var15))
 			if templ_7745c5c3_Err != nil {
@@ -36,7 +36,7 @@ func Layout(title string) templ.Component {
 		var templ_7745c5c3_Var2 string
 		templ_7745c5c3_Var2, templ_7745c5c3_Err = templ.JoinStringErrs(title)
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/layout.templ`, Line: 9, Col: 17}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `layout.templ`, Line: 9, Col: 17}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var2))
 		if templ_7745c5c3_Err != nil {
@@ -86,7 +86,7 @@ func BareLayout(title string) templ.Component {
 		var templ_7745c5c3_Var4 string
 		templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(title)
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/layout.templ`, Line: 39, Col: 17}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `layout.templ`, Line: 39, Col: 17}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
 		if templ_7745c5c3_Err != nil {
@@ -40,11 +40,13 @@ func runStateRank(s model.RunState) int {
 		model.StateWaitingReboot,
 		model.StateBooting,
 		model.StateInventoryCheck,
 		model.StateFirmware,
 		model.StateSpecValidate,
 		model.StateSMART,
 		model.StateCPUStress,
 		model.StateStorage,
 		model.StateNetwork,
 		model.StateBurn,
 		model.StateGPU,
 		model.StatePSU,
 		model.StateReporting,
@@ -205,11 +207,13 @@ func firstStageState(run *model.Run) model.RunState {
 func stageStateByName(name string) (model.RunState, bool) {
 	m := map[string]model.RunState{
 		"Inventory":    model.StateInventoryCheck,
 		"Firmware":     model.StateFirmware,
 		"SpecValidate": model.StateSpecValidate,
 		"SMART":        model.StateSMART,
 		"CPUStress":    model.StateCPUStress,
 		"Storage":      model.StateStorage,
 		"Network":      model.StateNetwork,
 		"Burn":         model.StateBurn,
 		"GPU":          model.StateGPU,
 		"PSU":          model.StatePSU,
 		"Reporting":    model.StateReporting,
@@ -48,11 +48,13 @@ func runStateRank(s model.RunState) int {
 		model.StateWaitingReboot,
 		model.StateBooting,
 		model.StateInventoryCheck,
 		model.StateFirmware,
 		model.StateSpecValidate,
 		model.StateSMART,
 		model.StateCPUStress,
 		model.StateStorage,
 		model.StateNetwork,
 		model.StateBurn,
 		model.StateGPU,
 		model.StatePSU,
 		model.StateReporting,
@@ -213,11 +215,13 @@ func firstStageState(run *model.Run) model.RunState {
 func stageStateByName(name string) (model.RunState, bool) {
 	m := map[string]model.RunState{
 		"Inventory":    model.StateInventoryCheck,
 		"Firmware":     model.StateFirmware,
 		"SpecValidate": model.StateSpecValidate,
 		"SMART":        model.StateSMART,
 		"CPUStress":    model.StateCPUStress,
 		"Storage":      model.StateStorage,
 		"Network":      model.StateNetwork,
 		"Burn":         model.StateBurn,
 		"GPU":          model.StateGPU,
 		"PSU":          model.StatePSU,
 		"Reporting":    model.StateReporting,
@@ -312,7 +316,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
 				var templ_7745c5c3_Var3 string
 				templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String())
 				if templ_7745c5c3_Err != nil {
-					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 1, Col: 0}
+					return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 1, Col: 0}
 				}
 				_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
 				if templ_7745c5c3_Err != nil {
@@ -339,7 +343,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
 			var templ_7745c5c3_Var5 string
 			templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var4).String())
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 1, Col: 0}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 1, Col: 0}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
 			if templ_7745c5c3_Err != nil {
@@ -361,7 +365,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
 			var templ_7745c5c3_Var7 string
 			templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var6).String())
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 1, Col: 0}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 1, Col: 0}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
 			if templ_7745c5c3_Err != nil {
@@ -374,7 +378,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
 			var templ_7745c5c3_Var8 string
 			templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(stageMarker(n.State))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 275, Col: 77}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 279, Col: 77}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
 			if templ_7745c5c3_Err != nil {
@@ -387,7 +391,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
 			var templ_7745c5c3_Var9 string
 			templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(n.Name)
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 276, Col: 36}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 280, Col: 36}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
 			if templ_7745c5c3_Err != nil {
@@ -400,7 +404,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
 			var templ_7745c5c3_Var10 string
 			templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(stageDuration(n))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 277, Col: 50}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 281, Col: 50}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
 			if templ_7745c5c3_Err != nil {
@@ -454,7 +458,7 @@ func PipelineSection(run *model.Run, nodes []PipelineNode) templ.Component {
 		var templ_7745c5c3_Var12 string
 		templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("pipeline-%d", run.ID))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 292, Col: 41}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 296, Col: 41}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12))
 		if templ_7745c5c3_Err != nil {
@@ -467,7 +471,7 @@ func PipelineSection(run *model.Run, nodes []PipelineNode) templ.Component {
 		var templ_7745c5c3_Var13 string
 		templ_7745c5c3_Var13, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("pipeline-%d", run.ID))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 294, Col: 47}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 298, Col: 47}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var13))
 		if templ_7745c5c3_Err != nil {
@@ -8,26 +8,28 @@ import (
 )
 // node indexes for the default pipeline layout: pre-stages (3) + stage
-// rows (9) + terminal Completed (1) = 13 nodes.
+// rows (11) + terminal Completed (1) = 15 nodes.
 const (
 	idxQueued        = 0
 	idxWaitingReboot = 1
 	idxBooting       = 2
 	idxInventory     = 3
-	idxSpecValidate  = 4
+	idxFirmware      = 4
-	idxSMART         = 5
+	idxSpecValidate  = 5
-	idxCPUStress     = 6
+	idxSMART         = 6
-	idxStorage       = 7
+	idxCPUStress     = 7
-	idxNetwork       = 8
+	idxStorage       = 8
-	idxGPU           = 9
+	idxNetwork       = 9
-	idxPSU           = 10
+	idxBurn          = 10
-	idxReporting     = 11
+	idxGPU           = 11
-	idxCompleted     = 12
+	idxPSU           = 12
 	idxReporting     = 13
 	idxCompleted     = 14
 )
 // seedStages returns a fresh all-pending stage slice in the canonical order.
 func seedStages() []model.Stage {
-	names := []string{"Inventory", "SpecValidate", "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU", "Reporting"}
+	names := []string{"Inventory", "Firmware", "SpecValidate", "SMART", "CPUStress", "Storage", "Network", "Burn", "GPU", "PSU", "Reporting"}
 	out := make([]model.Stage, len(names))
 	for i, n := range names {
 		out[i] = model.Stage{Name: n, Ordinal: i, State: model.StagePending}
@@ -37,10 +39,10 @@ func seedStages() []model.Stage {
 func TestBuildPipeline_NoRun(t *testing.T) {
 	nodes := BuildPipeline(nil, nil)
-	// Ghost pipeline: 3 pre-stages + 9 stage ghosts + 1 terminal = 13
+	// Ghost pipeline: 3 pre-stages + 10 stage ghosts + 1 terminal = 14
 	// nodes, all pending.
-	if len(nodes) != 13 {
+	if len(nodes) != 15 {
-		t.Fatalf("len = %d, want 13", len(nodes))
+		t.Fatalf("len = %d, want 15", len(nodes))
 	}
 	for i, n := range nodes {
 		if n.State != "pending" {
@@ -56,8 +58,8 @@ func TestBuildPipeline_NoRun(t *testing.T) {
 func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
 	run := &model.Run{State: model.StateWaitingReboot}
 	nodes := BuildPipeline(run, nil)
-	if len(nodes) != 13 {
+	if len(nodes) != 15 {
-		t.Fatalf("len = %d, want 13", len(nodes))
+		t.Fatalf("len = %d, want 15", len(nodes))
 	}
 	if nodes[idxQueued].State != "passed" {
 		t.Errorf("Queued = %q, want passed", nodes[idxQueued].State)
@@ -65,7 +67,7 @@ func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
 	if nodes[idxWaitingReboot].State != "running" {
 		t.Errorf("WaitingReboot = %q, want running", nodes[idxWaitingReboot].State)
 	}
-	// All 9 stage ghosts must be pending — nothing has started yet.
+	// All 11 stage ghosts must be pending — nothing has started yet.
 	for i := idxInventory; i <= idxReporting; i++ {
 		if nodes[i].State != "pending" {
 			t.Errorf("%s (ghost) = %q, want pending", nodes[i].Name, nodes[i].State)
@@ -81,19 +83,20 @@ func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
 // pending ghosts rather than silently disappearing.
 func TestBuildPipeline_GhostStagesDuringStage(t *testing.T) {
 	run := &model.Run{State: model.StateSMART}
-	// Only Inventory + SpecValidate seeded; SMART onwards are ghosts.
+	// Only Inventory + Firmware + SpecValidate seeded; SMART onwards are ghosts.
 	stages := []model.Stage{
 		{Name: "Inventory", Ordinal: 0, State: model.StagePassed},
-		{Name: "SpecValidate", Ordinal: 1, State: model.StagePassed},
+		{Name: "Firmware", Ordinal: 1, State: model.StagePassed},
 		{Name: "SpecValidate", Ordinal: 2, State: model.StagePassed},
 	}
 	nodes := BuildPipeline(run, stages)
-	if len(nodes) != 13 {
+	if len(nodes) != 15 {
-		t.Fatalf("len = %d, want 13", len(nodes))
+		t.Fatalf("len = %d, want 15", len(nodes))
 	}
 	if nodes[idxSMART].State != "running" {
 		t.Errorf("SMART (ghost) = %q, want running", nodes[idxSMART].State)
 	}
-	for _, i := range []int{idxCPUStress, idxStorage, idxNetwork, idxGPU, idxPSU, idxReporting} {
+	for _, i := range []int{idxCPUStress, idxStorage, idxNetwork, idxBurn, idxGPU, idxPSU, idxReporting} {
 		if nodes[i].State != "pending" {
 			t.Errorf("%s (ghost) = %q, want pending", nodes[i].Name, nodes[i].State)
 		}
@@ -103,12 +106,13 @@ func TestBuildPipeline_GhostStagesDuringStage(t *testing.T) {
 func TestBuildPipeline_Running(t *testing.T) {
 	run := &model.Run{State: model.StateSMART}
 	stages := seedStages()
-	stages[0].State = model.StagePassed
+	stages[0].State = model.StagePassed // Inventory
-	stages[1].State = model.StagePassed
+	stages[1].State = model.StagePassed // Firmware
-	stages[2].State = model.StageRunning
+	stages[2].State = model.StagePassed // SpecValidate
 	stages[3].State = model.StageRunning // SMART
 	nodes := BuildPipeline(run, stages)
-	if len(nodes) != 13 {
+	if len(nodes) != 15 {
-		t.Fatalf("len = %d, want 13", len(nodes))
+		t.Fatalf("len = %d, want 15", len(nodes))
 	}
 	// Pre-stages are all past for a run that has reached SMART.
 	for i := idxQueued; i <= idxBooting; i++ {
@@ -136,10 +140,10 @@ func TestBuildPipeline_Running(t *testing.T) {
 func TestBuildPipeline_Failed(t *testing.T) {
 	run := &model.Run{State: model.StateFailedHolding, FailedStage: "Storage"}
 	stages := seedStages()
-	for i := 0; i <= 3; i++ {
+	for i := 0; i <= 4; i++ {
 		stages[i].State = model.StagePassed
 	}
-	stages[4].State = model.StageFailed // Storage
+	stages[5].State = model.StageFailed // Storage
 	nodes := BuildPipeline(run, stages)
 	// Pre-stages are past a run that reached Storage.
 	for i := idxQueued; i <= idxBooting; i++ {
@@ -150,7 +154,7 @@ func TestBuildPipeline_Failed(t *testing.T) {
 	if nodes[idxStorage].State != "failed" {
 		t.Errorf("Storage = %q, want failed", nodes[idxStorage].State)
 	}
-	for _, i := range []int{idxNetwork, idxGPU, idxPSU, idxReporting} {
+	for _, i := range []int{idxNetwork, idxBurn, idxGPU, idxPSU, idxReporting} {
 		if nodes[i].State != "skipped" {
 			t.Errorf("%s = %q, want skipped", nodes[i].Name, nodes[i].State)
 		}
@@ -64,7 +64,7 @@ func Registration(form RegistrationForm) templ.Component {
 				var templ_7745c5c3_Var3 string
 				templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(form.Error)
 				if templ_7745c5c3_Err != nil {
-					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 22, Col: 35}
+					return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 22, Col: 35}
 				}
 				_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
 				if templ_7745c5c3_Err != nil {
@@ -83,7 +83,7 @@ func Registration(form RegistrationForm) templ.Component {
 				var templ_7745c5c3_Var4 string
 				templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs("curl -fsSL " + form.QuickRegisterURL + "/register/quick.sh | sudo bash")
 				if templ_7745c5c3_Err != nil {
-					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 28, Col: 108}
+					return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 28, Col: 108}
 				}
 				_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
 				if templ_7745c5c3_Err != nil {
@@ -101,7 +101,7 @@ func Registration(form RegistrationForm) templ.Component {
 			var templ_7745c5c3_Var5 string
 			templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(form.Name)
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 38, Col: 55}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 38, Col: 55}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
 			if templ_7745c5c3_Err != nil {
@@ -114,7 +114,7 @@ func Registration(form RegistrationForm) templ.Component {
 			var templ_7745c5c3_Var6 string
 			templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(form.MAC)
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 42, Col: 53}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 42, Col: 53}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6))
 			if templ_7745c5c3_Err != nil {
@@ -127,7 +127,7 @@ func Registration(form RegistrationForm) templ.Component {
 			var templ_7745c5c3_Var7 string
 			templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(form.WoLBroadcastIP)
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 47, Col: 78}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 47, Col: 78}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
 			if templ_7745c5c3_Err != nil {
@@ -140,7 +140,7 @@ func Registration(form RegistrationForm) templ.Component {
 			var templ_7745c5c3_Var8 string
 			templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(defaultPort(form.WoLPort))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 51, Col: 78}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 51, Col: 78}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
 			if templ_7745c5c3_Err != nil {
@@ -153,7 +153,7 @@ func Registration(form RegistrationForm) templ.Component {
 			var templ_7745c5c3_Var9 string
 			templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(form.ExpectedSpecYAML)
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 56, Col: 127}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 56, Col: 127}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
 			if templ_7745c5c3_Err != nil {
@@ -166,7 +166,7 @@ func Registration(form RegistrationForm) templ.Component {
 			var templ_7745c5c3_Var10 string
 			templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(form.Notes)
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 60, Col: 51}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 60, Col: 51}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
 			if templ_7745c5c3_Err != nil {
@@ -83,6 +83,7 @@ templ RunHeader(d RunPageData) {
 		<div class="run-header-left">
 			<h1 class="run-header-name">{ fmt.Sprintf("Run #%d", d.Run.ID) }</h1>
 			<span class={ "run-status-badge", "run-status-" + tileMood(&d.Run) }>{ tileStatus(&d.Run) }</span>
 			<span class={ "run-profile-chip", "run-profile-" + profileChipValue(d.Run.Profile) }>{ profileChipValue(d.Run.Profile) }</span>
 			<span class="run-duration">{ runDuration(&d.Run) }</span>
 			if d.Run.FailedStage != "" {
 				<span class="run-failed-stage">failed at <strong>{ d.Run.FailedStage }</strong></span>
@@ -286,142 +286,177 @@ func RunHeader(d RunPageData) templ.Component {
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 15, "</span> <span class=\"run-duration\">")
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 15, "</span> ")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-		var templ_7745c5c3_Var15 string
+		var templ_7745c5c3_Var15 = []any{"run-profile-chip", "run-profile-" + profileChipValue(d.Run.Profile)}
-		templ_7745c5c3_Var15, templ_7745c5c3_Err = templ.JoinStringErrs(runDuration(&d.Run))
+		templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var15...)
 		if templ_7745c5c3_Err != nil {
 			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 86, Col: 51}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var15))
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 16, "</span> ")
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 16, "<span class=\"")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
 		if d.Run.FailedStage != "" {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 17, "<span class=\"run-failed-stage\">failed at <strong>")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
 		var templ_7745c5c3_Var16 string
-			templ_7745c5c3_Var16, templ_7745c5c3_Err = templ.JoinStringErrs(d.Run.FailedStage)
+		templ_7745c5c3_Var16, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var15).String())
 		if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 88, Col: 72}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 1, Col: 0}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var16))
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 18, "</strong></span> ")
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 17, "\">")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 		}
 		if d.SpecDiffCritical > 0 {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 19, "<span class=\"run-diffs bad\">")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
 		var templ_7745c5c3_Var17 string
-			templ_7745c5c3_Var17, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d critical diff", d.SpecDiffCritical))
+		templ_7745c5c3_Var17, templ_7745c5c3_Err = templ.JoinStringErrs(profileChipValue(d.Run.Profile))
 		if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 91, Col: 85}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 86, Col: 121}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var17))
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 20, "</span>")
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 18, "</span> <span class=\"run-duration\">")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-		}
+		var templ_7745c5c3_Var18 string
-		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 21, "</div><div class=\"run-header-right\">")
+		templ_7745c5c3_Var18, templ_7745c5c3_Err = templ.JoinStringErrs(runDuration(&d.Run))
 		if templ_7745c5c3_Err != nil {
-			return templ_7745c5c3_Err
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 87, Col: 51}
 		}
 		if canCancel(&d.Run) {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 22, "<form method=\"post\" action=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var18 templ.SafeURL
 			templ_7745c5c3_Var18, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/cancel", d.Host.ID)))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 96, Col: 90}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var18))
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 23, "\" class=\"inline\" onsubmit=\"return confirm('Cancel run? Destructive stages may leave the host in an intermediate state requiring manual cleanup.');\"><button type=\"submit\" class=\"btn-danger\">Cancel run</button></form>")
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 19, "</span> ")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-		}
+		if d.Run.FailedStage != "" {
-		if canOverrideWipe(&d.Run) {
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 20, "<span class=\"run-failed-stage\">failed at <strong>")
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 24, "<form method=\"post\" action=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-			var templ_7745c5c3_Var19 templ.SafeURL
+			var templ_7745c5c3_Var19 string
-			templ_7745c5c3_Var19, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/override-wipe", d.Host.ID)))
+			templ_7745c5c3_Var19, templ_7745c5c3_Err = templ.JoinStringErrs(d.Run.FailedStage)
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 101, Col: 97}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 89, Col: 72}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var19))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 25, "\" class=\"inline\"><button type=\"submit\" class=\"btn-danger\">Override wipe-probe</button></form>")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 21, "</strong></span> ")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 		}
-		if hasReport(&d.Run) {
+		if d.SpecDiffCritical > 0 {
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 26, "<a class=\"button-like\" href=\"")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 22, "<span class=\"run-diffs bad\">")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-			var templ_7745c5c3_Var20 templ.SafeURL
+			var templ_7745c5c3_Var20 string
-			templ_7745c5c3_Var20, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/reports/%d", d.Run.ID)))
+			templ_7745c5c3_Var20, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d critical diff", d.SpecDiffCritical))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 106, Col: 85}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 92, Col: 85}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var20))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 27, "\" target=\"_blank\" rel=\"noopener\">View report</a> ")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 23, "</span>")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 		}
-		if d.Run.State.IsTerminal() {
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 24, "</div><div class=\"run-header-right\">")
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 28, "<form method=\"post\" action=\"")
+		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
 		if canCancel(&d.Run) {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 25, "<form method=\"post\" action=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var21 templ.SafeURL
-			templ_7745c5c3_Var21, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", d.Host.ID)))
+			templ_7745c5c3_Var21, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/cancel", d.Host.ID)))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 109, Col: 89}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 97, Col: 90}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var21))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 29, "\" class=\"inline\"><button type=\"submit\" class=\"btn-primary\">Start new run</button></form>")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 26, "\" class=\"inline\" onsubmit=\"return confirm('Cancel run? Destructive stages may leave the host in an intermediate state requiring manual cleanup.');\"><button type=\"submit\" class=\"btn-danger\">Cancel run</button></form>")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 		}
-		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 30, "</div></header>")
+		if canOverrideWipe(&d.Run) {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 27, "<form method=\"post\" action=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var22 templ.SafeURL
 			templ_7745c5c3_Var22, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/override-wipe", d.Host.ID)))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 102, Col: 97}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var22))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 28, "\" class=\"inline\"><button type=\"submit\" class=\"btn-danger\">Override wipe-probe</button></form>")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 		}
 		if hasReport(&d.Run) {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 29, "<a class=\"button-like\" href=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var23 templ.SafeURL
 			templ_7745c5c3_Var23, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/reports/%d", d.Run.ID)))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 107, Col: 85}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var23))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 30, "\" target=\"_blank\" rel=\"noopener\">View report</a> ")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 		}
 		if d.Run.State.IsTerminal() {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 31, "<form method=\"post\" action=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var24 templ.SafeURL
 			templ_7745c5c3_Var24, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", d.Host.ID)))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 110, Col: 89}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var24))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 32, "\" class=\"inline\"><button type=\"submit\" class=\"btn-primary\">Start new run</button></form>")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 		}
 		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 33, "</div></header>")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
@@ -449,83 +484,83 @@ func HoldBanner(d RunPageData) templ.Component {
 			}()
 		}
 		ctx = templ.InitializeContext(ctx)
-		templ_7745c5c3_Var22 := templ.GetChildren(ctx)
+		templ_7745c5c3_Var25 := templ.GetChildren(ctx)
-		if templ_7745c5c3_Var22 == nil {
+		if templ_7745c5c3_Var25 == nil {
-			templ_7745c5c3_Var22 = templ.NopComponent
+			templ_7745c5c3_Var25 = templ.NopComponent
 		}
 		ctx = templ.ClearChildren(ctx)
 		if d.Run.State == model.StateFailedHolding && d.Run.HoldIP != "" {
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 31, "<section id=\"")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 34, "<section id=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var23 string
 			templ_7745c5c3_Var23, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 124, Col: 47}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var23))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 32, "\" class=\"hold-banner\" sse-swap=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var24 string
 			templ_7745c5c3_Var24, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 126, Col: 53}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var24))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 33, "\" hx-swap=\"outerHTML\"><span class=\"hold-banner-label\">Host is holding — SSH available:</span> <code class=\"hold-ssh\">")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var25 string
 			templ_7745c5c3_Var25, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(d.HoldKeyPath, d.Run.HoldIP))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 130, Col: 70}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var25))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 34, "</code></section>")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 		} else {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 35, "<section id=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var26 string
 			templ_7745c5c3_Var26, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 134, Col: 47}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 125, Col: 47}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var26))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 36, "\" class=\"detail-hold-placeholder\" sse-swap=\"")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 35, "\" class=\"hold-banner\" sse-swap=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var27 string
 			templ_7745c5c3_Var27, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 136, Col: 53}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 127, Col: 53}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var27))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 37, "\" hx-swap=\"outerHTML\"></section>")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 36, "\" hx-swap=\"outerHTML\"><span class=\"hold-banner-label\">Host is holding — SSH available:</span> <code class=\"hold-ssh\">")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var28 string
 			templ_7745c5c3_Var28, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(d.HoldKeyPath, d.Run.HoldIP))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 131, Col: 70}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var28))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 37, "</code></section>")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 		} else {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, "<section id=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var29 string
 			templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 135, Col: 47}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var29))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 39, "\" class=\"detail-hold-placeholder\" sse-swap=\"")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var30 string
 			templ_7745c5c3_Var30, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 137, Col: 53}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var30))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 40, "\" hx-swap=\"outerHTML\"></section>")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
@@ -553,138 +588,138 @@ func RunSpecDiffs(d RunPageData) templ.Component {
 			}()
 		}
 		ctx = templ.InitializeContext(ctx)
-		templ_7745c5c3_Var28 := templ.GetChildren(ctx)
+		templ_7745c5c3_Var31 := templ.GetChildren(ctx)
-		if templ_7745c5c3_Var28 == nil {
+		if templ_7745c5c3_Var31 == nil {
-			templ_7745c5c3_Var28 = templ.NopComponent
+			templ_7745c5c3_Var31 = templ.NopComponent
 		}
 		ctx = templ.ClearChildren(ctx)
-		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, "<section id=\"")
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 41, "<section id=\"")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-		var templ_7745c5c3_Var29 string
+		var templ_7745c5c3_Var32 string
-		templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-specdiffs-%d", d.Run.ID))
+		templ_7745c5c3_Var32, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-specdiffs-%d", d.Run.ID))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 147, Col: 51}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 148, Col: 51}
 		}
-		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var29))
+		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var32))
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 39, "\" class=\"detail-section detail-diffs\" sse-swap=\"")
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "\" class=\"detail-section detail-diffs\" sse-swap=\"")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
 		var templ_7745c5c3_Var30 string
 		templ_7745c5c3_Var30, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-specdiffs-%d", d.Run.ID))
 		if templ_7745c5c3_Err != nil {
 			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 149, Col: 57}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var30))
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
 		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 40, "\" hx-swap=\"outerHTML\">")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
 		if len(d.SpecDiffs) > 0 {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 41, "<details")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			if hasCriticalDiff(d.SpecDiffs) {
 				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, " open")
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "><summary><h2>Spec diffs (")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var31 string
 			templ_7745c5c3_Var31, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", len(d.SpecDiffs)))
 			if templ_7745c5c3_Err != nil {
 				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 154, Col: 66}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var31))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, ")</h2></summary><ul class=\"diff-list\">")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			for _, diff := range d.SpecDiffs {
 				var templ_7745c5c3_Var32 = []any{"diff-row", "diff-" + diff.Severity}
 				templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var32...)
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 45, "<li class=\"")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
 		var templ_7745c5c3_Var33 string
-				templ_7745c5c3_Var33, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var32).String())
+		templ_7745c5c3_Var33, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-specdiffs-%d", d.Run.ID))
 		if templ_7745c5c3_Err != nil {
-					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 1, Col: 0}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 150, Col: 57}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var33))
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
-				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, "\"><div class=\"diff-field\">")
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "\" hx-swap=\"outerHTML\">")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
 		if len(d.SpecDiffs) > 0 {
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, "<details")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			if hasCriticalDiff(d.SpecDiffs) {
 				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 45, " open")
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 			}
 			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, "><summary><h2>Spec diffs (")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var34 string
-				templ_7745c5c3_Var34, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Field)
+			templ_7745c5c3_Var34, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", len(d.SpecDiffs)))
 			if templ_7745c5c3_Err != nil {
-					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 158, Col: 43}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 155, Col: 66}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var34))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, "</div><div class=\"diff-expected\">expected: <code>")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, ")</h2></summary><ul class=\"diff-list\">")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-				var templ_7745c5c3_Var35 string
+			for _, diff := range d.SpecDiffs {
-				templ_7745c5c3_Var35, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Expected)
+				var templ_7745c5c3_Var35 = []any{"diff-row", "diff-" + diff.Severity}
-				if templ_7745c5c3_Err != nil {
+				templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var35...)
 					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 159, Col: 65}
 				}
 				_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var35))
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
-				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "</code></div><div class=\"diff-actual\">actual: <code>")
+				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "<li class=\"")
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 				var templ_7745c5c3_Var36 string
-				templ_7745c5c3_Var36, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Actual)
+				templ_7745c5c3_Var36, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var35).String())
 				if templ_7745c5c3_Err != nil {
-					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 160, Col: 59}
+					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 1, Col: 0}
 				}
 				_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var36))
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
-				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "</code></div></li>")
+				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "\"><div class=\"diff-field\">")
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 				var templ_7745c5c3_Var37 string
 				templ_7745c5c3_Var37, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Field)
 				if templ_7745c5c3_Err != nil {
 					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 159, Col: 43}
 				}
 				_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var37))
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 50, "</div><div class=\"diff-expected\">expected: <code>")
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 				var templ_7745c5c3_Var38 string
 				templ_7745c5c3_Var38, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Expected)
 				if templ_7745c5c3_Err != nil {
 					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 160, Col: 65}
 				}
 				_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var38))
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 51, "</code></div><div class=\"diff-actual\">actual: <code>")
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 				var templ_7745c5c3_Var39 string
 				templ_7745c5c3_Var39, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Actual)
 				if templ_7745c5c3_Err != nil {
 					return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 161, Col: 59}
 				}
 				_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var39))
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 				templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 52, "</code></div></li>")
 				if templ_7745c5c3_Err != nil {
 					return templ_7745c5c3_Err
 				}
 			}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 50, "</ul></details>")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 53, "</ul></details>")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 		}
-		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 51, "</section>")
+		templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 54, "</section>")
 		if templ_7745c5c3_Err != nil {
 			return templ_7745c5c3_Err
 		}
@@ -99,7 +99,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
 		var templ_7745c5c3_Var3 string
 		templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("substep-%d-%s-%d", ss.RunID, ss.StageName, ss.Ordinal))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 63, Col: 74}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 63, Col: 74}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
 		if templ_7745c5c3_Err != nil {
@@ -112,7 +112,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
 		var templ_7745c5c3_Var4 string
 		templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String())
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 1, Col: 0}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 1, Col: 0}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
 		if templ_7745c5c3_Err != nil {
@@ -125,7 +125,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
 		var templ_7745c5c3_Var5 string
 		templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("substep-%d-%s-%d", ss.RunID, ss.StageName, ss.Ordinal))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 65, Col: 80}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 65, Col: 80}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
 		if templ_7745c5c3_Err != nil {
@@ -147,7 +147,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
 		var templ_7745c5c3_Var7 string
 		templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var6).String())
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 1, Col: 0}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 1, Col: 0}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
 		if templ_7745c5c3_Err != nil {
@@ -160,7 +160,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
 		var templ_7745c5c3_Var8 string
 		templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(subStepMarker(ss.State))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 68, Col: 96}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 68, Col: 96}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
 		if templ_7745c5c3_Err != nil {
@@ -173,7 +173,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
 		var templ_7745c5c3_Var9 string
 		templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(ss.Name)
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 69, Col: 38}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 69, Col: 38}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
 		if templ_7745c5c3_Err != nil {
@@ -186,7 +186,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
 		var templ_7745c5c3_Var10 string
 		templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(subStepDuration(ss))
 		if templ_7745c5c3_Err != nil {
-			return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 70, Col: 54}
+			return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 70, Col: 54}
 		}
 		_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
 		if templ_7745c5c3_Err != nil {