deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -0,0 +1,486 @@
+package tests
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"vetting/agent/probes"
+)
+
+// BurnConfig is what the agent passes to Burn: the orchestrator's iperf3
+// server address and port. Durations + concurrency knobs come from
+// Deps.BurnKnobs so they scale with profile.
+type BurnConfig struct {
+	OrchestratorURL string
+	IperfPort       int // 0 = 5201
+}
+
+// Burn is the concurrent soak stage. Unlike CPUStress (serial
+// CPU→memory) or Storage (serial per disk) it fans out every workload
+// at once: stress-ng hammers CPU + memory, fio drives the allow-listed
+// disks, iperf3 pushes sustained NIC traffic, and two sidecars poll
+// EDAC + PSU rails for the duration of the window.
+//
+// This is where PSU rails actually matter: 12V sag under simultaneous
+// CPU + disk + NIC load is exactly the failure a thermal/power
+// regression produces, and it's invisible to any stage that loads one
+// subsystem at a time. The PSU stage that follows Burn in the pipeline
+// re-samples rails post-window to confirm they settle back to nominal.
+//
+// Burn stays inside the stage framework — it doesn't spawn a parallel
+// stage runner. The goroutine fan-out is local; the stage converges
+// before returning an Outcome so every invariant the orchestrator
+// relies on (serial stage order, single in-flight stage per run) still
+// holds.
+func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome {
+	duration := d.BurnKnobs.Duration
+	if duration <= 0 {
+		duration = 2 * time.Minute
+	}
+	cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers)
+	memPct := clampMemPct(d.BurnKnobs.MemPct)
+	iperfParallel := d.BurnKnobs.IperfParallel
+	if iperfParallel <= 0 {
+		iperfParallel = 2
+	}
+	d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v",
+		duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare))
+
+	// Sidecars run for the lifetime of the window and are cancelled on
+	// return so the main stage converges cleanly. EDAC catches DIMM
+	// bit-flips that appear only under concurrent load; PSU catches
+	// rail sag that only appears when CPU + disk + NIC pull current
+	// simultaneously.
+	sideCtx, sideCancel := context.WithCancel(ctx)
+	defer sideCancel()
+	var sideWG sync.WaitGroup
+	sideWG.Add(2)
+	go runEDACSidecar(sideCtx, &sideWG, d)
+	go runPSUSidecar(sideCtx, &sideWG, d)
+
+	runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
+	defer cancel()
+
+	results := make(chan burnSubResult, 4)
+	var wg sync.WaitGroup
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		results <- runBurnCPU(runCtx, d, duration, cpuWorkers)
+	}()
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		results <- runBurnMemory(runCtx, d, duration, memPct)
+	}()
+
+	// fio runs only when explicitly enabled *and* there are allow-listed
+	// disks *and* the run wasn't marked non-destructive. Any of those
+	// missing records a Skipped sub-step so the operator sees why.
+	if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			results <- runBurnFio(runCtx, d, duration)
+		}()
+	} else {
+		reason := burnFioSkipReason(d)
+		results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason}
+	}
+
+	// iperf requires an orchestrator host. Lab hosts run with the
+	// bundled iperf3 server; without a base URL we can't derive a
+	// target so we skip rather than fail the stage.
+	if cfg.OrchestratorURL != "" {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel)
+		}()
+	} else {
+		results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"}
+	}
+
+	wg.Wait()
+	sideCancel()
+	sideWG.Wait()
+	close(results)
+
+	subs, samples, failures := collectBurnResults(results)
+	if d.Sensor != nil && len(samples) > 0 {
+		_ = d.Sensor(ctx, samples)
+	}
+
+	extras := map[string]any{
+		"duration":       duration.String(),
+		"cpu_workers":    cpuWorkers,
+		"mem_pct":        memPct,
+		"iperf_parallel": iperfParallel,
+		"fio_on_spare":   d.BurnKnobs.FioOnSpare,
+	}
+	if len(failures) > 0 {
+		msg := "Burn workloads failed: " + strings.Join(failures, ", ")
+		d.Error(msg)
+		return Outcome{
+			Passed:   false,
+			Message:  msg,
+			Summary:  fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)),
+			Extras:   extras,
+			SubSteps: subs,
+		}
+	}
+	d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs)))
+	return Outcome{
+		Passed:   true,
+		Summary:  fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)),
+		Extras:   extras,
+		SubSteps: subs,
+	}
+}
+
+// burnSubResult is the per-workload return type used by the fan-out
+// goroutines. Sample slice is merged into the stage's final /sensor
+// batch; SubStep becomes a row on the /result sub-steps list.
+type burnSubResult struct {
+	Name    string
+	Passed  bool
+	Skipped bool
+	Reason  string // why a workload was skipped
+	Err     string // why a workload failed
+	Samples []Sample
+	SubStep SubStepReport
+}
+
+func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) {
+	var subs []SubStepReport
+	var samples []Sample
+	var failures []string
+	for r := range ch {
+		// Non-skipped goroutines populate SubStep directly. Skipped slots
+		// get a synthesized row here so the /result shape stays stable.
+		if r.Skipped {
+			stamp := time.Now().UTC()
+			subs = append(subs, SubStepReport{
+				Name:        r.Name,
+				Skipped:     true,
+				StartedAt:   stamp,
+				CompletedAt: stamp,
+				SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}),
+			})
+			continue
+		}
+		subs = append(subs, r.SubStep)
+		samples = append(samples, r.Samples...)
+		if !r.Passed {
+			reason := r.Err
+			if reason == "" {
+				reason = "unknown"
+			}
+			failures = append(failures, r.Name+": "+reason)
+		}
+	}
+	return subs, samples, failures
+}
+
+func burnFioSkipReason(d Deps) string {
+	if !d.BurnKnobs.FioOnSpare {
+		return "fio_on_spare knob disabled"
+	}
+	if d.NonDestructive {
+		return "non-destructive run"
+	}
+	if len(d.ExpectedDisks) == 0 {
+		return "no allowlisted disks"
+	}
+	return "disabled"
+}
+
+// runBurnCPU hammers all CPU cores with stress-ng for the window. Same
+// shape as CPUStress pass 1 but with shorter label so the sub-step row
+// doesn't collide with the earlier stage's "CPU pass".
+func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult {
+	if _, err := exec.LookPath("stress-ng"); err != nil {
+		return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"}
+	}
+	args := []string{
+		"--cpu", strconv.Itoa(workers),
+		"--cpu-method", "all",
+		"--timeout", durationSeconds(duration),
+		"--metrics-brief",
+		"--verify",
+	}
+	d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " ")))
+	pass := runStressPass(ctx, d, "Burn CPU", duration, args)
+	return burnSubResult{
+		Name:    "Burn CPU",
+		Passed:  pass.Passed,
+		Err:     pass.Err,
+		SubStep: subStepFromPass("Burn CPU", pass),
+	}
+}
+
+// runBurnMemory drives a single --vm worker sized at memPct of
+// MemAvailable, capped so the kernel + agent + other workloads still
+// have headroom. Clamping happens here rather than in resolveBurnKnobs
+// so the cap is computed against real live memory each run.
+func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult {
+	if _, err := exec.LookPath("stress-ng"); err != nil {
+		return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"}
+	}
+	avail, err := memAvailableBytes()
+	if err != nil {
+		return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()}
+	}
+	// Budget = avail * memPct / 100, then subtract the standard headroom.
+	// If the result is below the memory-pass floor we record a skipped
+	// row instead — the window is too tight to be meaningful on this box.
+	budget := int64(float64(avail) * float64(memPct) / 100.0)
+	cap := budget - memHeadroomBytes
+	if cap < memFloorBytes {
+		return burnSubResult{
+			Name:    "Burn memory",
+			Skipped: true,
+			Reason:  fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)),
+		}
+	}
+	args := []string{
+		"--vm", "1",
+		"--vm-bytes", strconv.FormatInt(cap, 10),
+		"--vm-keep",
+		"--timeout", durationSeconds(duration),
+		"--metrics-brief",
+		"--verify",
+	}
+	d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct))
+	pass := runStressPass(ctx, d, "Burn memory", duration, args)
+	return burnSubResult{
+		Name:    "Burn memory",
+		Passed:  pass.Passed,
+		Err:     pass.Err,
+		SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass),
+	}
+}
+
+// runBurnFio runs fio_sample against the first allow-listed disk for
+// the window. Reuses runFioVerify + parseFioJSON so the samples line
+// up with what Storage emits. Using fio_sample (bounded by --size)
+// keeps Burn's write volume predictable regardless of profile.
+func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult {
+	if _, err := exec.LookPath("fio"); err != nil {
+		return burnSubResult{Name: "Burn fio", Err: "fio missing"}
+	}
+	targets := resolveTargets(d.ExpectedDisks)
+	if len(targets) == 0 {
+		return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"}
+	}
+	t := targets[0]
+	opts := fioOpts{
+		Mode:    "fio_sample",
+		Size:    "512MiB",
+		Runtime: duration,
+		BS:      "4k",
+		RW:      "randrw",
+		Verify:  "md5",
+	}
+	start := time.Now()
+	d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration))
+	fr := runFioVerify(ctx, t.Device, opts)
+	end := time.Now()
+
+	sub := SubStepReport{
+		Name:        "Burn fio " + t.Device,
+		Passed:      fr.Error == "",
+		StartedAt:   start,
+		CompletedAt: end,
+		SummaryJSON: mustJSON(fr),
+	}
+	out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error}
+	if fr.Error == "" {
+		out.Samples = append(out.Samples,
+			Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
+			Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
+		)
+		if fr.ReadP99Us > 0 {
+			out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
+		}
+		if fr.WriteP99Us > 0 {
+			out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
+		}
+	}
+	return out
+}
+
+// runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON
+// so the same (mbps, retrans, bytesSent) extraction the Network stage
+// uses applies here too. Samples emitted as Burn-scoped keys so the
+// dashboard can tell at-a-glance which window they came from.
+func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult {
+	if _, err := exec.LookPath("iperf3"); err != nil {
+		return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"}
+	}
+	host, err := deriveHost(orchestratorURL)
+	if err != nil || host == "" {
+		return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"}
+	}
+	if port == 0 {
+		port = 5201
+	}
+	if parallel < 1 {
+		parallel = 1
+	}
+	args := []string{
+		"-c", host,
+		"-p", strconv.Itoa(port),
+		"-t", strconv.Itoa(int(duration.Seconds())),
+		"-P", strconv.Itoa(parallel),
+		"-J",
+	}
+	runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
+	defer cancel()
+	start := time.Now()
+	out, err := exec.CommandContext(runCtx, "iperf3", args...).Output()
+	end := time.Now()
+	if err != nil {
+		return burnSubResult{
+			Name:    "Burn iperf",
+			Err:     "iperf3 client error: " + err.Error(),
+			SubStep: SubStepReport{
+				Name:        "Burn iperf",
+				StartedAt:   start,
+				CompletedAt: end,
+				SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}),
+			},
+		}
+	}
+	mbps, retrans, bytesSent, _, perr := parseIperfJSON(out)
+	if perr != nil {
+		return burnSubResult{
+			Name:    "Burn iperf",
+			Err:     "parse iperf3 json: " + perr.Error(),
+			SubStep: SubStepReport{
+				Name:        "Burn iperf",
+				StartedAt:   start,
+				CompletedAt: end,
+				SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}),
+			},
+		}
+	}
+
+	samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}}
+	if bytesSent > 0 {
+		packets := float64(bytesSent) / 1460.0
+		if packets > 0 {
+			samples = append(samples, Sample{
+				Kind: "nic_retrans", Key: "burn/rate",
+				Value: float64(retrans) / packets, Unit: "rate",
+			})
+		}
+	}
+	passed := mbps > 0
+	errMsg := ""
+	if !passed {
+		errMsg = "zero throughput from iperf3"
+	}
+	return burnSubResult{
+		Name:    "Burn iperf",
+		Passed:  passed,
+		Err:     errMsg,
+		Samples: samples,
+		SubStep: SubStepReport{
+			Name:        fmt.Sprintf("Burn iperf (P=%d)", parallel),
+			Passed:      passed,
+			StartedAt:   start,
+			CompletedAt: end,
+			SummaryJSON: mustJSON(map[string]any{
+				"throughput_mbps": mbps,
+				"retransmits":     retrans,
+				"bytes_sent":      bytesSent,
+				"parallel":        parallel,
+			}),
+		},
+	}
+}
+
+// runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration
+// of the Burn window, piping each read into the stage's sensor channel
+// as a psu_volt sample. The threshold evaluator then applies the same
+// within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V
+// under load will fire the critical threshold mid-Burn and the run
+// will flip into FailedHolding without waiting for the post-Burn PSU
+// stage to catch it.
+func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
+	defer wg.Done()
+	if d.Sensor == nil {
+		return
+	}
+	t := time.NewTicker(5 * time.Second)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			rails := scanPSURails()
+			if len(rails) == 0 {
+				continue
+			}
+			batch := make([]Sample, 0, len(rails))
+			for _, r := range rails {
+				batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
+			}
+			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+			if err := d.Sensor(sendCtx, batch); err != nil {
+				d.Warn("Burn: PSU sample post: " + err.Error())
+			}
+			cancel()
+		}
+	}
+}
+
+func resolveCPUWorkers(raw string) int {
+	if raw == "" || strings.EqualFold(raw, "all") {
+		return runtime.NumCPU()
+	}
+	if n, err := strconv.Atoi(raw); err == nil && n > 0 {
+		return n
+	}
+	return runtime.NumCPU()
+}
+
+// clampMemPct keeps the knob in a sane band. 0 means "use default 50%";
+// above 90 would crowd the kernel + agent + fio + iperf3 workers off the
+// page cache. Anything outside [10, 90] is clamped.
+func clampMemPct(pct int) int {
+	if pct <= 0 {
+		return 50
+	}
+	if pct < 10 {
+		return 10
+	}
+	if pct > 90 {
+		return 90
+	}
+	return pct
+}
+
+func mustJSON(v any) json.RawMessage {
+	b, err := json.Marshal(v)
+	if err != nil {
+		return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`))
+	}
+	return b
+}
+
+// Ensure the probes package import stays anchored — the Burn sidecars
+// use probes.EDAC + the PSU rail scanner defined in psu.go which
+// otherwise wouldn't pull probes in on its own.
+var _ = probes.EDAC
@@ -0,0 +1,58 @@
+package tests
+
+import (
+	"runtime"
+	"testing"
+)
+
+// TestResolveCPUWorkers covers the three parse branches: empty/"all"
+// falls back to NumCPU, a valid integer is used verbatim, and garbage
+// also falls back to NumCPU rather than returning zero. Zero workers
+// would make stress-ng a no-op and silently defeat Burn's CPU load.
+func TestResolveCPUWorkers(t *testing.T) {
+	np := runtime.NumCPU()
+	cases := []struct {
+		name string
+		in   string
+		want int
+	}{
+		{"empty defaults to NumCPU", "", np},
+		{"all defaults to NumCPU", "all", np},
+		{"ALL is case-insensitive", "ALL", np},
+		{"explicit integer", "3", 3},
+		{"negative falls back", "-1", np},
+		{"zero falls back", "0", np},
+		{"garbage falls back", "lots", np},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := resolveCPUWorkers(tc.in); got != tc.want {
+				t.Errorf("resolveCPUWorkers(%q) = %d, want %d", tc.in, got, tc.want)
+			}
+		})
+	}
+}
+
+// TestClampMemPct ensures the mem_pct knob never drives the memory
+// burner into OOM territory (upper clamp) or into uselessness (lower
+// clamp). Zero is treated as "use default 50" so a missing knob in an
+// older orchestrator's claim response doesn't collapse the workload.
+func TestClampMemPct(t *testing.T) {
+	cases := []struct {
+		in, want int
+	}{
+		{0, 50},   // default
+		{-10, 50}, // negative treated as default
+		{5, 10},   // below lower band → clamp up
+		{10, 10},
+		{50, 50},
+		{90, 90},
+		{95, 90}, // above upper band → clamp down
+		{1000, 90},
+	}
+	for _, tc := range cases {
+		if got := clampMemPct(tc.in); got != tc.want {
+			t.Errorf("clampMemPct(%d) = %d, want %d", tc.in, got, tc.want)
+		}
+	}
+}
@@ -11,7 +11,10 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
+
+	"vetting/agent/probes"
 )

 // CPUStress runs stress-ng as two serial passes. The previous shape
@@ -55,11 +58,28 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 	extras := map[string]any{"cores": cores}
 	var subs []SubStepReport

+	// EDAC sidecar runs for the lifetime of the stage; cancelled on
+	// return. It polls /sys/devices/system/edac/mc/*/{ce,ue}_count and
+	// posts the current counters so the server-side threshold evaluator
+	// can gate edac_ue > 0 → fail the run. Zero-valued poll falls back
+	// to 10s — the same cadence rasdaemon uses by default.
+	sideCtx, sideCancel := context.WithCancel(ctx)
+	defer sideCancel()
+	var sideWG sync.WaitGroup
+	sideWG.Add(1)
+	go runEDACSidecar(sideCtx, &sideWG, d)
+
+	// Per-profile durations come from Deps; zero values (missing knobs
+	// or legacy orchestrator) fall back to the package default so the
+	// stage always has a defined budget.
+	cpuDur := nonzeroDur(d.CPUStressKnobs.CPUPass, cpuPassDuration)
+	memDur := nonzeroDur(d.CPUStressKnobs.MemPass, memPassDuration)
+
 	// Pass 1: CPU
-	cpu := runStressPass(ctx, d, "CPU", cpuPassDuration, []string{
+	cpu := runStressPass(ctx, d, "CPU", cpuDur, []string{
 		"--cpu", strconv.Itoa(cores),
 		"--cpu-method", "all",
-		"--timeout", durationSeconds(cpuPassDuration),
+		"--timeout", durationSeconds(cpuDur),
 		"--metrics-brief",
 		"--verify",
 	})
@@ -104,11 +124,11 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 			SubSteps: subs,
 		}
 	}
-	mem := runStressPass(ctx, d, "memory", memPassDuration, []string{
+	mem := runStressPass(ctx, d, "memory", memDur, []string{
 		"--vm", "1",
 		"--vm-bytes", strconv.FormatInt(cap, 10),
 		"--vm-keep",
-		"--timeout", durationSeconds(memPassDuration),
+		"--timeout", durationSeconds(memDur),
 		"--metrics-brief",
 		"--verify",
 	})
@@ -133,6 +153,64 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 	}
 }

+// runEDACSidecar polls /sys EDAC counters on d.CPUStressKnobs.EDACPoll
+// cadence (or 10s fallback) for the lifetime of the stage ctx, emitting
+// one sample per (memory-controller × {ce,ue}) pair on each tick. A
+// single failing read is tolerated: the next tick picks up the counter.
+//
+// This is where the critical edac_ue threshold becomes a hard-fail: as
+// soon as a UE counter advances past 0, the server-side evaluator trips
+// and flips the run into FailedHolding. The sidecar emits whether or
+// not stress-ng is still running; that keeps the signal live during
+// inter-pass gaps.
+//
+// MCE counts are intentionally not sampled here — they require
+// rasdaemon or mcelog and vary by live-image packaging. The threshold
+// rule for mce stays seeded (so the DB shape is stable) but only fires
+// once a matching kind lands, which is a follow-up.
+func runEDACSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
+	defer wg.Done()
+	if d.Sensor == nil {
+		return
+	}
+	poll := d.CPUStressKnobs.EDACPoll
+	if poll <= 0 {
+		poll = 10 * time.Second
+	}
+	t := time.NewTicker(poll)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			edac := probes.EDAC()
+			if len(edac) == 0 {
+				continue
+			}
+			batch := make([]Sample, 0, len(edac))
+			for _, s := range edac {
+				batch = append(batch, Sample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
+			}
+			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+			if err := d.Sensor(sendCtx, batch); err != nil {
+				d.Warn("CPUStress: edac sample post: " + err.Error())
+			}
+			cancel()
+		}
+	}
+}
+
+// nonzeroDur picks override over fallback, but only when override is
+// strictly positive. Lets callers pass a zero-value duration to mean
+// "no override; use fallback" without a separate ok return.
+func nonzeroDur(override, fallback time.Duration) time.Duration {
+	if override > 0 {
+		return override
+	}
+	return fallback
+}
+
 // subStepFromPass projects a stressPass into a SubStepReport — shared by
 // both passes and by the mid-stage early-return paths so the UI always
 // sees exactly one row per pass, even on failure.
@@ -0,0 +1,24 @@
+// fake_dmidecode simulates `dmidecode -t bios` for unit tests of the
+// firmware probe's BIOS parser. Prints deterministic output modeled on
+// a real Supermicro host; exits 0 regardless of flags.
+package main
+
+import "fmt"
+
+func main() {
+	fmt.Println(`# dmidecode 3.3
+Getting SMBIOS data from sysfs.
+SMBIOS 3.2.0 present.
+
+Handle 0x0000, DMI type 0, 26 bytes
+BIOS Information
+	Vendor: American Megatrends Inc.
+	Version: 3.2
+	Release Date: 07/15/2021
+	Address: 0xF0000
+	Runtime Size: 64 kB
+	ROM Size: 32 MB
+	Characteristics:
+		PCI is supported
+		BIOS is upgradeable`)
+}
@@ -0,0 +1,22 @@
+// Package fakes is the umbrella for deterministic stand-ins for
+// external probe binaries that Vetting's stage code normally shells
+// out to (stress-ng, fio, iperf3, dmidecode, ethtool, nvidia-smi,
+// mcelog, nvme). Each real binary gets its own subpackage under
+// fakes/<name>/ with `package main` and a main() that prints golden
+// output — build with `go build -o <tmp>/<name> ./agent/tests/fakes/<name>`
+// and point a test's tests.Deps.LookPath at <tmp>/<name>.
+//
+// The seam in tests is tests.Deps.LookPath: when non-nil the stage
+// code uses it instead of os/exec.LookPath. Outside tests, nil
+// LookPath means "use the real binary on $PATH" — stages continue to
+// work on production hosts without the fakes package around.
+//
+// How to add a new fake:
+//  1. Create agent/tests/fakes/<binaryname>/main.go.
+//  2. Write `package main` with a main() that prints exactly the
+//     bytes the real tool would produce for the input you care to
+//     simulate. Determinism > completeness — tests want a known
+//     sample, not a realistic one.
+//  3. Reference the fake from the unit test with `go test` compiling
+//     it via t.TempDir() + `go build -o` before the test body runs.
+package fakes
@@ -0,0 +1,18 @@
+// fake_stress_ng simulates stress-ng for unit tests. Accepts (and
+// ignores) any flag, sleeps briefly so callers that measure wall-clock
+// see a non-zero elapsed, and prints the "passed" lines CPUStress
+// expects. Exits 0.
+package main
+
+import (
+	"fmt"
+	"os"
+	"time"
+)
+
+func main() {
+	fmt.Fprintln(os.Stderr, "fake_stress_ng invoked:", os.Args[1:])
+	time.Sleep(50 * time.Millisecond)
+	fmt.Println("stress-ng: info:  [1] dispatching hogs: 1 cpu")
+	fmt.Println("stress-ng: info:  [1] successful run completed in 0.05s")
+}
@@ -9,19 +9,27 @@ import (
 	"strconv"
 	"strings"
 	"time"
+
+	"vetting/agent/probes"
 )

 // NetworkConfig is what the agent passes to Network: the orchestrator's
-// iperf3 server address and port. We derive host from OrchestratorURL.
+// iperf3 server address, port, and the per-profile duration.
 type NetworkConfig struct {
 	OrchestratorURL string
 	IperfPort       int // 0 = 5201
 	Duration        time.Duration
 }

-// Network runs iperf3 against the orchestrator's bundled server. Records
-// bandwidth as a measurement; fails if iperf3 is missing, the server
-// isn't reachable, or throughput is zero.
+// Network runs iperf3 against the orchestrator's bundled server for
+// the profile-configured duration. Records throughput as a measurement;
+// records per-interface rx/tx error-rate deltas as nic_retrans samples
+// so the server-side threshold gate (`nic_retrans rate < 0.001`) fires
+// on a flaky PHY or a wire that drops half its packets under load.
+//
+// Failure cases: iperf3 missing, server unreachable, zero throughput.
+// Zero throughput is treated as a hard failure — an iperf that finished
+// cleanly but pushed zero bytes is indistinguishable from a bad run.
 func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
 	if _, err := exec.LookPath("iperf3"); err != nil {
 		// Live image ships iperf3; absence means packaging regression.
@@ -51,6 +59,11 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
 		duration = 10 * time.Second
 	}

+	// Snapshot /proc/net/dev before the test so we can attribute any
+	// error-count growth to *this stage's* traffic. The same snapshot
+	// taken after iperf returns is the end of the window.
+	netStart := indexNetDev(probes.NetDev())
+
 	args := []string{
 		"-c", host,
 		"-p", strconv.Itoa(port),
@@ -72,7 +85,7 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
 			Extras:  map[string]any{"stderr_tail": tailLines(string(out), 20)},
 		}
 	}
-	mbps, parsed, err := parseIperfJSON(out)
+	mbps, retrans, bytesSent, parsed, err := parseIperfJSON(out)
 	if err != nil {
 		d.Error("Network: parse iperf3 output: " + err.Error())
 		return Outcome{
@@ -82,12 +95,58 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
 			Extras:  map[string]any{"raw": string(out)},
 		}
 	}
+
+	netEnd := indexNetDev(probes.NetDev())
+	netDelta := diffNetDev(netStart, netEnd)
+
+	samples := []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}
+
+	// iperf-derived retrans rate: retrans_count / packet_count_estimate.
+	// TCP typical MTU 1500; payload ~1460. We divide bytes by 1460 to
+	// approximate packets. This keeps the rate bounded in [0, 1].
+	if bytesSent > 0 {
+		packets := float64(bytesSent) / 1460.0
+		if packets > 0 {
+			samples = append(samples, Sample{
+				Kind:  "nic_retrans",
+				Key:   "iperf/rate",
+				Value: float64(retrans) / packets,
+				Unit:  "rate",
+			})
+		}
+	}
+
+	// Per-interface error-rate deltas. A flaky cable typically surfaces
+	// as tx_errs or tx_drop on the originating interface, not inside
+	// iperf's own tally.
+	for iface, delta := range netDelta {
+		if delta.TxBytes > 0 {
+			packets := float64(delta.TxBytes) / 1460.0
+			if packets > 0 {
+				rate := float64(delta.TxErrs+delta.TxDrop) / packets
+				samples = append(samples, Sample{
+					Kind: "nic_retrans", Key: iface + "/rate", Value: rate, Unit: "rate",
+				})
+			}
+		}
+		// Diagnostic raw counts so the report can show which interface
+		// bled. These don't fire a threshold today but are useful for
+		// post-mortem.
+		samples = append(samples,
+			Sample{Kind: "nic_errs", Key: iface + "/rx", Value: float64(delta.RxErrs + delta.RxDrop), Unit: "count"},
+			Sample{Kind: "nic_errs", Key: iface + "/tx", Value: float64(delta.TxErrs + delta.TxDrop), Unit: "count"},
+		)
+	}
+
 	if d.Sensor != nil {
-		_ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}})
+		_ = d.Sensor(ctx, samples)
 	}

 	extras := map[string]any{
 		"throughput_mbps": mbps,
+		"retransmits":     retrans,
+		"bytes_sent":      bytesSent,
+		"net_delta":       netDelta,
 		"iperf_end":       parsed,
 	}
 	if mbps <= 0 {
@@ -98,14 +157,55 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
 			Extras:  extras,
 		}
 	}
-	d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps))
+	d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps (retransmits=%d)", mbps, retrans))
 	return Outcome{
 		Passed:  true,
-		Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host),
+		Summary: fmt.Sprintf("%.1f Mbps to %s (retransmits=%d)", mbps, host, retrans),
 		Extras:  extras,
 	}
 }

+// indexNetDev flattens a NetDev slice into a map keyed by interface
+// name so diffNetDev can pair start/end by name without O(n²) scans.
+func indexNetDev(snaps []probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
+	out := map[string]probes.NetDevSnapshot{}
+	for _, s := range snaps {
+		out[s.Iface] = s
+	}
+	return out
+}
+
+// diffNetDev computes end − start for each interface present in both
+// snapshots. An interface that dropped away mid-run is dropped from
+// the result (can't compute a delta). Underflow (end < start, rare
+// after a counter reset) is clamped to 0.
+func diffNetDev(start, end map[string]probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
+	out := map[string]probes.NetDevSnapshot{}
+	for iface, e := range end {
+		s, ok := start[iface]
+		if !ok {
+			continue
+		}
+		out[iface] = probes.NetDevSnapshot{
+			Iface:   iface,
+			RxBytes: subU64(e.RxBytes, s.RxBytes),
+			RxErrs:  subU64(e.RxErrs, s.RxErrs),
+			RxDrop:  subU64(e.RxDrop, s.RxDrop),
+			TxBytes: subU64(e.TxBytes, s.TxBytes),
+			TxErrs:  subU64(e.TxErrs, s.TxErrs),
+			TxDrop:  subU64(e.TxDrop, s.TxDrop),
+		}
+	}
+	return out
+}
+
+func subU64(a, b uint64) uint64 {
+	if a < b {
+		return 0
+	}
+	return a - b
+}
+
 // deriveHost pulls the hostname out of an https://host:port base URL.
 func deriveHost(raw string) (string, error) {
 	if raw == "" {
@@ -119,18 +219,22 @@ func deriveHost(raw string) (string, error) {
 	return strings.TrimSpace(h), nil
 }

-// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J.
-// Returns (Mbps, full-json-map, err).
-func parseIperfJSON(b []byte) (float64, map[string]any, error) {
+// parseIperfJSON pulls end.sum_sent.bits_per_second and retransmits out
+// of iperf3 -J. Returns (Mbps, retransmits, bytes_sent, full-end-map, err).
+func parseIperfJSON(b []byte) (float64, int64, int64, map[string]any, error) {
 	var top map[string]any
 	if err := json.Unmarshal(b, &top); err != nil {
-		return 0, nil, err
+		return 0, 0, 0, nil, err
 	}
 	end, ok := top["end"].(map[string]any)
 	if !ok {
-		return 0, top, fmt.Errorf("missing end")
+		return 0, 0, 0, nil, fmt.Errorf("missing end")
 	}
-	// iperf3 reports either sum_sent (when -R not set) or sum_received.
+	// Pull the first sum that carries bits_per_second; retransmits +
+	// bytes live there too for TCP.
+	var mbps float64
+	var retrans int64
+	var bytesSent int64
 	for _, key := range []string{"sum_sent", "sum_received", "sum"} {
 		sum, ok := end[key].(map[string]any)
 		if !ok {
@@ -140,7 +244,17 @@ func parseIperfJSON(b []byte) (float64, map[string]any, error) {
 		if !ok {
 			continue
 		}
-		return bps / 1_000_000, end, nil
+		mbps = bps / 1_000_000
+		if r, ok := sum["retransmits"].(float64); ok {
+			retrans = int64(r)
+		}
+		if bs, ok := sum["bytes"].(float64); ok {
+			bytesSent = int64(bs)
+		}
+		break
 	}
-	return 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
+	if mbps == 0 {
+		return 0, 0, 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
+	}
+	return mbps, retrans, bytesSent, end, nil
 }
@@ -0,0 +1,192 @@
+package tests
+
+import (
+	"encoding/json"
+	"testing"
+
+	"vetting/agent/probes"
+)
+
+// TestParseIperfJSON_SumSent confirms we pull throughput, retransmits,
+// and bytes_sent from end.sum_sent. Real iperf3 -J output nests these
+// three under end.sum_sent for TCP streams.
+func TestParseIperfJSON_SumSent(t *testing.T) {
+	raw := `{
+		"end": {
+			"sum_sent": {
+				"bits_per_second": 950000000,
+				"retransmits": 42,
+				"bytes": 1187500000
+			}
+		}
+	}`
+	mbps, retrans, bytesSent, _, err := parseIperfJSON([]byte(raw))
+	if err != nil {
+		t.Fatalf("parseIperfJSON: %v", err)
+	}
+	if mbps != 950 {
+		t.Errorf("mbps = %v, want 950", mbps)
+	}
+	if retrans != 42 {
+		t.Errorf("retransmits = %d, want 42", retrans)
+	}
+	if bytesSent != 1187500000 {
+		t.Errorf("bytesSent = %d, want 1187500000", bytesSent)
+	}
+}
+
+// TestParseIperfJSON_MissingEnd fails cleanly when iperf returned
+// something without an end block (partial/aborted run).
+func TestParseIperfJSON_MissingEnd(t *testing.T) {
+	raw := `{"start": {}}`
+	if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
+		t.Errorf("expected error on iperf output missing end block")
+	}
+}
+
+// TestParseIperfJSON_ZeroBps returns an error so the stage can fail
+// fast. A successful-exit iperf that pushed zero bits is indistinguishable
+// from a broken run and must not pass.
+func TestParseIperfJSON_ZeroBps(t *testing.T) {
+	raw := `{"end": {"sum_sent": {"bits_per_second": 0}}}`
+	if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
+		t.Errorf("expected error when bits_per_second is 0")
+	}
+}
+
+// TestParseIperfJSON_FallsBackToSumReceived: UDP tests and some edge
+// cases don't populate sum_sent. The parser walks sum_sent → sum_received
+// → sum and picks the first that has a throughput number.
+func TestParseIperfJSON_FallsBackToSumReceived(t *testing.T) {
+	raw := `{
+		"end": {
+			"sum_received": {"bits_per_second": 500000000}
+		}
+	}`
+	mbps, _, _, _, err := parseIperfJSON([]byte(raw))
+	if err != nil {
+		t.Fatalf("parseIperfJSON: %v", err)
+	}
+	if mbps != 500 {
+		t.Errorf("mbps = %v, want 500", mbps)
+	}
+}
+
+// TestDiffNetDev_HappyPath confirms end − start on a shared interface
+// produces the delta we expect. eth0 pushed 10k bytes and accumulated
+// 3 tx errors during the window.
+func TestDiffNetDev_HappyPath(t *testing.T) {
+	start := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", RxBytes: 1000, RxErrs: 0, TxBytes: 5000, TxErrs: 1},
+	}
+	end := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", RxBytes: 2000, RxErrs: 0, TxBytes: 15000, TxErrs: 4},
+	}
+	delta := diffNetDev(start, end)
+	got, ok := delta["eth0"]
+	if !ok {
+		t.Fatalf("eth0 missing from diff output")
+	}
+	if got.RxBytes != 1000 {
+		t.Errorf("RxBytes delta=%d, want 1000", got.RxBytes)
+	}
+	if got.TxBytes != 10000 {
+		t.Errorf("TxBytes delta=%d, want 10000", got.TxBytes)
+	}
+	if got.TxErrs != 3 {
+		t.Errorf("TxErrs delta=%d, want 3", got.TxErrs)
+	}
+}
+
+// TestDiffNetDev_InterfaceVanished: an interface present at start but
+// gone at end drops from the diff rather than carrying a negative or
+// stale number.
+func TestDiffNetDev_InterfaceVanished(t *testing.T) {
+	start := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", TxBytes: 1000},
+		"eth1": {Iface: "eth1", TxBytes: 500},
+	}
+	end := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", TxBytes: 2000},
+	}
+	delta := diffNetDev(start, end)
+	if _, ok := delta["eth1"]; ok {
+		t.Errorf("eth1 should have been dropped (gone at end)")
+	}
+	if delta["eth0"].TxBytes != 1000 {
+		t.Errorf("eth0 TxBytes delta=%d, want 1000", delta["eth0"].TxBytes)
+	}
+}
+
+// TestDiffNetDev_CounterReset: if a counter resets between snapshots
+// (kernel restart, wrap-around on a 32-bit counter) we clamp to 0
+// rather than underflow a uint64.
+func TestDiffNetDev_CounterReset(t *testing.T) {
+	start := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", TxBytes: 9999, TxErrs: 5},
+	}
+	end := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", TxBytes: 100, TxErrs: 0},
+	}
+	delta := diffNetDev(start, end)
+	if delta["eth0"].TxBytes != 0 {
+		t.Errorf("reset TxBytes delta=%d, want 0 (clamped)", delta["eth0"].TxBytes)
+	}
+	if delta["eth0"].TxErrs != 0 {
+		t.Errorf("reset TxErrs delta=%d, want 0 (clamped)", delta["eth0"].TxErrs)
+	}
+}
+
+// TestDeriveHost: orchestrator URL → host extraction is how the agent
+// picks the iperf3 server target. Handles both https://host and
+// https://host:port shapes.
+func TestDeriveHost(t *testing.T) {
+	cases := []struct {
+		raw  string
+		want string
+	}{
+		{"https://orch.local", "orch.local"},
+		{"https://orch.local:8443", "orch.local"},
+		{"http://10.0.0.5:8080", "10.0.0.5"},
+	}
+	for _, c := range cases {
+		got, err := deriveHost(c.raw)
+		if err != nil {
+			t.Errorf("deriveHost(%q) error: %v", c.raw, err)
+			continue
+		}
+		if got != c.want {
+			t.Errorf("deriveHost(%q) = %q, want %q", c.raw, got, c.want)
+		}
+	}
+}
+
+func TestDeriveHost_Empty(t *testing.T) {
+	if _, err := deriveHost(""); err == nil {
+		t.Errorf("deriveHost(\"\") should error")
+	}
+}
+
+// TestParseIperfJSON_ParsesEndMap confirms the full end map is returned
+// so extras can show every field iperf produced, not just the three we
+// extract by hand.
+func TestParseIperfJSON_ParsesEndMap(t *testing.T) {
+	raw := `{
+		"end": {
+			"sum_sent": {"bits_per_second": 1000000, "retransmits": 0, "bytes": 125000},
+			"cpu_utilization_percent": {"host_total": 12.3}
+		}
+	}`
+	_, _, _, endMap, err := parseIperfJSON([]byte(raw))
+	if err != nil {
+		t.Fatalf("parseIperfJSON: %v", err)
+	}
+	if endMap == nil {
+		t.Fatalf("endMap is nil")
+	}
+	// Sanity: both keys round-trip via json.
+	b, _ := json.Marshal(endMap)
+	if len(b) == 0 {
+		t.Errorf("endMap marshaled to empty")
+	}
+}
@@ -7,12 +7,20 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
+	"time"
 )

 // PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
-// PSU rails. In home-lab hosts the kernel surfaces a handful of named
-// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10%
-// window of its nominal value → fail.
+// PSU rails, then samples each rail every psuSampleInterval for a
+// window sized by the stage timeout. During Burn a separate sidecar
+// (see burn.go) runs the same probe concurrently with workload — the
+// PSU stage itself catches slow post-load sag that only surfaces once
+// the 12V rail starts recovering from a brownout under concurrent CPU
+// + fio + iperf load.
+//
+// Any rail outside ±10% of its nominal value at any tick fires the
+// critical threshold (server-side) and fails the stage. A host with no
+// PSU rails wired to hwmon auto-skips.
 func PSU(ctx context.Context, d Deps) Outcome {
 	rails := scanPSURails()
 	if len(rails) == 0 {
@@ -24,39 +32,150 @@ func PSU(ctx context.Context, d Deps) Outcome {
 		}
 	}

-	var samples []Sample
-	problems := []string{}
-	for _, rail := range rails {
-		samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"})
-		if ok, why := voltageInRange(rail); !ok {
-			problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why))
+	window := resolvePSUWindow(d.StageTimeout)
+	deadline := time.Now().Add(window)
+	interval := psuSampleInterval
+	if window < interval*2 {
+		// Tiny window (tests, pathological stage_timeout) — at least two
+		// ticks so aggregate stats are meaningful.
+		interval = window / 2
+		if interval < time.Second {
+			interval = time.Second
 		}
 	}
-	if d.Sensor != nil {
-		_ = d.Sensor(ctx, samples)
+
+	// Per-label tracking: min/max across the window, count of out-of-range
+	// hits, last-observed value (shown in the summary).
+	type railStats struct {
+		label    string
+		minV     float64
+		maxV     float64
+		lastV    float64
+		ticks    int
+		breaches int
+		reason   string
+	}
+	stats := map[string]*railStats{}
+
+	tick := time.NewTicker(interval)
+	defer tick.Stop()
+	// Start with an immediate sample so a sub-45s window still produces
+	// at least one reading.
+	sampleOnce := func() {
+		cur := scanPSURails()
+		if len(cur) == 0 {
+			return
+		}
+		batch := make([]Sample, 0, len(cur))
+		for _, r := range cur {
+			s, ok := stats[r.Label]
+			if !ok {
+				s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts}
+				stats[r.Label] = s
+			}
+			s.ticks++
+			s.lastV = r.Volts
+			if r.Volts < s.minV {
+				s.minV = r.Volts
+			}
+			if r.Volts > s.maxV {
+				s.maxV = r.Volts
+			}
+			if ok, why := voltageInRange(r); !ok {
+				s.breaches++
+				if s.reason == "" {
+					s.reason = why
+				}
+			}
+			batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
+		}
+		if d.Sensor != nil && len(batch) > 0 {
+			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+			_ = d.Sensor(sendCtx, batch)
+			cancel()
+		}
+	}
+	sampleOnce()
+sampling:
+	for time.Now().Before(deadline) {
+		select {
+		case <-ctx.Done():
+			break sampling
+		case <-tick.C:
+			sampleOnce()
+		}
+	}
+
+	// Build the outcome. Extras carry per-rail rollup so the report can
+	// show "12V min=11.1 max=12.05 (3/120 ticks out of range)".
+	type railRollup struct {
+		Label    string  `json:"label"`
+		MinV     float64 `json:"min_v"`
+		MaxV     float64 `json:"max_v"`
+		LastV    float64 `json:"last_v"`
+		Ticks    int     `json:"ticks"`
+		Breaches int     `json:"breaches"`
+		Reason   string  `json:"reason,omitempty"`
+	}
+	rollups := make([]railRollup, 0, len(stats))
+	problems := []string{}
+	for _, s := range stats {
+		rollups = append(rollups, railRollup{
+			Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV,
+			Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason,
+		})
+		if s.breaches > 0 {
+			problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason))
+		}
 	}

 	extras := map[string]any{
-		"rails":    rails,
-		"problems": problems,
+		"rails":       rollups,
+		"problems":    problems,
+		"window":      window.String(),
+		"interval":    interval.String(),
 	}
 	if len(problems) > 0 {
-		d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", "))
+		d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; "))
 		return Outcome{
 			Passed:  false,
-			Message: "PSU rails out of range: " + strings.Join(problems, ", "),
-			Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)),
+			Message: "PSU rails out of range: " + strings.Join(problems, "; "),
+			Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)),
 			Extras:  extras,
 		}
 	}
-	d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails)))
+	d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window))
 	return Outcome{
 		Passed:  true,
-		Summary: fmt.Sprintf("%d rails nominal", len(rails)),
+		Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window),
 		Extras:  extras,
 	}
 }

+// psuSampleInterval is the default tick for post-Burn rail sampling.
+// Five seconds is slow enough to stay under the HTTP budget and fast
+// enough to catch rail recovery transients.
+const psuSampleInterval = 5 * time.Second
+
+// resolvePSUWindow maps the stage timeout to the sampling window.
+// With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot-
+// like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom
+// for sensor flush + result post, capped at 10 min so a 24 h soak
+// doesn't spend all day in PSU.
+func resolvePSUWindow(stageTimeout time.Duration) time.Duration {
+	if stageTimeout <= 0 {
+		return 30 * time.Second
+	}
+	w := stageTimeout - 5*time.Second
+	if w < 30*time.Second {
+		w = 30 * time.Second
+	}
+	if w > 10*time.Minute {
+		w = 10 * time.Minute
+	}
+	return w
+}
+
 type psuRail struct {
 	Label string  `json:"label"`
 	Volts float64 `json:"volts"`
@@ -0,0 +1,112 @@
+package tests
+
+import (
+	"testing"
+	"time"
+)
+
+// TestIsPSULabel keeps the allowlist narrow enough that CPU VRM rails
+// don't get misclassified as PSU-out-of-range failures but wide enough
+// that common SuperMicro/Intel hwmon labels land in the Yes bucket.
+func TestIsPSULabel(t *testing.T) {
+	cases := []struct {
+		label string
+		want  bool
+	}{
+		{"+12V", true},
+		{"12V", true},
+		{"+5V", true},
+		{"5V", true},
+		{"+3.3V", true},
+		{"3V3", true},
+		{"VCCIN", true},
+		{"vccin", true},
+		{"Vcore", false},
+		{"CPU VCORE", false},
+		{"AVCC", false},
+		{"", false},
+	}
+	for _, tc := range cases {
+		if got := isPSULabel(tc.label); got != tc.want {
+			t.Errorf("isPSULabel(%q) = %v, want %v", tc.label, got, tc.want)
+		}
+	}
+}
+
+// TestNominalFor maps rail labels back to expected nominal voltages.
+// Unknown labels must return 0 so voltageInRange short-circuits — an
+// accidental nominal would invent out-of-range failures.
+func TestNominalFor(t *testing.T) {
+	cases := []struct {
+		label string
+		want  float64
+	}{
+		{"+12V", 12.0},
+		{"12V", 12.0},
+		{"+5V", 5.0},
+		{"+3.3V", 3.3},
+		{"3V3", 3.3},
+		{"VCCIN", 0},
+		{"unknown", 0},
+	}
+	for _, tc := range cases {
+		if got := nominalFor(tc.label); got != tc.want {
+			t.Errorf("nominalFor(%q) = %v, want %v", tc.label, got, tc.want)
+		}
+	}
+}
+
+// TestVoltageInRange verifies the ±10% band: 12V passes in [10.8,
+// 13.2], fails anywhere outside. Unknown labels always pass (since
+// nominalFor returned 0 above).
+func TestVoltageInRange(t *testing.T) {
+	cases := []struct {
+		rail psuRail
+		ok   bool
+	}{
+		{psuRail{Label: "+12V", Volts: 12.0}, true},
+		{psuRail{Label: "+12V", Volts: 10.8}, true},  // exactly at the band
+		{psuRail{Label: "+12V", Volts: 13.2}, true},  // exactly at the band
+		{psuRail{Label: "+12V", Volts: 10.7}, false}, // just below
+		{psuRail{Label: "+12V", Volts: 13.3}, false}, // just above
+		{psuRail{Label: "+12V", Volts: 10.5}, false}, // real sag
+		{psuRail{Label: "+5V", Volts: 4.6}, true},    // 8% low on 5V still in band
+		{psuRail{Label: "+5V", Volts: 4.4}, false},   // 12% low on 5V — out of band
+		{psuRail{Label: "+5V", Volts: 5.0}, true},
+		{psuRail{Label: "VCCIN", Volts: 1.8}, true}, // unknown nominal → pass
+	}
+	for _, tc := range cases {
+		got, _ := voltageInRange(tc.rail)
+		if got != tc.ok {
+			t.Errorf("voltageInRange(%+v) = %v, want %v", tc.rail, got, tc.ok)
+		}
+	}
+}
+
+// TestResolvePSUWindow maps stage timeouts to the sampling window.
+// Quick's 1m stage_timeout → 55s window; deep's 10m → capped at 10m;
+// missing/zero → 30s (test / legacy orchestrator path); sub-35s → at
+// least 30s so aggregates are non-trivial.
+func TestResolvePSUWindow(t *testing.T) {
+	cases := []struct {
+		name string
+		in   time.Duration
+		want time.Duration
+	}{
+		{"zero → snapshot fallback", 0, 30 * time.Second},
+		{"negative → snapshot fallback", -1 * time.Second, 30 * time.Second},
+		{"tiny timeout clamps up to 30s floor", 10 * time.Second, 30 * time.Second},
+		{"35s - 5s = 30s", 35 * time.Second, 30 * time.Second},
+		{"1m quick → 55s", time.Minute, 55 * time.Second},
+		{"10m deep → 9m55s", 10 * time.Minute, 9*time.Minute + 55*time.Second},
+		{"15m soak → capped at 10m", 15 * time.Minute, 10 * time.Minute},
+		{"1h → capped at 10m", time.Hour, 10 * time.Minute},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := resolvePSUWindow(tc.in); got != tc.want {
+				t.Errorf("resolvePSUWindow(%s) = %s, want %s", tc.in, got, tc.want)
+			}
+		})
+	}
+}
@@ -59,6 +59,11 @@ func (o Outcome) MarshalSummary() (json.RawMessage, error) {
 // Deps bundles what stages need without pulling in the whole agent.
 // Logger methods print to stdout + forward to the orchestrator; Sensor
 // drops numeric samples; OverrideFlags carries operator-set bypasses.
+//
+// CPUStressKnobs / StorageKnobs / NetworkKnobs are Phase-2 profile
+// knobs. Zero-valued fields mean "fall back to the compile-time
+// default" — that keeps the stages runnable even when the runner can't
+// materialize a profile (tests, legacy orchestrator, etc).
 type Deps struct {
 	Info           func(string)
 	Warn           func(string)
@@ -68,6 +73,58 @@ type Deps struct {
 	NonDestructive bool           // skip wipe-probe + writes in Storage
 	ExpectedDisks  []ExpectedDisk // serials + sizes from host.expected_spec
 	StageTimeout   time.Duration
+	CPUStressKnobs CPUStressKnobs
+	StorageKnobs   StorageKnobs
+	NetworkKnobs   NetworkKnobs
+	BurnKnobs      BurnKnobs
+	// LookPath is the unit-test seam for swapping a real external
+	// binary (stress-ng, fio, iperf3, dmidecode, …) for a fake. When
+	// nil the stage falls back to os/exec.LookPath — production and
+	// existing tests keep working unchanged. Tests under
+	// agent/tests/fakes/ populate this to redirect lookups to a built
+	// fake binary in a tempdir.
+	LookPath func(name string) (string, error)
+}
+
+// CPUStressKnobs parameterizes the CPUStress stage. Zero durations fall
+// back to the package's compile-time defaults (cpuPassDuration etc).
+type CPUStressKnobs struct {
+	CPUPass  time.Duration
+	MemPass  time.Duration
+	EDACPoll time.Duration
+}
+
+// StorageKnobs parameterizes the Storage stage. Mode picks between
+// "fio_sample" (bounded tempfile inside the device, quick profile) and
+// "full_disk" (whole-device write verify, deep/soak). Empty strings
+// fall back to the stage's safe defaults.
+type StorageKnobs struct {
+	Mode    string
+	FioSize string
+	FioTime time.Duration
+	FioBS   string
+	FioRW   string
+	Verify  string
+}
+
+// NetworkKnobs parameterizes the Network stage.
+type NetworkKnobs struct {
+	Duration time.Duration
+}
+
+// BurnKnobs parameterizes the Burn super-stage. Duration is the total
+// Burn window; sub-workloads run concurrently inside that window.
+// CPUWorkers is "all" (runtime.NumCPU) or a numeric string. MemPct is a
+// percentage of MemAvailable to allocate for the memory burner (clamped
+// 0-90 by the stage). IperfParallel feeds iperf3 -P to generate sustained
+// NIC load. FioOnSpare gates the storage sub-workload: true = fio runs
+// against the allow-listed disks for the same window; false = skip fio.
+type BurnKnobs struct {
+	Duration      time.Duration
+	CPUWorkers    string
+	MemPct        int
+	FioOnSpare    bool
+	IperfParallel int
 }

 // Sample mirrors the server's SensorSample but lives in the tests
@@ -5,24 +5,36 @@ import (
 	"encoding/json"
 	"fmt"
 	"os/exec"
+	"strconv"
 	"strings"
 	"time"
 )

-// Storage is the destructive stage: badblocks (write-mode sample) + fio
-// random IO, persisting IOPS + latency as measurements. Pre-gates:
+// Storage is the destructive stage. Phase 2 replaced the old
+// badblocks + 128 MiB fio combo with a single fio run per disk that
+// writes, verifies md5 of what it wrote, and reports p99 latency.
+// Modes:
+//
+//   - fio_sample (quick): bounded 1 GiB write per disk, ~3 min runtime.
+//   - full_disk (deep/soak): writes the whole device, time-bounded by
+//     the fio_time knob (2 h deep, 6 h soak).
+//
+// Pre-gates kept from Phase 1:
 //
 //  1. Device allowlist: only act on /dev/<X> where the kernel-reported
-//     serial matches one of Deps.ExpectedDisks. This is the operator's
-//     contract for what can be written to. USB sticks and unexpected
+//     serial matches one of Deps.ExpectedDisks. USB sticks and unexpected
 //     drives are excluded.
 //  2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
-//     signatures, partition tables, or LVM metadata → fail with
+//     signature, partition table, or LVM metadata → fail with
 //     UnexpectedData unless Deps.OverrideWipe is set.
 //
-// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w`
-// and `fio` in write mode. This matches the plan's "destructive disk
-// tests are always-on, gated by layered safety."
+// After fio, the stage captures a SMART diff (start snapshot taken
+// before any writes; end snapshot after all writes finish) and posts
+// deltas on attributes like Reallocated_Sector_Ct and Current_Pending_Sector.
+// The threshold evaluator isn't seeded to gate smart_delta out of the
+// box — those samples are diagnostic for the report. Fio's p99 latency
+// posts as fio_p99_us so the per-stage Storage warning threshold can
+// fire on a latency cliff.
 func Storage(ctx context.Context, d Deps) Outcome {
 	if len(d.ExpectedDisks) == 0 {
 		d.Info("Storage: no expected disks in spec — skipping stage")
@@ -44,10 +56,10 @@ func Storage(ctx context.Context, d Deps) Outcome {
 		}
 	}

-	// Non-destructive runs skip wipe-probe (nothing to refuse), badblocks
-	// -w, and write-mode fio. Every expected disk is still asserted
-	// present + readable by listing /sys/block and reading SMART-accessible
-	// identity; the per-disk map flags the shortcut so the report is clear.
+	// Non-destructive runs skip wipe-probe (nothing to refuse), fio
+	// writes, and SMART delta (nothing changed so no delta to report).
+	// Every expected disk is still asserted present so a vanished drive
+	// still fails the stage.
 	if d.NonDestructive {
 		perDisk := map[string]any{}
 		for _, t := range targets {
@@ -79,9 +91,9 @@ func Storage(ctx context.Context, d Deps) Outcome {
 			Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
 			Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
 			Extras: map[string]any{
-				"wipe_probe":     probes,
-				"override_hint":  "click 'Override wipe & retry' in the held tile",
-				"dirty_devices":  dirty,
+				"wipe_probe":    probes,
+				"override_hint": "click 'Override wipe & retry' in the held tile",
+				"dirty_devices": dirty,
 			},
 		}
 	}
@@ -89,64 +101,80 @@ func Storage(ctx context.Context, d Deps) Outcome {
 		d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
 	}

-	// Per target: short badblocks write sample + fio random-read/write.
+	// Capture start-of-stage SMART attributes before we write anything
+	// so the delta is attributable to *this* stage's writes and not the
+	// host's prior history. Per-disk failures are tolerated (e.g. the
+	// device doesn't expose SMART); we just can't emit a delta for it.
+	startSMART := captureSMARTAttrs(ctx, targets)
+
+	fioOpts := resolveFioOpts(d.StorageKnobs)
+	d.Info(fmt.Sprintf("Storage: fio mode=%s size=%s runtime=%s bs=%s rw=%s verify=%s",
+		fioOpts.Mode, fioOpts.Size, fioOpts.Runtime, fioOpts.BS, fioOpts.RW, fioOpts.Verify))
+
 	var samples []Sample
 	var subs []SubStepReport
 	perDisk := map[string]any{}
+	failed := ""
 	for _, t := range targets {
-		d.Info("Storage: running badblocks write sample on " + t.Device)
-		bbStart := time.Now()
-		bb := runBadblocks(ctx, t.Device)
-		bbEnd := time.Now()
-		bbSummary, _ := json.Marshal(bb)
-		subs = append(subs, SubStepReport{
-			Name:        fmt.Sprintf("badblocks %s", t.Device),
-			Passed:      bb.OK,
-			StartedAt:   bbStart,
-			CompletedAt: bbEnd,
-			SummaryJSON: bbSummary,
-		})
-
-		d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
+		d.Info(fmt.Sprintf("Storage: running fio %s on %s", fioOpts.Mode, t.Device))
 		fioStart := time.Now()
-		fr := runFio(ctx, t.Device)
+		fr := runFioVerify(ctx, t.Device, fioOpts)
 		fioEnd := time.Now()
 		fioSummary, _ := json.Marshal(fr)
 		subs = append(subs, SubStepReport{
-			Name:        fmt.Sprintf("fio %s", t.Device),
+			Name:        fmt.Sprintf("fio %s %s", fioOpts.Mode, t.Device),
 			Passed:      fr.Error == "",
 			StartedAt:   fioStart,
 			CompletedAt: fioEnd,
 			SummaryJSON: fioSummary,
 		})
+		perDisk[t.Device] = map[string]any{"fio": fr}

-		perDisk[t.Device] = map[string]any{
-			"badblocks": bb,
-			"fio":       fr,
-		}
-		samples = append(samples,
-			Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
-			Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
-		)
-		if !bb.OK {
-			return Outcome{
-				Passed:   false,
-				Message:  "badblocks found errors on " + t.Device,
-				Summary:  "badblocks failed on " + t.Device,
-				Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes},
-				SubSteps: subs,
+		if fr.Error == "" {
+			samples = append(samples,
+				Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
+				Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
+			)
+			if fr.ReadP99Us > 0 {
+				samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
 			}
+			if fr.WriteP99Us > 0 {
+				samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
+			}
+		} else if failed == "" {
+			failed = t.Device
 		}
 	}
-	if d.Sensor != nil {
+
+	// End-of-stage SMART snapshot + diff. We capture whether or not fio
+	// succeeded — a mid-run failure still produces attributable deltas,
+	// which is often more interesting than the stage outcome itself.
+	endSMART := captureSMARTAttrs(ctx, targets)
+	deltas := diffSMARTAttrs(startSMART, endSMART)
+	for dev, attrs := range deltas {
+		for attr, delta := range attrs {
+			samples = append(samples, Sample{Kind: "smart_delta", Key: dev + "/" + attr, Value: delta, Unit: "count"})
+		}
+	}
+	if d.Sensor != nil && len(samples) > 0 {
 		_ = d.Sensor(ctx, samples)
 	}

-	d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
+	if failed != "" {
+		return Outcome{
+			Passed:   false,
+			Message:  "fio verify failed on " + failed,
+			Summary:  "fio failed on " + failed,
+			Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
+			SubSteps: subs,
+		}
+	}
+
+	d.Info(fmt.Sprintf("Storage: %d disk(s) passed fio --verify", len(targets)))
 	return Outcome{
 		Passed:   true,
-		Summary:  fmt.Sprintf("%d disks passed", len(targets)),
-		Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes},
+		Summary:  fmt.Sprintf("%d disks passed (%s)", len(targets), fioOpts.Mode),
+		Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
 		SubSteps: subs,
 	}
 }
@@ -229,8 +257,8 @@ type wipeProbeResult struct {

 // probeWipe runs blkid + wipefs -n. Any non-empty output from either is
 // a "has data" signal. This is deliberately conservative: we'd rather
-// halt on a bare ext4 signature than hand badblocks a disk with real
-// bytes on it.
+// halt on a bare ext4 signature than hand fio a disk with real bytes on
+// it.
 func probeWipe(ctx context.Context, device string) wipeProbeResult {
 	out := wipeProbeResult{Device: device}

@@ -257,84 +285,269 @@ func probeWipe(ctx context.Context, device string) wipeProbeResult {
 	return out
 }

-// ---------- badblocks ----------
+// ---------- fio ----------

-type badblocksResult struct {
-	OK        bool   `json:"ok"`
-	Elapsed   string `json:"elapsed"`
-	Error     string `json:"error,omitempty"`
-	OutputTail string `json:"output_tail,omitempty"`
+// fioOpts resolves the probe knobs into the concrete flag values fio
+// needs. Defaults match the quick profile's fio_sample shape so callers
+// with zero knobs still run something bounded.
+type fioOpts struct {
+	Mode    string        `json:"mode"`     // "fio_sample" | "full_disk"
+	Size    string        `json:"size"`     // "1GiB"; only used for fio_sample
+	Runtime time.Duration `json:"runtime"`  // bounding time
+	BS      string        `json:"bs"`       // "4k"
+	RW      string        `json:"rw"`       // "randrw"
+	Verify  string        `json:"verify"`   // "md5" | ""
 }

-func runBadblocks(ctx context.Context, device string) badblocksResult {
-	// -c 64 blocks per check, -w destructive write, -b 4096 block size,
-	// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays
-	// bounded. A real burn-in would run the whole disk; that belongs in
-	// a separate "deep" stage.
-	args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"}
-	start := time.Now()
-	runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
+// resolveFioOpts normalizes the knobs into a runnable config. Zero-
+// valued fields fall back to the quick defaults so a stage that's
+// missing its knobs still has coherent behavior (safer than refusing).
+func resolveFioOpts(k StorageKnobs) fioOpts {
+	o := fioOpts{
+		Mode:    firstNonEmpty(k.Mode, "fio_sample"),
+		Size:    firstNonEmpty(k.FioSize, "1GiB"),
+		Runtime: k.FioTime,
+		BS:      firstNonEmpty(k.FioBS, "4k"),
+		RW:      firstNonEmpty(k.FioRW, "randrw"),
+		Verify:  firstNonEmpty(k.Verify, "md5"),
+	}
+	if o.Runtime <= 0 {
+		o.Runtime = 3 * time.Minute
+	}
+	return o
+}
+
+func firstNonEmpty(vs ...string) string {
+	for _, v := range vs {
+		if v != "" {
+			return v
+		}
+	}
+	return ""
+}
+
+type fioResult struct {
+	Mode        string  `json:"mode"`
+	ReadIOPS    float64 `json:"read_iops"`
+	WriteIOPS   float64 `json:"write_iops"`
+	ReadBWKBps  float64 `json:"read_bw_kbps"`
+	WriteBWKBps float64 `json:"write_bw_kbps"`
+	ReadP99Us   float64 `json:"read_p99_us,omitempty"`
+	WriteP99Us  float64 `json:"write_p99_us,omitempty"`
+	Error       string  `json:"error,omitempty"`
+	OutputTail  string  `json:"output_tail,omitempty"`
+}
+
+// runFioVerify invokes fio with md5-verify semantics. fio_sample mode
+// caps the IO at opts.Size; full_disk drives the whole device bounded
+// by runtime. Both use direct IO to bypass the page cache — we want
+// real disk latency, not Linux' cheerful buffer.
+func runFioVerify(ctx context.Context, device string, opts fioOpts) fioResult {
+	// 30s grace over runtime so fio has time to flush + close cleanly.
+	runCtx, cancel := context.WithTimeout(ctx, opts.Runtime+30*time.Second)
 	defer cancel()
-	cmd := exec.CommandContext(runCtx, "badblocks", args...)
-	out, err := cmd.CombinedOutput()
-	r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)}
+
+	args := []string{
+		"--name=verify-" + strings.TrimPrefix(device, "/dev/"),
+		"--filename=" + device,
+		"--rw=" + opts.RW,
+		"--bs=" + opts.BS,
+		"--numjobs=1",
+		"--direct=1",
+		"--group_reporting",
+		"--output-format=json",
+		"--runtime=" + strconv.Itoa(int(opts.Runtime.Seconds())),
+	}
+	if opts.Verify != "" {
+		args = append(args,
+			"--verify="+opts.Verify,
+			"--verify_pattern=random",
+			"--do_verify=1",
+		)
+	}
+	switch opts.Mode {
+	case "full_disk":
+		// Time-bounded across the full device — fio uses the device's
+		// full size when --size is omitted on a block device.
+		args = append(args, "--time_based=1")
+	default:
+		// fio_sample: bounded write. Setting --size= limits the IO
+		// volume regardless of runtime.
+		args = append(args, "--size="+opts.Size, "--time_based=0")
+	}
+
+	cmd := exec.CommandContext(runCtx, "fio", args...)
+	out, err := cmd.Output()
+	r := fioResult{Mode: opts.Mode, OutputTail: tailLines(string(out), 20)}
 	if err != nil {
 		r.Error = err.Error()
 		return r
 	}
-	// badblocks prints each bad block to stdout. Empty output = clean.
-	if strings.TrimSpace(string(out)) == "" {
-		r.OK = true
-	} else {
-		r.Error = "bad blocks found"
+	parsed, perr := parseFioJSON(out)
+	if perr != nil {
+		r.Error = "parse fio json: " + perr.Error()
+		return r
 	}
+	r.ReadIOPS = parsed.ReadIOPS
+	r.WriteIOPS = parsed.WriteIOPS
+	r.ReadBWKBps = parsed.ReadBWKBps
+	r.WriteBWKBps = parsed.WriteBWKBps
+	r.ReadP99Us = parsed.ReadP99Us
+	r.WriteP99Us = parsed.WriteP99Us
 	return r
 }

-// ---------- fio ----------
-
-type fioResult struct {
-	ReadIOPS   float64 `json:"read_iops"`
-	WriteIOPS  float64 `json:"write_iops"`
-	ReadBWKBps float64 `json:"read_bw_kbps"`
-	WriteBWKBps float64 `json:"write_bw_kbps"`
-	Error      string  `json:"error,omitempty"`
-}
-
-// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
-// This is a health bar, not a benchmark — we want to know the disk
-// services IO, not how fast it is at p99.
-func runFio(ctx context.Context, device string) fioResult {
-	runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
-	defer cancel()
-	args := []string{
-		"--name=health", "--filename=" + device, "--rw=randrw",
-		"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
-		"--group_reporting", "--output-format=json", "--direct=1",
-	}
-	cmd := exec.CommandContext(runCtx, "fio", args...)
-	out, err := cmd.Output()
-	if err != nil {
-		return fioResult{Error: err.Error()}
-	}
+// parseFioJSON extracts the bits we care about from fio's --output-format=json.
+// Latency percentiles live at .jobs[0].read.clat_ns.percentile["99.000000"];
+// we convert nanoseconds to microseconds for the fio_p99_us sample.
+func parseFioJSON(out []byte) (fioResult, error) {
 	var top struct {
 		Jobs []struct {
-			Read  struct {
+			Read struct {
 				IOPS float64 `json:"iops"`
 				BW   float64 `json:"bw"`
+				CLat struct {
+					Percentile map[string]float64 `json:"percentile"`
+				} `json:"clat_ns"`
 			} `json:"read"`
 			Write struct {
 				IOPS float64 `json:"iops"`
 				BW   float64 `json:"bw"`
+				CLat struct {
+					Percentile map[string]float64 `json:"percentile"`
+				} `json:"clat_ns"`
 			} `json:"write"`
 		} `json:"jobs"`
 	}
-	if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 {
-		return fioResult{Error: "parse fio json: " + fmt.Sprint(err)}
+	if err := json.Unmarshal(out, &top); err != nil {
+		return fioResult{}, err
+	}
+	if len(top.Jobs) == 0 {
+		return fioResult{}, fmt.Errorf("no jobs in fio output")
 	}
 	j := top.Jobs[0]
-	return fioResult{
+	r := fioResult{
 		ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
 		ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
 	}
+	if p := j.Read.CLat.Percentile["99.000000"]; p > 0 {
+		r.ReadP99Us = p / 1000.0
+	}
+	if p := j.Write.CLat.Percentile["99.000000"]; p > 0 {
+		r.WriteP99Us = p / 1000.0
+	}
+	return r, nil
+}
+
+// ---------- SMART delta ----------
+
+// smartAttrMap: device → attribute → raw counter value. ATA drives
+// populate named attributes (Reallocated_Sector_Ct etc); NVMe drives
+// populate a flatter nvme-specific map. We track a curated whitelist
+// of wear indicators — anything else is diagnostic and drops to the raw
+// report output.
+type smartAttrMap map[string]map[string]float64
+
+// captureSMARTAttrs runs smartctl -aj on each target and pulls the
+// whitelisted attributes. Per-device failures (virtio, permission
+// issues) degrade silently — the delta step just shows no data for
+// that device.
+func captureSMARTAttrs(ctx context.Context, targets []diskTarget) smartAttrMap {
+	out := smartAttrMap{}
+	for _, t := range targets {
+		parsed, err := runSmartctl(ctx, t.Device)
+		if err != nil {
+			continue
+		}
+		attrs := extractSMARTAttrs(parsed)
+		if len(attrs) > 0 {
+			out[t.Device] = attrs
+		}
+	}
+	return out
+}
+
+// smartAttributeWhitelist is the set of attributes we diff across a
+// stage. They're the ones that reflect *this stage's* IO damage, not
+// cumulative drive history. Adding attributes is cheap — missing ones
+// just drop to zero.
+var smartAttributeWhitelist = map[string]bool{
+	// ATA SMART attribute names (smartctl normalizes to these)
+	"Reallocated_Sector_Ct":   true,
+	"Current_Pending_Sector":  true,
+	"Offline_Uncorrectable":   true,
+	"UDMA_CRC_Error_Count":    true,
+	"Reported_Uncorrect":      true,
+	"Raw_Read_Error_Rate":     true,
+	// NVMe log fields (flat keys at top of nvme_smart_health_information_log)
+	"media_errors":            true,
+	"num_err_log_entries":     true,
+	"percentage_used":         true,
+}
+
+// extractSMARTAttrs walks smartctl's JSON for whitelisted attribute
+// values. Handles both the ATA shape (ata_smart_attributes.table[]) and
+// the NVMe shape (nvme_smart_health_information_log). Returns a map
+// keyed by the canonical attribute name.
+func extractSMARTAttrs(raw map[string]any) map[string]float64 {
+	out := map[string]float64{}
+	// ATA attributes are in ata_smart_attributes.table[] — each element
+	// has {"name": "Reallocated_Sector_Ct", "raw": {"value": N}}.
+	if ata, ok := raw["ata_smart_attributes"].(map[string]any); ok {
+		if tbl, ok := ata["table"].([]any); ok {
+			for _, row := range tbl {
+				rm, ok := row.(map[string]any)
+				if !ok {
+					continue
+				}
+				name, _ := rm["name"].(string)
+				if !smartAttributeWhitelist[name] {
+					continue
+				}
+				if r, ok := rm["raw"].(map[string]any); ok {
+					if v, ok := r["value"].(float64); ok {
+						out[name] = v
+					}
+				}
+			}
+		}
+	}
+	// NVMe attributes live flat under nvme_smart_health_information_log.
+	if nvme, ok := raw["nvme_smart_health_information_log"].(map[string]any); ok {
+		for k, v := range nvme {
+			if !smartAttributeWhitelist[k] {
+				continue
+			}
+			if n, ok := v.(float64); ok {
+				out[k] = n
+			}
+		}
+	}
+	return out
+}
+
+// diffSMARTAttrs subtracts start from end per (device, attribute).
+// Only attributes present in both ends produce a delta; missing
+// attributes drop out (can't attribute a zero-to-present delta safely).
+// Negative deltas are kept so a drive that resets a counter is visible.
+func diffSMARTAttrs(start, end smartAttrMap) map[string]map[string]float64 {
+	out := map[string]map[string]float64{}
+	for dev, endAttrs := range end {
+		startAttrs, ok := start[dev]
+		if !ok {
+			continue
+		}
+		devOut := map[string]float64{}
+		for attr, endV := range endAttrs {
+			startV, ok := startAttrs[attr]
+			if !ok {
+				continue
+			}
+			devOut[attr] = endV - startV
+		}
+		if len(devOut) > 0 {
+			out[dev] = devOut
+		}
+	}
+	return out
 }
@@ -0,0 +1,218 @@
+package tests
+
+import (
+	"encoding/json"
+	"testing"
+	"time"
+)
+
+// TestParseFioJSON_ATAReadWrite confirms we pull IOPS, BW, and p99
+// latency from both read and write sides. P99 is read from clat_ns and
+// converted ns → us (the unit we emit to the threshold evaluator).
+func TestParseFioJSON_ATAReadWrite(t *testing.T) {
+	raw := `{
+		"jobs": [{
+			"read":  {"iops": 1234.5, "bw": 5000, "clat_ns": {"percentile": {"99.000000": 250000}}},
+			"write": {"iops": 432.1,  "bw": 2000, "clat_ns": {"percentile": {"99.000000": 500000}}}
+		}]
+	}`
+	r, err := parseFioJSON([]byte(raw))
+	if err != nil {
+		t.Fatalf("parseFioJSON: %v", err)
+	}
+	if r.ReadIOPS != 1234.5 {
+		t.Errorf("ReadIOPS = %v, want 1234.5", r.ReadIOPS)
+	}
+	if r.WriteIOPS != 432.1 {
+		t.Errorf("WriteIOPS = %v, want 432.1", r.WriteIOPS)
+	}
+	if r.ReadBWKBps != 5000 {
+		t.Errorf("ReadBWKBps = %v, want 5000", r.ReadBWKBps)
+	}
+	// 250000 ns → 250 us
+	if r.ReadP99Us != 250 {
+		t.Errorf("ReadP99Us = %v, want 250", r.ReadP99Us)
+	}
+	// 500000 ns → 500 us
+	if r.WriteP99Us != 500 {
+		t.Errorf("WriteP99Us = %v, want 500", r.WriteP99Us)
+	}
+}
+
+// TestParseFioJSON_ReadOnlyJob: if only one side has p99 populated the
+// other stays zero (not emitted as a sample). Mirrors a randread job.
+func TestParseFioJSON_ReadOnlyJob(t *testing.T) {
+	raw := `{
+		"jobs": [{
+			"read":  {"iops": 1000, "bw": 4000, "clat_ns": {"percentile": {"99.000000": 100000}}},
+			"write": {"iops": 0, "bw": 0}
+		}]
+	}`
+	r, err := parseFioJSON([]byte(raw))
+	if err != nil {
+		t.Fatalf("parseFioJSON: %v", err)
+	}
+	if r.WriteP99Us != 0 {
+		t.Errorf("WriteP99Us = %v on read-only job, want 0", r.WriteP99Us)
+	}
+	if r.ReadP99Us != 100 {
+		t.Errorf("ReadP99Us = %v, want 100", r.ReadP99Us)
+	}
+}
+
+// TestParseFioJSON_NoJobs fails rather than reporting zeroes silently.
+// An empty jobs array means fio didn't run anything.
+func TestParseFioJSON_NoJobs(t *testing.T) {
+	raw := `{"jobs": []}`
+	if _, err := parseFioJSON([]byte(raw)); err == nil {
+		t.Errorf("expected error on empty jobs array")
+	}
+}
+
+// TestExtractSMARTAttrs_ATA picks attributes out of ata_smart_attributes.table
+// when present. Attributes outside the whitelist drop out silently.
+func TestExtractSMARTAttrs_ATA(t *testing.T) {
+	raw := map[string]any{}
+	smartJSON := `{
+		"ata_smart_attributes": {
+			"table": [
+				{"name": "Reallocated_Sector_Ct",   "raw": {"value": 7}},
+				{"name": "Current_Pending_Sector",  "raw": {"value": 3}},
+				{"name": "Spin_Retry_Count",        "raw": {"value": 99}}
+			]
+		}
+	}`
+	if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
+		t.Fatalf("unmarshal fixture: %v", err)
+	}
+	out := extractSMARTAttrs(raw)
+	if out["Reallocated_Sector_Ct"] != 7 {
+		t.Errorf("Reallocated_Sector_Ct = %v, want 7", out["Reallocated_Sector_Ct"])
+	}
+	if out["Current_Pending_Sector"] != 3 {
+		t.Errorf("Current_Pending_Sector = %v, want 3", out["Current_Pending_Sector"])
+	}
+	if _, ok := out["Spin_Retry_Count"]; ok {
+		t.Errorf("Spin_Retry_Count should not appear (not in whitelist)")
+	}
+}
+
+// TestExtractSMARTAttrs_NVMe picks media_errors and friends from the
+// nvme health log shape, which is a flat map at the top of the JSON.
+func TestExtractSMARTAttrs_NVMe(t *testing.T) {
+	raw := map[string]any{}
+	smartJSON := `{
+		"nvme_smart_health_information_log": {
+			"media_errors": 2,
+			"num_err_log_entries": 15,
+			"percentage_used": 7,
+			"temperature": 42
+		}
+	}`
+	if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
+		t.Fatalf("unmarshal fixture: %v", err)
+	}
+	out := extractSMARTAttrs(raw)
+	if out["media_errors"] != 2 {
+		t.Errorf("media_errors = %v, want 2", out["media_errors"])
+	}
+	if out["num_err_log_entries"] != 15 {
+		t.Errorf("num_err_log_entries = %v, want 15", out["num_err_log_entries"])
+	}
+	if out["percentage_used"] != 7 {
+		t.Errorf("percentage_used = %v, want 7", out["percentage_used"])
+	}
+	if _, ok := out["temperature"]; ok {
+		t.Errorf("temperature should not appear (not in whitelist)")
+	}
+}
+
+// TestDiffSMARTAttrs: end − start per (device, attr). Only attrs in
+// both snapshots yield a delta; any disappearing attribute just drops
+// out instead of showing a misleading negative.
+func TestDiffSMARTAttrs(t *testing.T) {
+	start := smartAttrMap{
+		"/dev/sda": {"Reallocated_Sector_Ct": 5, "Current_Pending_Sector": 0},
+	}
+	end := smartAttrMap{
+		"/dev/sda": {"Reallocated_Sector_Ct": 8, "Current_Pending_Sector": 2, "UDMA_CRC_Error_Count": 1},
+	}
+	out := diffSMARTAttrs(start, end)
+	if out["/dev/sda"]["Reallocated_Sector_Ct"] != 3 {
+		t.Errorf("Reallocated_Sector_Ct delta = %v, want 3", out["/dev/sda"]["Reallocated_Sector_Ct"])
+	}
+	if out["/dev/sda"]["Current_Pending_Sector"] != 2 {
+		t.Errorf("Current_Pending_Sector delta = %v, want 2", out["/dev/sda"]["Current_Pending_Sector"])
+	}
+	if _, ok := out["/dev/sda"]["UDMA_CRC_Error_Count"]; ok {
+		t.Errorf("UDMA_CRC_Error_Count should not appear (missing at start)")
+	}
+}
+
+// TestDiffSMARTAttrs_DeviceNewAtEnd: a device only present in the end
+// snapshot (drive hot-plugged mid-run, or SMART read succeeded only at
+// end) is dropped from the diff — no start baseline to subtract from.
+func TestDiffSMARTAttrs_DeviceNewAtEnd(t *testing.T) {
+	start := smartAttrMap{}
+	end := smartAttrMap{
+		"/dev/sda": {"Reallocated_Sector_Ct": 10},
+	}
+	out := diffSMARTAttrs(start, end)
+	if _, ok := out["/dev/sda"]; ok {
+		t.Errorf("/dev/sda should drop from diff when absent at start")
+	}
+}
+
+// TestResolveFioOpts_Defaults: zero-valued knobs resolve to the quick
+// profile's fio_sample shape. Any stage that's missing per-profile
+// knobs (legacy claim response, test harness) still has coherent
+// bounded defaults — we won't accidentally fall into unbounded writes.
+func TestResolveFioOpts_Defaults(t *testing.T) {
+	o := resolveFioOpts(StorageKnobs{})
+	if o.Mode != "fio_sample" {
+		t.Errorf("Mode = %q, want fio_sample", o.Mode)
+	}
+	if o.Size != "1GiB" {
+		t.Errorf("Size = %q, want 1GiB", o.Size)
+	}
+	if o.Runtime != 3*time.Minute {
+		t.Errorf("Runtime = %v, want 3m", o.Runtime)
+	}
+	if o.BS != "4k" {
+		t.Errorf("BS = %q, want 4k", o.BS)
+	}
+	if o.RW != "randrw" {
+		t.Errorf("RW = %q, want randrw", o.RW)
+	}
+	if o.Verify != "md5" {
+		t.Errorf("Verify = %q, want md5", o.Verify)
+	}
+}
+
+// TestResolveFioOpts_FullDiskOverride confirms the deep/soak shape
+// round-trips. FioTime as 2h overrides the 3-minute default.
+func TestResolveFioOpts_FullDiskOverride(t *testing.T) {
+	k := StorageKnobs{
+		Mode:    "full_disk",
+		FioTime: 2 * time.Hour,
+		FioBS:   "64k",
+		FioRW:   "write",
+	}
+	o := resolveFioOpts(k)
+	if o.Mode != "full_disk" {
+		t.Errorf("Mode = %q, want full_disk", o.Mode)
+	}
+	if o.Runtime != 2*time.Hour {
+		t.Errorf("Runtime = %v, want 2h", o.Runtime)
+	}
+	if o.BS != "64k" {
+		t.Errorf("BS = %q, want 64k", o.BS)
+	}
+	if o.RW != "write" {
+		t.Errorf("RW = %q, want write", o.RW)
+	}
+	// Verify should fall back to md5 default since knob was empty.
+	if o.Verify != "md5" {
+		t.Errorf("Verify = %q, want md5 (default)", o.Verify)
+	}
+}