Initial commit: full Phases 1-6 implementation

Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
@@ -0,0 +1,97 @@
+package tests
+
+import (
+	"context"
+	"fmt"
+	"os/exec"
+	"runtime"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// CPUStress runs stress-ng with CPU workers AND memory stressors. The
+// memory stressors take the place of a Memtest86+ pass — per the plan,
+// running under Linux gives us exit-code-based pass/fail and log
+// capture we can't get from Memtest without IPMI serial redirection.
+//
+// Non-zero exit = stress-ng aborted due to a failure (bit flip, OOM
+// kill, etc.) → stage fails. Exit 0 means the kernel returned sane
+// pages for the full duration, which is the Phase 4 health bar.
+func CPUStress(ctx context.Context, d Deps) Outcome {
+	if _, err := exec.LookPath("stress-ng"); err != nil {
+		d.Warn("CPUStress: stress-ng not found in PATH — skipping stage")
+		return Outcome{
+			Passed:  true,
+			Summary: "skipped (stress-ng missing)",
+			Extras:  map[string]any{"skipped": true, "reason": "stress_ng_missing"},
+		}
+	}
+
+	// Timeout: Deps.StageTimeout may be zero in tests; default 2 min.
+	timeout := d.StageTimeout
+	if timeout <= 0 {
+		timeout = 2 * time.Minute
+	}
+
+	cores := runtime.NumCPU()
+	// --vm N allocates N worker processes each touching 90% of RAM. On
+	// an 8-core host with 32GiB this is 8 × ~28GiB sliding windows —
+	// enough to exercise every DIMM row within a minute.
+	args := []string{
+		"--cpu", strconv.Itoa(cores),
+		"--cpu-method", "all",
+		"--vm", strconv.Itoa(cores),
+		"--vm-bytes", "90%",
+		"--timeout", durationSeconds(timeout),
+		"--metrics-brief",
+		"--verify",
+	}
+	d.Info(fmt.Sprintf("CPUStress: stress-ng --cpu %d --vm %d --vm-bytes 90%% --timeout %s",
+		cores, cores, durationSeconds(timeout)))
+
+	runCtx, cancel := context.WithTimeout(ctx, timeout+30*time.Second)
+	defer cancel()
+	cmd := exec.CommandContext(runCtx, "stress-ng", args...)
+	start := time.Now()
+	out, err := cmd.CombinedOutput()
+	elapsed := time.Since(start).Round(time.Second)
+
+	extras := map[string]any{
+		"cores":        cores,
+		"elapsed_secs": elapsed.Seconds(),
+		"output_tail":  tailLines(string(out), 20),
+	}
+	if err != nil {
+		d.Error("CPUStress: stress-ng failed: " + err.Error())
+		return Outcome{
+			Passed:  false,
+			Message: "stress-ng returned non-zero: " + err.Error(),
+			Summary: fmt.Sprintf("failed after %s", elapsed),
+			Extras:  extras,
+		}
+	}
+	d.Info(fmt.Sprintf("CPUStress: stress-ng completed cleanly in %s", elapsed))
+	return Outcome{
+		Passed:  true,
+		Summary: fmt.Sprintf("stress-ng PASSED after %s (%d cores + 90%% RAM)", elapsed, cores),
+		Extras:  extras,
+	}
+}
+
+func durationSeconds(d time.Duration) string {
+	s := int(d.Seconds())
+	if s < 1 {
+		s = 1
+	}
+	return strconv.Itoa(s) + "s"
+}
+
+// tailLines returns the last n non-empty lines of s, for the summary.
+func tailLines(s string, n int) string {
+	lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
+	if len(lines) > n {
+		lines = lines[len(lines)-n:]
+	}
+	return strings.Join(lines, "\n")
+}
@@ -0,0 +1,86 @@
+package tests
+
+import (
+	"context"
+	"os/exec"
+	"strings"
+)
+
+// GPU enumerates VGA / 3D PCI devices. No devices → skip cleanly (a
+// CPU-only server passes this stage by virtue of having nothing to
+// stress). Devices present → try nvidia-smi for NVIDIA cards, else
+// accept PCI presence.
+func GPU(ctx context.Context, d Deps) Outcome {
+	devices := listGPUPCI(ctx)
+	if len(devices) == 0 {
+		d.Info("GPU: no VGA/3D PCI devices found — skipping stage")
+		return Outcome{
+			Passed:  true,
+			Summary: "skipped (no GPU present)",
+			Extras:  map[string]any{"skipped": true, "reason": "no_gpu_present"},
+		}
+	}
+	d.Info("GPU: found " + joinDevices(devices))
+
+	nvidia := nvidiaSmiList(ctx)
+	extras := map[string]any{
+		"pci_devices": devices,
+		"skipped":     false,
+	}
+	if len(nvidia) > 0 {
+		extras["nvidia"] = nvidia
+		d.Info("GPU: nvidia-smi reports: " + strings.Join(nvidia, ", "))
+	}
+	return Outcome{
+		Passed:  true,
+		Summary: formatCount(len(devices), "GPU present"),
+		Extras:  extras,
+	}
+}
+
+// listGPUPCI shells out to lspci. Returns human-readable strings, one
+// per VGA/3D device. If lspci isn't available we return nil and the
+// caller treats it as "no GPU" which auto-skips.
+func listGPUPCI(ctx context.Context) []string {
+	cmd := exec.CommandContext(ctx, "lspci", "-mm")
+	out, err := cmd.Output()
+	if err != nil {
+		return nil
+	}
+	var devs []string
+	for _, line := range strings.Split(string(out), "\n") {
+		l := strings.ToLower(line)
+		if strings.Contains(l, "vga compatible controller") || strings.Contains(l, "3d controller") {
+			devs = append(devs, strings.TrimSpace(line))
+		}
+	}
+	return devs
+}
+
+// nvidiaSmiList returns each card's "<name>, <pci bus>" line; empty
+// slice when nvidia-smi isn't installed or fails.
+func nvidiaSmiList(ctx context.Context) []string {
+	cmd := exec.CommandContext(ctx, "nvidia-smi", "-L")
+	out, err := cmd.Output()
+	if err != nil {
+		return nil
+	}
+	var lines []string
+	for _, l := range strings.Split(string(out), "\n") {
+		l = strings.TrimSpace(l)
+		if l != "" {
+			lines = append(lines, l)
+		}
+	}
+	return lines
+}
+
+func joinDevices(devs []string) string {
+	if len(devs) == 0 {
+		return ""
+	}
+	if len(devs) == 1 {
+		return devs[0]
+	}
+	return devs[0] + " (+" + strings.TrimSpace(formatCount(len(devs)-1, "more")) + ")"
+}
@@ -0,0 +1,144 @@
+package tests
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/url"
+	"os/exec"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// NetworkConfig is what the agent passes to Network: the orchestrator's
+// iperf3 server address and port. We derive host from OrchestratorURL.
+type NetworkConfig struct {
+	OrchestratorURL string
+	IperfPort       int // 0 = 5201
+	Duration        time.Duration
+}
+
+// Network runs iperf3 against the orchestrator's bundled server. Records
+// bandwidth as a measurement; fails if iperf3 is missing, the server
+// isn't reachable, or throughput is zero.
+func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
+	if _, err := exec.LookPath("iperf3"); err != nil {
+		d.Warn("Network: iperf3 not found — skipping stage")
+		return Outcome{
+			Passed:  true,
+			Summary: "skipped (iperf3 missing)",
+			Extras:  map[string]any{"skipped": true, "reason": "iperf3_missing"},
+		}
+	}
+	host, err := deriveHost(cfg.OrchestratorURL)
+	if err != nil || host == "" {
+		d.Warn("Network: can't derive orchestrator host from URL — skipping stage")
+		return Outcome{
+			Passed:  true,
+			Summary: "skipped (no orchestrator host)",
+			Extras:  map[string]any{"skipped": true, "reason": "no_host"},
+		}
+	}
+	port := cfg.IperfPort
+	if port == 0 {
+		port = 5201
+	}
+	duration := cfg.Duration
+	if duration <= 0 {
+		duration = 10 * time.Second
+	}
+
+	args := []string{
+		"-c", host,
+		"-p", strconv.Itoa(port),
+		"-t", strconv.Itoa(int(duration.Seconds())),
+		"-J", // JSON output
+	}
+	d.Info(fmt.Sprintf("Network: iperf3 -c %s -p %d -t %s", host, port, duration))
+
+	runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
+	defer cancel()
+	cmd := exec.CommandContext(runCtx, "iperf3", args...)
+	out, err := cmd.Output()
+	if err != nil {
+		d.Error("Network: iperf3 client failed: " + err.Error())
+		return Outcome{
+			Passed:  false,
+			Message: "iperf3 client error: " + err.Error(),
+			Summary: "iperf3 failed",
+			Extras:  map[string]any{"stderr_tail": tailLines(string(out), 20)},
+		}
+	}
+	mbps, parsed, err := parseIperfJSON(out)
+	if err != nil {
+		d.Error("Network: parse iperf3 output: " + err.Error())
+		return Outcome{
+			Passed:  false,
+			Message: "parse iperf3 json: " + err.Error(),
+			Summary: "parse error",
+			Extras:  map[string]any{"raw": string(out)},
+		}
+	}
+	if d.Sensor != nil {
+		_ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}})
+	}
+
+	extras := map[string]any{
+		"throughput_mbps": mbps,
+		"iperf_end":       parsed,
+	}
+	if mbps <= 0 {
+		return Outcome{
+			Passed:  false,
+			Message: "iperf3 reported zero throughput",
+			Summary: "zero throughput",
+			Extras:  extras,
+		}
+	}
+	d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps))
+	return Outcome{
+		Passed:  true,
+		Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host),
+		Extras:  extras,
+	}
+}
+
+// deriveHost pulls the hostname out of an https://host:port base URL.
+func deriveHost(raw string) (string, error) {
+	if raw == "" {
+		return "", fmt.Errorf("empty url")
+	}
+	u, err := url.Parse(raw)
+	if err != nil {
+		return "", err
+	}
+	h := u.Hostname()
+	return strings.TrimSpace(h), nil
+}
+
+// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J.
+// Returns (Mbps, full-json-map, err).
+func parseIperfJSON(b []byte) (float64, map[string]any, error) {
+	var top map[string]any
+	if err := json.Unmarshal(b, &top); err != nil {
+		return 0, nil, err
+	}
+	end, ok := top["end"].(map[string]any)
+	if !ok {
+		return 0, top, fmt.Errorf("missing end")
+	}
+	// iperf3 reports either sum_sent (when -R not set) or sum_received.
+	for _, key := range []string{"sum_sent", "sum_received", "sum"} {
+		sum, ok := end[key].(map[string]any)
+		if !ok {
+			continue
+		}
+		bps, ok := sum["bits_per_second"].(float64)
+		if !ok {
+			continue
+		}
+		return bps / 1_000_000, end, nil
+	}
+	return 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
+}
@@ -0,0 +1,153 @@
+package tests
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
+// PSU rails. In home-lab hosts the kernel surfaces a handful of named
+// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10%
+// window of its nominal value → fail.
+func PSU(ctx context.Context, d Deps) Outcome {
+	rails := scanPSURails()
+	if len(rails) == 0 {
+		d.Info("PSU: no voltage rails found under /sys/class/hwmon — skipping stage")
+		return Outcome{
+			Passed:  true,
+			Summary: "skipped (no PSU sensors)",
+			Extras:  map[string]any{"skipped": true, "reason": "no_hwmon_voltages"},
+		}
+	}
+
+	var samples []Sample
+	problems := []string{}
+	for _, rail := range rails {
+		samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"})
+		if ok, why := voltageInRange(rail); !ok {
+			problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why))
+		}
+	}
+	if d.Sensor != nil {
+		_ = d.Sensor(ctx, samples)
+	}
+
+	extras := map[string]any{
+		"rails":    rails,
+		"problems": problems,
+	}
+	if len(problems) > 0 {
+		d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", "))
+		return Outcome{
+			Passed:  false,
+			Message: "PSU rails out of range: " + strings.Join(problems, ", "),
+			Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)),
+			Extras:  extras,
+		}
+	}
+	d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails)))
+	return Outcome{
+		Passed:  true,
+		Summary: fmt.Sprintf("%d rails nominal", len(rails)),
+		Extras:  extras,
+	}
+}
+
+type psuRail struct {
+	Label string  `json:"label"`
+	Volts float64 `json:"volts"`
+}
+
+// scanPSURails walks every hwmon chip looking for in*_input files with
+// an accompanying in*_label that mentions a known rail name. Unknown
+// labels are skipped rather than flagged — motherboard VRMs report many
+// rails that aren't PSU outputs.
+func scanPSURails() []psuRail {
+	root := "/sys/class/hwmon"
+	chips, err := os.ReadDir(root)
+	if err != nil {
+		return nil
+	}
+	var out []psuRail
+	for _, c := range chips {
+		base := filepath.Join(root, c.Name())
+		files, err := os.ReadDir(base)
+		if err != nil {
+			continue
+		}
+		for _, f := range files {
+			name := f.Name()
+			if !strings.HasPrefix(name, "in") || !strings.HasSuffix(name, "_input") {
+				continue
+			}
+			n := strings.TrimSuffix(strings.TrimPrefix(name, "in"), "_input")
+			labelPath := filepath.Join(base, "in"+n+"_label")
+			label := strings.TrimSpace(readFileStr(labelPath))
+			if !isPSULabel(label) {
+				continue
+			}
+			raw := strings.TrimSpace(readFileStr(filepath.Join(base, name)))
+			mv, err := strconv.Atoi(raw)
+			if err != nil {
+				continue
+			}
+			out = append(out, psuRail{Label: label, Volts: float64(mv) / 1000})
+		}
+	}
+	return out
+}
+
+// isPSULabel filters labels that look like PSU rails. Keeps a small
+// allowlist to avoid flagging CPU VRM rails as PSU failures.
+func isPSULabel(label string) bool {
+	l := strings.ToLower(label)
+	switch {
+	case strings.Contains(l, "12v"), strings.Contains(l, "5v"),
+		strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"),
+		strings.Contains(l, "vccin"):
+		return true
+	}
+	return false
+}
+
+// voltageInRange returns (ok, reason). A label like "12V" has a 12.0V
+// nominal; we accept ±10%. Unknown labels pass.
+func voltageInRange(r psuRail) (bool, string) {
+	nom := nominalFor(r.Label)
+	if nom == 0 {
+		return true, ""
+	}
+	delta := r.Volts - nom
+	if delta < 0 {
+		delta = -delta
+	}
+	if delta/nom > 0.10 {
+		return false, fmt.Sprintf("expected ~%.1fV", nom)
+	}
+	return true, ""
+}
+
+func nominalFor(label string) float64 {
+	l := strings.ToLower(label)
+	switch {
+	case strings.Contains(l, "12v"):
+		return 12.0
+	case strings.Contains(l, "5v"):
+		return 5.0
+	case strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"):
+		return 3.3
+	}
+	return 0
+}
+
+func readFileStr(p string) string {
+	b, err := os.ReadFile(p)
+	if err != nil {
+		return ""
+	}
+	return string(b)
+}
@@ -0,0 +1,152 @@
+package tests
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+)
+
+// SMART runs smartctl -a on each block device the kernel exposes. We
+// pass each device's result through smartctl --json output and key on:
+//
+//	smart_status.passed        -> overall-health PASSED
+//	ata_smart_attributes       -> per-attribute raw + threshold (ATA only)
+//	nvme_smart_health_information_log -> NVMe health flags
+//
+// Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just
+// surfaces as a per-disk "skipped" entry; the stage only fails if at
+// least one disk reports !passed.
+func SMART(ctx context.Context, d Deps) Outcome {
+	disks, err := listBlockDisks()
+	if err != nil {
+		d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error())
+		return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}}
+	}
+	if len(disks) == 0 {
+		d.Info("SMART: no physical disks found — skipping stage")
+		return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
+	}
+
+	type diskReport struct {
+		Device  string         `json:"device"`
+		Passed  bool           `json:"passed"`
+		Skipped bool           `json:"skipped,omitempty"`
+		Reason  string         `json:"reason,omitempty"`
+		Raw     map[string]any `json:"raw,omitempty"`
+	}
+
+	var reports []diskReport
+	failed := 0
+	usable := 0
+	for _, dev := range disks {
+		rep := diskReport{Device: dev}
+		out, err := runSmartctl(ctx, dev)
+		if err != nil {
+			rep.Skipped = true
+			rep.Reason = err.Error()
+			reports = append(reports, rep)
+			d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
+			continue
+		}
+		usable++
+		rep.Raw = out
+		if passed, ok := smartPassed(out); ok {
+			rep.Passed = passed
+			if !passed {
+				failed++
+				d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev))
+			} else {
+				d.Info(fmt.Sprintf("SMART: %s PASSED", dev))
+			}
+		} else {
+			rep.Skipped = true
+			rep.Reason = "no smart_status in output"
+		}
+		reports = append(reports, rep)
+	}
+
+	extras := map[string]any{
+		"disks":   reports,
+		"tested":  usable,
+		"failing": failed,
+	}
+	if failed > 0 {
+		return Outcome{
+			Passed:  false,
+			Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
+			Summary: fmt.Sprintf("%d/%d failing", failed, usable),
+			Extras:  extras,
+		}
+	}
+	summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
+	if usable == 0 {
+		summary = "skipped (no smartctl data on any disk)"
+		extras["skipped"] = true
+	}
+	return Outcome{Passed: true, Summary: summary, Extras: extras}
+}
+
+func listBlockDisks() ([]string, error) {
+	entries, err := os.ReadDir("/sys/class/block")
+	if err != nil {
+		return nil, err
+	}
+	var out []string
+	for _, e := range entries {
+		name := e.Name()
+		if !isRealBlockDisk(name) {
+			continue
+		}
+		out = append(out, "/dev/"+name)
+	}
+	return out, nil
+}
+
+func isRealBlockDisk(name string) bool {
+	if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
+		strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
+		return false
+	}
+	partPath := filepath.Join("/sys/class/block", name, "partition")
+	if _, err := os.Stat(partPath); err == nil {
+		return false
+	}
+	return true
+}
+
+// runSmartctl invokes `smartctl -aj <dev>` and returns the parsed JSON.
+// Exit code 4 means smartctl found no device info (e.g. virtio), which
+// we surface as a skip rather than a failure.
+func runSmartctl(ctx context.Context, dev string) (map[string]any, error) {
+	cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev)
+	out, err := cmd.Output()
+	if len(out) == 0 {
+		if err != nil {
+			return nil, fmt.Errorf("smartctl: %w", err)
+		}
+		return nil, fmt.Errorf("empty smartctl output")
+	}
+	var parsed map[string]any
+	if jerr := json.Unmarshal(out, &parsed); jerr != nil {
+		return nil, fmt.Errorf("parse smartctl output: %w", jerr)
+	}
+	// Even with a non-zero exit code, if we got valid JSON with
+	// smart_status, trust the structured result.
+	return parsed, nil
+}
+
+// smartPassed extracts smart_status.passed from a smartctl --json blob.
+// Returns (passed, present) so callers can distinguish "passed=false"
+// from "attribute missing".
+func smartPassed(out map[string]any) (bool, bool) {
+	status, ok := out["smart_status"].(map[string]any)
+	if !ok {
+		return false, false
+	}
+	passed, ok := status["passed"].(bool)
+	return passed, ok
+}
@@ -0,0 +1,67 @@
+// Package tests contains the per-stage executors the agent runs on the
+// host under test. Each stage implements Runner, is called with a
+// Context that carries the client + forwarder + run params, and returns
+// an Outcome that the caller POSTs to /result.
+package tests
+
+import (
+	"context"
+	"encoding/json"
+	"time"
+)
+
+// Outcome is what a stage returns; it maps directly to the /result body.
+//   - Passed=true and len(Skipped)>0 counts as a pass but surfaces in the
+//     tile summary so operators can see "GPU: skipped (no VGA device)".
+//   - Message is only used on failure; the UI displays it in the log.
+//   - Extras is merged into the posted summary so stages can add
+//     their own shape (e.g. Storage returns per-disk probe results).
+type Outcome struct {
+	Passed  bool
+	Message string
+	Summary string         // short human-readable one-liner
+	Extras  map[string]any // merged into posted summary JSON
+}
+
+// MarshalSummary builds the summary JSON body POSTed to /result.
+// Stages accumulate fields via Extras; this helper adds "summary" (the
+// human-readable line) and serializes.
+func (o Outcome) MarshalSummary() (json.RawMessage, error) {
+	body := map[string]any{}
+	for k, v := range o.Extras {
+		body[k] = v
+	}
+	if o.Summary != "" {
+		body["summary"] = o.Summary
+	}
+	return json.Marshal(body)
+}
+
+// Deps bundles what stages need without pulling in the whole agent.
+// Logger methods print to stdout + forward to the orchestrator; Sensor
+// drops numeric samples; OverrideFlags carries operator-set bypasses.
+type Deps struct {
+	Info           func(string)
+	Warn           func(string)
+	Error          func(string)
+	Sensor         func(ctx context.Context, samples []Sample) error
+	OverrideWipe   bool
+	ExpectedDisks  []ExpectedDisk // serials + sizes from host.expected_spec
+	StageTimeout   time.Duration
+}
+
+// Sample mirrors the server's SensorSample but lives in the tests
+// package so probe code doesn't import internal/api.
+type Sample struct {
+	Kind  string
+	Key   string
+	Value float64
+	Unit  string
+}
+
+// ExpectedDisk is the subset of internal/spec.DiskSpec that Storage
+// needs: a device allowlist keyed on serial.
+type ExpectedDisk struct {
+	Serial string
+	SizeGB int
+}
@@ -0,0 +1,298 @@
+package tests
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"strings"
+	"time"
+)
+
+// Storage is the destructive stage: badblocks (write-mode sample) + fio
+// random IO, persisting IOPS + latency as measurements. Pre-gates:
+//
+//  1. Device allowlist: only act on /dev/<X> where the kernel-reported
+//     serial matches one of Deps.ExpectedDisks. This is the operator's
+//     contract for what can be written to. USB sticks and unexpected
+//     drives are excluded.
+//  2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
+//     signatures, partition tables, or LVM metadata → fail with
+//     UnexpectedData unless Deps.OverrideWipe is set.
+//
+// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w`
+// and `fio` in write mode. This matches the plan's "destructive disk
+// tests are always-on, gated by layered safety."
+func Storage(ctx context.Context, d Deps) Outcome {
+	if len(d.ExpectedDisks) == 0 {
+		d.Info("Storage: no expected disks in spec — skipping stage")
+		return Outcome{
+			Passed:  true,
+			Summary: "skipped (no expected disks)",
+			Extras:  map[string]any{"skipped": true, "reason": "no_expected_disks"},
+		}
+	}
+
+	targets := resolveTargets(d.ExpectedDisks)
+	if len(targets) == 0 {
+		d.Error("Storage: none of the expected disks are present on this host")
+		return Outcome{
+			Passed:  false,
+			Message: "device allowlist matched zero disks",
+			Summary: "no allowed disks present",
+			Extras:  map[string]any{"expected": d.ExpectedDisks},
+		}
+	}
+
+	// Wipe probe on every target. A single dirty disk halts the stage
+	// unless the operator has set OverrideWipe via the UI.
+	probes := map[string]wipeProbeResult{}
+	dirty := []string{}
+	for _, t := range targets {
+		probe := probeWipe(ctx, t.Device)
+		probes[t.Device] = probe
+		if probe.HasData {
+			dirty = append(dirty, t.Device)
+		}
+	}
+	if len(dirty) > 0 && !d.OverrideWipe {
+		d.Error("Storage: wipe probe found existing data on: " + strings.Join(dirty, ", "))
+		return Outcome{
+			Passed:  false,
+			Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
+			Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
+			Extras: map[string]any{
+				"wipe_probe":     probes,
+				"override_hint":  "click 'Override wipe & retry' in the held tile",
+				"dirty_devices":  dirty,
+			},
+		}
+	}
+	if d.OverrideWipe && len(dirty) > 0 {
+		d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
+	}
+
+	// Per target: short badblocks write sample + fio random-read/write.
+	var samples []Sample
+	perDisk := map[string]any{}
+	for _, t := range targets {
+		d.Info("Storage: running badblocks write sample on " + t.Device)
+		bb := runBadblocks(ctx, t.Device)
+		d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
+		fr := runFio(ctx, t.Device)
+		perDisk[t.Device] = map[string]any{
+			"badblocks": bb,
+			"fio":       fr,
+		}
+		samples = append(samples,
+			Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
+			Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
+		)
+		if !bb.OK {
+			return Outcome{
+				Passed:  false,
+				Message: "badblocks found errors on " + t.Device,
+				Summary: "badblocks failed on " + t.Device,
+				Extras:  map[string]any{"per_disk": perDisk, "wipe_probe": probes},
+			}
+		}
+	}
+	if d.Sensor != nil {
+		_ = d.Sensor(ctx, samples)
+	}
+
+	d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
+	return Outcome{
+		Passed:  true,
+		Summary: fmt.Sprintf("%d disks passed", len(targets)),
+		Extras:  map[string]any{"per_disk": perDisk, "wipe_probe": probes},
+	}
+}
+
+type diskTarget struct {
+	Serial string
+	Device string
+}
+
+// resolveTargets maps expected-disk serials to /dev/<X> paths by reading
+// /sys/block. Uses the same mechanism as probes.inventory to avoid drift.
+func resolveTargets(expected []ExpectedDisk) []diskTarget {
+	disks, err := listBlockDisks()
+	if err != nil {
+		return nil
+	}
+	// Build serial → device map from /sys.
+	serialOf := map[string]string{}
+	for _, dev := range disks {
+		name := strings.TrimPrefix(dev, "/dev/")
+		s := diskSerialFromSys(name)
+		if s != "" {
+			serialOf[strings.ToLower(s)] = dev
+		}
+	}
+	var out []diskTarget
+	for _, e := range expected {
+		if e.Serial == "" {
+			continue
+		}
+		if dev, ok := serialOf[strings.ToLower(e.Serial)]; ok {
+			out = append(out, diskTarget{Serial: e.Serial, Device: dev})
+		}
+	}
+	return out
+}
+
+// diskSerialFromSys is a smaller copy of probes.diskSerial; imported
+// from internal/probes would cause a cycle so we duplicate the short
+// lookup. If it drifts from the inventory probe, Storage fails because
+// the serial doesn't match — which is the correct behavior.
+func diskSerialFromSys(name string) string {
+	for _, rel := range []string{
+		"/sys/block/" + name + "/device/serial",
+		"/sys/block/" + name + "/serial",
+	} {
+		b, err := readFileBytes(rel)
+		if err != nil {
+			continue
+		}
+		s := strings.TrimSpace(string(b))
+		if s != "" {
+			return s
+		}
+	}
+	// Fall back to udevadm — ID_SERIAL_SHORT is more reliable on SCSI.
+	out, err := exec.Command("udevadm", "info", "--query=property", "--name="+name).Output()
+	if err != nil {
+		return ""
+	}
+	for _, line := range strings.Split(string(out), "\n") {
+		if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
+			return strings.TrimSpace(v)
+		}
+	}
+	return ""
+}
+
+func readFileBytes(p string) ([]byte, error) {
+	return readFile(p)
+}
+
+// ---------- wipe probe ----------
+
+type wipeProbeResult struct {
+	Device   string   `json:"device"`
+	HasData  bool     `json:"has_data"`
+	Findings []string `json:"findings,omitempty"`
+}
+
+// probeWipe runs blkid + wipefs -n. Any non-empty output from either is
+// a "has data" signal. This is deliberately conservative: we'd rather
+// halt on a bare ext4 signature than hand badblocks a disk with real
+// bytes on it.
+func probeWipe(ctx context.Context, device string) wipeProbeResult {
+	out := wipeProbeResult{Device: device}
+
+	if b, err := exec.CommandContext(ctx, "blkid", "-o", "full", device).Output(); err == nil {
+		s := strings.TrimSpace(string(b))
+		if s != "" {
+			out.Findings = append(out.Findings, "blkid: "+s)
+			out.HasData = true
+		}
+	}
+	if b, err := exec.CommandContext(ctx, "wipefs", "--no-act", device).Output(); err == nil {
+		s := strings.TrimSpace(string(b))
+		// wipefs prints a header line even on a clean disk; keep only
+		// lines with actual signature data.
+		for _, line := range strings.Split(s, "\n") {
+			line = strings.TrimSpace(line)
+			if line == "" || strings.HasPrefix(line, "DEVICE") || strings.HasPrefix(line, "offset") {
+				continue
+			}
+			out.Findings = append(out.Findings, "wipefs: "+line)
+			out.HasData = true
+		}
+	}
+	return out
+}
+
+// ---------- badblocks ----------
+
+type badblocksResult struct {
+	OK        bool   `json:"ok"`
+	Elapsed   string `json:"elapsed"`
+	Error     string `json:"error,omitempty"`
+	OutputTail string `json:"output_tail,omitempty"`
+}
+
+func runBadblocks(ctx context.Context, device string) badblocksResult {
+	// -c 64 blocks per check, -w destructive write, -b 4096 block size,
+	// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays
+	// bounded. A real burn-in would run the whole disk; that belongs in
+	// a separate "deep" stage.
+	args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"}
+	start := time.Now()
+	runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
+	defer cancel()
+	cmd := exec.CommandContext(runCtx, "badblocks", args...)
+	out, err := cmd.CombinedOutput()
+	r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)}
+	if err != nil {
+		r.Error = err.Error()
+		return r
+	}
+	// badblocks prints each bad block to stdout. Empty output = clean.
+	if strings.TrimSpace(string(out)) == "" {
+		r.OK = true
+	} else {
+		r.Error = "bad blocks found"
+	}
+	return r
+}
+
+// ---------- fio ----------
+
+type fioResult struct {
+	ReadIOPS   float64 `json:"read_iops"`
+	WriteIOPS  float64 `json:"write_iops"`
+	ReadBWKBps float64 `json:"read_bw_kbps"`
+	WriteBWKBps float64 `json:"write_bw_kbps"`
+	Error      string  `json:"error,omitempty"`
+}
+
+// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
+// This is a health bar, not a benchmark — we want to know the disk
+// services IO, not how fast it is at p99.
+func runFio(ctx context.Context, device string) fioResult {
+	runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
+	defer cancel()
+	args := []string{
+		"--name=health", "--filename=" + device, "--rw=randrw",
+		"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
+		"--group_reporting", "--output-format=json", "--direct=1",
+	}
+	cmd := exec.CommandContext(runCtx, "fio", args...)
+	out, err := cmd.Output()
+	if err != nil {
+		return fioResult{Error: err.Error()}
+	}
+	var top struct {
+		Jobs []struct {
+			Read  struct {
+				IOPS float64 `json:"iops"`
+				BW   float64 `json:"bw"`
+			} `json:"read"`
+			Write struct {
+				IOPS float64 `json:"iops"`
+				BW   float64 `json:"bw"`
+			} `json:"write"`
+		} `json:"jobs"`
+	}
+	if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 {
+		return fioResult{Error: "parse fio json: " + fmt.Sprint(err)}
+	}
+	j := top.Jobs[0]
+	return fioResult{
+		ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
+		ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
+	}
+}
@@ -0,0 +1,21 @@
+package tests
+
+import (
+	"fmt"
+	"os"
+)
+
+// readFile is used by stages that need to peek at /sys files without
+// importing the agent's probes package (which would cycle).
+func readFile(p string) ([]byte, error) {
+	return os.ReadFile(p)
+}
+
+// formatCount pluralizes a count + label: (0, "disk") → "0 disks",
+// (1, "disk") → "1 disk", (n, "disk") → "n disks". Keeps log lines tidy.
+func formatCount(n int, label string) string {
+	if n == 1 {
+		return fmt.Sprintf("%d %s", n, label)
+	}
+	return fmt.Sprintf("%d %ss", n, label)
+}