deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -5,24 +5,36 @@ import (
 	"encoding/json"
 	"fmt"
 	"os/exec"
+	"strconv"
 	"strings"
 	"time"
 )

-// Storage is the destructive stage: badblocks (write-mode sample) + fio
-// random IO, persisting IOPS + latency as measurements. Pre-gates:
+// Storage is the destructive stage. Phase 2 replaced the old
+// badblocks + 128 MiB fio combo with a single fio run per disk that
+// writes, verifies md5 of what it wrote, and reports p99 latency.
+// Modes:
+//
+//   - fio_sample (quick): bounded 1 GiB write per disk, ~3 min runtime.
+//   - full_disk (deep/soak): writes the whole device, time-bounded by
+//     the fio_time knob (2 h deep, 6 h soak).
+//
+// Pre-gates kept from Phase 1:
 //
 //  1. Device allowlist: only act on /dev/<X> where the kernel-reported
-//     serial matches one of Deps.ExpectedDisks. This is the operator's
-//     contract for what can be written to. USB sticks and unexpected
+//     serial matches one of Deps.ExpectedDisks. USB sticks and unexpected
 //     drives are excluded.
 //  2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
-//     signatures, partition tables, or LVM metadata → fail with
+//     signature, partition table, or LVM metadata → fail with
 //     UnexpectedData unless Deps.OverrideWipe is set.
 //
-// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w`
-// and `fio` in write mode. This matches the plan's "destructive disk
-// tests are always-on, gated by layered safety."
+// After fio, the stage captures a SMART diff (start snapshot taken
+// before any writes; end snapshot after all writes finish) and posts
+// deltas on attributes like Reallocated_Sector_Ct and Current_Pending_Sector.
+// The threshold evaluator isn't seeded to gate smart_delta out of the
+// box — those samples are diagnostic for the report. Fio's p99 latency
+// posts as fio_p99_us so the per-stage Storage warning threshold can
+// fire on a latency cliff.
 func Storage(ctx context.Context, d Deps) Outcome {
 	if len(d.ExpectedDisks) == 0 {
 		d.Info("Storage: no expected disks in spec — skipping stage")
@@ -44,10 +56,10 @@ func Storage(ctx context.Context, d Deps) Outcome {
 		}
 	}

-	// Non-destructive runs skip wipe-probe (nothing to refuse), badblocks
-	// -w, and write-mode fio. Every expected disk is still asserted
-	// present + readable by listing /sys/block and reading SMART-accessible
-	// identity; the per-disk map flags the shortcut so the report is clear.
+	// Non-destructive runs skip wipe-probe (nothing to refuse), fio
+	// writes, and SMART delta (nothing changed so no delta to report).
+	// Every expected disk is still asserted present so a vanished drive
+	// still fails the stage.
 	if d.NonDestructive {
 		perDisk := map[string]any{}
 		for _, t := range targets {
@@ -79,9 +91,9 @@ func Storage(ctx context.Context, d Deps) Outcome {
 			Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
 			Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
 			Extras: map[string]any{
-				"wipe_probe":     probes,
-				"override_hint":  "click 'Override wipe & retry' in the held tile",
-				"dirty_devices":  dirty,
+				"wipe_probe":    probes,
+				"override_hint": "click 'Override wipe & retry' in the held tile",
+				"dirty_devices": dirty,
 			},
 		}
 	}
@@ -89,64 +101,80 @@ func Storage(ctx context.Context, d Deps) Outcome {
 		d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
 	}

-	// Per target: short badblocks write sample + fio random-read/write.
+	// Capture start-of-stage SMART attributes before we write anything
+	// so the delta is attributable to *this* stage's writes and not the
+	// host's prior history. Per-disk failures are tolerated (e.g. the
+	// device doesn't expose SMART); we just can't emit a delta for it.
+	startSMART := captureSMARTAttrs(ctx, targets)
+
+	fioOpts := resolveFioOpts(d.StorageKnobs)
+	d.Info(fmt.Sprintf("Storage: fio mode=%s size=%s runtime=%s bs=%s rw=%s verify=%s",
+		fioOpts.Mode, fioOpts.Size, fioOpts.Runtime, fioOpts.BS, fioOpts.RW, fioOpts.Verify))
+
 	var samples []Sample
 	var subs []SubStepReport
 	perDisk := map[string]any{}
+	failed := ""
 	for _, t := range targets {
-		d.Info("Storage: running badblocks write sample on " + t.Device)
-		bbStart := time.Now()
-		bb := runBadblocks(ctx, t.Device)
-		bbEnd := time.Now()
-		bbSummary, _ := json.Marshal(bb)
-		subs = append(subs, SubStepReport{
-			Name:        fmt.Sprintf("badblocks %s", t.Device),
-			Passed:      bb.OK,
-			StartedAt:   bbStart,
-			CompletedAt: bbEnd,
-			SummaryJSON: bbSummary,
-		})
-
-		d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
+		d.Info(fmt.Sprintf("Storage: running fio %s on %s", fioOpts.Mode, t.Device))
 		fioStart := time.Now()
-		fr := runFio(ctx, t.Device)
+		fr := runFioVerify(ctx, t.Device, fioOpts)
 		fioEnd := time.Now()
 		fioSummary, _ := json.Marshal(fr)
 		subs = append(subs, SubStepReport{
-			Name:        fmt.Sprintf("fio %s", t.Device),
+			Name:        fmt.Sprintf("fio %s %s", fioOpts.Mode, t.Device),
 			Passed:      fr.Error == "",
 			StartedAt:   fioStart,
 			CompletedAt: fioEnd,
 			SummaryJSON: fioSummary,
 		})
+		perDisk[t.Device] = map[string]any{"fio": fr}

-		perDisk[t.Device] = map[string]any{
-			"badblocks": bb,
-			"fio":       fr,
-		}
-		samples = append(samples,
-			Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
-			Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
-		)
-		if !bb.OK {
-			return Outcome{
-				Passed:   false,
-				Message:  "badblocks found errors on " + t.Device,
-				Summary:  "badblocks failed on " + t.Device,
-				Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes},
-				SubSteps: subs,
+		if fr.Error == "" {
+			samples = append(samples,
+				Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
+				Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
+			)
+			if fr.ReadP99Us > 0 {
+				samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
 			}
+			if fr.WriteP99Us > 0 {
+				samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
+			}
+		} else if failed == "" {
+			failed = t.Device
 		}
 	}
-	if d.Sensor != nil {
+
+	// End-of-stage SMART snapshot + diff. We capture whether or not fio
+	// succeeded — a mid-run failure still produces attributable deltas,
+	// which is often more interesting than the stage outcome itself.
+	endSMART := captureSMARTAttrs(ctx, targets)
+	deltas := diffSMARTAttrs(startSMART, endSMART)
+	for dev, attrs := range deltas {
+		for attr, delta := range attrs {
+			samples = append(samples, Sample{Kind: "smart_delta", Key: dev + "/" + attr, Value: delta, Unit: "count"})
+		}
+	}
+	if d.Sensor != nil && len(samples) > 0 {
 		_ = d.Sensor(ctx, samples)
 	}

-	d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
+	if failed != "" {
+		return Outcome{
+			Passed:   false,
+			Message:  "fio verify failed on " + failed,
+			Summary:  "fio failed on " + failed,
+			Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
+			SubSteps: subs,
+		}
+	}
+
+	d.Info(fmt.Sprintf("Storage: %d disk(s) passed fio --verify", len(targets)))
 	return Outcome{
 		Passed:   true,
-		Summary:  fmt.Sprintf("%d disks passed", len(targets)),
-		Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes},
+		Summary:  fmt.Sprintf("%d disks passed (%s)", len(targets), fioOpts.Mode),
+		Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
 		SubSteps: subs,
 	}
 }
@@ -229,8 +257,8 @@ type wipeProbeResult struct {

 // probeWipe runs blkid + wipefs -n. Any non-empty output from either is
 // a "has data" signal. This is deliberately conservative: we'd rather
-// halt on a bare ext4 signature than hand badblocks a disk with real
-// bytes on it.
+// halt on a bare ext4 signature than hand fio a disk with real bytes on
+// it.
 func probeWipe(ctx context.Context, device string) wipeProbeResult {
 	out := wipeProbeResult{Device: device}

@@ -257,84 +285,269 @@ func probeWipe(ctx context.Context, device string) wipeProbeResult {
 	return out
 }

-// ---------- badblocks ----------
+// ---------- fio ----------

-type badblocksResult struct {
-	OK        bool   `json:"ok"`
-	Elapsed   string `json:"elapsed"`
-	Error     string `json:"error,omitempty"`
-	OutputTail string `json:"output_tail,omitempty"`
+// fioOpts resolves the probe knobs into the concrete flag values fio
+// needs. Defaults match the quick profile's fio_sample shape so callers
+// with zero knobs still run something bounded.
+type fioOpts struct {
+	Mode    string        `json:"mode"`     // "fio_sample" | "full_disk"
+	Size    string        `json:"size"`     // "1GiB"; only used for fio_sample
+	Runtime time.Duration `json:"runtime"`  // bounding time
+	BS      string        `json:"bs"`       // "4k"
+	RW      string        `json:"rw"`       // "randrw"
+	Verify  string        `json:"verify"`   // "md5" | ""
 }

-func runBadblocks(ctx context.Context, device string) badblocksResult {
-	// -c 64 blocks per check, -w destructive write, -b 4096 block size,
-	// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays
-	// bounded. A real burn-in would run the whole disk; that belongs in
-	// a separate "deep" stage.
-	args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"}
-	start := time.Now()
-	runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
+// resolveFioOpts normalizes the knobs into a runnable config. Zero-
+// valued fields fall back to the quick defaults so a stage that's
+// missing its knobs still has coherent behavior (safer than refusing).
+func resolveFioOpts(k StorageKnobs) fioOpts {
+	o := fioOpts{
+		Mode:    firstNonEmpty(k.Mode, "fio_sample"),
+		Size:    firstNonEmpty(k.FioSize, "1GiB"),
+		Runtime: k.FioTime,
+		BS:      firstNonEmpty(k.FioBS, "4k"),
+		RW:      firstNonEmpty(k.FioRW, "randrw"),
+		Verify:  firstNonEmpty(k.Verify, "md5"),
+	}
+	if o.Runtime <= 0 {
+		o.Runtime = 3 * time.Minute
+	}
+	return o
+}
+
+func firstNonEmpty(vs ...string) string {
+	for _, v := range vs {
+		if v != "" {
+			return v
+		}
+	}
+	return ""
+}
+
+type fioResult struct {
+	Mode        string  `json:"mode"`
+	ReadIOPS    float64 `json:"read_iops"`
+	WriteIOPS   float64 `json:"write_iops"`
+	ReadBWKBps  float64 `json:"read_bw_kbps"`
+	WriteBWKBps float64 `json:"write_bw_kbps"`
+	ReadP99Us   float64 `json:"read_p99_us,omitempty"`
+	WriteP99Us  float64 `json:"write_p99_us,omitempty"`
+	Error       string  `json:"error,omitempty"`
+	OutputTail  string  `json:"output_tail,omitempty"`
+}
+
+// runFioVerify invokes fio with md5-verify semantics. fio_sample mode
+// caps the IO at opts.Size; full_disk drives the whole device bounded
+// by runtime. Both use direct IO to bypass the page cache — we want
+// real disk latency, not Linux' cheerful buffer.
+func runFioVerify(ctx context.Context, device string, opts fioOpts) fioResult {
+	// 30s grace over runtime so fio has time to flush + close cleanly.
+	runCtx, cancel := context.WithTimeout(ctx, opts.Runtime+30*time.Second)
 	defer cancel()
-	cmd := exec.CommandContext(runCtx, "badblocks", args...)
-	out, err := cmd.CombinedOutput()
-	r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)}
+
+	args := []string{
+		"--name=verify-" + strings.TrimPrefix(device, "/dev/"),
+		"--filename=" + device,
+		"--rw=" + opts.RW,
+		"--bs=" + opts.BS,
+		"--numjobs=1",
+		"--direct=1",
+		"--group_reporting",
+		"--output-format=json",
+		"--runtime=" + strconv.Itoa(int(opts.Runtime.Seconds())),
+	}
+	if opts.Verify != "" {
+		args = append(args,
+			"--verify="+opts.Verify,
+			"--verify_pattern=random",
+			"--do_verify=1",
+		)
+	}
+	switch opts.Mode {
+	case "full_disk":
+		// Time-bounded across the full device — fio uses the device's
+		// full size when --size is omitted on a block device.
+		args = append(args, "--time_based=1")
+	default:
+		// fio_sample: bounded write. Setting --size= limits the IO
+		// volume regardless of runtime.
+		args = append(args, "--size="+opts.Size, "--time_based=0")
+	}
+
+	cmd := exec.CommandContext(runCtx, "fio", args...)
+	out, err := cmd.Output()
+	r := fioResult{Mode: opts.Mode, OutputTail: tailLines(string(out), 20)}
 	if err != nil {
 		r.Error = err.Error()
 		return r
 	}
-	// badblocks prints each bad block to stdout. Empty output = clean.
-	if strings.TrimSpace(string(out)) == "" {
-		r.OK = true
-	} else {
-		r.Error = "bad blocks found"
+	parsed, perr := parseFioJSON(out)
+	if perr != nil {
+		r.Error = "parse fio json: " + perr.Error()
+		return r
 	}
+	r.ReadIOPS = parsed.ReadIOPS
+	r.WriteIOPS = parsed.WriteIOPS
+	r.ReadBWKBps = parsed.ReadBWKBps
+	r.WriteBWKBps = parsed.WriteBWKBps
+	r.ReadP99Us = parsed.ReadP99Us
+	r.WriteP99Us = parsed.WriteP99Us
 	return r
 }

-// ---------- fio ----------
-
-type fioResult struct {
-	ReadIOPS   float64 `json:"read_iops"`
-	WriteIOPS  float64 `json:"write_iops"`
-	ReadBWKBps float64 `json:"read_bw_kbps"`
-	WriteBWKBps float64 `json:"write_bw_kbps"`
-	Error      string  `json:"error,omitempty"`
-}
-
-// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
-// This is a health bar, not a benchmark — we want to know the disk
-// services IO, not how fast it is at p99.
-func runFio(ctx context.Context, device string) fioResult {
-	runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
-	defer cancel()
-	args := []string{
-		"--name=health", "--filename=" + device, "--rw=randrw",
-		"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
-		"--group_reporting", "--output-format=json", "--direct=1",
-	}
-	cmd := exec.CommandContext(runCtx, "fio", args...)
-	out, err := cmd.Output()
-	if err != nil {
-		return fioResult{Error: err.Error()}
-	}
+// parseFioJSON extracts the bits we care about from fio's --output-format=json.
+// Latency percentiles live at .jobs[0].read.clat_ns.percentile["99.000000"];
+// we convert nanoseconds to microseconds for the fio_p99_us sample.
+func parseFioJSON(out []byte) (fioResult, error) {
 	var top struct {
 		Jobs []struct {
-			Read  struct {
+			Read struct {
 				IOPS float64 `json:"iops"`
 				BW   float64 `json:"bw"`
+				CLat struct {
+					Percentile map[string]float64 `json:"percentile"`
+				} `json:"clat_ns"`
 			} `json:"read"`
 			Write struct {
 				IOPS float64 `json:"iops"`
 				BW   float64 `json:"bw"`
+				CLat struct {
+					Percentile map[string]float64 `json:"percentile"`
+				} `json:"clat_ns"`
 			} `json:"write"`
 		} `json:"jobs"`
 	}
-	if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 {
-		return fioResult{Error: "parse fio json: " + fmt.Sprint(err)}
+	if err := json.Unmarshal(out, &top); err != nil {
+		return fioResult{}, err
+	}
+	if len(top.Jobs) == 0 {
+		return fioResult{}, fmt.Errorf("no jobs in fio output")
 	}
 	j := top.Jobs[0]
-	return fioResult{
+	r := fioResult{
 		ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
 		ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
 	}
+	if p := j.Read.CLat.Percentile["99.000000"]; p > 0 {
+		r.ReadP99Us = p / 1000.0
+	}
+	if p := j.Write.CLat.Percentile["99.000000"]; p > 0 {
+		r.WriteP99Us = p / 1000.0
+	}
+	return r, nil
+}
+
+// ---------- SMART delta ----------
+
+// smartAttrMap: device → attribute → raw counter value. ATA drives
+// populate named attributes (Reallocated_Sector_Ct etc); NVMe drives
+// populate a flatter nvme-specific map. We track a curated whitelist
+// of wear indicators — anything else is diagnostic and drops to the raw
+// report output.
+type smartAttrMap map[string]map[string]float64
+
+// captureSMARTAttrs runs smartctl -aj on each target and pulls the
+// whitelisted attributes. Per-device failures (virtio, permission
+// issues) degrade silently — the delta step just shows no data for
+// that device.
+func captureSMARTAttrs(ctx context.Context, targets []diskTarget) smartAttrMap {
+	out := smartAttrMap{}
+	for _, t := range targets {
+		parsed, err := runSmartctl(ctx, t.Device)
+		if err != nil {
+			continue
+		}
+		attrs := extractSMARTAttrs(parsed)
+		if len(attrs) > 0 {
+			out[t.Device] = attrs
+		}
+	}
+	return out
+}
+
+// smartAttributeWhitelist is the set of attributes we diff across a
+// stage. They're the ones that reflect *this stage's* IO damage, not
+// cumulative drive history. Adding attributes is cheap — missing ones
+// just drop to zero.
+var smartAttributeWhitelist = map[string]bool{
+	// ATA SMART attribute names (smartctl normalizes to these)
+	"Reallocated_Sector_Ct":   true,
+	"Current_Pending_Sector":  true,
+	"Offline_Uncorrectable":   true,
+	"UDMA_CRC_Error_Count":    true,
+	"Reported_Uncorrect":      true,
+	"Raw_Read_Error_Rate":     true,
+	// NVMe log fields (flat keys at top of nvme_smart_health_information_log)
+	"media_errors":            true,
+	"num_err_log_entries":     true,
+	"percentage_used":         true,
+}
+
+// extractSMARTAttrs walks smartctl's JSON for whitelisted attribute
+// values. Handles both the ATA shape (ata_smart_attributes.table[]) and
+// the NVMe shape (nvme_smart_health_information_log). Returns a map
+// keyed by the canonical attribute name.
+func extractSMARTAttrs(raw map[string]any) map[string]float64 {
+	out := map[string]float64{}
+	// ATA attributes are in ata_smart_attributes.table[] — each element
+	// has {"name": "Reallocated_Sector_Ct", "raw": {"value": N}}.
+	if ata, ok := raw["ata_smart_attributes"].(map[string]any); ok {
+		if tbl, ok := ata["table"].([]any); ok {
+			for _, row := range tbl {
+				rm, ok := row.(map[string]any)
+				if !ok {
+					continue
+				}
+				name, _ := rm["name"].(string)
+				if !smartAttributeWhitelist[name] {
+					continue
+				}
+				if r, ok := rm["raw"].(map[string]any); ok {
+					if v, ok := r["value"].(float64); ok {
+						out[name] = v
+					}
+				}
+			}
+		}
+	}
+	// NVMe attributes live flat under nvme_smart_health_information_log.
+	if nvme, ok := raw["nvme_smart_health_information_log"].(map[string]any); ok {
+		for k, v := range nvme {
+			if !smartAttributeWhitelist[k] {
+				continue
+			}
+			if n, ok := v.(float64); ok {
+				out[k] = n
+			}
+		}
+	}
+	return out
+}
+
+// diffSMARTAttrs subtracts start from end per (device, attribute).
+// Only attributes present in both ends produce a delta; missing
+// attributes drop out (can't attribute a zero-to-present delta safely).
+// Negative deltas are kept so a drive that resets a counter is visible.
+func diffSMARTAttrs(start, end smartAttrMap) map[string]map[string]float64 {
+	out := map[string]map[string]float64{}
+	for dev, endAttrs := range end {
+		startAttrs, ok := start[dev]
+		if !ok {
+			continue
+		}
+		devOut := map[string]float64{}
+		for attr, endV := range endAttrs {
+			startV, ok := startAttrs[attr]
+			if !ok {
+				continue
+			}
+			devOut[attr] = endV - startV
+		}
+		if len(devOut) > 0 {
+			out[dev] = devOut
+		}
+	}
+	return out
 }