package tests import ( "context" "encoding/json" "fmt" "os/exec" "strings" "time" ) // Storage is the destructive stage: badblocks (write-mode sample) + fio // random IO, persisting IOPS + latency as measurements. Pre-gates: // // 1. Device allowlist: only act on /dev/ where the kernel-reported // serial matches one of Deps.ExpectedDisks. This is the operator's // contract for what can be written to. USB sticks and unexpected // drives are excluded. // 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem // signatures, partition tables, or LVM metadata → fail with // UnexpectedData unless Deps.OverrideWipe is set. // // Only after those pass does the stage run `badblocks -b 4096 -c 64 -w` // and `fio` in write mode. This matches the plan's "destructive disk // tests are always-on, gated by layered safety." func Storage(ctx context.Context, d Deps) Outcome { if len(d.ExpectedDisks) == 0 { d.Info("Storage: no expected disks in spec — skipping stage") return Outcome{ Passed: true, Summary: "skipped (no expected disks)", Extras: map[string]any{"skipped": true, "reason": "no_expected_disks"}, } } targets := resolveTargets(d.ExpectedDisks) if len(targets) == 0 { d.Error("Storage: none of the expected disks are present on this host") return Outcome{ Passed: false, Message: "device allowlist matched zero disks", Summary: "no allowed disks present", Extras: map[string]any{"expected": d.ExpectedDisks}, } } // Non-destructive runs skip wipe-probe (nothing to refuse), badblocks // -w, and write-mode fio. Every expected disk is still asserted // present + readable by listing /sys/block and reading SMART-accessible // identity; the per-disk map flags the shortcut so the report is clear. if d.NonDestructive { perDisk := map[string]any{} for _, t := range targets { perDisk[t.Device] = map[string]any{"mode": "non_destructive", "serial": t.Serial} } d.Info(fmt.Sprintf("Storage: non-destructive — verified %d disk(s) present", len(targets))) return Outcome{ Passed: true, Summary: fmt.Sprintf("non-destructive: read-only checks only (%d disks)", len(targets)), Extras: map[string]any{"per_disk": perDisk, "non_destructive": true}, } } // Wipe probe on every target. A single dirty disk halts the stage // unless the operator has set OverrideWipe via the UI. probes := map[string]wipeProbeResult{} dirty := []string{} for _, t := range targets { probe := probeWipe(ctx, t.Device) probes[t.Device] = probe if probe.HasData { dirty = append(dirty, t.Device) } } if len(dirty) > 0 && !d.OverrideWipe { d.Error("Storage: wipe probe found existing data on: " + strings.Join(dirty, ", ")) return Outcome{ Passed: false, Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)", Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)), Extras: map[string]any{ "wipe_probe": probes, "override_hint": "click 'Override wipe & retry' in the held tile", "dirty_devices": dirty, }, } } if d.OverrideWipe && len(dirty) > 0 { d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", ")) } // Per target: short badblocks write sample + fio random-read/write. var samples []Sample var subs []SubStepReport perDisk := map[string]any{} for _, t := range targets { d.Info("Storage: running badblocks write sample on " + t.Device) bbStart := time.Now() bb := runBadblocks(ctx, t.Device) bbEnd := time.Now() bbSummary, _ := json.Marshal(bb) subs = append(subs, SubStepReport{ Name: fmt.Sprintf("badblocks %s", t.Device), Passed: bb.OK, StartedAt: bbStart, CompletedAt: bbEnd, SummaryJSON: bbSummary, }) d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device)) fioStart := time.Now() fr := runFio(ctx, t.Device) fioEnd := time.Now() fioSummary, _ := json.Marshal(fr) subs = append(subs, SubStepReport{ Name: fmt.Sprintf("fio %s", t.Device), Passed: fr.Error == "", StartedAt: fioStart, CompletedAt: fioEnd, SummaryJSON: fioSummary, }) perDisk[t.Device] = map[string]any{ "badblocks": bb, "fio": fr, } samples = append(samples, Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"}, Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"}, ) if !bb.OK { return Outcome{ Passed: false, Message: "badblocks found errors on " + t.Device, Summary: "badblocks failed on " + t.Device, Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes}, SubSteps: subs, } } } if d.Sensor != nil { _ = d.Sensor(ctx, samples) } d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets))) return Outcome{ Passed: true, Summary: fmt.Sprintf("%d disks passed", len(targets)), Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes}, SubSteps: subs, } } type diskTarget struct { Serial string Device string } // resolveTargets maps expected-disk serials to /dev/ paths by reading // /sys/block. Uses the same mechanism as probes.inventory to avoid drift. func resolveTargets(expected []ExpectedDisk) []diskTarget { disks, err := listBlockDisks() if err != nil { return nil } // Build serial → device map from /sys. serialOf := map[string]string{} for _, dev := range disks { name := strings.TrimPrefix(dev, "/dev/") s := diskSerialFromSys(name) if s != "" { serialOf[strings.ToLower(s)] = dev } } var out []diskTarget for _, e := range expected { if e.Serial == "" { continue } if dev, ok := serialOf[strings.ToLower(e.Serial)]; ok { out = append(out, diskTarget{Serial: e.Serial, Device: dev}) } } return out } // diskSerialFromSys is a smaller copy of probes.diskSerial; imported // from internal/probes would cause a cycle so we duplicate the short // lookup. If it drifts from the inventory probe, Storage fails because // the serial doesn't match — which is the correct behavior. func diskSerialFromSys(name string) string { for _, rel := range []string{ "/sys/block/" + name + "/device/serial", "/sys/block/" + name + "/serial", } { b, err := readFileBytes(rel) if err != nil { continue } s := strings.TrimSpace(string(b)) if s != "" { return s } } // Fall back to udevadm — ID_SERIAL_SHORT is more reliable on SCSI. out, err := exec.Command("udevadm", "info", "--query=property", "--name="+name).Output() if err != nil { return "" } for _, line := range strings.Split(string(out), "\n") { if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok { return strings.TrimSpace(v) } } return "" } func readFileBytes(p string) ([]byte, error) { return readFile(p) } // ---------- wipe probe ---------- type wipeProbeResult struct { Device string `json:"device"` HasData bool `json:"has_data"` Findings []string `json:"findings,omitempty"` } // probeWipe runs blkid + wipefs -n. Any non-empty output from either is // a "has data" signal. This is deliberately conservative: we'd rather // halt on a bare ext4 signature than hand badblocks a disk with real // bytes on it. func probeWipe(ctx context.Context, device string) wipeProbeResult { out := wipeProbeResult{Device: device} if b, err := exec.CommandContext(ctx, "blkid", "-o", "full", device).Output(); err == nil { s := strings.TrimSpace(string(b)) if s != "" { out.Findings = append(out.Findings, "blkid: "+s) out.HasData = true } } if b, err := exec.CommandContext(ctx, "wipefs", "--no-act", device).Output(); err == nil { s := strings.TrimSpace(string(b)) // wipefs prints a header line even on a clean disk; keep only // lines with actual signature data. for _, line := range strings.Split(s, "\n") { line = strings.TrimSpace(line) if line == "" || strings.HasPrefix(line, "DEVICE") || strings.HasPrefix(line, "offset") { continue } out.Findings = append(out.Findings, "wipefs: "+line) out.HasData = true } } return out } // ---------- badblocks ---------- type badblocksResult struct { OK bool `json:"ok"` Elapsed string `json:"elapsed"` Error string `json:"error,omitempty"` OutputTail string `json:"output_tail,omitempty"` } func runBadblocks(ctx context.Context, device string) badblocksResult { // -c 64 blocks per check, -w destructive write, -b 4096 block size, // -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays // bounded. A real burn-in would run the whole disk; that belongs in // a separate "deep" stage. args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"} start := time.Now() runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) defer cancel() cmd := exec.CommandContext(runCtx, "badblocks", args...) out, err := cmd.CombinedOutput() r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)} if err != nil { r.Error = err.Error() return r } // badblocks prints each bad block to stdout. Empty output = clean. if strings.TrimSpace(string(out)) == "" { r.OK = true } else { r.Error = "bad blocks found" } return r } // ---------- fio ---------- type fioResult struct { ReadIOPS float64 `json:"read_iops"` WriteIOPS float64 `json:"write_iops"` ReadBWKBps float64 `json:"read_bw_kbps"` WriteBWKBps float64 `json:"write_bw_kbps"` Error string `json:"error,omitempty"` } // runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks. // This is a health bar, not a benchmark — we want to know the disk // services IO, not how fast it is at p99. func runFio(ctx context.Context, device string) fioResult { runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) defer cancel() args := []string{ "--name=health", "--filename=" + device, "--rw=randrw", "--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0", "--group_reporting", "--output-format=json", "--direct=1", } cmd := exec.CommandContext(runCtx, "fio", args...) out, err := cmd.Output() if err != nil { return fioResult{Error: err.Error()} } var top struct { Jobs []struct { Read struct { IOPS float64 `json:"iops"` BW float64 `json:"bw"` } `json:"read"` Write struct { IOPS float64 `json:"iops"` BW float64 `json:"bw"` } `json:"write"` } `json:"jobs"` } if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 { return fioResult{Error: "parse fio json: " + fmt.Sprint(err)} } j := top.Jobs[0] return fioResult{ ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS, ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW, } }