f79fe0f0db
Reshapes the detail page into a run-view: hybrid horizontal pipeline
+ expanded active-step pane with sub-steps, a per-step log pane with
line-numbered permalinks and client-side search, and a runs-history
sidebar that navigates via ?run=N. Default step is server-picked
(running → failed → Reporting) so the operator lands on the thing
that's moving.
Adds a sub_steps table + SSE topic (substep-{run}-{stage}-{ordinal})
so per-disk and per-pass work (SMART, CPUStress CPU/RAM, Storage,
GPU) is visible in the UI instead of buried in stage summary JSON.
Agent emits sub-step reports from existing per-iteration loops.
Dashboard tiles become a mini run-view with a 9-dot step strip so
the operator reads run health across the whole grid at a glance.
Register page gets the same card shell + button styling.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
341 lines
11 KiB
Go
341 lines
11 KiB
Go
package tests
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"fmt"
|
||
"os/exec"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
// Storage is the destructive stage: badblocks (write-mode sample) + fio
|
||
// random IO, persisting IOPS + latency as measurements. Pre-gates:
|
||
//
|
||
// 1. Device allowlist: only act on /dev/<X> where the kernel-reported
|
||
// serial matches one of Deps.ExpectedDisks. This is the operator's
|
||
// contract for what can be written to. USB sticks and unexpected
|
||
// drives are excluded.
|
||
// 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
|
||
// signatures, partition tables, or LVM metadata → fail with
|
||
// UnexpectedData unless Deps.OverrideWipe is set.
|
||
//
|
||
// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w`
|
||
// and `fio` in write mode. This matches the plan's "destructive disk
|
||
// tests are always-on, gated by layered safety."
|
||
func Storage(ctx context.Context, d Deps) Outcome {
|
||
if len(d.ExpectedDisks) == 0 {
|
||
d.Info("Storage: no expected disks in spec — skipping stage")
|
||
return Outcome{
|
||
Passed: true,
|
||
Summary: "skipped (no expected disks)",
|
||
Extras: map[string]any{"skipped": true, "reason": "no_expected_disks"},
|
||
}
|
||
}
|
||
|
||
targets := resolveTargets(d.ExpectedDisks)
|
||
if len(targets) == 0 {
|
||
d.Error("Storage: none of the expected disks are present on this host")
|
||
return Outcome{
|
||
Passed: false,
|
||
Message: "device allowlist matched zero disks",
|
||
Summary: "no allowed disks present",
|
||
Extras: map[string]any{"expected": d.ExpectedDisks},
|
||
}
|
||
}
|
||
|
||
// Non-destructive runs skip wipe-probe (nothing to refuse), badblocks
|
||
// -w, and write-mode fio. Every expected disk is still asserted
|
||
// present + readable by listing /sys/block and reading SMART-accessible
|
||
// identity; the per-disk map flags the shortcut so the report is clear.
|
||
if d.NonDestructive {
|
||
perDisk := map[string]any{}
|
||
for _, t := range targets {
|
||
perDisk[t.Device] = map[string]any{"mode": "non_destructive", "serial": t.Serial}
|
||
}
|
||
d.Info(fmt.Sprintf("Storage: non-destructive — verified %d disk(s) present", len(targets)))
|
||
return Outcome{
|
||
Passed: true,
|
||
Summary: fmt.Sprintf("non-destructive: read-only checks only (%d disks)", len(targets)),
|
||
Extras: map[string]any{"per_disk": perDisk, "non_destructive": true},
|
||
}
|
||
}
|
||
|
||
// Wipe probe on every target. A single dirty disk halts the stage
|
||
// unless the operator has set OverrideWipe via the UI.
|
||
probes := map[string]wipeProbeResult{}
|
||
dirty := []string{}
|
||
for _, t := range targets {
|
||
probe := probeWipe(ctx, t.Device)
|
||
probes[t.Device] = probe
|
||
if probe.HasData {
|
||
dirty = append(dirty, t.Device)
|
||
}
|
||
}
|
||
if len(dirty) > 0 && !d.OverrideWipe {
|
||
d.Error("Storage: wipe probe found existing data on: " + strings.Join(dirty, ", "))
|
||
return Outcome{
|
||
Passed: false,
|
||
Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
|
||
Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
|
||
Extras: map[string]any{
|
||
"wipe_probe": probes,
|
||
"override_hint": "click 'Override wipe & retry' in the held tile",
|
||
"dirty_devices": dirty,
|
||
},
|
||
}
|
||
}
|
||
if d.OverrideWipe && len(dirty) > 0 {
|
||
d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
|
||
}
|
||
|
||
// Per target: short badblocks write sample + fio random-read/write.
|
||
var samples []Sample
|
||
var subs []SubStepReport
|
||
perDisk := map[string]any{}
|
||
for _, t := range targets {
|
||
d.Info("Storage: running badblocks write sample on " + t.Device)
|
||
bbStart := time.Now()
|
||
bb := runBadblocks(ctx, t.Device)
|
||
bbEnd := time.Now()
|
||
bbSummary, _ := json.Marshal(bb)
|
||
subs = append(subs, SubStepReport{
|
||
Name: fmt.Sprintf("badblocks %s", t.Device),
|
||
Passed: bb.OK,
|
||
StartedAt: bbStart,
|
||
CompletedAt: bbEnd,
|
||
SummaryJSON: bbSummary,
|
||
})
|
||
|
||
d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
|
||
fioStart := time.Now()
|
||
fr := runFio(ctx, t.Device)
|
||
fioEnd := time.Now()
|
||
fioSummary, _ := json.Marshal(fr)
|
||
subs = append(subs, SubStepReport{
|
||
Name: fmt.Sprintf("fio %s", t.Device),
|
||
Passed: fr.Error == "",
|
||
StartedAt: fioStart,
|
||
CompletedAt: fioEnd,
|
||
SummaryJSON: fioSummary,
|
||
})
|
||
|
||
perDisk[t.Device] = map[string]any{
|
||
"badblocks": bb,
|
||
"fio": fr,
|
||
}
|
||
samples = append(samples,
|
||
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
|
||
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
|
||
)
|
||
if !bb.OK {
|
||
return Outcome{
|
||
Passed: false,
|
||
Message: "badblocks found errors on " + t.Device,
|
||
Summary: "badblocks failed on " + t.Device,
|
||
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
|
||
SubSteps: subs,
|
||
}
|
||
}
|
||
}
|
||
if d.Sensor != nil {
|
||
_ = d.Sensor(ctx, samples)
|
||
}
|
||
|
||
d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
|
||
return Outcome{
|
||
Passed: true,
|
||
Summary: fmt.Sprintf("%d disks passed", len(targets)),
|
||
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
|
||
SubSteps: subs,
|
||
}
|
||
}
|
||
|
||
type diskTarget struct {
|
||
Serial string
|
||
Device string
|
||
}
|
||
|
||
// resolveTargets maps expected-disk serials to /dev/<X> paths by reading
|
||
// /sys/block. Uses the same mechanism as probes.inventory to avoid drift.
|
||
func resolveTargets(expected []ExpectedDisk) []diskTarget {
|
||
disks, err := listBlockDisks()
|
||
if err != nil {
|
||
return nil
|
||
}
|
||
// Build serial → device map from /sys.
|
||
serialOf := map[string]string{}
|
||
for _, dev := range disks {
|
||
name := strings.TrimPrefix(dev, "/dev/")
|
||
s := diskSerialFromSys(name)
|
||
if s != "" {
|
||
serialOf[strings.ToLower(s)] = dev
|
||
}
|
||
}
|
||
var out []diskTarget
|
||
for _, e := range expected {
|
||
if e.Serial == "" {
|
||
continue
|
||
}
|
||
if dev, ok := serialOf[strings.ToLower(e.Serial)]; ok {
|
||
out = append(out, diskTarget{Serial: e.Serial, Device: dev})
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
|
||
// diskSerialFromSys is a smaller copy of probes.diskSerial; imported
|
||
// from internal/probes would cause a cycle so we duplicate the short
|
||
// lookup. If it drifts from the inventory probe, Storage fails because
|
||
// the serial doesn't match — which is the correct behavior.
|
||
func diskSerialFromSys(name string) string {
|
||
for _, rel := range []string{
|
||
"/sys/block/" + name + "/device/serial",
|
||
"/sys/block/" + name + "/serial",
|
||
} {
|
||
b, err := readFileBytes(rel)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
s := strings.TrimSpace(string(b))
|
||
if s != "" {
|
||
return s
|
||
}
|
||
}
|
||
// Fall back to udevadm — ID_SERIAL_SHORT is more reliable on SCSI.
|
||
out, err := exec.Command("udevadm", "info", "--query=property", "--name="+name).Output()
|
||
if err != nil {
|
||
return ""
|
||
}
|
||
for _, line := range strings.Split(string(out), "\n") {
|
||
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
|
||
return strings.TrimSpace(v)
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func readFileBytes(p string) ([]byte, error) {
|
||
return readFile(p)
|
||
}
|
||
|
||
// ---------- wipe probe ----------
|
||
|
||
type wipeProbeResult struct {
|
||
Device string `json:"device"`
|
||
HasData bool `json:"has_data"`
|
||
Findings []string `json:"findings,omitempty"`
|
||
}
|
||
|
||
// probeWipe runs blkid + wipefs -n. Any non-empty output from either is
|
||
// a "has data" signal. This is deliberately conservative: we'd rather
|
||
// halt on a bare ext4 signature than hand badblocks a disk with real
|
||
// bytes on it.
|
||
func probeWipe(ctx context.Context, device string) wipeProbeResult {
|
||
out := wipeProbeResult{Device: device}
|
||
|
||
if b, err := exec.CommandContext(ctx, "blkid", "-o", "full", device).Output(); err == nil {
|
||
s := strings.TrimSpace(string(b))
|
||
if s != "" {
|
||
out.Findings = append(out.Findings, "blkid: "+s)
|
||
out.HasData = true
|
||
}
|
||
}
|
||
if b, err := exec.CommandContext(ctx, "wipefs", "--no-act", device).Output(); err == nil {
|
||
s := strings.TrimSpace(string(b))
|
||
// wipefs prints a header line even on a clean disk; keep only
|
||
// lines with actual signature data.
|
||
for _, line := range strings.Split(s, "\n") {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" || strings.HasPrefix(line, "DEVICE") || strings.HasPrefix(line, "offset") {
|
||
continue
|
||
}
|
||
out.Findings = append(out.Findings, "wipefs: "+line)
|
||
out.HasData = true
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
|
||
// ---------- badblocks ----------
|
||
|
||
type badblocksResult struct {
|
||
OK bool `json:"ok"`
|
||
Elapsed string `json:"elapsed"`
|
||
Error string `json:"error,omitempty"`
|
||
OutputTail string `json:"output_tail,omitempty"`
|
||
}
|
||
|
||
func runBadblocks(ctx context.Context, device string) badblocksResult {
|
||
// -c 64 blocks per check, -w destructive write, -b 4096 block size,
|
||
// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays
|
||
// bounded. A real burn-in would run the whole disk; that belongs in
|
||
// a separate "deep" stage.
|
||
args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"}
|
||
start := time.Now()
|
||
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||
defer cancel()
|
||
cmd := exec.CommandContext(runCtx, "badblocks", args...)
|
||
out, err := cmd.CombinedOutput()
|
||
r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)}
|
||
if err != nil {
|
||
r.Error = err.Error()
|
||
return r
|
||
}
|
||
// badblocks prints each bad block to stdout. Empty output = clean.
|
||
if strings.TrimSpace(string(out)) == "" {
|
||
r.OK = true
|
||
} else {
|
||
r.Error = "bad blocks found"
|
||
}
|
||
return r
|
||
}
|
||
|
||
// ---------- fio ----------
|
||
|
||
type fioResult struct {
|
||
ReadIOPS float64 `json:"read_iops"`
|
||
WriteIOPS float64 `json:"write_iops"`
|
||
ReadBWKBps float64 `json:"read_bw_kbps"`
|
||
WriteBWKBps float64 `json:"write_bw_kbps"`
|
||
Error string `json:"error,omitempty"`
|
||
}
|
||
|
||
// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
|
||
// This is a health bar, not a benchmark — we want to know the disk
|
||
// services IO, not how fast it is at p99.
|
||
func runFio(ctx context.Context, device string) fioResult {
|
||
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||
defer cancel()
|
||
args := []string{
|
||
"--name=health", "--filename=" + device, "--rw=randrw",
|
||
"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
|
||
"--group_reporting", "--output-format=json", "--direct=1",
|
||
}
|
||
cmd := exec.CommandContext(runCtx, "fio", args...)
|
||
out, err := cmd.Output()
|
||
if err != nil {
|
||
return fioResult{Error: err.Error()}
|
||
}
|
||
var top struct {
|
||
Jobs []struct {
|
||
Read struct {
|
||
IOPS float64 `json:"iops"`
|
||
BW float64 `json:"bw"`
|
||
} `json:"read"`
|
||
Write struct {
|
||
IOPS float64 `json:"iops"`
|
||
BW float64 `json:"bw"`
|
||
} `json:"write"`
|
||
} `json:"jobs"`
|
||
}
|
||
if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 {
|
||
return fioResult{Error: "parse fio json: " + fmt.Sprint(err)}
|
||
}
|
||
j := top.Jobs[0]
|
||
return fioResult{
|
||
ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
|
||
ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
|
||
}
|
||
}
|