Files
Vetting/agent/tests/storage.go
T
josh f79fe0f0db
CI / Lint + build + test (push) Successful in 1m26s
Release / release (push) Successful in 6m47s
ui: GitHub-Actions-style detail page, sub-steps, mini-tile run-view
Reshapes the detail page into a run-view: hybrid horizontal pipeline
+ expanded active-step pane with sub-steps, a per-step log pane with
line-numbered permalinks and client-side search, and a runs-history
sidebar that navigates via ?run=N. Default step is server-picked
(running → failed → Reporting) so the operator lands on the thing
that's moving.

Adds a sub_steps table + SSE topic (substep-{run}-{stage}-{ordinal})
so per-disk and per-pass work (SMART, CPUStress CPU/RAM, Storage,
GPU) is visible in the UI instead of buried in stage summary JSON.
Agent emits sub-step reports from existing per-iteration loops.

Dashboard tiles become a mini run-view with a 9-dot step strip so
the operator reads run health across the whole grid at a glance.
Register page gets the same card shell + button styling.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 19:00:11 -04:00

341 lines
11 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package tests
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"strings"
"time"
)
// Storage is the destructive stage: badblocks (write-mode sample) + fio
// random IO, persisting IOPS + latency as measurements. Pre-gates:
//
// 1. Device allowlist: only act on /dev/<X> where the kernel-reported
// serial matches one of Deps.ExpectedDisks. This is the operator's
// contract for what can be written to. USB sticks and unexpected
// drives are excluded.
// 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
// signatures, partition tables, or LVM metadata → fail with
// UnexpectedData unless Deps.OverrideWipe is set.
//
// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w`
// and `fio` in write mode. This matches the plan's "destructive disk
// tests are always-on, gated by layered safety."
func Storage(ctx context.Context, d Deps) Outcome {
if len(d.ExpectedDisks) == 0 {
d.Info("Storage: no expected disks in spec — skipping stage")
return Outcome{
Passed: true,
Summary: "skipped (no expected disks)",
Extras: map[string]any{"skipped": true, "reason": "no_expected_disks"},
}
}
targets := resolveTargets(d.ExpectedDisks)
if len(targets) == 0 {
d.Error("Storage: none of the expected disks are present on this host")
return Outcome{
Passed: false,
Message: "device allowlist matched zero disks",
Summary: "no allowed disks present",
Extras: map[string]any{"expected": d.ExpectedDisks},
}
}
// Non-destructive runs skip wipe-probe (nothing to refuse), badblocks
// -w, and write-mode fio. Every expected disk is still asserted
// present + readable by listing /sys/block and reading SMART-accessible
// identity; the per-disk map flags the shortcut so the report is clear.
if d.NonDestructive {
perDisk := map[string]any{}
for _, t := range targets {
perDisk[t.Device] = map[string]any{"mode": "non_destructive", "serial": t.Serial}
}
d.Info(fmt.Sprintf("Storage: non-destructive — verified %d disk(s) present", len(targets)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("non-destructive: read-only checks only (%d disks)", len(targets)),
Extras: map[string]any{"per_disk": perDisk, "non_destructive": true},
}
}
// Wipe probe on every target. A single dirty disk halts the stage
// unless the operator has set OverrideWipe via the UI.
probes := map[string]wipeProbeResult{}
dirty := []string{}
for _, t := range targets {
probe := probeWipe(ctx, t.Device)
probes[t.Device] = probe
if probe.HasData {
dirty = append(dirty, t.Device)
}
}
if len(dirty) > 0 && !d.OverrideWipe {
d.Error("Storage: wipe probe found existing data on: " + strings.Join(dirty, ", "))
return Outcome{
Passed: false,
Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
Extras: map[string]any{
"wipe_probe": probes,
"override_hint": "click 'Override wipe & retry' in the held tile",
"dirty_devices": dirty,
},
}
}
if d.OverrideWipe && len(dirty) > 0 {
d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
}
// Per target: short badblocks write sample + fio random-read/write.
var samples []Sample
var subs []SubStepReport
perDisk := map[string]any{}
for _, t := range targets {
d.Info("Storage: running badblocks write sample on " + t.Device)
bbStart := time.Now()
bb := runBadblocks(ctx, t.Device)
bbEnd := time.Now()
bbSummary, _ := json.Marshal(bb)
subs = append(subs, SubStepReport{
Name: fmt.Sprintf("badblocks %s", t.Device),
Passed: bb.OK,
StartedAt: bbStart,
CompletedAt: bbEnd,
SummaryJSON: bbSummary,
})
d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
fioStart := time.Now()
fr := runFio(ctx, t.Device)
fioEnd := time.Now()
fioSummary, _ := json.Marshal(fr)
subs = append(subs, SubStepReport{
Name: fmt.Sprintf("fio %s", t.Device),
Passed: fr.Error == "",
StartedAt: fioStart,
CompletedAt: fioEnd,
SummaryJSON: fioSummary,
})
perDisk[t.Device] = map[string]any{
"badblocks": bb,
"fio": fr,
}
samples = append(samples,
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
)
if !bb.OK {
return Outcome{
Passed: false,
Message: "badblocks found errors on " + t.Device,
Summary: "badblocks failed on " + t.Device,
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
SubSteps: subs,
}
}
}
if d.Sensor != nil {
_ = d.Sensor(ctx, samples)
}
d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%d disks passed", len(targets)),
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
SubSteps: subs,
}
}
type diskTarget struct {
Serial string
Device string
}
// resolveTargets maps expected-disk serials to /dev/<X> paths by reading
// /sys/block. Uses the same mechanism as probes.inventory to avoid drift.
func resolveTargets(expected []ExpectedDisk) []diskTarget {
disks, err := listBlockDisks()
if err != nil {
return nil
}
// Build serial → device map from /sys.
serialOf := map[string]string{}
for _, dev := range disks {
name := strings.TrimPrefix(dev, "/dev/")
s := diskSerialFromSys(name)
if s != "" {
serialOf[strings.ToLower(s)] = dev
}
}
var out []diskTarget
for _, e := range expected {
if e.Serial == "" {
continue
}
if dev, ok := serialOf[strings.ToLower(e.Serial)]; ok {
out = append(out, diskTarget{Serial: e.Serial, Device: dev})
}
}
return out
}
// diskSerialFromSys is a smaller copy of probes.diskSerial; imported
// from internal/probes would cause a cycle so we duplicate the short
// lookup. If it drifts from the inventory probe, Storage fails because
// the serial doesn't match — which is the correct behavior.
func diskSerialFromSys(name string) string {
for _, rel := range []string{
"/sys/block/" + name + "/device/serial",
"/sys/block/" + name + "/serial",
} {
b, err := readFileBytes(rel)
if err != nil {
continue
}
s := strings.TrimSpace(string(b))
if s != "" {
return s
}
}
// Fall back to udevadm — ID_SERIAL_SHORT is more reliable on SCSI.
out, err := exec.Command("udevadm", "info", "--query=property", "--name="+name).Output()
if err != nil {
return ""
}
for _, line := range strings.Split(string(out), "\n") {
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
return strings.TrimSpace(v)
}
}
return ""
}
func readFileBytes(p string) ([]byte, error) {
return readFile(p)
}
// ---------- wipe probe ----------
type wipeProbeResult struct {
Device string `json:"device"`
HasData bool `json:"has_data"`
Findings []string `json:"findings,omitempty"`
}
// probeWipe runs blkid + wipefs -n. Any non-empty output from either is
// a "has data" signal. This is deliberately conservative: we'd rather
// halt on a bare ext4 signature than hand badblocks a disk with real
// bytes on it.
func probeWipe(ctx context.Context, device string) wipeProbeResult {
out := wipeProbeResult{Device: device}
if b, err := exec.CommandContext(ctx, "blkid", "-o", "full", device).Output(); err == nil {
s := strings.TrimSpace(string(b))
if s != "" {
out.Findings = append(out.Findings, "blkid: "+s)
out.HasData = true
}
}
if b, err := exec.CommandContext(ctx, "wipefs", "--no-act", device).Output(); err == nil {
s := strings.TrimSpace(string(b))
// wipefs prints a header line even on a clean disk; keep only
// lines with actual signature data.
for _, line := range strings.Split(s, "\n") {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "DEVICE") || strings.HasPrefix(line, "offset") {
continue
}
out.Findings = append(out.Findings, "wipefs: "+line)
out.HasData = true
}
}
return out
}
// ---------- badblocks ----------
type badblocksResult struct {
OK bool `json:"ok"`
Elapsed string `json:"elapsed"`
Error string `json:"error,omitempty"`
OutputTail string `json:"output_tail,omitempty"`
}
func runBadblocks(ctx context.Context, device string) badblocksResult {
// -c 64 blocks per check, -w destructive write, -b 4096 block size,
// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays
// bounded. A real burn-in would run the whole disk; that belongs in
// a separate "deep" stage.
args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"}
start := time.Now()
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()
cmd := exec.CommandContext(runCtx, "badblocks", args...)
out, err := cmd.CombinedOutput()
r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)}
if err != nil {
r.Error = err.Error()
return r
}
// badblocks prints each bad block to stdout. Empty output = clean.
if strings.TrimSpace(string(out)) == "" {
r.OK = true
} else {
r.Error = "bad blocks found"
}
return r
}
// ---------- fio ----------
type fioResult struct {
ReadIOPS float64 `json:"read_iops"`
WriteIOPS float64 `json:"write_iops"`
ReadBWKBps float64 `json:"read_bw_kbps"`
WriteBWKBps float64 `json:"write_bw_kbps"`
Error string `json:"error,omitempty"`
}
// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
// This is a health bar, not a benchmark — we want to know the disk
// services IO, not how fast it is at p99.
func runFio(ctx context.Context, device string) fioResult {
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()
args := []string{
"--name=health", "--filename=" + device, "--rw=randrw",
"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
"--group_reporting", "--output-format=json", "--direct=1",
}
cmd := exec.CommandContext(runCtx, "fio", args...)
out, err := cmd.Output()
if err != nil {
return fioResult{Error: err.Error()}
}
var top struct {
Jobs []struct {
Read struct {
IOPS float64 `json:"iops"`
BW float64 `json:"bw"`
} `json:"read"`
Write struct {
IOPS float64 `json:"iops"`
BW float64 `json:"bw"`
} `json:"write"`
} `json:"jobs"`
}
if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 {
return fioResult{Error: "parse fio json: " + fmt.Sprint(err)}
}
j := top.Jobs[0]
return fioResult{
ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
}
}