Files
josh 23c689aa5b
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled
deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00

554 lines
18 KiB
Go

package tests
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"strconv"
"strings"
"time"
)
// Storage is the destructive stage. Phase 2 replaced the old
// badblocks + 128 MiB fio combo with a single fio run per disk that
// writes, verifies md5 of what it wrote, and reports p99 latency.
// Modes:
//
// - fio_sample (quick): bounded 1 GiB write per disk, ~3 min runtime.
// - full_disk (deep/soak): writes the whole device, time-bounded by
// the fio_time knob (2 h deep, 6 h soak).
//
// Pre-gates kept from Phase 1:
//
// 1. Device allowlist: only act on /dev/<X> where the kernel-reported
// serial matches one of Deps.ExpectedDisks. USB sticks and unexpected
// drives are excluded.
// 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
// signature, partition table, or LVM metadata → fail with
// UnexpectedData unless Deps.OverrideWipe is set.
//
// After fio, the stage captures a SMART diff (start snapshot taken
// before any writes; end snapshot after all writes finish) and posts
// deltas on attributes like Reallocated_Sector_Ct and Current_Pending_Sector.
// The threshold evaluator isn't seeded to gate smart_delta out of the
// box — those samples are diagnostic for the report. Fio's p99 latency
// posts as fio_p99_us so the per-stage Storage warning threshold can
// fire on a latency cliff.
func Storage(ctx context.Context, d Deps) Outcome {
if len(d.ExpectedDisks) == 0 {
d.Info("Storage: no expected disks in spec — skipping stage")
return Outcome{
Passed: true,
Summary: "skipped (no expected disks)",
Extras: map[string]any{"skipped": true, "reason": "no_expected_disks"},
}
}
targets := resolveTargets(d.ExpectedDisks)
if len(targets) == 0 {
d.Error("Storage: none of the expected disks are present on this host")
return Outcome{
Passed: false,
Message: "device allowlist matched zero disks",
Summary: "no allowed disks present",
Extras: map[string]any{"expected": d.ExpectedDisks},
}
}
// Non-destructive runs skip wipe-probe (nothing to refuse), fio
// writes, and SMART delta (nothing changed so no delta to report).
// Every expected disk is still asserted present so a vanished drive
// still fails the stage.
if d.NonDestructive {
perDisk := map[string]any{}
for _, t := range targets {
perDisk[t.Device] = map[string]any{"mode": "non_destructive", "serial": t.Serial}
}
d.Info(fmt.Sprintf("Storage: non-destructive — verified %d disk(s) present", len(targets)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("non-destructive: read-only checks only (%d disks)", len(targets)),
Extras: map[string]any{"per_disk": perDisk, "non_destructive": true},
}
}
// Wipe probe on every target. A single dirty disk halts the stage
// unless the operator has set OverrideWipe via the UI.
probes := map[string]wipeProbeResult{}
dirty := []string{}
for _, t := range targets {
probe := probeWipe(ctx, t.Device)
probes[t.Device] = probe
if probe.HasData {
dirty = append(dirty, t.Device)
}
}
if len(dirty) > 0 && !d.OverrideWipe {
d.Error("Storage: wipe probe found existing data on: " + strings.Join(dirty, ", "))
return Outcome{
Passed: false,
Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
Extras: map[string]any{
"wipe_probe": probes,
"override_hint": "click 'Override wipe & retry' in the held tile",
"dirty_devices": dirty,
},
}
}
if d.OverrideWipe && len(dirty) > 0 {
d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
}
// Capture start-of-stage SMART attributes before we write anything
// so the delta is attributable to *this* stage's writes and not the
// host's prior history. Per-disk failures are tolerated (e.g. the
// device doesn't expose SMART); we just can't emit a delta for it.
startSMART := captureSMARTAttrs(ctx, targets)
fioOpts := resolveFioOpts(d.StorageKnobs)
d.Info(fmt.Sprintf("Storage: fio mode=%s size=%s runtime=%s bs=%s rw=%s verify=%s",
fioOpts.Mode, fioOpts.Size, fioOpts.Runtime, fioOpts.BS, fioOpts.RW, fioOpts.Verify))
var samples []Sample
var subs []SubStepReport
perDisk := map[string]any{}
failed := ""
for _, t := range targets {
d.Info(fmt.Sprintf("Storage: running fio %s on %s", fioOpts.Mode, t.Device))
fioStart := time.Now()
fr := runFioVerify(ctx, t.Device, fioOpts)
fioEnd := time.Now()
fioSummary, _ := json.Marshal(fr)
subs = append(subs, SubStepReport{
Name: fmt.Sprintf("fio %s %s", fioOpts.Mode, t.Device),
Passed: fr.Error == "",
StartedAt: fioStart,
CompletedAt: fioEnd,
SummaryJSON: fioSummary,
})
perDisk[t.Device] = map[string]any{"fio": fr}
if fr.Error == "" {
samples = append(samples,
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
)
if fr.ReadP99Us > 0 {
samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
}
if fr.WriteP99Us > 0 {
samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
}
} else if failed == "" {
failed = t.Device
}
}
// End-of-stage SMART snapshot + diff. We capture whether or not fio
// succeeded — a mid-run failure still produces attributable deltas,
// which is often more interesting than the stage outcome itself.
endSMART := captureSMARTAttrs(ctx, targets)
deltas := diffSMARTAttrs(startSMART, endSMART)
for dev, attrs := range deltas {
for attr, delta := range attrs {
samples = append(samples, Sample{Kind: "smart_delta", Key: dev + "/" + attr, Value: delta, Unit: "count"})
}
}
if d.Sensor != nil && len(samples) > 0 {
_ = d.Sensor(ctx, samples)
}
if failed != "" {
return Outcome{
Passed: false,
Message: "fio verify failed on " + failed,
Summary: "fio failed on " + failed,
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
SubSteps: subs,
}
}
d.Info(fmt.Sprintf("Storage: %d disk(s) passed fio --verify", len(targets)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%d disks passed (%s)", len(targets), fioOpts.Mode),
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
SubSteps: subs,
}
}
type diskTarget struct {
Serial string
Device string
}
// resolveTargets maps expected-disk serials to /dev/<X> paths by reading
// /sys/block. Uses the same mechanism as probes.inventory to avoid drift.
func resolveTargets(expected []ExpectedDisk) []diskTarget {
disks, err := listBlockDisks()
if err != nil {
return nil
}
// Build serial → device map from /sys.
serialOf := map[string]string{}
for _, dev := range disks {
name := strings.TrimPrefix(dev, "/dev/")
s := diskSerialFromSys(name)
if s != "" {
serialOf[strings.ToLower(s)] = dev
}
}
var out []diskTarget
for _, e := range expected {
if e.Serial == "" {
continue
}
if dev, ok := serialOf[strings.ToLower(e.Serial)]; ok {
out = append(out, diskTarget{Serial: e.Serial, Device: dev})
}
}
return out
}
// diskSerialFromSys is a smaller copy of probes.diskSerial; imported
// from internal/probes would cause a cycle so we duplicate the short
// lookup. If it drifts from the inventory probe, Storage fails because
// the serial doesn't match — which is the correct behavior.
func diskSerialFromSys(name string) string {
for _, rel := range []string{
"/sys/block/" + name + "/device/serial",
"/sys/block/" + name + "/serial",
} {
b, err := readFileBytes(rel)
if err != nil {
continue
}
s := strings.TrimSpace(string(b))
if s != "" {
return s
}
}
// Fall back to udevadm — ID_SERIAL_SHORT is more reliable on SCSI.
out, err := exec.Command("udevadm", "info", "--query=property", "--name="+name).Output()
if err != nil {
return ""
}
for _, line := range strings.Split(string(out), "\n") {
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
return strings.TrimSpace(v)
}
}
return ""
}
func readFileBytes(p string) ([]byte, error) {
return readFile(p)
}
// ---------- wipe probe ----------
type wipeProbeResult struct {
Device string `json:"device"`
HasData bool `json:"has_data"`
Findings []string `json:"findings,omitempty"`
}
// probeWipe runs blkid + wipefs -n. Any non-empty output from either is
// a "has data" signal. This is deliberately conservative: we'd rather
// halt on a bare ext4 signature than hand fio a disk with real bytes on
// it.
func probeWipe(ctx context.Context, device string) wipeProbeResult {
out := wipeProbeResult{Device: device}
if b, err := exec.CommandContext(ctx, "blkid", "-o", "full", device).Output(); err == nil {
s := strings.TrimSpace(string(b))
if s != "" {
out.Findings = append(out.Findings, "blkid: "+s)
out.HasData = true
}
}
if b, err := exec.CommandContext(ctx, "wipefs", "--no-act", device).Output(); err == nil {
s := strings.TrimSpace(string(b))
// wipefs prints a header line even on a clean disk; keep only
// lines with actual signature data.
for _, line := range strings.Split(s, "\n") {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "DEVICE") || strings.HasPrefix(line, "offset") {
continue
}
out.Findings = append(out.Findings, "wipefs: "+line)
out.HasData = true
}
}
return out
}
// ---------- fio ----------
// fioOpts resolves the probe knobs into the concrete flag values fio
// needs. Defaults match the quick profile's fio_sample shape so callers
// with zero knobs still run something bounded.
type fioOpts struct {
Mode string `json:"mode"` // "fio_sample" | "full_disk"
Size string `json:"size"` // "1GiB"; only used for fio_sample
Runtime time.Duration `json:"runtime"` // bounding time
BS string `json:"bs"` // "4k"
RW string `json:"rw"` // "randrw"
Verify string `json:"verify"` // "md5" | ""
}
// resolveFioOpts normalizes the knobs into a runnable config. Zero-
// valued fields fall back to the quick defaults so a stage that's
// missing its knobs still has coherent behavior (safer than refusing).
func resolveFioOpts(k StorageKnobs) fioOpts {
o := fioOpts{
Mode: firstNonEmpty(k.Mode, "fio_sample"),
Size: firstNonEmpty(k.FioSize, "1GiB"),
Runtime: k.FioTime,
BS: firstNonEmpty(k.FioBS, "4k"),
RW: firstNonEmpty(k.FioRW, "randrw"),
Verify: firstNonEmpty(k.Verify, "md5"),
}
if o.Runtime <= 0 {
o.Runtime = 3 * time.Minute
}
return o
}
func firstNonEmpty(vs ...string) string {
for _, v := range vs {
if v != "" {
return v
}
}
return ""
}
type fioResult struct {
Mode string `json:"mode"`
ReadIOPS float64 `json:"read_iops"`
WriteIOPS float64 `json:"write_iops"`
ReadBWKBps float64 `json:"read_bw_kbps"`
WriteBWKBps float64 `json:"write_bw_kbps"`
ReadP99Us float64 `json:"read_p99_us,omitempty"`
WriteP99Us float64 `json:"write_p99_us,omitempty"`
Error string `json:"error,omitempty"`
OutputTail string `json:"output_tail,omitempty"`
}
// runFioVerify invokes fio with md5-verify semantics. fio_sample mode
// caps the IO at opts.Size; full_disk drives the whole device bounded
// by runtime. Both use direct IO to bypass the page cache — we want
// real disk latency, not Linux' cheerful buffer.
func runFioVerify(ctx context.Context, device string, opts fioOpts) fioResult {
// 30s grace over runtime so fio has time to flush + close cleanly.
runCtx, cancel := context.WithTimeout(ctx, opts.Runtime+30*time.Second)
defer cancel()
args := []string{
"--name=verify-" + strings.TrimPrefix(device, "/dev/"),
"--filename=" + device,
"--rw=" + opts.RW,
"--bs=" + opts.BS,
"--numjobs=1",
"--direct=1",
"--group_reporting",
"--output-format=json",
"--runtime=" + strconv.Itoa(int(opts.Runtime.Seconds())),
}
if opts.Verify != "" {
args = append(args,
"--verify="+opts.Verify,
"--verify_pattern=random",
"--do_verify=1",
)
}
switch opts.Mode {
case "full_disk":
// Time-bounded across the full device — fio uses the device's
// full size when --size is omitted on a block device.
args = append(args, "--time_based=1")
default:
// fio_sample: bounded write. Setting --size= limits the IO
// volume regardless of runtime.
args = append(args, "--size="+opts.Size, "--time_based=0")
}
cmd := exec.CommandContext(runCtx, "fio", args...)
out, err := cmd.Output()
r := fioResult{Mode: opts.Mode, OutputTail: tailLines(string(out), 20)}
if err != nil {
r.Error = err.Error()
return r
}
parsed, perr := parseFioJSON(out)
if perr != nil {
r.Error = "parse fio json: " + perr.Error()
return r
}
r.ReadIOPS = parsed.ReadIOPS
r.WriteIOPS = parsed.WriteIOPS
r.ReadBWKBps = parsed.ReadBWKBps
r.WriteBWKBps = parsed.WriteBWKBps
r.ReadP99Us = parsed.ReadP99Us
r.WriteP99Us = parsed.WriteP99Us
return r
}
// parseFioJSON extracts the bits we care about from fio's --output-format=json.
// Latency percentiles live at .jobs[0].read.clat_ns.percentile["99.000000"];
// we convert nanoseconds to microseconds for the fio_p99_us sample.
func parseFioJSON(out []byte) (fioResult, error) {
var top struct {
Jobs []struct {
Read struct {
IOPS float64 `json:"iops"`
BW float64 `json:"bw"`
CLat struct {
Percentile map[string]float64 `json:"percentile"`
} `json:"clat_ns"`
} `json:"read"`
Write struct {
IOPS float64 `json:"iops"`
BW float64 `json:"bw"`
CLat struct {
Percentile map[string]float64 `json:"percentile"`
} `json:"clat_ns"`
} `json:"write"`
} `json:"jobs"`
}
if err := json.Unmarshal(out, &top); err != nil {
return fioResult{}, err
}
if len(top.Jobs) == 0 {
return fioResult{}, fmt.Errorf("no jobs in fio output")
}
j := top.Jobs[0]
r := fioResult{
ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
}
if p := j.Read.CLat.Percentile["99.000000"]; p > 0 {
r.ReadP99Us = p / 1000.0
}
if p := j.Write.CLat.Percentile["99.000000"]; p > 0 {
r.WriteP99Us = p / 1000.0
}
return r, nil
}
// ---------- SMART delta ----------
// smartAttrMap: device → attribute → raw counter value. ATA drives
// populate named attributes (Reallocated_Sector_Ct etc); NVMe drives
// populate a flatter nvme-specific map. We track a curated whitelist
// of wear indicators — anything else is diagnostic and drops to the raw
// report output.
type smartAttrMap map[string]map[string]float64
// captureSMARTAttrs runs smartctl -aj on each target and pulls the
// whitelisted attributes. Per-device failures (virtio, permission
// issues) degrade silently — the delta step just shows no data for
// that device.
func captureSMARTAttrs(ctx context.Context, targets []diskTarget) smartAttrMap {
out := smartAttrMap{}
for _, t := range targets {
parsed, err := runSmartctl(ctx, t.Device)
if err != nil {
continue
}
attrs := extractSMARTAttrs(parsed)
if len(attrs) > 0 {
out[t.Device] = attrs
}
}
return out
}
// smartAttributeWhitelist is the set of attributes we diff across a
// stage. They're the ones that reflect *this stage's* IO damage, not
// cumulative drive history. Adding attributes is cheap — missing ones
// just drop to zero.
var smartAttributeWhitelist = map[string]bool{
// ATA SMART attribute names (smartctl normalizes to these)
"Reallocated_Sector_Ct": true,
"Current_Pending_Sector": true,
"Offline_Uncorrectable": true,
"UDMA_CRC_Error_Count": true,
"Reported_Uncorrect": true,
"Raw_Read_Error_Rate": true,
// NVMe log fields (flat keys at top of nvme_smart_health_information_log)
"media_errors": true,
"num_err_log_entries": true,
"percentage_used": true,
}
// extractSMARTAttrs walks smartctl's JSON for whitelisted attribute
// values. Handles both the ATA shape (ata_smart_attributes.table[]) and
// the NVMe shape (nvme_smart_health_information_log). Returns a map
// keyed by the canonical attribute name.
func extractSMARTAttrs(raw map[string]any) map[string]float64 {
out := map[string]float64{}
// ATA attributes are in ata_smart_attributes.table[] — each element
// has {"name": "Reallocated_Sector_Ct", "raw": {"value": N}}.
if ata, ok := raw["ata_smart_attributes"].(map[string]any); ok {
if tbl, ok := ata["table"].([]any); ok {
for _, row := range tbl {
rm, ok := row.(map[string]any)
if !ok {
continue
}
name, _ := rm["name"].(string)
if !smartAttributeWhitelist[name] {
continue
}
if r, ok := rm["raw"].(map[string]any); ok {
if v, ok := r["value"].(float64); ok {
out[name] = v
}
}
}
}
}
// NVMe attributes live flat under nvme_smart_health_information_log.
if nvme, ok := raw["nvme_smart_health_information_log"].(map[string]any); ok {
for k, v := range nvme {
if !smartAttributeWhitelist[k] {
continue
}
if n, ok := v.(float64); ok {
out[k] = n
}
}
}
return out
}
// diffSMARTAttrs subtracts start from end per (device, attribute).
// Only attributes present in both ends produce a delta; missing
// attributes drop out (can't attribute a zero-to-present delta safely).
// Negative deltas are kept so a drive that resets a counter is visible.
func diffSMARTAttrs(start, end smartAttrMap) map[string]map[string]float64 {
out := map[string]map[string]float64{}
for dev, endAttrs := range end {
startAttrs, ok := start[dev]
if !ok {
continue
}
devOut := map[string]float64{}
for attr, endV := range endAttrs {
startV, ok := startAttrs[attr]
if !ok {
continue
}
devOut[attr] = endV - startV
}
if len(devOut) > 0 {
out[dev] = devOut
}
}
return out
}