deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+486
View File
@@ -0,0 +1,486 @@
package tests
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"runtime"
"strconv"
"strings"
"sync"
"time"
"vetting/agent/probes"
)
// BurnConfig is what the agent passes to Burn: the orchestrator's iperf3
// server address and port. Durations + concurrency knobs come from
// Deps.BurnKnobs so they scale with profile.
type BurnConfig struct {
OrchestratorURL string
IperfPort int // 0 = 5201
}
// Burn is the concurrent soak stage. Unlike CPUStress (serial
// CPU→memory) or Storage (serial per disk) it fans out every workload
// at once: stress-ng hammers CPU + memory, fio drives the allow-listed
// disks, iperf3 pushes sustained NIC traffic, and two sidecars poll
// EDAC + PSU rails for the duration of the window.
//
// This is where PSU rails actually matter: 12V sag under simultaneous
// CPU + disk + NIC load is exactly the failure a thermal/power
// regression produces, and it's invisible to any stage that loads one
// subsystem at a time. The PSU stage that follows Burn in the pipeline
// re-samples rails post-window to confirm they settle back to nominal.
//
// Burn stays inside the stage framework — it doesn't spawn a parallel
// stage runner. The goroutine fan-out is local; the stage converges
// before returning an Outcome so every invariant the orchestrator
// relies on (serial stage order, single in-flight stage per run) still
// holds.
func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome {
duration := d.BurnKnobs.Duration
if duration <= 0 {
duration = 2 * time.Minute
}
cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers)
memPct := clampMemPct(d.BurnKnobs.MemPct)
iperfParallel := d.BurnKnobs.IperfParallel
if iperfParallel <= 0 {
iperfParallel = 2
}
d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v",
duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare))
// Sidecars run for the lifetime of the window and are cancelled on
// return so the main stage converges cleanly. EDAC catches DIMM
// bit-flips that appear only under concurrent load; PSU catches
// rail sag that only appears when CPU + disk + NIC pull current
// simultaneously.
sideCtx, sideCancel := context.WithCancel(ctx)
defer sideCancel()
var sideWG sync.WaitGroup
sideWG.Add(2)
go runEDACSidecar(sideCtx, &sideWG, d)
go runPSUSidecar(sideCtx, &sideWG, d)
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
defer cancel()
results := make(chan burnSubResult, 4)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnCPU(runCtx, d, duration, cpuWorkers)
}()
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnMemory(runCtx, d, duration, memPct)
}()
// fio runs only when explicitly enabled *and* there are allow-listed
// disks *and* the run wasn't marked non-destructive. Any of those
// missing records a Skipped sub-step so the operator sees why.
if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive {
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnFio(runCtx, d, duration)
}()
} else {
reason := burnFioSkipReason(d)
results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason}
}
// iperf requires an orchestrator host. Lab hosts run with the
// bundled iperf3 server; without a base URL we can't derive a
// target so we skip rather than fail the stage.
if cfg.OrchestratorURL != "" {
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel)
}()
} else {
results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"}
}
wg.Wait()
sideCancel()
sideWG.Wait()
close(results)
subs, samples, failures := collectBurnResults(results)
if d.Sensor != nil && len(samples) > 0 {
_ = d.Sensor(ctx, samples)
}
extras := map[string]any{
"duration": duration.String(),
"cpu_workers": cpuWorkers,
"mem_pct": memPct,
"iperf_parallel": iperfParallel,
"fio_on_spare": d.BurnKnobs.FioOnSpare,
}
if len(failures) > 0 {
msg := "Burn workloads failed: " + strings.Join(failures, ", ")
d.Error(msg)
return Outcome{
Passed: false,
Message: msg,
Summary: fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)),
Extras: extras,
SubSteps: subs,
}
}
d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)),
Extras: extras,
SubSteps: subs,
}
}
// burnSubResult is the per-workload return type used by the fan-out
// goroutines. Sample slice is merged into the stage's final /sensor
// batch; SubStep becomes a row on the /result sub-steps list.
type burnSubResult struct {
Name string
Passed bool
Skipped bool
Reason string // why a workload was skipped
Err string // why a workload failed
Samples []Sample
SubStep SubStepReport
}
func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) {
var subs []SubStepReport
var samples []Sample
var failures []string
for r := range ch {
// Non-skipped goroutines populate SubStep directly. Skipped slots
// get a synthesized row here so the /result shape stays stable.
if r.Skipped {
stamp := time.Now().UTC()
subs = append(subs, SubStepReport{
Name: r.Name,
Skipped: true,
StartedAt: stamp,
CompletedAt: stamp,
SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}),
})
continue
}
subs = append(subs, r.SubStep)
samples = append(samples, r.Samples...)
if !r.Passed {
reason := r.Err
if reason == "" {
reason = "unknown"
}
failures = append(failures, r.Name+": "+reason)
}
}
return subs, samples, failures
}
func burnFioSkipReason(d Deps) string {
if !d.BurnKnobs.FioOnSpare {
return "fio_on_spare knob disabled"
}
if d.NonDestructive {
return "non-destructive run"
}
if len(d.ExpectedDisks) == 0 {
return "no allowlisted disks"
}
return "disabled"
}
// runBurnCPU hammers all CPU cores with stress-ng for the window. Same
// shape as CPUStress pass 1 but with shorter label so the sub-step row
// doesn't collide with the earlier stage's "CPU pass".
func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult {
if _, err := exec.LookPath("stress-ng"); err != nil {
return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"}
}
args := []string{
"--cpu", strconv.Itoa(workers),
"--cpu-method", "all",
"--timeout", durationSeconds(duration),
"--metrics-brief",
"--verify",
}
d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " ")))
pass := runStressPass(ctx, d, "Burn CPU", duration, args)
return burnSubResult{
Name: "Burn CPU",
Passed: pass.Passed,
Err: pass.Err,
SubStep: subStepFromPass("Burn CPU", pass),
}
}
// runBurnMemory drives a single --vm worker sized at memPct of
// MemAvailable, capped so the kernel + agent + other workloads still
// have headroom. Clamping happens here rather than in resolveBurnKnobs
// so the cap is computed against real live memory each run.
func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult {
if _, err := exec.LookPath("stress-ng"); err != nil {
return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"}
}
avail, err := memAvailableBytes()
if err != nil {
return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()}
}
// Budget = avail * memPct / 100, then subtract the standard headroom.
// If the result is below the memory-pass floor we record a skipped
// row instead — the window is too tight to be meaningful on this box.
budget := int64(float64(avail) * float64(memPct) / 100.0)
cap := budget - memHeadroomBytes
if cap < memFloorBytes {
return burnSubResult{
Name: "Burn memory",
Skipped: true,
Reason: fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)),
}
}
args := []string{
"--vm", "1",
"--vm-bytes", strconv.FormatInt(cap, 10),
"--vm-keep",
"--timeout", durationSeconds(duration),
"--metrics-brief",
"--verify",
}
d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct))
pass := runStressPass(ctx, d, "Burn memory", duration, args)
return burnSubResult{
Name: "Burn memory",
Passed: pass.Passed,
Err: pass.Err,
SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass),
}
}
// runBurnFio runs fio_sample against the first allow-listed disk for
// the window. Reuses runFioVerify + parseFioJSON so the samples line
// up with what Storage emits. Using fio_sample (bounded by --size)
// keeps Burn's write volume predictable regardless of profile.
func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult {
if _, err := exec.LookPath("fio"); err != nil {
return burnSubResult{Name: "Burn fio", Err: "fio missing"}
}
targets := resolveTargets(d.ExpectedDisks)
if len(targets) == 0 {
return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"}
}
t := targets[0]
opts := fioOpts{
Mode: "fio_sample",
Size: "512MiB",
Runtime: duration,
BS: "4k",
RW: "randrw",
Verify: "md5",
}
start := time.Now()
d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration))
fr := runFioVerify(ctx, t.Device, opts)
end := time.Now()
sub := SubStepReport{
Name: "Burn fio " + t.Device,
Passed: fr.Error == "",
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(fr),
}
out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error}
if fr.Error == "" {
out.Samples = append(out.Samples,
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
)
if fr.ReadP99Us > 0 {
out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
}
if fr.WriteP99Us > 0 {
out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
}
}
return out
}
// runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON
// so the same (mbps, retrans, bytesSent) extraction the Network stage
// uses applies here too. Samples emitted as Burn-scoped keys so the
// dashboard can tell at-a-glance which window they came from.
func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult {
if _, err := exec.LookPath("iperf3"); err != nil {
return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"}
}
host, err := deriveHost(orchestratorURL)
if err != nil || host == "" {
return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"}
}
if port == 0 {
port = 5201
}
if parallel < 1 {
parallel = 1
}
args := []string{
"-c", host,
"-p", strconv.Itoa(port),
"-t", strconv.Itoa(int(duration.Seconds())),
"-P", strconv.Itoa(parallel),
"-J",
}
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
defer cancel()
start := time.Now()
out, err := exec.CommandContext(runCtx, "iperf3", args...).Output()
end := time.Now()
if err != nil {
return burnSubResult{
Name: "Burn iperf",
Err: "iperf3 client error: " + err.Error(),
SubStep: SubStepReport{
Name: "Burn iperf",
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}),
},
}
}
mbps, retrans, bytesSent, _, perr := parseIperfJSON(out)
if perr != nil {
return burnSubResult{
Name: "Burn iperf",
Err: "parse iperf3 json: " + perr.Error(),
SubStep: SubStepReport{
Name: "Burn iperf",
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}),
},
}
}
samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}}
if bytesSent > 0 {
packets := float64(bytesSent) / 1460.0
if packets > 0 {
samples = append(samples, Sample{
Kind: "nic_retrans", Key: "burn/rate",
Value: float64(retrans) / packets, Unit: "rate",
})
}
}
passed := mbps > 0
errMsg := ""
if !passed {
errMsg = "zero throughput from iperf3"
}
return burnSubResult{
Name: "Burn iperf",
Passed: passed,
Err: errMsg,
Samples: samples,
SubStep: SubStepReport{
Name: fmt.Sprintf("Burn iperf (P=%d)", parallel),
Passed: passed,
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(map[string]any{
"throughput_mbps": mbps,
"retransmits": retrans,
"bytes_sent": bytesSent,
"parallel": parallel,
}),
},
}
}
// runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration
// of the Burn window, piping each read into the stage's sensor channel
// as a psu_volt sample. The threshold evaluator then applies the same
// within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V
// under load will fire the critical threshold mid-Burn and the run
// will flip into FailedHolding without waiting for the post-Burn PSU
// stage to catch it.
func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
defer wg.Done()
if d.Sensor == nil {
return
}
t := time.NewTicker(5 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
rails := scanPSURails()
if len(rails) == 0 {
continue
}
batch := make([]Sample, 0, len(rails))
for _, r := range rails {
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
}
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
if err := d.Sensor(sendCtx, batch); err != nil {
d.Warn("Burn: PSU sample post: " + err.Error())
}
cancel()
}
}
}
func resolveCPUWorkers(raw string) int {
if raw == "" || strings.EqualFold(raw, "all") {
return runtime.NumCPU()
}
if n, err := strconv.Atoi(raw); err == nil && n > 0 {
return n
}
return runtime.NumCPU()
}
// clampMemPct keeps the knob in a sane band. 0 means "use default 50%";
// above 90 would crowd the kernel + agent + fio + iperf3 workers off the
// page cache. Anything outside [10, 90] is clamped.
func clampMemPct(pct int) int {
if pct <= 0 {
return 50
}
if pct < 10 {
return 10
}
if pct > 90 {
return 90
}
return pct
}
func mustJSON(v any) json.RawMessage {
b, err := json.Marshal(v)
if err != nil {
return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`))
}
return b
}
// Ensure the probes package import stays anchored — the Burn sidecars
// use probes.EDAC + the PSU rail scanner defined in psu.go which
// otherwise wouldn't pull probes in on its own.
var _ = probes.EDAC
+58
View File
@@ -0,0 +1,58 @@
package tests
import (
"runtime"
"testing"
)
// TestResolveCPUWorkers covers the three parse branches: empty/"all"
// falls back to NumCPU, a valid integer is used verbatim, and garbage
// also falls back to NumCPU rather than returning zero. Zero workers
// would make stress-ng a no-op and silently defeat Burn's CPU load.
func TestResolveCPUWorkers(t *testing.T) {
np := runtime.NumCPU()
cases := []struct {
name string
in string
want int
}{
{"empty defaults to NumCPU", "", np},
{"all defaults to NumCPU", "all", np},
{"ALL is case-insensitive", "ALL", np},
{"explicit integer", "3", 3},
{"negative falls back", "-1", np},
{"zero falls back", "0", np},
{"garbage falls back", "lots", np},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := resolveCPUWorkers(tc.in); got != tc.want {
t.Errorf("resolveCPUWorkers(%q) = %d, want %d", tc.in, got, tc.want)
}
})
}
}
// TestClampMemPct ensures the mem_pct knob never drives the memory
// burner into OOM territory (upper clamp) or into uselessness (lower
// clamp). Zero is treated as "use default 50" so a missing knob in an
// older orchestrator's claim response doesn't collapse the workload.
func TestClampMemPct(t *testing.T) {
cases := []struct {
in, want int
}{
{0, 50}, // default
{-10, 50}, // negative treated as default
{5, 10}, // below lower band → clamp up
{10, 10},
{50, 50},
{90, 90},
{95, 90}, // above upper band → clamp down
{1000, 90},
}
for _, tc := range cases {
if got := clampMemPct(tc.in); got != tc.want {
t.Errorf("clampMemPct(%d) = %d, want %d", tc.in, got, tc.want)
}
}
}
+82 -4
View File
@@ -11,7 +11,10 @@ import (
"runtime"
"strconv"
"strings"
"sync"
"time"
"vetting/agent/probes"
)
// CPUStress runs stress-ng as two serial passes. The previous shape
@@ -55,11 +58,28 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
extras := map[string]any{"cores": cores}
var subs []SubStepReport
// EDAC sidecar runs for the lifetime of the stage; cancelled on
// return. It polls /sys/devices/system/edac/mc/*/{ce,ue}_count and
// posts the current counters so the server-side threshold evaluator
// can gate edac_ue > 0 → fail the run. Zero-valued poll falls back
// to 10s — the same cadence rasdaemon uses by default.
sideCtx, sideCancel := context.WithCancel(ctx)
defer sideCancel()
var sideWG sync.WaitGroup
sideWG.Add(1)
go runEDACSidecar(sideCtx, &sideWG, d)
// Per-profile durations come from Deps; zero values (missing knobs
// or legacy orchestrator) fall back to the package default so the
// stage always has a defined budget.
cpuDur := nonzeroDur(d.CPUStressKnobs.CPUPass, cpuPassDuration)
memDur := nonzeroDur(d.CPUStressKnobs.MemPass, memPassDuration)
// Pass 1: CPU
cpu := runStressPass(ctx, d, "CPU", cpuPassDuration, []string{
cpu := runStressPass(ctx, d, "CPU", cpuDur, []string{
"--cpu", strconv.Itoa(cores),
"--cpu-method", "all",
"--timeout", durationSeconds(cpuPassDuration),
"--timeout", durationSeconds(cpuDur),
"--metrics-brief",
"--verify",
})
@@ -104,11 +124,11 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
SubSteps: subs,
}
}
mem := runStressPass(ctx, d, "memory", memPassDuration, []string{
mem := runStressPass(ctx, d, "memory", memDur, []string{
"--vm", "1",
"--vm-bytes", strconv.FormatInt(cap, 10),
"--vm-keep",
"--timeout", durationSeconds(memPassDuration),
"--timeout", durationSeconds(memDur),
"--metrics-brief",
"--verify",
})
@@ -133,6 +153,64 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
}
}
// runEDACSidecar polls /sys EDAC counters on d.CPUStressKnobs.EDACPoll
// cadence (or 10s fallback) for the lifetime of the stage ctx, emitting
// one sample per (memory-controller × {ce,ue}) pair on each tick. A
// single failing read is tolerated: the next tick picks up the counter.
//
// This is where the critical edac_ue threshold becomes a hard-fail: as
// soon as a UE counter advances past 0, the server-side evaluator trips
// and flips the run into FailedHolding. The sidecar emits whether or
// not stress-ng is still running; that keeps the signal live during
// inter-pass gaps.
//
// MCE counts are intentionally not sampled here — they require
// rasdaemon or mcelog and vary by live-image packaging. The threshold
// rule for mce stays seeded (so the DB shape is stable) but only fires
// once a matching kind lands, which is a follow-up.
func runEDACSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
defer wg.Done()
if d.Sensor == nil {
return
}
poll := d.CPUStressKnobs.EDACPoll
if poll <= 0 {
poll = 10 * time.Second
}
t := time.NewTicker(poll)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
edac := probes.EDAC()
if len(edac) == 0 {
continue
}
batch := make([]Sample, 0, len(edac))
for _, s := range edac {
batch = append(batch, Sample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
}
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
if err := d.Sensor(sendCtx, batch); err != nil {
d.Warn("CPUStress: edac sample post: " + err.Error())
}
cancel()
}
}
}
// nonzeroDur picks override over fallback, but only when override is
// strictly positive. Lets callers pass a zero-value duration to mean
// "no override; use fallback" without a separate ok return.
func nonzeroDur(override, fallback time.Duration) time.Duration {
if override > 0 {
return override
}
return fallback
}
// subStepFromPass projects a stressPass into a SubStepReport — shared by
// both passes and by the mid-stage early-return paths so the UI always
// sees exactly one row per pass, even on failure.
+24
View File
@@ -0,0 +1,24 @@
// fake_dmidecode simulates `dmidecode -t bios` for unit tests of the
// firmware probe's BIOS parser. Prints deterministic output modeled on
// a real Supermicro host; exits 0 regardless of flags.
package main
import "fmt"
func main() {
fmt.Println(`# dmidecode 3.3
Getting SMBIOS data from sysfs.
SMBIOS 3.2.0 present.
Handle 0x0000, DMI type 0, 26 bytes
BIOS Information
Vendor: American Megatrends Inc.
Version: 3.2
Release Date: 07/15/2021
Address: 0xF0000
Runtime Size: 64 kB
ROM Size: 32 MB
Characteristics:
PCI is supported
BIOS is upgradeable`)
}
+22
View File
@@ -0,0 +1,22 @@
// Package fakes is the umbrella for deterministic stand-ins for
// external probe binaries that Vetting's stage code normally shells
// out to (stress-ng, fio, iperf3, dmidecode, ethtool, nvidia-smi,
// mcelog, nvme). Each real binary gets its own subpackage under
// fakes/<name>/ with `package main` and a main() that prints golden
// output — build with `go build -o <tmp>/<name> ./agent/tests/fakes/<name>`
// and point a test's tests.Deps.LookPath at <tmp>/<name>.
//
// The seam in tests is tests.Deps.LookPath: when non-nil the stage
// code uses it instead of os/exec.LookPath. Outside tests, nil
// LookPath means "use the real binary on $PATH" — stages continue to
// work on production hosts without the fakes package around.
//
// How to add a new fake:
// 1. Create agent/tests/fakes/<binaryname>/main.go.
// 2. Write `package main` with a main() that prints exactly the
// bytes the real tool would produce for the input you care to
// simulate. Determinism > completeness — tests want a known
// sample, not a realistic one.
// 3. Reference the fake from the unit test with `go test` compiling
// it via t.TempDir() + `go build -o` before the test body runs.
package fakes
+18
View File
@@ -0,0 +1,18 @@
// fake_stress_ng simulates stress-ng for unit tests. Accepts (and
// ignores) any flag, sleeps briefly so callers that measure wall-clock
// see a non-zero elapsed, and prints the "passed" lines CPUStress
// expects. Exits 0.
package main
import (
"fmt"
"os"
"time"
)
func main() {
fmt.Fprintln(os.Stderr, "fake_stress_ng invoked:", os.Args[1:])
time.Sleep(50 * time.Millisecond)
fmt.Println("stress-ng: info: [1] dispatching hogs: 1 cpu")
fmt.Println("stress-ng: info: [1] successful run completed in 0.05s")
}
+130 -16
View File
@@ -9,19 +9,27 @@ import (
"strconv"
"strings"
"time"
"vetting/agent/probes"
)
// NetworkConfig is what the agent passes to Network: the orchestrator's
// iperf3 server address and port. We derive host from OrchestratorURL.
// iperf3 server address, port, and the per-profile duration.
type NetworkConfig struct {
OrchestratorURL string
IperfPort int // 0 = 5201
Duration time.Duration
}
// Network runs iperf3 against the orchestrator's bundled server. Records
// bandwidth as a measurement; fails if iperf3 is missing, the server
// isn't reachable, or throughput is zero.
// Network runs iperf3 against the orchestrator's bundled server for
// the profile-configured duration. Records throughput as a measurement;
// records per-interface rx/tx error-rate deltas as nic_retrans samples
// so the server-side threshold gate (`nic_retrans rate < 0.001`) fires
// on a flaky PHY or a wire that drops half its packets under load.
//
// Failure cases: iperf3 missing, server unreachable, zero throughput.
// Zero throughput is treated as a hard failure — an iperf that finished
// cleanly but pushed zero bytes is indistinguishable from a bad run.
func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
if _, err := exec.LookPath("iperf3"); err != nil {
// Live image ships iperf3; absence means packaging regression.
@@ -51,6 +59,11 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
duration = 10 * time.Second
}
// Snapshot /proc/net/dev before the test so we can attribute any
// error-count growth to *this stage's* traffic. The same snapshot
// taken after iperf returns is the end of the window.
netStart := indexNetDev(probes.NetDev())
args := []string{
"-c", host,
"-p", strconv.Itoa(port),
@@ -72,7 +85,7 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)},
}
}
mbps, parsed, err := parseIperfJSON(out)
mbps, retrans, bytesSent, parsed, err := parseIperfJSON(out)
if err != nil {
d.Error("Network: parse iperf3 output: " + err.Error())
return Outcome{
@@ -82,12 +95,58 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
Extras: map[string]any{"raw": string(out)},
}
}
netEnd := indexNetDev(probes.NetDev())
netDelta := diffNetDev(netStart, netEnd)
samples := []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}
// iperf-derived retrans rate: retrans_count / packet_count_estimate.
// TCP typical MTU 1500; payload ~1460. We divide bytes by 1460 to
// approximate packets. This keeps the rate bounded in [0, 1].
if bytesSent > 0 {
packets := float64(bytesSent) / 1460.0
if packets > 0 {
samples = append(samples, Sample{
Kind: "nic_retrans",
Key: "iperf/rate",
Value: float64(retrans) / packets,
Unit: "rate",
})
}
}
// Per-interface error-rate deltas. A flaky cable typically surfaces
// as tx_errs or tx_drop on the originating interface, not inside
// iperf's own tally.
for iface, delta := range netDelta {
if delta.TxBytes > 0 {
packets := float64(delta.TxBytes) / 1460.0
if packets > 0 {
rate := float64(delta.TxErrs+delta.TxDrop) / packets
samples = append(samples, Sample{
Kind: "nic_retrans", Key: iface + "/rate", Value: rate, Unit: "rate",
})
}
}
// Diagnostic raw counts so the report can show which interface
// bled. These don't fire a threshold today but are useful for
// post-mortem.
samples = append(samples,
Sample{Kind: "nic_errs", Key: iface + "/rx", Value: float64(delta.RxErrs + delta.RxDrop), Unit: "count"},
Sample{Kind: "nic_errs", Key: iface + "/tx", Value: float64(delta.TxErrs + delta.TxDrop), Unit: "count"},
)
}
if d.Sensor != nil {
_ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}})
_ = d.Sensor(ctx, samples)
}
extras := map[string]any{
"throughput_mbps": mbps,
"retransmits": retrans,
"bytes_sent": bytesSent,
"net_delta": netDelta,
"iperf_end": parsed,
}
if mbps <= 0 {
@@ -98,14 +157,55 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
Extras: extras,
}
}
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps))
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps (retransmits=%d)", mbps, retrans))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host),
Summary: fmt.Sprintf("%.1f Mbps to %s (retransmits=%d)", mbps, host, retrans),
Extras: extras,
}
}
// indexNetDev flattens a NetDev slice into a map keyed by interface
// name so diffNetDev can pair start/end by name without O(n²) scans.
func indexNetDev(snaps []probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
out := map[string]probes.NetDevSnapshot{}
for _, s := range snaps {
out[s.Iface] = s
}
return out
}
// diffNetDev computes end start for each interface present in both
// snapshots. An interface that dropped away mid-run is dropped from
// the result (can't compute a delta). Underflow (end < start, rare
// after a counter reset) is clamped to 0.
func diffNetDev(start, end map[string]probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
out := map[string]probes.NetDevSnapshot{}
for iface, e := range end {
s, ok := start[iface]
if !ok {
continue
}
out[iface] = probes.NetDevSnapshot{
Iface: iface,
RxBytes: subU64(e.RxBytes, s.RxBytes),
RxErrs: subU64(e.RxErrs, s.RxErrs),
RxDrop: subU64(e.RxDrop, s.RxDrop),
TxBytes: subU64(e.TxBytes, s.TxBytes),
TxErrs: subU64(e.TxErrs, s.TxErrs),
TxDrop: subU64(e.TxDrop, s.TxDrop),
}
}
return out
}
func subU64(a, b uint64) uint64 {
if a < b {
return 0
}
return a - b
}
// deriveHost pulls the hostname out of an https://host:port base URL.
func deriveHost(raw string) (string, error) {
if raw == "" {
@@ -119,18 +219,22 @@ func deriveHost(raw string) (string, error) {
return strings.TrimSpace(h), nil
}
// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J.
// Returns (Mbps, full-json-map, err).
func parseIperfJSON(b []byte) (float64, map[string]any, error) {
// parseIperfJSON pulls end.sum_sent.bits_per_second and retransmits out
// of iperf3 -J. Returns (Mbps, retransmits, bytes_sent, full-end-map, err).
func parseIperfJSON(b []byte) (float64, int64, int64, map[string]any, error) {
var top map[string]any
if err := json.Unmarshal(b, &top); err != nil {
return 0, nil, err
return 0, 0, 0, nil, err
}
end, ok := top["end"].(map[string]any)
if !ok {
return 0, top, fmt.Errorf("missing end")
return 0, 0, 0, nil, fmt.Errorf("missing end")
}
// iperf3 reports either sum_sent (when -R not set) or sum_received.
// Pull the first sum that carries bits_per_second; retransmits +
// bytes live there too for TCP.
var mbps float64
var retrans int64
var bytesSent int64
for _, key := range []string{"sum_sent", "sum_received", "sum"} {
sum, ok := end[key].(map[string]any)
if !ok {
@@ -140,7 +244,17 @@ func parseIperfJSON(b []byte) (float64, map[string]any, error) {
if !ok {
continue
}
return bps / 1_000_000, end, nil
mbps = bps / 1_000_000
if r, ok := sum["retransmits"].(float64); ok {
retrans = int64(r)
}
if bs, ok := sum["bytes"].(float64); ok {
bytesSent = int64(bs)
}
break
}
return 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
if mbps == 0 {
return 0, 0, 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
}
return mbps, retrans, bytesSent, end, nil
}
+192
View File
@@ -0,0 +1,192 @@
package tests
import (
"encoding/json"
"testing"
"vetting/agent/probes"
)
// TestParseIperfJSON_SumSent confirms we pull throughput, retransmits,
// and bytes_sent from end.sum_sent. Real iperf3 -J output nests these
// three under end.sum_sent for TCP streams.
func TestParseIperfJSON_SumSent(t *testing.T) {
raw := `{
"end": {
"sum_sent": {
"bits_per_second": 950000000,
"retransmits": 42,
"bytes": 1187500000
}
}
}`
mbps, retrans, bytesSent, _, err := parseIperfJSON([]byte(raw))
if err != nil {
t.Fatalf("parseIperfJSON: %v", err)
}
if mbps != 950 {
t.Errorf("mbps = %v, want 950", mbps)
}
if retrans != 42 {
t.Errorf("retransmits = %d, want 42", retrans)
}
if bytesSent != 1187500000 {
t.Errorf("bytesSent = %d, want 1187500000", bytesSent)
}
}
// TestParseIperfJSON_MissingEnd fails cleanly when iperf returned
// something without an end block (partial/aborted run).
func TestParseIperfJSON_MissingEnd(t *testing.T) {
raw := `{"start": {}}`
if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
t.Errorf("expected error on iperf output missing end block")
}
}
// TestParseIperfJSON_ZeroBps returns an error so the stage can fail
// fast. A successful-exit iperf that pushed zero bits is indistinguishable
// from a broken run and must not pass.
func TestParseIperfJSON_ZeroBps(t *testing.T) {
raw := `{"end": {"sum_sent": {"bits_per_second": 0}}}`
if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
t.Errorf("expected error when bits_per_second is 0")
}
}
// TestParseIperfJSON_FallsBackToSumReceived: UDP tests and some edge
// cases don't populate sum_sent. The parser walks sum_sent → sum_received
// → sum and picks the first that has a throughput number.
func TestParseIperfJSON_FallsBackToSumReceived(t *testing.T) {
raw := `{
"end": {
"sum_received": {"bits_per_second": 500000000}
}
}`
mbps, _, _, _, err := parseIperfJSON([]byte(raw))
if err != nil {
t.Fatalf("parseIperfJSON: %v", err)
}
if mbps != 500 {
t.Errorf("mbps = %v, want 500", mbps)
}
}
// TestDiffNetDev_HappyPath confirms end start on a shared interface
// produces the delta we expect. eth0 pushed 10k bytes and accumulated
// 3 tx errors during the window.
func TestDiffNetDev_HappyPath(t *testing.T) {
start := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", RxBytes: 1000, RxErrs: 0, TxBytes: 5000, TxErrs: 1},
}
end := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", RxBytes: 2000, RxErrs: 0, TxBytes: 15000, TxErrs: 4},
}
delta := diffNetDev(start, end)
got, ok := delta["eth0"]
if !ok {
t.Fatalf("eth0 missing from diff output")
}
if got.RxBytes != 1000 {
t.Errorf("RxBytes delta=%d, want 1000", got.RxBytes)
}
if got.TxBytes != 10000 {
t.Errorf("TxBytes delta=%d, want 10000", got.TxBytes)
}
if got.TxErrs != 3 {
t.Errorf("TxErrs delta=%d, want 3", got.TxErrs)
}
}
// TestDiffNetDev_InterfaceVanished: an interface present at start but
// gone at end drops from the diff rather than carrying a negative or
// stale number.
func TestDiffNetDev_InterfaceVanished(t *testing.T) {
start := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", TxBytes: 1000},
"eth1": {Iface: "eth1", TxBytes: 500},
}
end := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", TxBytes: 2000},
}
delta := diffNetDev(start, end)
if _, ok := delta["eth1"]; ok {
t.Errorf("eth1 should have been dropped (gone at end)")
}
if delta["eth0"].TxBytes != 1000 {
t.Errorf("eth0 TxBytes delta=%d, want 1000", delta["eth0"].TxBytes)
}
}
// TestDiffNetDev_CounterReset: if a counter resets between snapshots
// (kernel restart, wrap-around on a 32-bit counter) we clamp to 0
// rather than underflow a uint64.
func TestDiffNetDev_CounterReset(t *testing.T) {
start := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", TxBytes: 9999, TxErrs: 5},
}
end := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", TxBytes: 100, TxErrs: 0},
}
delta := diffNetDev(start, end)
if delta["eth0"].TxBytes != 0 {
t.Errorf("reset TxBytes delta=%d, want 0 (clamped)", delta["eth0"].TxBytes)
}
if delta["eth0"].TxErrs != 0 {
t.Errorf("reset TxErrs delta=%d, want 0 (clamped)", delta["eth0"].TxErrs)
}
}
// TestDeriveHost: orchestrator URL → host extraction is how the agent
// picks the iperf3 server target. Handles both https://host and
// https://host:port shapes.
func TestDeriveHost(t *testing.T) {
cases := []struct {
raw string
want string
}{
{"https://orch.local", "orch.local"},
{"https://orch.local:8443", "orch.local"},
{"http://10.0.0.5:8080", "10.0.0.5"},
}
for _, c := range cases {
got, err := deriveHost(c.raw)
if err != nil {
t.Errorf("deriveHost(%q) error: %v", c.raw, err)
continue
}
if got != c.want {
t.Errorf("deriveHost(%q) = %q, want %q", c.raw, got, c.want)
}
}
}
func TestDeriveHost_Empty(t *testing.T) {
if _, err := deriveHost(""); err == nil {
t.Errorf("deriveHost(\"\") should error")
}
}
// TestParseIperfJSON_ParsesEndMap confirms the full end map is returned
// so extras can show every field iperf produced, not just the three we
// extract by hand.
func TestParseIperfJSON_ParsesEndMap(t *testing.T) {
raw := `{
"end": {
"sum_sent": {"bits_per_second": 1000000, "retransmits": 0, "bytes": 125000},
"cpu_utilization_percent": {"host_total": 12.3}
}
}`
_, _, _, endMap, err := parseIperfJSON([]byte(raw))
if err != nil {
t.Fatalf("parseIperfJSON: %v", err)
}
if endMap == nil {
t.Fatalf("endMap is nil")
}
// Sanity: both keys round-trip via json.
b, _ := json.Marshal(endMap)
if len(b) == 0 {
t.Errorf("endMap marshaled to empty")
}
}
+137 -18
View File
@@ -7,12 +7,20 @@ import (
"path/filepath"
"strconv"
"strings"
"time"
)
// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
// PSU rails. In home-lab hosts the kernel surfaces a handful of named
// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10%
// window of its nominal value → fail.
// PSU rails, then samples each rail every psuSampleInterval for a
// window sized by the stage timeout. During Burn a separate sidecar
// (see burn.go) runs the same probe concurrently with workload — the
// PSU stage itself catches slow post-load sag that only surfaces once
// the 12V rail starts recovering from a brownout under concurrent CPU
// + fio + iperf load.
//
// Any rail outside ±10% of its nominal value at any tick fires the
// critical threshold (server-side) and fails the stage. A host with no
// PSU rails wired to hwmon auto-skips.
func PSU(ctx context.Context, d Deps) Outcome {
rails := scanPSURails()
if len(rails) == 0 {
@@ -24,39 +32,150 @@ func PSU(ctx context.Context, d Deps) Outcome {
}
}
var samples []Sample
problems := []string{}
for _, rail := range rails {
samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"})
if ok, why := voltageInRange(rail); !ok {
problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why))
window := resolvePSUWindow(d.StageTimeout)
deadline := time.Now().Add(window)
interval := psuSampleInterval
if window < interval*2 {
// Tiny window (tests, pathological stage_timeout) — at least two
// ticks so aggregate stats are meaningful.
interval = window / 2
if interval < time.Second {
interval = time.Second
}
}
if d.Sensor != nil {
_ = d.Sensor(ctx, samples)
// Per-label tracking: min/max across the window, count of out-of-range
// hits, last-observed value (shown in the summary).
type railStats struct {
label string
minV float64
maxV float64
lastV float64
ticks int
breaches int
reason string
}
stats := map[string]*railStats{}
tick := time.NewTicker(interval)
defer tick.Stop()
// Start with an immediate sample so a sub-45s window still produces
// at least one reading.
sampleOnce := func() {
cur := scanPSURails()
if len(cur) == 0 {
return
}
batch := make([]Sample, 0, len(cur))
for _, r := range cur {
s, ok := stats[r.Label]
if !ok {
s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts}
stats[r.Label] = s
}
s.ticks++
s.lastV = r.Volts
if r.Volts < s.minV {
s.minV = r.Volts
}
if r.Volts > s.maxV {
s.maxV = r.Volts
}
if ok, why := voltageInRange(r); !ok {
s.breaches++
if s.reason == "" {
s.reason = why
}
}
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
}
if d.Sensor != nil && len(batch) > 0 {
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
_ = d.Sensor(sendCtx, batch)
cancel()
}
}
sampleOnce()
sampling:
for time.Now().Before(deadline) {
select {
case <-ctx.Done():
break sampling
case <-tick.C:
sampleOnce()
}
}
// Build the outcome. Extras carry per-rail rollup so the report can
// show "12V min=11.1 max=12.05 (3/120 ticks out of range)".
type railRollup struct {
Label string `json:"label"`
MinV float64 `json:"min_v"`
MaxV float64 `json:"max_v"`
LastV float64 `json:"last_v"`
Ticks int `json:"ticks"`
Breaches int `json:"breaches"`
Reason string `json:"reason,omitempty"`
}
rollups := make([]railRollup, 0, len(stats))
problems := []string{}
for _, s := range stats {
rollups = append(rollups, railRollup{
Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV,
Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason,
})
if s.breaches > 0 {
problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason))
}
}
extras := map[string]any{
"rails": rails,
"problems": problems,
"rails": rollups,
"problems": problems,
"window": window.String(),
"interval": interval.String(),
}
if len(problems) > 0 {
d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", "))
d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; "))
return Outcome{
Passed: false,
Message: "PSU rails out of range: " + strings.Join(problems, ", "),
Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)),
Message: "PSU rails out of range: " + strings.Join(problems, "; "),
Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)),
Extras: extras,
}
}
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails)))
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%d rails nominal", len(rails)),
Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window),
Extras: extras,
}
}
// psuSampleInterval is the default tick for post-Burn rail sampling.
// Five seconds is slow enough to stay under the HTTP budget and fast
// enough to catch rail recovery transients.
const psuSampleInterval = 5 * time.Second
// resolvePSUWindow maps the stage timeout to the sampling window.
// With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot-
// like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom
// for sensor flush + result post, capped at 10 min so a 24 h soak
// doesn't spend all day in PSU.
func resolvePSUWindow(stageTimeout time.Duration) time.Duration {
if stageTimeout <= 0 {
return 30 * time.Second
}
w := stageTimeout - 5*time.Second
if w < 30*time.Second {
w = 30 * time.Second
}
if w > 10*time.Minute {
w = 10 * time.Minute
}
return w
}
type psuRail struct {
Label string `json:"label"`
Volts float64 `json:"volts"`
+112
View File
@@ -0,0 +1,112 @@
package tests
import (
"testing"
"time"
)
// TestIsPSULabel keeps the allowlist narrow enough that CPU VRM rails
// don't get misclassified as PSU-out-of-range failures but wide enough
// that common SuperMicro/Intel hwmon labels land in the Yes bucket.
func TestIsPSULabel(t *testing.T) {
cases := []struct {
label string
want bool
}{
{"+12V", true},
{"12V", true},
{"+5V", true},
{"5V", true},
{"+3.3V", true},
{"3V3", true},
{"VCCIN", true},
{"vccin", true},
{"Vcore", false},
{"CPU VCORE", false},
{"AVCC", false},
{"", false},
}
for _, tc := range cases {
if got := isPSULabel(tc.label); got != tc.want {
t.Errorf("isPSULabel(%q) = %v, want %v", tc.label, got, tc.want)
}
}
}
// TestNominalFor maps rail labels back to expected nominal voltages.
// Unknown labels must return 0 so voltageInRange short-circuits — an
// accidental nominal would invent out-of-range failures.
func TestNominalFor(t *testing.T) {
cases := []struct {
label string
want float64
}{
{"+12V", 12.0},
{"12V", 12.0},
{"+5V", 5.0},
{"+3.3V", 3.3},
{"3V3", 3.3},
{"VCCIN", 0},
{"unknown", 0},
}
for _, tc := range cases {
if got := nominalFor(tc.label); got != tc.want {
t.Errorf("nominalFor(%q) = %v, want %v", tc.label, got, tc.want)
}
}
}
// TestVoltageInRange verifies the ±10% band: 12V passes in [10.8,
// 13.2], fails anywhere outside. Unknown labels always pass (since
// nominalFor returned 0 above).
func TestVoltageInRange(t *testing.T) {
cases := []struct {
rail psuRail
ok bool
}{
{psuRail{Label: "+12V", Volts: 12.0}, true},
{psuRail{Label: "+12V", Volts: 10.8}, true}, // exactly at the band
{psuRail{Label: "+12V", Volts: 13.2}, true}, // exactly at the band
{psuRail{Label: "+12V", Volts: 10.7}, false}, // just below
{psuRail{Label: "+12V", Volts: 13.3}, false}, // just above
{psuRail{Label: "+12V", Volts: 10.5}, false}, // real sag
{psuRail{Label: "+5V", Volts: 4.6}, true}, // 8% low on 5V still in band
{psuRail{Label: "+5V", Volts: 4.4}, false}, // 12% low on 5V — out of band
{psuRail{Label: "+5V", Volts: 5.0}, true},
{psuRail{Label: "VCCIN", Volts: 1.8}, true}, // unknown nominal → pass
}
for _, tc := range cases {
got, _ := voltageInRange(tc.rail)
if got != tc.ok {
t.Errorf("voltageInRange(%+v) = %v, want %v", tc.rail, got, tc.ok)
}
}
}
// TestResolvePSUWindow maps stage timeouts to the sampling window.
// Quick's 1m stage_timeout → 55s window; deep's 10m → capped at 10m;
// missing/zero → 30s (test / legacy orchestrator path); sub-35s → at
// least 30s so aggregates are non-trivial.
func TestResolvePSUWindow(t *testing.T) {
cases := []struct {
name string
in time.Duration
want time.Duration
}{
{"zero → snapshot fallback", 0, 30 * time.Second},
{"negative → snapshot fallback", -1 * time.Second, 30 * time.Second},
{"tiny timeout clamps up to 30s floor", 10 * time.Second, 30 * time.Second},
{"35s - 5s = 30s", 35 * time.Second, 30 * time.Second},
{"1m quick → 55s", time.Minute, 55 * time.Second},
{"10m deep → 9m55s", 10 * time.Minute, 9*time.Minute + 55*time.Second},
{"15m soak → capped at 10m", 15 * time.Minute, 10 * time.Minute},
{"1h → capped at 10m", time.Hour, 10 * time.Minute},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := resolvePSUWindow(tc.in); got != tc.want {
t.Errorf("resolvePSUWindow(%s) = %s, want %s", tc.in, got, tc.want)
}
})
}
}
+57
View File
@@ -59,6 +59,11 @@ func (o Outcome) MarshalSummary() (json.RawMessage, error) {
// Deps bundles what stages need without pulling in the whole agent.
// Logger methods print to stdout + forward to the orchestrator; Sensor
// drops numeric samples; OverrideFlags carries operator-set bypasses.
//
// CPUStressKnobs / StorageKnobs / NetworkKnobs are Phase-2 profile
// knobs. Zero-valued fields mean "fall back to the compile-time
// default" — that keeps the stages runnable even when the runner can't
// materialize a profile (tests, legacy orchestrator, etc).
type Deps struct {
Info func(string)
Warn func(string)
@@ -68,6 +73,58 @@ type Deps struct {
NonDestructive bool // skip wipe-probe + writes in Storage
ExpectedDisks []ExpectedDisk // serials + sizes from host.expected_spec
StageTimeout time.Duration
CPUStressKnobs CPUStressKnobs
StorageKnobs StorageKnobs
NetworkKnobs NetworkKnobs
BurnKnobs BurnKnobs
// LookPath is the unit-test seam for swapping a real external
// binary (stress-ng, fio, iperf3, dmidecode, …) for a fake. When
// nil the stage falls back to os/exec.LookPath — production and
// existing tests keep working unchanged. Tests under
// agent/tests/fakes/ populate this to redirect lookups to a built
// fake binary in a tempdir.
LookPath func(name string) (string, error)
}
// CPUStressKnobs parameterizes the CPUStress stage. Zero durations fall
// back to the package's compile-time defaults (cpuPassDuration etc).
type CPUStressKnobs struct {
CPUPass time.Duration
MemPass time.Duration
EDACPoll time.Duration
}
// StorageKnobs parameterizes the Storage stage. Mode picks between
// "fio_sample" (bounded tempfile inside the device, quick profile) and
// "full_disk" (whole-device write verify, deep/soak). Empty strings
// fall back to the stage's safe defaults.
type StorageKnobs struct {
Mode string
FioSize string
FioTime time.Duration
FioBS string
FioRW string
Verify string
}
// NetworkKnobs parameterizes the Network stage.
type NetworkKnobs struct {
Duration time.Duration
}
// BurnKnobs parameterizes the Burn super-stage. Duration is the total
// Burn window; sub-workloads run concurrently inside that window.
// CPUWorkers is "all" (runtime.NumCPU) or a numeric string. MemPct is a
// percentage of MemAvailable to allocate for the memory burner (clamped
// 0-90 by the stage). IperfParallel feeds iperf3 -P to generate sustained
// NIC load. FioOnSpare gates the storage sub-workload: true = fio runs
// against the allow-listed disks for the same window; false = skip fio.
type BurnKnobs struct {
Duration time.Duration
CPUWorkers string
MemPct int
FioOnSpare bool
IperfParallel int
}
// Sample mirrors the server's SensorSample but lives in the tests
+318 -105
View File
@@ -5,24 +5,36 @@ import (
"encoding/json"
"fmt"
"os/exec"
"strconv"
"strings"
"time"
)
// Storage is the destructive stage: badblocks (write-mode sample) + fio
// random IO, persisting IOPS + latency as measurements. Pre-gates:
// Storage is the destructive stage. Phase 2 replaced the old
// badblocks + 128 MiB fio combo with a single fio run per disk that
// writes, verifies md5 of what it wrote, and reports p99 latency.
// Modes:
//
// - fio_sample (quick): bounded 1 GiB write per disk, ~3 min runtime.
// - full_disk (deep/soak): writes the whole device, time-bounded by
// the fio_time knob (2 h deep, 6 h soak).
//
// Pre-gates kept from Phase 1:
//
// 1. Device allowlist: only act on /dev/<X> where the kernel-reported
// serial matches one of Deps.ExpectedDisks. This is the operator's
// contract for what can be written to. USB sticks and unexpected
// serial matches one of Deps.ExpectedDisks. USB sticks and unexpected
// drives are excluded.
// 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
// signatures, partition tables, or LVM metadata → fail with
// signature, partition table, or LVM metadata → fail with
// UnexpectedData unless Deps.OverrideWipe is set.
//
// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w`
// and `fio` in write mode. This matches the plan's "destructive disk
// tests are always-on, gated by layered safety."
// After fio, the stage captures a SMART diff (start snapshot taken
// before any writes; end snapshot after all writes finish) and posts
// deltas on attributes like Reallocated_Sector_Ct and Current_Pending_Sector.
// The threshold evaluator isn't seeded to gate smart_delta out of the
// box — those samples are diagnostic for the report. Fio's p99 latency
// posts as fio_p99_us so the per-stage Storage warning threshold can
// fire on a latency cliff.
func Storage(ctx context.Context, d Deps) Outcome {
if len(d.ExpectedDisks) == 0 {
d.Info("Storage: no expected disks in spec — skipping stage")
@@ -44,10 +56,10 @@ func Storage(ctx context.Context, d Deps) Outcome {
}
}
// Non-destructive runs skip wipe-probe (nothing to refuse), badblocks
// -w, and write-mode fio. Every expected disk is still asserted
// present + readable by listing /sys/block and reading SMART-accessible
// identity; the per-disk map flags the shortcut so the report is clear.
// Non-destructive runs skip wipe-probe (nothing to refuse), fio
// writes, and SMART delta (nothing changed so no delta to report).
// Every expected disk is still asserted present so a vanished drive
// still fails the stage.
if d.NonDestructive {
perDisk := map[string]any{}
for _, t := range targets {
@@ -79,9 +91,9 @@ func Storage(ctx context.Context, d Deps) Outcome {
Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
Extras: map[string]any{
"wipe_probe": probes,
"override_hint": "click 'Override wipe & retry' in the held tile",
"dirty_devices": dirty,
"wipe_probe": probes,
"override_hint": "click 'Override wipe & retry' in the held tile",
"dirty_devices": dirty,
},
}
}
@@ -89,64 +101,80 @@ func Storage(ctx context.Context, d Deps) Outcome {
d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
}
// Per target: short badblocks write sample + fio random-read/write.
// Capture start-of-stage SMART attributes before we write anything
// so the delta is attributable to *this* stage's writes and not the
// host's prior history. Per-disk failures are tolerated (e.g. the
// device doesn't expose SMART); we just can't emit a delta for it.
startSMART := captureSMARTAttrs(ctx, targets)
fioOpts := resolveFioOpts(d.StorageKnobs)
d.Info(fmt.Sprintf("Storage: fio mode=%s size=%s runtime=%s bs=%s rw=%s verify=%s",
fioOpts.Mode, fioOpts.Size, fioOpts.Runtime, fioOpts.BS, fioOpts.RW, fioOpts.Verify))
var samples []Sample
var subs []SubStepReport
perDisk := map[string]any{}
failed := ""
for _, t := range targets {
d.Info("Storage: running badblocks write sample on " + t.Device)
bbStart := time.Now()
bb := runBadblocks(ctx, t.Device)
bbEnd := time.Now()
bbSummary, _ := json.Marshal(bb)
subs = append(subs, SubStepReport{
Name: fmt.Sprintf("badblocks %s", t.Device),
Passed: bb.OK,
StartedAt: bbStart,
CompletedAt: bbEnd,
SummaryJSON: bbSummary,
})
d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
d.Info(fmt.Sprintf("Storage: running fio %s on %s", fioOpts.Mode, t.Device))
fioStart := time.Now()
fr := runFio(ctx, t.Device)
fr := runFioVerify(ctx, t.Device, fioOpts)
fioEnd := time.Now()
fioSummary, _ := json.Marshal(fr)
subs = append(subs, SubStepReport{
Name: fmt.Sprintf("fio %s", t.Device),
Name: fmt.Sprintf("fio %s %s", fioOpts.Mode, t.Device),
Passed: fr.Error == "",
StartedAt: fioStart,
CompletedAt: fioEnd,
SummaryJSON: fioSummary,
})
perDisk[t.Device] = map[string]any{"fio": fr}
perDisk[t.Device] = map[string]any{
"badblocks": bb,
"fio": fr,
}
samples = append(samples,
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
)
if !bb.OK {
return Outcome{
Passed: false,
Message: "badblocks found errors on " + t.Device,
Summary: "badblocks failed on " + t.Device,
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
SubSteps: subs,
if fr.Error == "" {
samples = append(samples,
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
)
if fr.ReadP99Us > 0 {
samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
}
if fr.WriteP99Us > 0 {
samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
}
} else if failed == "" {
failed = t.Device
}
}
if d.Sensor != nil {
// End-of-stage SMART snapshot + diff. We capture whether or not fio
// succeeded — a mid-run failure still produces attributable deltas,
// which is often more interesting than the stage outcome itself.
endSMART := captureSMARTAttrs(ctx, targets)
deltas := diffSMARTAttrs(startSMART, endSMART)
for dev, attrs := range deltas {
for attr, delta := range attrs {
samples = append(samples, Sample{Kind: "smart_delta", Key: dev + "/" + attr, Value: delta, Unit: "count"})
}
}
if d.Sensor != nil && len(samples) > 0 {
_ = d.Sensor(ctx, samples)
}
d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
if failed != "" {
return Outcome{
Passed: false,
Message: "fio verify failed on " + failed,
Summary: "fio failed on " + failed,
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
SubSteps: subs,
}
}
d.Info(fmt.Sprintf("Storage: %d disk(s) passed fio --verify", len(targets)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%d disks passed", len(targets)),
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
Summary: fmt.Sprintf("%d disks passed (%s)", len(targets), fioOpts.Mode),
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
SubSteps: subs,
}
}
@@ -229,8 +257,8 @@ type wipeProbeResult struct {
// probeWipe runs blkid + wipefs -n. Any non-empty output from either is
// a "has data" signal. This is deliberately conservative: we'd rather
// halt on a bare ext4 signature than hand badblocks a disk with real
// bytes on it.
// halt on a bare ext4 signature than hand fio a disk with real bytes on
// it.
func probeWipe(ctx context.Context, device string) wipeProbeResult {
out := wipeProbeResult{Device: device}
@@ -257,84 +285,269 @@ func probeWipe(ctx context.Context, device string) wipeProbeResult {
return out
}
// ---------- badblocks ----------
// ---------- fio ----------
type badblocksResult struct {
OK bool `json:"ok"`
Elapsed string `json:"elapsed"`
Error string `json:"error,omitempty"`
OutputTail string `json:"output_tail,omitempty"`
// fioOpts resolves the probe knobs into the concrete flag values fio
// needs. Defaults match the quick profile's fio_sample shape so callers
// with zero knobs still run something bounded.
type fioOpts struct {
Mode string `json:"mode"` // "fio_sample" | "full_disk"
Size string `json:"size"` // "1GiB"; only used for fio_sample
Runtime time.Duration `json:"runtime"` // bounding time
BS string `json:"bs"` // "4k"
RW string `json:"rw"` // "randrw"
Verify string `json:"verify"` // "md5" | ""
}
func runBadblocks(ctx context.Context, device string) badblocksResult {
// -c 64 blocks per check, -w destructive write, -b 4096 block size,
// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays
// bounded. A real burn-in would run the whole disk; that belongs in
// a separate "deep" stage.
args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"}
start := time.Now()
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
// resolveFioOpts normalizes the knobs into a runnable config. Zero-
// valued fields fall back to the quick defaults so a stage that's
// missing its knobs still has coherent behavior (safer than refusing).
func resolveFioOpts(k StorageKnobs) fioOpts {
o := fioOpts{
Mode: firstNonEmpty(k.Mode, "fio_sample"),
Size: firstNonEmpty(k.FioSize, "1GiB"),
Runtime: k.FioTime,
BS: firstNonEmpty(k.FioBS, "4k"),
RW: firstNonEmpty(k.FioRW, "randrw"),
Verify: firstNonEmpty(k.Verify, "md5"),
}
if o.Runtime <= 0 {
o.Runtime = 3 * time.Minute
}
return o
}
func firstNonEmpty(vs ...string) string {
for _, v := range vs {
if v != "" {
return v
}
}
return ""
}
type fioResult struct {
Mode string `json:"mode"`
ReadIOPS float64 `json:"read_iops"`
WriteIOPS float64 `json:"write_iops"`
ReadBWKBps float64 `json:"read_bw_kbps"`
WriteBWKBps float64 `json:"write_bw_kbps"`
ReadP99Us float64 `json:"read_p99_us,omitempty"`
WriteP99Us float64 `json:"write_p99_us,omitempty"`
Error string `json:"error,omitempty"`
OutputTail string `json:"output_tail,omitempty"`
}
// runFioVerify invokes fio with md5-verify semantics. fio_sample mode
// caps the IO at opts.Size; full_disk drives the whole device bounded
// by runtime. Both use direct IO to bypass the page cache — we want
// real disk latency, not Linux' cheerful buffer.
func runFioVerify(ctx context.Context, device string, opts fioOpts) fioResult {
// 30s grace over runtime so fio has time to flush + close cleanly.
runCtx, cancel := context.WithTimeout(ctx, opts.Runtime+30*time.Second)
defer cancel()
cmd := exec.CommandContext(runCtx, "badblocks", args...)
out, err := cmd.CombinedOutput()
r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)}
args := []string{
"--name=verify-" + strings.TrimPrefix(device, "/dev/"),
"--filename=" + device,
"--rw=" + opts.RW,
"--bs=" + opts.BS,
"--numjobs=1",
"--direct=1",
"--group_reporting",
"--output-format=json",
"--runtime=" + strconv.Itoa(int(opts.Runtime.Seconds())),
}
if opts.Verify != "" {
args = append(args,
"--verify="+opts.Verify,
"--verify_pattern=random",
"--do_verify=1",
)
}
switch opts.Mode {
case "full_disk":
// Time-bounded across the full device — fio uses the device's
// full size when --size is omitted on a block device.
args = append(args, "--time_based=1")
default:
// fio_sample: bounded write. Setting --size= limits the IO
// volume regardless of runtime.
args = append(args, "--size="+opts.Size, "--time_based=0")
}
cmd := exec.CommandContext(runCtx, "fio", args...)
out, err := cmd.Output()
r := fioResult{Mode: opts.Mode, OutputTail: tailLines(string(out), 20)}
if err != nil {
r.Error = err.Error()
return r
}
// badblocks prints each bad block to stdout. Empty output = clean.
if strings.TrimSpace(string(out)) == "" {
r.OK = true
} else {
r.Error = "bad blocks found"
parsed, perr := parseFioJSON(out)
if perr != nil {
r.Error = "parse fio json: " + perr.Error()
return r
}
r.ReadIOPS = parsed.ReadIOPS
r.WriteIOPS = parsed.WriteIOPS
r.ReadBWKBps = parsed.ReadBWKBps
r.WriteBWKBps = parsed.WriteBWKBps
r.ReadP99Us = parsed.ReadP99Us
r.WriteP99Us = parsed.WriteP99Us
return r
}
// ---------- fio ----------
type fioResult struct {
ReadIOPS float64 `json:"read_iops"`
WriteIOPS float64 `json:"write_iops"`
ReadBWKBps float64 `json:"read_bw_kbps"`
WriteBWKBps float64 `json:"write_bw_kbps"`
Error string `json:"error,omitempty"`
}
// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
// This is a health bar, not a benchmark — we want to know the disk
// services IO, not how fast it is at p99.
func runFio(ctx context.Context, device string) fioResult {
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()
args := []string{
"--name=health", "--filename=" + device, "--rw=randrw",
"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
"--group_reporting", "--output-format=json", "--direct=1",
}
cmd := exec.CommandContext(runCtx, "fio", args...)
out, err := cmd.Output()
if err != nil {
return fioResult{Error: err.Error()}
}
// parseFioJSON extracts the bits we care about from fio's --output-format=json.
// Latency percentiles live at .jobs[0].read.clat_ns.percentile["99.000000"];
// we convert nanoseconds to microseconds for the fio_p99_us sample.
func parseFioJSON(out []byte) (fioResult, error) {
var top struct {
Jobs []struct {
Read struct {
Read struct {
IOPS float64 `json:"iops"`
BW float64 `json:"bw"`
CLat struct {
Percentile map[string]float64 `json:"percentile"`
} `json:"clat_ns"`
} `json:"read"`
Write struct {
IOPS float64 `json:"iops"`
BW float64 `json:"bw"`
CLat struct {
Percentile map[string]float64 `json:"percentile"`
} `json:"clat_ns"`
} `json:"write"`
} `json:"jobs"`
}
if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 {
return fioResult{Error: "parse fio json: " + fmt.Sprint(err)}
if err := json.Unmarshal(out, &top); err != nil {
return fioResult{}, err
}
if len(top.Jobs) == 0 {
return fioResult{}, fmt.Errorf("no jobs in fio output")
}
j := top.Jobs[0]
return fioResult{
r := fioResult{
ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
}
if p := j.Read.CLat.Percentile["99.000000"]; p > 0 {
r.ReadP99Us = p / 1000.0
}
if p := j.Write.CLat.Percentile["99.000000"]; p > 0 {
r.WriteP99Us = p / 1000.0
}
return r, nil
}
// ---------- SMART delta ----------
// smartAttrMap: device → attribute → raw counter value. ATA drives
// populate named attributes (Reallocated_Sector_Ct etc); NVMe drives
// populate a flatter nvme-specific map. We track a curated whitelist
// of wear indicators — anything else is diagnostic and drops to the raw
// report output.
type smartAttrMap map[string]map[string]float64
// captureSMARTAttrs runs smartctl -aj on each target and pulls the
// whitelisted attributes. Per-device failures (virtio, permission
// issues) degrade silently — the delta step just shows no data for
// that device.
func captureSMARTAttrs(ctx context.Context, targets []diskTarget) smartAttrMap {
out := smartAttrMap{}
for _, t := range targets {
parsed, err := runSmartctl(ctx, t.Device)
if err != nil {
continue
}
attrs := extractSMARTAttrs(parsed)
if len(attrs) > 0 {
out[t.Device] = attrs
}
}
return out
}
// smartAttributeWhitelist is the set of attributes we diff across a
// stage. They're the ones that reflect *this stage's* IO damage, not
// cumulative drive history. Adding attributes is cheap — missing ones
// just drop to zero.
var smartAttributeWhitelist = map[string]bool{
// ATA SMART attribute names (smartctl normalizes to these)
"Reallocated_Sector_Ct": true,
"Current_Pending_Sector": true,
"Offline_Uncorrectable": true,
"UDMA_CRC_Error_Count": true,
"Reported_Uncorrect": true,
"Raw_Read_Error_Rate": true,
// NVMe log fields (flat keys at top of nvme_smart_health_information_log)
"media_errors": true,
"num_err_log_entries": true,
"percentage_used": true,
}
// extractSMARTAttrs walks smartctl's JSON for whitelisted attribute
// values. Handles both the ATA shape (ata_smart_attributes.table[]) and
// the NVMe shape (nvme_smart_health_information_log). Returns a map
// keyed by the canonical attribute name.
func extractSMARTAttrs(raw map[string]any) map[string]float64 {
out := map[string]float64{}
// ATA attributes are in ata_smart_attributes.table[] — each element
// has {"name": "Reallocated_Sector_Ct", "raw": {"value": N}}.
if ata, ok := raw["ata_smart_attributes"].(map[string]any); ok {
if tbl, ok := ata["table"].([]any); ok {
for _, row := range tbl {
rm, ok := row.(map[string]any)
if !ok {
continue
}
name, _ := rm["name"].(string)
if !smartAttributeWhitelist[name] {
continue
}
if r, ok := rm["raw"].(map[string]any); ok {
if v, ok := r["value"].(float64); ok {
out[name] = v
}
}
}
}
}
// NVMe attributes live flat under nvme_smart_health_information_log.
if nvme, ok := raw["nvme_smart_health_information_log"].(map[string]any); ok {
for k, v := range nvme {
if !smartAttributeWhitelist[k] {
continue
}
if n, ok := v.(float64); ok {
out[k] = n
}
}
}
return out
}
// diffSMARTAttrs subtracts start from end per (device, attribute).
// Only attributes present in both ends produce a delta; missing
// attributes drop out (can't attribute a zero-to-present delta safely).
// Negative deltas are kept so a drive that resets a counter is visible.
func diffSMARTAttrs(start, end smartAttrMap) map[string]map[string]float64 {
out := map[string]map[string]float64{}
for dev, endAttrs := range end {
startAttrs, ok := start[dev]
if !ok {
continue
}
devOut := map[string]float64{}
for attr, endV := range endAttrs {
startV, ok := startAttrs[attr]
if !ok {
continue
}
devOut[attr] = endV - startV
}
if len(devOut) > 0 {
out[dev] = devOut
}
}
return out
}
+218
View File
@@ -0,0 +1,218 @@
package tests
import (
"encoding/json"
"testing"
"time"
)
// TestParseFioJSON_ATAReadWrite confirms we pull IOPS, BW, and p99
// latency from both read and write sides. P99 is read from clat_ns and
// converted ns → us (the unit we emit to the threshold evaluator).
func TestParseFioJSON_ATAReadWrite(t *testing.T) {
raw := `{
"jobs": [{
"read": {"iops": 1234.5, "bw": 5000, "clat_ns": {"percentile": {"99.000000": 250000}}},
"write": {"iops": 432.1, "bw": 2000, "clat_ns": {"percentile": {"99.000000": 500000}}}
}]
}`
r, err := parseFioJSON([]byte(raw))
if err != nil {
t.Fatalf("parseFioJSON: %v", err)
}
if r.ReadIOPS != 1234.5 {
t.Errorf("ReadIOPS = %v, want 1234.5", r.ReadIOPS)
}
if r.WriteIOPS != 432.1 {
t.Errorf("WriteIOPS = %v, want 432.1", r.WriteIOPS)
}
if r.ReadBWKBps != 5000 {
t.Errorf("ReadBWKBps = %v, want 5000", r.ReadBWKBps)
}
// 250000 ns → 250 us
if r.ReadP99Us != 250 {
t.Errorf("ReadP99Us = %v, want 250", r.ReadP99Us)
}
// 500000 ns → 500 us
if r.WriteP99Us != 500 {
t.Errorf("WriteP99Us = %v, want 500", r.WriteP99Us)
}
}
// TestParseFioJSON_ReadOnlyJob: if only one side has p99 populated the
// other stays zero (not emitted as a sample). Mirrors a randread job.
func TestParseFioJSON_ReadOnlyJob(t *testing.T) {
raw := `{
"jobs": [{
"read": {"iops": 1000, "bw": 4000, "clat_ns": {"percentile": {"99.000000": 100000}}},
"write": {"iops": 0, "bw": 0}
}]
}`
r, err := parseFioJSON([]byte(raw))
if err != nil {
t.Fatalf("parseFioJSON: %v", err)
}
if r.WriteP99Us != 0 {
t.Errorf("WriteP99Us = %v on read-only job, want 0", r.WriteP99Us)
}
if r.ReadP99Us != 100 {
t.Errorf("ReadP99Us = %v, want 100", r.ReadP99Us)
}
}
// TestParseFioJSON_NoJobs fails rather than reporting zeroes silently.
// An empty jobs array means fio didn't run anything.
func TestParseFioJSON_NoJobs(t *testing.T) {
raw := `{"jobs": []}`
if _, err := parseFioJSON([]byte(raw)); err == nil {
t.Errorf("expected error on empty jobs array")
}
}
// TestExtractSMARTAttrs_ATA picks attributes out of ata_smart_attributes.table
// when present. Attributes outside the whitelist drop out silently.
func TestExtractSMARTAttrs_ATA(t *testing.T) {
raw := map[string]any{}
smartJSON := `{
"ata_smart_attributes": {
"table": [
{"name": "Reallocated_Sector_Ct", "raw": {"value": 7}},
{"name": "Current_Pending_Sector", "raw": {"value": 3}},
{"name": "Spin_Retry_Count", "raw": {"value": 99}}
]
}
}`
if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
t.Fatalf("unmarshal fixture: %v", err)
}
out := extractSMARTAttrs(raw)
if out["Reallocated_Sector_Ct"] != 7 {
t.Errorf("Reallocated_Sector_Ct = %v, want 7", out["Reallocated_Sector_Ct"])
}
if out["Current_Pending_Sector"] != 3 {
t.Errorf("Current_Pending_Sector = %v, want 3", out["Current_Pending_Sector"])
}
if _, ok := out["Spin_Retry_Count"]; ok {
t.Errorf("Spin_Retry_Count should not appear (not in whitelist)")
}
}
// TestExtractSMARTAttrs_NVMe picks media_errors and friends from the
// nvme health log shape, which is a flat map at the top of the JSON.
func TestExtractSMARTAttrs_NVMe(t *testing.T) {
raw := map[string]any{}
smartJSON := `{
"nvme_smart_health_information_log": {
"media_errors": 2,
"num_err_log_entries": 15,
"percentage_used": 7,
"temperature": 42
}
}`
if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
t.Fatalf("unmarshal fixture: %v", err)
}
out := extractSMARTAttrs(raw)
if out["media_errors"] != 2 {
t.Errorf("media_errors = %v, want 2", out["media_errors"])
}
if out["num_err_log_entries"] != 15 {
t.Errorf("num_err_log_entries = %v, want 15", out["num_err_log_entries"])
}
if out["percentage_used"] != 7 {
t.Errorf("percentage_used = %v, want 7", out["percentage_used"])
}
if _, ok := out["temperature"]; ok {
t.Errorf("temperature should not appear (not in whitelist)")
}
}
// TestDiffSMARTAttrs: end start per (device, attr). Only attrs in
// both snapshots yield a delta; any disappearing attribute just drops
// out instead of showing a misleading negative.
func TestDiffSMARTAttrs(t *testing.T) {
start := smartAttrMap{
"/dev/sda": {"Reallocated_Sector_Ct": 5, "Current_Pending_Sector": 0},
}
end := smartAttrMap{
"/dev/sda": {"Reallocated_Sector_Ct": 8, "Current_Pending_Sector": 2, "UDMA_CRC_Error_Count": 1},
}
out := diffSMARTAttrs(start, end)
if out["/dev/sda"]["Reallocated_Sector_Ct"] != 3 {
t.Errorf("Reallocated_Sector_Ct delta = %v, want 3", out["/dev/sda"]["Reallocated_Sector_Ct"])
}
if out["/dev/sda"]["Current_Pending_Sector"] != 2 {
t.Errorf("Current_Pending_Sector delta = %v, want 2", out["/dev/sda"]["Current_Pending_Sector"])
}
if _, ok := out["/dev/sda"]["UDMA_CRC_Error_Count"]; ok {
t.Errorf("UDMA_CRC_Error_Count should not appear (missing at start)")
}
}
// TestDiffSMARTAttrs_DeviceNewAtEnd: a device only present in the end
// snapshot (drive hot-plugged mid-run, or SMART read succeeded only at
// end) is dropped from the diff — no start baseline to subtract from.
func TestDiffSMARTAttrs_DeviceNewAtEnd(t *testing.T) {
start := smartAttrMap{}
end := smartAttrMap{
"/dev/sda": {"Reallocated_Sector_Ct": 10},
}
out := diffSMARTAttrs(start, end)
if _, ok := out["/dev/sda"]; ok {
t.Errorf("/dev/sda should drop from diff when absent at start")
}
}
// TestResolveFioOpts_Defaults: zero-valued knobs resolve to the quick
// profile's fio_sample shape. Any stage that's missing per-profile
// knobs (legacy claim response, test harness) still has coherent
// bounded defaults — we won't accidentally fall into unbounded writes.
func TestResolveFioOpts_Defaults(t *testing.T) {
o := resolveFioOpts(StorageKnobs{})
if o.Mode != "fio_sample" {
t.Errorf("Mode = %q, want fio_sample", o.Mode)
}
if o.Size != "1GiB" {
t.Errorf("Size = %q, want 1GiB", o.Size)
}
if o.Runtime != 3*time.Minute {
t.Errorf("Runtime = %v, want 3m", o.Runtime)
}
if o.BS != "4k" {
t.Errorf("BS = %q, want 4k", o.BS)
}
if o.RW != "randrw" {
t.Errorf("RW = %q, want randrw", o.RW)
}
if o.Verify != "md5" {
t.Errorf("Verify = %q, want md5", o.Verify)
}
}
// TestResolveFioOpts_FullDiskOverride confirms the deep/soak shape
// round-trips. FioTime as 2h overrides the 3-minute default.
func TestResolveFioOpts_FullDiskOverride(t *testing.T) {
k := StorageKnobs{
Mode: "full_disk",
FioTime: 2 * time.Hour,
FioBS: "64k",
FioRW: "write",
}
o := resolveFioOpts(k)
if o.Mode != "full_disk" {
t.Errorf("Mode = %q, want full_disk", o.Mode)
}
if o.Runtime != 2*time.Hour {
t.Errorf("Runtime = %v, want 2h", o.Runtime)
}
if o.BS != "64k" {
t.Errorf("BS = %q, want 64k", o.BS)
}
if o.RW != "write" {
t.Errorf("RW = %q, want write", o.RW)
}
// Verify should fall back to md5 default since knob was empty.
if o.Verify != "md5" {
t.Errorf("Verify = %q, want md5 (default)", o.Verify)
}
}