deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,486 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"vetting/agent/probes"
|
||||
)
|
||||
|
||||
// BurnConfig is what the agent passes to Burn: the orchestrator's iperf3
|
||||
// server address and port. Durations + concurrency knobs come from
|
||||
// Deps.BurnKnobs so they scale with profile.
|
||||
type BurnConfig struct {
|
||||
OrchestratorURL string
|
||||
IperfPort int // 0 = 5201
|
||||
}
|
||||
|
||||
// Burn is the concurrent soak stage. Unlike CPUStress (serial
|
||||
// CPU→memory) or Storage (serial per disk) it fans out every workload
|
||||
// at once: stress-ng hammers CPU + memory, fio drives the allow-listed
|
||||
// disks, iperf3 pushes sustained NIC traffic, and two sidecars poll
|
||||
// EDAC + PSU rails for the duration of the window.
|
||||
//
|
||||
// This is where PSU rails actually matter: 12V sag under simultaneous
|
||||
// CPU + disk + NIC load is exactly the failure a thermal/power
|
||||
// regression produces, and it's invisible to any stage that loads one
|
||||
// subsystem at a time. The PSU stage that follows Burn in the pipeline
|
||||
// re-samples rails post-window to confirm they settle back to nominal.
|
||||
//
|
||||
// Burn stays inside the stage framework — it doesn't spawn a parallel
|
||||
// stage runner. The goroutine fan-out is local; the stage converges
|
||||
// before returning an Outcome so every invariant the orchestrator
|
||||
// relies on (serial stage order, single in-flight stage per run) still
|
||||
// holds.
|
||||
func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome {
|
||||
duration := d.BurnKnobs.Duration
|
||||
if duration <= 0 {
|
||||
duration = 2 * time.Minute
|
||||
}
|
||||
cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers)
|
||||
memPct := clampMemPct(d.BurnKnobs.MemPct)
|
||||
iperfParallel := d.BurnKnobs.IperfParallel
|
||||
if iperfParallel <= 0 {
|
||||
iperfParallel = 2
|
||||
}
|
||||
d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v",
|
||||
duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare))
|
||||
|
||||
// Sidecars run for the lifetime of the window and are cancelled on
|
||||
// return so the main stage converges cleanly. EDAC catches DIMM
|
||||
// bit-flips that appear only under concurrent load; PSU catches
|
||||
// rail sag that only appears when CPU + disk + NIC pull current
|
||||
// simultaneously.
|
||||
sideCtx, sideCancel := context.WithCancel(ctx)
|
||||
defer sideCancel()
|
||||
var sideWG sync.WaitGroup
|
||||
sideWG.Add(2)
|
||||
go runEDACSidecar(sideCtx, &sideWG, d)
|
||||
go runPSUSidecar(sideCtx, &sideWG, d)
|
||||
|
||||
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
results := make(chan burnSubResult, 4)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results <- runBurnCPU(runCtx, d, duration, cpuWorkers)
|
||||
}()
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results <- runBurnMemory(runCtx, d, duration, memPct)
|
||||
}()
|
||||
|
||||
// fio runs only when explicitly enabled *and* there are allow-listed
|
||||
// disks *and* the run wasn't marked non-destructive. Any of those
|
||||
// missing records a Skipped sub-step so the operator sees why.
|
||||
if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results <- runBurnFio(runCtx, d, duration)
|
||||
}()
|
||||
} else {
|
||||
reason := burnFioSkipReason(d)
|
||||
results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason}
|
||||
}
|
||||
|
||||
// iperf requires an orchestrator host. Lab hosts run with the
|
||||
// bundled iperf3 server; without a base URL we can't derive a
|
||||
// target so we skip rather than fail the stage.
|
||||
if cfg.OrchestratorURL != "" {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel)
|
||||
}()
|
||||
} else {
|
||||
results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
sideCancel()
|
||||
sideWG.Wait()
|
||||
close(results)
|
||||
|
||||
subs, samples, failures := collectBurnResults(results)
|
||||
if d.Sensor != nil && len(samples) > 0 {
|
||||
_ = d.Sensor(ctx, samples)
|
||||
}
|
||||
|
||||
extras := map[string]any{
|
||||
"duration": duration.String(),
|
||||
"cpu_workers": cpuWorkers,
|
||||
"mem_pct": memPct,
|
||||
"iperf_parallel": iperfParallel,
|
||||
"fio_on_spare": d.BurnKnobs.FioOnSpare,
|
||||
}
|
||||
if len(failures) > 0 {
|
||||
msg := "Burn workloads failed: " + strings.Join(failures, ", ")
|
||||
d.Error(msg)
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: msg,
|
||||
Summary: fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)),
|
||||
Extras: extras,
|
||||
SubSteps: subs,
|
||||
}
|
||||
}
|
||||
d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs)))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)),
|
||||
Extras: extras,
|
||||
SubSteps: subs,
|
||||
}
|
||||
}
|
||||
|
||||
// burnSubResult is the per-workload return type used by the fan-out
|
||||
// goroutines. Sample slice is merged into the stage's final /sensor
|
||||
// batch; SubStep becomes a row on the /result sub-steps list.
|
||||
type burnSubResult struct {
|
||||
Name string
|
||||
Passed bool
|
||||
Skipped bool
|
||||
Reason string // why a workload was skipped
|
||||
Err string // why a workload failed
|
||||
Samples []Sample
|
||||
SubStep SubStepReport
|
||||
}
|
||||
|
||||
func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) {
|
||||
var subs []SubStepReport
|
||||
var samples []Sample
|
||||
var failures []string
|
||||
for r := range ch {
|
||||
// Non-skipped goroutines populate SubStep directly. Skipped slots
|
||||
// get a synthesized row here so the /result shape stays stable.
|
||||
if r.Skipped {
|
||||
stamp := time.Now().UTC()
|
||||
subs = append(subs, SubStepReport{
|
||||
Name: r.Name,
|
||||
Skipped: true,
|
||||
StartedAt: stamp,
|
||||
CompletedAt: stamp,
|
||||
SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}),
|
||||
})
|
||||
continue
|
||||
}
|
||||
subs = append(subs, r.SubStep)
|
||||
samples = append(samples, r.Samples...)
|
||||
if !r.Passed {
|
||||
reason := r.Err
|
||||
if reason == "" {
|
||||
reason = "unknown"
|
||||
}
|
||||
failures = append(failures, r.Name+": "+reason)
|
||||
}
|
||||
}
|
||||
return subs, samples, failures
|
||||
}
|
||||
|
||||
func burnFioSkipReason(d Deps) string {
|
||||
if !d.BurnKnobs.FioOnSpare {
|
||||
return "fio_on_spare knob disabled"
|
||||
}
|
||||
if d.NonDestructive {
|
||||
return "non-destructive run"
|
||||
}
|
||||
if len(d.ExpectedDisks) == 0 {
|
||||
return "no allowlisted disks"
|
||||
}
|
||||
return "disabled"
|
||||
}
|
||||
|
||||
// runBurnCPU hammers all CPU cores with stress-ng for the window. Same
|
||||
// shape as CPUStress pass 1 but with shorter label so the sub-step row
|
||||
// doesn't collide with the earlier stage's "CPU pass".
|
||||
func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult {
|
||||
if _, err := exec.LookPath("stress-ng"); err != nil {
|
||||
return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"}
|
||||
}
|
||||
args := []string{
|
||||
"--cpu", strconv.Itoa(workers),
|
||||
"--cpu-method", "all",
|
||||
"--timeout", durationSeconds(duration),
|
||||
"--metrics-brief",
|
||||
"--verify",
|
||||
}
|
||||
d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " ")))
|
||||
pass := runStressPass(ctx, d, "Burn CPU", duration, args)
|
||||
return burnSubResult{
|
||||
Name: "Burn CPU",
|
||||
Passed: pass.Passed,
|
||||
Err: pass.Err,
|
||||
SubStep: subStepFromPass("Burn CPU", pass),
|
||||
}
|
||||
}
|
||||
|
||||
// runBurnMemory drives a single --vm worker sized at memPct of
|
||||
// MemAvailable, capped so the kernel + agent + other workloads still
|
||||
// have headroom. Clamping happens here rather than in resolveBurnKnobs
|
||||
// so the cap is computed against real live memory each run.
|
||||
func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult {
|
||||
if _, err := exec.LookPath("stress-ng"); err != nil {
|
||||
return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"}
|
||||
}
|
||||
avail, err := memAvailableBytes()
|
||||
if err != nil {
|
||||
return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()}
|
||||
}
|
||||
// Budget = avail * memPct / 100, then subtract the standard headroom.
|
||||
// If the result is below the memory-pass floor we record a skipped
|
||||
// row instead — the window is too tight to be meaningful on this box.
|
||||
budget := int64(float64(avail) * float64(memPct) / 100.0)
|
||||
cap := budget - memHeadroomBytes
|
||||
if cap < memFloorBytes {
|
||||
return burnSubResult{
|
||||
Name: "Burn memory",
|
||||
Skipped: true,
|
||||
Reason: fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)),
|
||||
}
|
||||
}
|
||||
args := []string{
|
||||
"--vm", "1",
|
||||
"--vm-bytes", strconv.FormatInt(cap, 10),
|
||||
"--vm-keep",
|
||||
"--timeout", durationSeconds(duration),
|
||||
"--metrics-brief",
|
||||
"--verify",
|
||||
}
|
||||
d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct))
|
||||
pass := runStressPass(ctx, d, "Burn memory", duration, args)
|
||||
return burnSubResult{
|
||||
Name: "Burn memory",
|
||||
Passed: pass.Passed,
|
||||
Err: pass.Err,
|
||||
SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass),
|
||||
}
|
||||
}
|
||||
|
||||
// runBurnFio runs fio_sample against the first allow-listed disk for
|
||||
// the window. Reuses runFioVerify + parseFioJSON so the samples line
|
||||
// up with what Storage emits. Using fio_sample (bounded by --size)
|
||||
// keeps Burn's write volume predictable regardless of profile.
|
||||
func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult {
|
||||
if _, err := exec.LookPath("fio"); err != nil {
|
||||
return burnSubResult{Name: "Burn fio", Err: "fio missing"}
|
||||
}
|
||||
targets := resolveTargets(d.ExpectedDisks)
|
||||
if len(targets) == 0 {
|
||||
return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"}
|
||||
}
|
||||
t := targets[0]
|
||||
opts := fioOpts{
|
||||
Mode: "fio_sample",
|
||||
Size: "512MiB",
|
||||
Runtime: duration,
|
||||
BS: "4k",
|
||||
RW: "randrw",
|
||||
Verify: "md5",
|
||||
}
|
||||
start := time.Now()
|
||||
d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration))
|
||||
fr := runFioVerify(ctx, t.Device, opts)
|
||||
end := time.Now()
|
||||
|
||||
sub := SubStepReport{
|
||||
Name: "Burn fio " + t.Device,
|
||||
Passed: fr.Error == "",
|
||||
StartedAt: start,
|
||||
CompletedAt: end,
|
||||
SummaryJSON: mustJSON(fr),
|
||||
}
|
||||
out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error}
|
||||
if fr.Error == "" {
|
||||
out.Samples = append(out.Samples,
|
||||
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
|
||||
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
|
||||
)
|
||||
if fr.ReadP99Us > 0 {
|
||||
out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
|
||||
}
|
||||
if fr.WriteP99Us > 0 {
|
||||
out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON
|
||||
// so the same (mbps, retrans, bytesSent) extraction the Network stage
|
||||
// uses applies here too. Samples emitted as Burn-scoped keys so the
|
||||
// dashboard can tell at-a-glance which window they came from.
|
||||
func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult {
|
||||
if _, err := exec.LookPath("iperf3"); err != nil {
|
||||
return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"}
|
||||
}
|
||||
host, err := deriveHost(orchestratorURL)
|
||||
if err != nil || host == "" {
|
||||
return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"}
|
||||
}
|
||||
if port == 0 {
|
||||
port = 5201
|
||||
}
|
||||
if parallel < 1 {
|
||||
parallel = 1
|
||||
}
|
||||
args := []string{
|
||||
"-c", host,
|
||||
"-p", strconv.Itoa(port),
|
||||
"-t", strconv.Itoa(int(duration.Seconds())),
|
||||
"-P", strconv.Itoa(parallel),
|
||||
"-J",
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
|
||||
defer cancel()
|
||||
start := time.Now()
|
||||
out, err := exec.CommandContext(runCtx, "iperf3", args...).Output()
|
||||
end := time.Now()
|
||||
if err != nil {
|
||||
return burnSubResult{
|
||||
Name: "Burn iperf",
|
||||
Err: "iperf3 client error: " + err.Error(),
|
||||
SubStep: SubStepReport{
|
||||
Name: "Burn iperf",
|
||||
StartedAt: start,
|
||||
CompletedAt: end,
|
||||
SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}),
|
||||
},
|
||||
}
|
||||
}
|
||||
mbps, retrans, bytesSent, _, perr := parseIperfJSON(out)
|
||||
if perr != nil {
|
||||
return burnSubResult{
|
||||
Name: "Burn iperf",
|
||||
Err: "parse iperf3 json: " + perr.Error(),
|
||||
SubStep: SubStepReport{
|
||||
Name: "Burn iperf",
|
||||
StartedAt: start,
|
||||
CompletedAt: end,
|
||||
SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}}
|
||||
if bytesSent > 0 {
|
||||
packets := float64(bytesSent) / 1460.0
|
||||
if packets > 0 {
|
||||
samples = append(samples, Sample{
|
||||
Kind: "nic_retrans", Key: "burn/rate",
|
||||
Value: float64(retrans) / packets, Unit: "rate",
|
||||
})
|
||||
}
|
||||
}
|
||||
passed := mbps > 0
|
||||
errMsg := ""
|
||||
if !passed {
|
||||
errMsg = "zero throughput from iperf3"
|
||||
}
|
||||
return burnSubResult{
|
||||
Name: "Burn iperf",
|
||||
Passed: passed,
|
||||
Err: errMsg,
|
||||
Samples: samples,
|
||||
SubStep: SubStepReport{
|
||||
Name: fmt.Sprintf("Burn iperf (P=%d)", parallel),
|
||||
Passed: passed,
|
||||
StartedAt: start,
|
||||
CompletedAt: end,
|
||||
SummaryJSON: mustJSON(map[string]any{
|
||||
"throughput_mbps": mbps,
|
||||
"retransmits": retrans,
|
||||
"bytes_sent": bytesSent,
|
||||
"parallel": parallel,
|
||||
}),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration
|
||||
// of the Burn window, piping each read into the stage's sensor channel
|
||||
// as a psu_volt sample. The threshold evaluator then applies the same
|
||||
// within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V
|
||||
// under load will fire the critical threshold mid-Burn and the run
|
||||
// will flip into FailedHolding without waiting for the post-Burn PSU
|
||||
// stage to catch it.
|
||||
func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
|
||||
defer wg.Done()
|
||||
if d.Sensor == nil {
|
||||
return
|
||||
}
|
||||
t := time.NewTicker(5 * time.Second)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
rails := scanPSURails()
|
||||
if len(rails) == 0 {
|
||||
continue
|
||||
}
|
||||
batch := make([]Sample, 0, len(rails))
|
||||
for _, r := range rails {
|
||||
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
|
||||
}
|
||||
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
if err := d.Sensor(sendCtx, batch); err != nil {
|
||||
d.Warn("Burn: PSU sample post: " + err.Error())
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func resolveCPUWorkers(raw string) int {
|
||||
if raw == "" || strings.EqualFold(raw, "all") {
|
||||
return runtime.NumCPU()
|
||||
}
|
||||
if n, err := strconv.Atoi(raw); err == nil && n > 0 {
|
||||
return n
|
||||
}
|
||||
return runtime.NumCPU()
|
||||
}
|
||||
|
||||
// clampMemPct keeps the knob in a sane band. 0 means "use default 50%";
|
||||
// above 90 would crowd the kernel + agent + fio + iperf3 workers off the
|
||||
// page cache. Anything outside [10, 90] is clamped.
|
||||
func clampMemPct(pct int) int {
|
||||
if pct <= 0 {
|
||||
return 50
|
||||
}
|
||||
if pct < 10 {
|
||||
return 10
|
||||
}
|
||||
if pct > 90 {
|
||||
return 90
|
||||
}
|
||||
return pct
|
||||
}
|
||||
|
||||
func mustJSON(v any) json.RawMessage {
|
||||
b, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`))
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// Ensure the probes package import stays anchored — the Burn sidecars
|
||||
// use probes.EDAC + the PSU rail scanner defined in psu.go which
|
||||
// otherwise wouldn't pull probes in on its own.
|
||||
var _ = probes.EDAC
|
||||
@@ -0,0 +1,58 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestResolveCPUWorkers covers the three parse branches: empty/"all"
|
||||
// falls back to NumCPU, a valid integer is used verbatim, and garbage
|
||||
// also falls back to NumCPU rather than returning zero. Zero workers
|
||||
// would make stress-ng a no-op and silently defeat Burn's CPU load.
|
||||
func TestResolveCPUWorkers(t *testing.T) {
|
||||
np := runtime.NumCPU()
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
want int
|
||||
}{
|
||||
{"empty defaults to NumCPU", "", np},
|
||||
{"all defaults to NumCPU", "all", np},
|
||||
{"ALL is case-insensitive", "ALL", np},
|
||||
{"explicit integer", "3", 3},
|
||||
{"negative falls back", "-1", np},
|
||||
{"zero falls back", "0", np},
|
||||
{"garbage falls back", "lots", np},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := resolveCPUWorkers(tc.in); got != tc.want {
|
||||
t.Errorf("resolveCPUWorkers(%q) = %d, want %d", tc.in, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestClampMemPct ensures the mem_pct knob never drives the memory
|
||||
// burner into OOM territory (upper clamp) or into uselessness (lower
|
||||
// clamp). Zero is treated as "use default 50" so a missing knob in an
|
||||
// older orchestrator's claim response doesn't collapse the workload.
|
||||
func TestClampMemPct(t *testing.T) {
|
||||
cases := []struct {
|
||||
in, want int
|
||||
}{
|
||||
{0, 50}, // default
|
||||
{-10, 50}, // negative treated as default
|
||||
{5, 10}, // below lower band → clamp up
|
||||
{10, 10},
|
||||
{50, 50},
|
||||
{90, 90},
|
||||
{95, 90}, // above upper band → clamp down
|
||||
{1000, 90},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
if got := clampMemPct(tc.in); got != tc.want {
|
||||
t.Errorf("clampMemPct(%d) = %d, want %d", tc.in, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,7 +11,10 @@ import (
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"vetting/agent/probes"
|
||||
)
|
||||
|
||||
// CPUStress runs stress-ng as two serial passes. The previous shape
|
||||
@@ -55,11 +58,28 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
|
||||
extras := map[string]any{"cores": cores}
|
||||
var subs []SubStepReport
|
||||
|
||||
// EDAC sidecar runs for the lifetime of the stage; cancelled on
|
||||
// return. It polls /sys/devices/system/edac/mc/*/{ce,ue}_count and
|
||||
// posts the current counters so the server-side threshold evaluator
|
||||
// can gate edac_ue > 0 → fail the run. Zero-valued poll falls back
|
||||
// to 10s — the same cadence rasdaemon uses by default.
|
||||
sideCtx, sideCancel := context.WithCancel(ctx)
|
||||
defer sideCancel()
|
||||
var sideWG sync.WaitGroup
|
||||
sideWG.Add(1)
|
||||
go runEDACSidecar(sideCtx, &sideWG, d)
|
||||
|
||||
// Per-profile durations come from Deps; zero values (missing knobs
|
||||
// or legacy orchestrator) fall back to the package default so the
|
||||
// stage always has a defined budget.
|
||||
cpuDur := nonzeroDur(d.CPUStressKnobs.CPUPass, cpuPassDuration)
|
||||
memDur := nonzeroDur(d.CPUStressKnobs.MemPass, memPassDuration)
|
||||
|
||||
// Pass 1: CPU
|
||||
cpu := runStressPass(ctx, d, "CPU", cpuPassDuration, []string{
|
||||
cpu := runStressPass(ctx, d, "CPU", cpuDur, []string{
|
||||
"--cpu", strconv.Itoa(cores),
|
||||
"--cpu-method", "all",
|
||||
"--timeout", durationSeconds(cpuPassDuration),
|
||||
"--timeout", durationSeconds(cpuDur),
|
||||
"--metrics-brief",
|
||||
"--verify",
|
||||
})
|
||||
@@ -104,11 +124,11 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
|
||||
SubSteps: subs,
|
||||
}
|
||||
}
|
||||
mem := runStressPass(ctx, d, "memory", memPassDuration, []string{
|
||||
mem := runStressPass(ctx, d, "memory", memDur, []string{
|
||||
"--vm", "1",
|
||||
"--vm-bytes", strconv.FormatInt(cap, 10),
|
||||
"--vm-keep",
|
||||
"--timeout", durationSeconds(memPassDuration),
|
||||
"--timeout", durationSeconds(memDur),
|
||||
"--metrics-brief",
|
||||
"--verify",
|
||||
})
|
||||
@@ -133,6 +153,64 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
|
||||
}
|
||||
}
|
||||
|
||||
// runEDACSidecar polls /sys EDAC counters on d.CPUStressKnobs.EDACPoll
|
||||
// cadence (or 10s fallback) for the lifetime of the stage ctx, emitting
|
||||
// one sample per (memory-controller × {ce,ue}) pair on each tick. A
|
||||
// single failing read is tolerated: the next tick picks up the counter.
|
||||
//
|
||||
// This is where the critical edac_ue threshold becomes a hard-fail: as
|
||||
// soon as a UE counter advances past 0, the server-side evaluator trips
|
||||
// and flips the run into FailedHolding. The sidecar emits whether or
|
||||
// not stress-ng is still running; that keeps the signal live during
|
||||
// inter-pass gaps.
|
||||
//
|
||||
// MCE counts are intentionally not sampled here — they require
|
||||
// rasdaemon or mcelog and vary by live-image packaging. The threshold
|
||||
// rule for mce stays seeded (so the DB shape is stable) but only fires
|
||||
// once a matching kind lands, which is a follow-up.
|
||||
func runEDACSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
|
||||
defer wg.Done()
|
||||
if d.Sensor == nil {
|
||||
return
|
||||
}
|
||||
poll := d.CPUStressKnobs.EDACPoll
|
||||
if poll <= 0 {
|
||||
poll = 10 * time.Second
|
||||
}
|
||||
t := time.NewTicker(poll)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
edac := probes.EDAC()
|
||||
if len(edac) == 0 {
|
||||
continue
|
||||
}
|
||||
batch := make([]Sample, 0, len(edac))
|
||||
for _, s := range edac {
|
||||
batch = append(batch, Sample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
|
||||
}
|
||||
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
if err := d.Sensor(sendCtx, batch); err != nil {
|
||||
d.Warn("CPUStress: edac sample post: " + err.Error())
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// nonzeroDur picks override over fallback, but only when override is
|
||||
// strictly positive. Lets callers pass a zero-value duration to mean
|
||||
// "no override; use fallback" without a separate ok return.
|
||||
func nonzeroDur(override, fallback time.Duration) time.Duration {
|
||||
if override > 0 {
|
||||
return override
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
// subStepFromPass projects a stressPass into a SubStepReport — shared by
|
||||
// both passes and by the mid-stage early-return paths so the UI always
|
||||
// sees exactly one row per pass, even on failure.
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
// fake_dmidecode simulates `dmidecode -t bios` for unit tests of the
|
||||
// firmware probe's BIOS parser. Prints deterministic output modeled on
|
||||
// a real Supermicro host; exits 0 regardless of flags.
|
||||
package main
|
||||
|
||||
import "fmt"
|
||||
|
||||
func main() {
|
||||
fmt.Println(`# dmidecode 3.3
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.2.0 present.
|
||||
|
||||
Handle 0x0000, DMI type 0, 26 bytes
|
||||
BIOS Information
|
||||
Vendor: American Megatrends Inc.
|
||||
Version: 3.2
|
||||
Release Date: 07/15/2021
|
||||
Address: 0xF0000
|
||||
Runtime Size: 64 kB
|
||||
ROM Size: 32 MB
|
||||
Characteristics:
|
||||
PCI is supported
|
||||
BIOS is upgradeable`)
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
// Package fakes is the umbrella for deterministic stand-ins for
|
||||
// external probe binaries that Vetting's stage code normally shells
|
||||
// out to (stress-ng, fio, iperf3, dmidecode, ethtool, nvidia-smi,
|
||||
// mcelog, nvme). Each real binary gets its own subpackage under
|
||||
// fakes/<name>/ with `package main` and a main() that prints golden
|
||||
// output — build with `go build -o <tmp>/<name> ./agent/tests/fakes/<name>`
|
||||
// and point a test's tests.Deps.LookPath at <tmp>/<name>.
|
||||
//
|
||||
// The seam in tests is tests.Deps.LookPath: when non-nil the stage
|
||||
// code uses it instead of os/exec.LookPath. Outside tests, nil
|
||||
// LookPath means "use the real binary on $PATH" — stages continue to
|
||||
// work on production hosts without the fakes package around.
|
||||
//
|
||||
// How to add a new fake:
|
||||
// 1. Create agent/tests/fakes/<binaryname>/main.go.
|
||||
// 2. Write `package main` with a main() that prints exactly the
|
||||
// bytes the real tool would produce for the input you care to
|
||||
// simulate. Determinism > completeness — tests want a known
|
||||
// sample, not a realistic one.
|
||||
// 3. Reference the fake from the unit test with `go test` compiling
|
||||
// it via t.TempDir() + `go build -o` before the test body runs.
|
||||
package fakes
|
||||
@@ -0,0 +1,18 @@
|
||||
// fake_stress_ng simulates stress-ng for unit tests. Accepts (and
|
||||
// ignores) any flag, sleeps briefly so callers that measure wall-clock
|
||||
// see a non-zero elapsed, and prints the "passed" lines CPUStress
|
||||
// expects. Exits 0.
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Fprintln(os.Stderr, "fake_stress_ng invoked:", os.Args[1:])
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
fmt.Println("stress-ng: info: [1] dispatching hogs: 1 cpu")
|
||||
fmt.Println("stress-ng: info: [1] successful run completed in 0.05s")
|
||||
}
|
||||
+130
-16
@@ -9,19 +9,27 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"vetting/agent/probes"
|
||||
)
|
||||
|
||||
// NetworkConfig is what the agent passes to Network: the orchestrator's
|
||||
// iperf3 server address and port. We derive host from OrchestratorURL.
|
||||
// iperf3 server address, port, and the per-profile duration.
|
||||
type NetworkConfig struct {
|
||||
OrchestratorURL string
|
||||
IperfPort int // 0 = 5201
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
// Network runs iperf3 against the orchestrator's bundled server. Records
|
||||
// bandwidth as a measurement; fails if iperf3 is missing, the server
|
||||
// isn't reachable, or throughput is zero.
|
||||
// Network runs iperf3 against the orchestrator's bundled server for
|
||||
// the profile-configured duration. Records throughput as a measurement;
|
||||
// records per-interface rx/tx error-rate deltas as nic_retrans samples
|
||||
// so the server-side threshold gate (`nic_retrans rate < 0.001`) fires
|
||||
// on a flaky PHY or a wire that drops half its packets under load.
|
||||
//
|
||||
// Failure cases: iperf3 missing, server unreachable, zero throughput.
|
||||
// Zero throughput is treated as a hard failure — an iperf that finished
|
||||
// cleanly but pushed zero bytes is indistinguishable from a bad run.
|
||||
func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
if _, err := exec.LookPath("iperf3"); err != nil {
|
||||
// Live image ships iperf3; absence means packaging regression.
|
||||
@@ -51,6 +59,11 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
duration = 10 * time.Second
|
||||
}
|
||||
|
||||
// Snapshot /proc/net/dev before the test so we can attribute any
|
||||
// error-count growth to *this stage's* traffic. The same snapshot
|
||||
// taken after iperf returns is the end of the window.
|
||||
netStart := indexNetDev(probes.NetDev())
|
||||
|
||||
args := []string{
|
||||
"-c", host,
|
||||
"-p", strconv.Itoa(port),
|
||||
@@ -72,7 +85,7 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)},
|
||||
}
|
||||
}
|
||||
mbps, parsed, err := parseIperfJSON(out)
|
||||
mbps, retrans, bytesSent, parsed, err := parseIperfJSON(out)
|
||||
if err != nil {
|
||||
d.Error("Network: parse iperf3 output: " + err.Error())
|
||||
return Outcome{
|
||||
@@ -82,12 +95,58 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
Extras: map[string]any{"raw": string(out)},
|
||||
}
|
||||
}
|
||||
|
||||
netEnd := indexNetDev(probes.NetDev())
|
||||
netDelta := diffNetDev(netStart, netEnd)
|
||||
|
||||
samples := []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}
|
||||
|
||||
// iperf-derived retrans rate: retrans_count / packet_count_estimate.
|
||||
// TCP typical MTU 1500; payload ~1460. We divide bytes by 1460 to
|
||||
// approximate packets. This keeps the rate bounded in [0, 1].
|
||||
if bytesSent > 0 {
|
||||
packets := float64(bytesSent) / 1460.0
|
||||
if packets > 0 {
|
||||
samples = append(samples, Sample{
|
||||
Kind: "nic_retrans",
|
||||
Key: "iperf/rate",
|
||||
Value: float64(retrans) / packets,
|
||||
Unit: "rate",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Per-interface error-rate deltas. A flaky cable typically surfaces
|
||||
// as tx_errs or tx_drop on the originating interface, not inside
|
||||
// iperf's own tally.
|
||||
for iface, delta := range netDelta {
|
||||
if delta.TxBytes > 0 {
|
||||
packets := float64(delta.TxBytes) / 1460.0
|
||||
if packets > 0 {
|
||||
rate := float64(delta.TxErrs+delta.TxDrop) / packets
|
||||
samples = append(samples, Sample{
|
||||
Kind: "nic_retrans", Key: iface + "/rate", Value: rate, Unit: "rate",
|
||||
})
|
||||
}
|
||||
}
|
||||
// Diagnostic raw counts so the report can show which interface
|
||||
// bled. These don't fire a threshold today but are useful for
|
||||
// post-mortem.
|
||||
samples = append(samples,
|
||||
Sample{Kind: "nic_errs", Key: iface + "/rx", Value: float64(delta.RxErrs + delta.RxDrop), Unit: "count"},
|
||||
Sample{Kind: "nic_errs", Key: iface + "/tx", Value: float64(delta.TxErrs + delta.TxDrop), Unit: "count"},
|
||||
)
|
||||
}
|
||||
|
||||
if d.Sensor != nil {
|
||||
_ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}})
|
||||
_ = d.Sensor(ctx, samples)
|
||||
}
|
||||
|
||||
extras := map[string]any{
|
||||
"throughput_mbps": mbps,
|
||||
"retransmits": retrans,
|
||||
"bytes_sent": bytesSent,
|
||||
"net_delta": netDelta,
|
||||
"iperf_end": parsed,
|
||||
}
|
||||
if mbps <= 0 {
|
||||
@@ -98,14 +157,55 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps))
|
||||
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps (retransmits=%d)", mbps, retrans))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host),
|
||||
Summary: fmt.Sprintf("%.1f Mbps to %s (retransmits=%d)", mbps, host, retrans),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
|
||||
// indexNetDev flattens a NetDev slice into a map keyed by interface
|
||||
// name so diffNetDev can pair start/end by name without O(n²) scans.
|
||||
func indexNetDev(snaps []probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
|
||||
out := map[string]probes.NetDevSnapshot{}
|
||||
for _, s := range snaps {
|
||||
out[s.Iface] = s
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// diffNetDev computes end − start for each interface present in both
|
||||
// snapshots. An interface that dropped away mid-run is dropped from
|
||||
// the result (can't compute a delta). Underflow (end < start, rare
|
||||
// after a counter reset) is clamped to 0.
|
||||
func diffNetDev(start, end map[string]probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
|
||||
out := map[string]probes.NetDevSnapshot{}
|
||||
for iface, e := range end {
|
||||
s, ok := start[iface]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
out[iface] = probes.NetDevSnapshot{
|
||||
Iface: iface,
|
||||
RxBytes: subU64(e.RxBytes, s.RxBytes),
|
||||
RxErrs: subU64(e.RxErrs, s.RxErrs),
|
||||
RxDrop: subU64(e.RxDrop, s.RxDrop),
|
||||
TxBytes: subU64(e.TxBytes, s.TxBytes),
|
||||
TxErrs: subU64(e.TxErrs, s.TxErrs),
|
||||
TxDrop: subU64(e.TxDrop, s.TxDrop),
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func subU64(a, b uint64) uint64 {
|
||||
if a < b {
|
||||
return 0
|
||||
}
|
||||
return a - b
|
||||
}
|
||||
|
||||
// deriveHost pulls the hostname out of an https://host:port base URL.
|
||||
func deriveHost(raw string) (string, error) {
|
||||
if raw == "" {
|
||||
@@ -119,18 +219,22 @@ func deriveHost(raw string) (string, error) {
|
||||
return strings.TrimSpace(h), nil
|
||||
}
|
||||
|
||||
// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J.
|
||||
// Returns (Mbps, full-json-map, err).
|
||||
func parseIperfJSON(b []byte) (float64, map[string]any, error) {
|
||||
// parseIperfJSON pulls end.sum_sent.bits_per_second and retransmits out
|
||||
// of iperf3 -J. Returns (Mbps, retransmits, bytes_sent, full-end-map, err).
|
||||
func parseIperfJSON(b []byte) (float64, int64, int64, map[string]any, error) {
|
||||
var top map[string]any
|
||||
if err := json.Unmarshal(b, &top); err != nil {
|
||||
return 0, nil, err
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
end, ok := top["end"].(map[string]any)
|
||||
if !ok {
|
||||
return 0, top, fmt.Errorf("missing end")
|
||||
return 0, 0, 0, nil, fmt.Errorf("missing end")
|
||||
}
|
||||
// iperf3 reports either sum_sent (when -R not set) or sum_received.
|
||||
// Pull the first sum that carries bits_per_second; retransmits +
|
||||
// bytes live there too for TCP.
|
||||
var mbps float64
|
||||
var retrans int64
|
||||
var bytesSent int64
|
||||
for _, key := range []string{"sum_sent", "sum_received", "sum"} {
|
||||
sum, ok := end[key].(map[string]any)
|
||||
if !ok {
|
||||
@@ -140,7 +244,17 @@ func parseIperfJSON(b []byte) (float64, map[string]any, error) {
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
return bps / 1_000_000, end, nil
|
||||
mbps = bps / 1_000_000
|
||||
if r, ok := sum["retransmits"].(float64); ok {
|
||||
retrans = int64(r)
|
||||
}
|
||||
if bs, ok := sum["bytes"].(float64); ok {
|
||||
bytesSent = int64(bs)
|
||||
}
|
||||
break
|
||||
}
|
||||
return 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
|
||||
if mbps == 0 {
|
||||
return 0, 0, 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
|
||||
}
|
||||
return mbps, retrans, bytesSent, end, nil
|
||||
}
|
||||
|
||||
@@ -0,0 +1,192 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"vetting/agent/probes"
|
||||
)
|
||||
|
||||
// TestParseIperfJSON_SumSent confirms we pull throughput, retransmits,
|
||||
// and bytes_sent from end.sum_sent. Real iperf3 -J output nests these
|
||||
// three under end.sum_sent for TCP streams.
|
||||
func TestParseIperfJSON_SumSent(t *testing.T) {
|
||||
raw := `{
|
||||
"end": {
|
||||
"sum_sent": {
|
||||
"bits_per_second": 950000000,
|
||||
"retransmits": 42,
|
||||
"bytes": 1187500000
|
||||
}
|
||||
}
|
||||
}`
|
||||
mbps, retrans, bytesSent, _, err := parseIperfJSON([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("parseIperfJSON: %v", err)
|
||||
}
|
||||
if mbps != 950 {
|
||||
t.Errorf("mbps = %v, want 950", mbps)
|
||||
}
|
||||
if retrans != 42 {
|
||||
t.Errorf("retransmits = %d, want 42", retrans)
|
||||
}
|
||||
if bytesSent != 1187500000 {
|
||||
t.Errorf("bytesSent = %d, want 1187500000", bytesSent)
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseIperfJSON_MissingEnd fails cleanly when iperf returned
|
||||
// something without an end block (partial/aborted run).
|
||||
func TestParseIperfJSON_MissingEnd(t *testing.T) {
|
||||
raw := `{"start": {}}`
|
||||
if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
|
||||
t.Errorf("expected error on iperf output missing end block")
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseIperfJSON_ZeroBps returns an error so the stage can fail
|
||||
// fast. A successful-exit iperf that pushed zero bits is indistinguishable
|
||||
// from a broken run and must not pass.
|
||||
func TestParseIperfJSON_ZeroBps(t *testing.T) {
|
||||
raw := `{"end": {"sum_sent": {"bits_per_second": 0}}}`
|
||||
if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
|
||||
t.Errorf("expected error when bits_per_second is 0")
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseIperfJSON_FallsBackToSumReceived: UDP tests and some edge
|
||||
// cases don't populate sum_sent. The parser walks sum_sent → sum_received
|
||||
// → sum and picks the first that has a throughput number.
|
||||
func TestParseIperfJSON_FallsBackToSumReceived(t *testing.T) {
|
||||
raw := `{
|
||||
"end": {
|
||||
"sum_received": {"bits_per_second": 500000000}
|
||||
}
|
||||
}`
|
||||
mbps, _, _, _, err := parseIperfJSON([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("parseIperfJSON: %v", err)
|
||||
}
|
||||
if mbps != 500 {
|
||||
t.Errorf("mbps = %v, want 500", mbps)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDiffNetDev_HappyPath confirms end − start on a shared interface
|
||||
// produces the delta we expect. eth0 pushed 10k bytes and accumulated
|
||||
// 3 tx errors during the window.
|
||||
func TestDiffNetDev_HappyPath(t *testing.T) {
|
||||
start := map[string]probes.NetDevSnapshot{
|
||||
"eth0": {Iface: "eth0", RxBytes: 1000, RxErrs: 0, TxBytes: 5000, TxErrs: 1},
|
||||
}
|
||||
end := map[string]probes.NetDevSnapshot{
|
||||
"eth0": {Iface: "eth0", RxBytes: 2000, RxErrs: 0, TxBytes: 15000, TxErrs: 4},
|
||||
}
|
||||
delta := diffNetDev(start, end)
|
||||
got, ok := delta["eth0"]
|
||||
if !ok {
|
||||
t.Fatalf("eth0 missing from diff output")
|
||||
}
|
||||
if got.RxBytes != 1000 {
|
||||
t.Errorf("RxBytes delta=%d, want 1000", got.RxBytes)
|
||||
}
|
||||
if got.TxBytes != 10000 {
|
||||
t.Errorf("TxBytes delta=%d, want 10000", got.TxBytes)
|
||||
}
|
||||
if got.TxErrs != 3 {
|
||||
t.Errorf("TxErrs delta=%d, want 3", got.TxErrs)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDiffNetDev_InterfaceVanished: an interface present at start but
|
||||
// gone at end drops from the diff rather than carrying a negative or
|
||||
// stale number.
|
||||
func TestDiffNetDev_InterfaceVanished(t *testing.T) {
|
||||
start := map[string]probes.NetDevSnapshot{
|
||||
"eth0": {Iface: "eth0", TxBytes: 1000},
|
||||
"eth1": {Iface: "eth1", TxBytes: 500},
|
||||
}
|
||||
end := map[string]probes.NetDevSnapshot{
|
||||
"eth0": {Iface: "eth0", TxBytes: 2000},
|
||||
}
|
||||
delta := diffNetDev(start, end)
|
||||
if _, ok := delta["eth1"]; ok {
|
||||
t.Errorf("eth1 should have been dropped (gone at end)")
|
||||
}
|
||||
if delta["eth0"].TxBytes != 1000 {
|
||||
t.Errorf("eth0 TxBytes delta=%d, want 1000", delta["eth0"].TxBytes)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDiffNetDev_CounterReset: if a counter resets between snapshots
|
||||
// (kernel restart, wrap-around on a 32-bit counter) we clamp to 0
|
||||
// rather than underflow a uint64.
|
||||
func TestDiffNetDev_CounterReset(t *testing.T) {
|
||||
start := map[string]probes.NetDevSnapshot{
|
||||
"eth0": {Iface: "eth0", TxBytes: 9999, TxErrs: 5},
|
||||
}
|
||||
end := map[string]probes.NetDevSnapshot{
|
||||
"eth0": {Iface: "eth0", TxBytes: 100, TxErrs: 0},
|
||||
}
|
||||
delta := diffNetDev(start, end)
|
||||
if delta["eth0"].TxBytes != 0 {
|
||||
t.Errorf("reset TxBytes delta=%d, want 0 (clamped)", delta["eth0"].TxBytes)
|
||||
}
|
||||
if delta["eth0"].TxErrs != 0 {
|
||||
t.Errorf("reset TxErrs delta=%d, want 0 (clamped)", delta["eth0"].TxErrs)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDeriveHost: orchestrator URL → host extraction is how the agent
|
||||
// picks the iperf3 server target. Handles both https://host and
|
||||
// https://host:port shapes.
|
||||
func TestDeriveHost(t *testing.T) {
|
||||
cases := []struct {
|
||||
raw string
|
||||
want string
|
||||
}{
|
||||
{"https://orch.local", "orch.local"},
|
||||
{"https://orch.local:8443", "orch.local"},
|
||||
{"http://10.0.0.5:8080", "10.0.0.5"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
got, err := deriveHost(c.raw)
|
||||
if err != nil {
|
||||
t.Errorf("deriveHost(%q) error: %v", c.raw, err)
|
||||
continue
|
||||
}
|
||||
if got != c.want {
|
||||
t.Errorf("deriveHost(%q) = %q, want %q", c.raw, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeriveHost_Empty(t *testing.T) {
|
||||
if _, err := deriveHost(""); err == nil {
|
||||
t.Errorf("deriveHost(\"\") should error")
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseIperfJSON_ParsesEndMap confirms the full end map is returned
|
||||
// so extras can show every field iperf produced, not just the three we
|
||||
// extract by hand.
|
||||
func TestParseIperfJSON_ParsesEndMap(t *testing.T) {
|
||||
raw := `{
|
||||
"end": {
|
||||
"sum_sent": {"bits_per_second": 1000000, "retransmits": 0, "bytes": 125000},
|
||||
"cpu_utilization_percent": {"host_total": 12.3}
|
||||
}
|
||||
}`
|
||||
_, _, _, endMap, err := parseIperfJSON([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("parseIperfJSON: %v", err)
|
||||
}
|
||||
if endMap == nil {
|
||||
t.Fatalf("endMap is nil")
|
||||
}
|
||||
// Sanity: both keys round-trip via json.
|
||||
b, _ := json.Marshal(endMap)
|
||||
if len(b) == 0 {
|
||||
t.Errorf("endMap marshaled to empty")
|
||||
}
|
||||
}
|
||||
+137
-18
@@ -7,12 +7,20 @@ import (
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
|
||||
// PSU rails. In home-lab hosts the kernel surfaces a handful of named
|
||||
// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10%
|
||||
// window of its nominal value → fail.
|
||||
// PSU rails, then samples each rail every psuSampleInterval for a
|
||||
// window sized by the stage timeout. During Burn a separate sidecar
|
||||
// (see burn.go) runs the same probe concurrently with workload — the
|
||||
// PSU stage itself catches slow post-load sag that only surfaces once
|
||||
// the 12V rail starts recovering from a brownout under concurrent CPU
|
||||
// + fio + iperf load.
|
||||
//
|
||||
// Any rail outside ±10% of its nominal value at any tick fires the
|
||||
// critical threshold (server-side) and fails the stage. A host with no
|
||||
// PSU rails wired to hwmon auto-skips.
|
||||
func PSU(ctx context.Context, d Deps) Outcome {
|
||||
rails := scanPSURails()
|
||||
if len(rails) == 0 {
|
||||
@@ -24,39 +32,150 @@ func PSU(ctx context.Context, d Deps) Outcome {
|
||||
}
|
||||
}
|
||||
|
||||
var samples []Sample
|
||||
problems := []string{}
|
||||
for _, rail := range rails {
|
||||
samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"})
|
||||
if ok, why := voltageInRange(rail); !ok {
|
||||
problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why))
|
||||
window := resolvePSUWindow(d.StageTimeout)
|
||||
deadline := time.Now().Add(window)
|
||||
interval := psuSampleInterval
|
||||
if window < interval*2 {
|
||||
// Tiny window (tests, pathological stage_timeout) — at least two
|
||||
// ticks so aggregate stats are meaningful.
|
||||
interval = window / 2
|
||||
if interval < time.Second {
|
||||
interval = time.Second
|
||||
}
|
||||
}
|
||||
if d.Sensor != nil {
|
||||
_ = d.Sensor(ctx, samples)
|
||||
|
||||
// Per-label tracking: min/max across the window, count of out-of-range
|
||||
// hits, last-observed value (shown in the summary).
|
||||
type railStats struct {
|
||||
label string
|
||||
minV float64
|
||||
maxV float64
|
||||
lastV float64
|
||||
ticks int
|
||||
breaches int
|
||||
reason string
|
||||
}
|
||||
stats := map[string]*railStats{}
|
||||
|
||||
tick := time.NewTicker(interval)
|
||||
defer tick.Stop()
|
||||
// Start with an immediate sample so a sub-45s window still produces
|
||||
// at least one reading.
|
||||
sampleOnce := func() {
|
||||
cur := scanPSURails()
|
||||
if len(cur) == 0 {
|
||||
return
|
||||
}
|
||||
batch := make([]Sample, 0, len(cur))
|
||||
for _, r := range cur {
|
||||
s, ok := stats[r.Label]
|
||||
if !ok {
|
||||
s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts}
|
||||
stats[r.Label] = s
|
||||
}
|
||||
s.ticks++
|
||||
s.lastV = r.Volts
|
||||
if r.Volts < s.minV {
|
||||
s.minV = r.Volts
|
||||
}
|
||||
if r.Volts > s.maxV {
|
||||
s.maxV = r.Volts
|
||||
}
|
||||
if ok, why := voltageInRange(r); !ok {
|
||||
s.breaches++
|
||||
if s.reason == "" {
|
||||
s.reason = why
|
||||
}
|
||||
}
|
||||
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
|
||||
}
|
||||
if d.Sensor != nil && len(batch) > 0 {
|
||||
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
_ = d.Sensor(sendCtx, batch)
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
sampleOnce()
|
||||
sampling:
|
||||
for time.Now().Before(deadline) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break sampling
|
||||
case <-tick.C:
|
||||
sampleOnce()
|
||||
}
|
||||
}
|
||||
|
||||
// Build the outcome. Extras carry per-rail rollup so the report can
|
||||
// show "12V min=11.1 max=12.05 (3/120 ticks out of range)".
|
||||
type railRollup struct {
|
||||
Label string `json:"label"`
|
||||
MinV float64 `json:"min_v"`
|
||||
MaxV float64 `json:"max_v"`
|
||||
LastV float64 `json:"last_v"`
|
||||
Ticks int `json:"ticks"`
|
||||
Breaches int `json:"breaches"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
rollups := make([]railRollup, 0, len(stats))
|
||||
problems := []string{}
|
||||
for _, s := range stats {
|
||||
rollups = append(rollups, railRollup{
|
||||
Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV,
|
||||
Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason,
|
||||
})
|
||||
if s.breaches > 0 {
|
||||
problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason))
|
||||
}
|
||||
}
|
||||
|
||||
extras := map[string]any{
|
||||
"rails": rails,
|
||||
"problems": problems,
|
||||
"rails": rollups,
|
||||
"problems": problems,
|
||||
"window": window.String(),
|
||||
"interval": interval.String(),
|
||||
}
|
||||
if len(problems) > 0 {
|
||||
d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", "))
|
||||
d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; "))
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "PSU rails out of range: " + strings.Join(problems, ", "),
|
||||
Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)),
|
||||
Message: "PSU rails out of range: " + strings.Join(problems, "; "),
|
||||
Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails)))
|
||||
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("%d rails nominal", len(rails)),
|
||||
Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
|
||||
// psuSampleInterval is the default tick for post-Burn rail sampling.
|
||||
// Five seconds is slow enough to stay under the HTTP budget and fast
|
||||
// enough to catch rail recovery transients.
|
||||
const psuSampleInterval = 5 * time.Second
|
||||
|
||||
// resolvePSUWindow maps the stage timeout to the sampling window.
|
||||
// With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot-
|
||||
// like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom
|
||||
// for sensor flush + result post, capped at 10 min so a 24 h soak
|
||||
// doesn't spend all day in PSU.
|
||||
func resolvePSUWindow(stageTimeout time.Duration) time.Duration {
|
||||
if stageTimeout <= 0 {
|
||||
return 30 * time.Second
|
||||
}
|
||||
w := stageTimeout - 5*time.Second
|
||||
if w < 30*time.Second {
|
||||
w = 30 * time.Second
|
||||
}
|
||||
if w > 10*time.Minute {
|
||||
w = 10 * time.Minute
|
||||
}
|
||||
return w
|
||||
}
|
||||
|
||||
type psuRail struct {
|
||||
Label string `json:"label"`
|
||||
Volts float64 `json:"volts"`
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestIsPSULabel keeps the allowlist narrow enough that CPU VRM rails
|
||||
// don't get misclassified as PSU-out-of-range failures but wide enough
|
||||
// that common SuperMicro/Intel hwmon labels land in the Yes bucket.
|
||||
func TestIsPSULabel(t *testing.T) {
|
||||
cases := []struct {
|
||||
label string
|
||||
want bool
|
||||
}{
|
||||
{"+12V", true},
|
||||
{"12V", true},
|
||||
{"+5V", true},
|
||||
{"5V", true},
|
||||
{"+3.3V", true},
|
||||
{"3V3", true},
|
||||
{"VCCIN", true},
|
||||
{"vccin", true},
|
||||
{"Vcore", false},
|
||||
{"CPU VCORE", false},
|
||||
{"AVCC", false},
|
||||
{"", false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
if got := isPSULabel(tc.label); got != tc.want {
|
||||
t.Errorf("isPSULabel(%q) = %v, want %v", tc.label, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestNominalFor maps rail labels back to expected nominal voltages.
|
||||
// Unknown labels must return 0 so voltageInRange short-circuits — an
|
||||
// accidental nominal would invent out-of-range failures.
|
||||
func TestNominalFor(t *testing.T) {
|
||||
cases := []struct {
|
||||
label string
|
||||
want float64
|
||||
}{
|
||||
{"+12V", 12.0},
|
||||
{"12V", 12.0},
|
||||
{"+5V", 5.0},
|
||||
{"+3.3V", 3.3},
|
||||
{"3V3", 3.3},
|
||||
{"VCCIN", 0},
|
||||
{"unknown", 0},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
if got := nominalFor(tc.label); got != tc.want {
|
||||
t.Errorf("nominalFor(%q) = %v, want %v", tc.label, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestVoltageInRange verifies the ±10% band: 12V passes in [10.8,
|
||||
// 13.2], fails anywhere outside. Unknown labels always pass (since
|
||||
// nominalFor returned 0 above).
|
||||
func TestVoltageInRange(t *testing.T) {
|
||||
cases := []struct {
|
||||
rail psuRail
|
||||
ok bool
|
||||
}{
|
||||
{psuRail{Label: "+12V", Volts: 12.0}, true},
|
||||
{psuRail{Label: "+12V", Volts: 10.8}, true}, // exactly at the band
|
||||
{psuRail{Label: "+12V", Volts: 13.2}, true}, // exactly at the band
|
||||
{psuRail{Label: "+12V", Volts: 10.7}, false}, // just below
|
||||
{psuRail{Label: "+12V", Volts: 13.3}, false}, // just above
|
||||
{psuRail{Label: "+12V", Volts: 10.5}, false}, // real sag
|
||||
{psuRail{Label: "+5V", Volts: 4.6}, true}, // 8% low on 5V still in band
|
||||
{psuRail{Label: "+5V", Volts: 4.4}, false}, // 12% low on 5V — out of band
|
||||
{psuRail{Label: "+5V", Volts: 5.0}, true},
|
||||
{psuRail{Label: "VCCIN", Volts: 1.8}, true}, // unknown nominal → pass
|
||||
}
|
||||
for _, tc := range cases {
|
||||
got, _ := voltageInRange(tc.rail)
|
||||
if got != tc.ok {
|
||||
t.Errorf("voltageInRange(%+v) = %v, want %v", tc.rail, got, tc.ok)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePSUWindow maps stage timeouts to the sampling window.
|
||||
// Quick's 1m stage_timeout → 55s window; deep's 10m → capped at 10m;
|
||||
// missing/zero → 30s (test / legacy orchestrator path); sub-35s → at
|
||||
// least 30s so aggregates are non-trivial.
|
||||
func TestResolvePSUWindow(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in time.Duration
|
||||
want time.Duration
|
||||
}{
|
||||
{"zero → snapshot fallback", 0, 30 * time.Second},
|
||||
{"negative → snapshot fallback", -1 * time.Second, 30 * time.Second},
|
||||
{"tiny timeout clamps up to 30s floor", 10 * time.Second, 30 * time.Second},
|
||||
{"35s - 5s = 30s", 35 * time.Second, 30 * time.Second},
|
||||
{"1m quick → 55s", time.Minute, 55 * time.Second},
|
||||
{"10m deep → 9m55s", 10 * time.Minute, 9*time.Minute + 55*time.Second},
|
||||
{"15m soak → capped at 10m", 15 * time.Minute, 10 * time.Minute},
|
||||
{"1h → capped at 10m", time.Hour, 10 * time.Minute},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := resolvePSUWindow(tc.in); got != tc.want {
|
||||
t.Errorf("resolvePSUWindow(%s) = %s, want %s", tc.in, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -59,6 +59,11 @@ func (o Outcome) MarshalSummary() (json.RawMessage, error) {
|
||||
// Deps bundles what stages need without pulling in the whole agent.
|
||||
// Logger methods print to stdout + forward to the orchestrator; Sensor
|
||||
// drops numeric samples; OverrideFlags carries operator-set bypasses.
|
||||
//
|
||||
// CPUStressKnobs / StorageKnobs / NetworkKnobs are Phase-2 profile
|
||||
// knobs. Zero-valued fields mean "fall back to the compile-time
|
||||
// default" — that keeps the stages runnable even when the runner can't
|
||||
// materialize a profile (tests, legacy orchestrator, etc).
|
||||
type Deps struct {
|
||||
Info func(string)
|
||||
Warn func(string)
|
||||
@@ -68,6 +73,58 @@ type Deps struct {
|
||||
NonDestructive bool // skip wipe-probe + writes in Storage
|
||||
ExpectedDisks []ExpectedDisk // serials + sizes from host.expected_spec
|
||||
StageTimeout time.Duration
|
||||
CPUStressKnobs CPUStressKnobs
|
||||
StorageKnobs StorageKnobs
|
||||
NetworkKnobs NetworkKnobs
|
||||
BurnKnobs BurnKnobs
|
||||
// LookPath is the unit-test seam for swapping a real external
|
||||
// binary (stress-ng, fio, iperf3, dmidecode, …) for a fake. When
|
||||
// nil the stage falls back to os/exec.LookPath — production and
|
||||
// existing tests keep working unchanged. Tests under
|
||||
// agent/tests/fakes/ populate this to redirect lookups to a built
|
||||
// fake binary in a tempdir.
|
||||
LookPath func(name string) (string, error)
|
||||
}
|
||||
|
||||
// CPUStressKnobs parameterizes the CPUStress stage. Zero durations fall
|
||||
// back to the package's compile-time defaults (cpuPassDuration etc).
|
||||
type CPUStressKnobs struct {
|
||||
CPUPass time.Duration
|
||||
MemPass time.Duration
|
||||
EDACPoll time.Duration
|
||||
}
|
||||
|
||||
// StorageKnobs parameterizes the Storage stage. Mode picks between
|
||||
// "fio_sample" (bounded tempfile inside the device, quick profile) and
|
||||
// "full_disk" (whole-device write verify, deep/soak). Empty strings
|
||||
// fall back to the stage's safe defaults.
|
||||
type StorageKnobs struct {
|
||||
Mode string
|
||||
FioSize string
|
||||
FioTime time.Duration
|
||||
FioBS string
|
||||
FioRW string
|
||||
Verify string
|
||||
}
|
||||
|
||||
// NetworkKnobs parameterizes the Network stage.
|
||||
type NetworkKnobs struct {
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
// BurnKnobs parameterizes the Burn super-stage. Duration is the total
|
||||
// Burn window; sub-workloads run concurrently inside that window.
|
||||
// CPUWorkers is "all" (runtime.NumCPU) or a numeric string. MemPct is a
|
||||
// percentage of MemAvailable to allocate for the memory burner (clamped
|
||||
// 0-90 by the stage). IperfParallel feeds iperf3 -P to generate sustained
|
||||
// NIC load. FioOnSpare gates the storage sub-workload: true = fio runs
|
||||
// against the allow-listed disks for the same window; false = skip fio.
|
||||
type BurnKnobs struct {
|
||||
Duration time.Duration
|
||||
CPUWorkers string
|
||||
MemPct int
|
||||
FioOnSpare bool
|
||||
IperfParallel int
|
||||
}
|
||||
|
||||
// Sample mirrors the server's SensorSample but lives in the tests
|
||||
|
||||
+318
-105
@@ -5,24 +5,36 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Storage is the destructive stage: badblocks (write-mode sample) + fio
|
||||
// random IO, persisting IOPS + latency as measurements. Pre-gates:
|
||||
// Storage is the destructive stage. Phase 2 replaced the old
|
||||
// badblocks + 128 MiB fio combo with a single fio run per disk that
|
||||
// writes, verifies md5 of what it wrote, and reports p99 latency.
|
||||
// Modes:
|
||||
//
|
||||
// - fio_sample (quick): bounded 1 GiB write per disk, ~3 min runtime.
|
||||
// - full_disk (deep/soak): writes the whole device, time-bounded by
|
||||
// the fio_time knob (2 h deep, 6 h soak).
|
||||
//
|
||||
// Pre-gates kept from Phase 1:
|
||||
//
|
||||
// 1. Device allowlist: only act on /dev/<X> where the kernel-reported
|
||||
// serial matches one of Deps.ExpectedDisks. This is the operator's
|
||||
// contract for what can be written to. USB sticks and unexpected
|
||||
// serial matches one of Deps.ExpectedDisks. USB sticks and unexpected
|
||||
// drives are excluded.
|
||||
// 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
|
||||
// signatures, partition tables, or LVM metadata → fail with
|
||||
// signature, partition table, or LVM metadata → fail with
|
||||
// UnexpectedData unless Deps.OverrideWipe is set.
|
||||
//
|
||||
// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w`
|
||||
// and `fio` in write mode. This matches the plan's "destructive disk
|
||||
// tests are always-on, gated by layered safety."
|
||||
// After fio, the stage captures a SMART diff (start snapshot taken
|
||||
// before any writes; end snapshot after all writes finish) and posts
|
||||
// deltas on attributes like Reallocated_Sector_Ct and Current_Pending_Sector.
|
||||
// The threshold evaluator isn't seeded to gate smart_delta out of the
|
||||
// box — those samples are diagnostic for the report. Fio's p99 latency
|
||||
// posts as fio_p99_us so the per-stage Storage warning threshold can
|
||||
// fire on a latency cliff.
|
||||
func Storage(ctx context.Context, d Deps) Outcome {
|
||||
if len(d.ExpectedDisks) == 0 {
|
||||
d.Info("Storage: no expected disks in spec — skipping stage")
|
||||
@@ -44,10 +56,10 @@ func Storage(ctx context.Context, d Deps) Outcome {
|
||||
}
|
||||
}
|
||||
|
||||
// Non-destructive runs skip wipe-probe (nothing to refuse), badblocks
|
||||
// -w, and write-mode fio. Every expected disk is still asserted
|
||||
// present + readable by listing /sys/block and reading SMART-accessible
|
||||
// identity; the per-disk map flags the shortcut so the report is clear.
|
||||
// Non-destructive runs skip wipe-probe (nothing to refuse), fio
|
||||
// writes, and SMART delta (nothing changed so no delta to report).
|
||||
// Every expected disk is still asserted present so a vanished drive
|
||||
// still fails the stage.
|
||||
if d.NonDestructive {
|
||||
perDisk := map[string]any{}
|
||||
for _, t := range targets {
|
||||
@@ -79,9 +91,9 @@ func Storage(ctx context.Context, d Deps) Outcome {
|
||||
Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
|
||||
Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
|
||||
Extras: map[string]any{
|
||||
"wipe_probe": probes,
|
||||
"override_hint": "click 'Override wipe & retry' in the held tile",
|
||||
"dirty_devices": dirty,
|
||||
"wipe_probe": probes,
|
||||
"override_hint": "click 'Override wipe & retry' in the held tile",
|
||||
"dirty_devices": dirty,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -89,64 +101,80 @@ func Storage(ctx context.Context, d Deps) Outcome {
|
||||
d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
|
||||
}
|
||||
|
||||
// Per target: short badblocks write sample + fio random-read/write.
|
||||
// Capture start-of-stage SMART attributes before we write anything
|
||||
// so the delta is attributable to *this* stage's writes and not the
|
||||
// host's prior history. Per-disk failures are tolerated (e.g. the
|
||||
// device doesn't expose SMART); we just can't emit a delta for it.
|
||||
startSMART := captureSMARTAttrs(ctx, targets)
|
||||
|
||||
fioOpts := resolveFioOpts(d.StorageKnobs)
|
||||
d.Info(fmt.Sprintf("Storage: fio mode=%s size=%s runtime=%s bs=%s rw=%s verify=%s",
|
||||
fioOpts.Mode, fioOpts.Size, fioOpts.Runtime, fioOpts.BS, fioOpts.RW, fioOpts.Verify))
|
||||
|
||||
var samples []Sample
|
||||
var subs []SubStepReport
|
||||
perDisk := map[string]any{}
|
||||
failed := ""
|
||||
for _, t := range targets {
|
||||
d.Info("Storage: running badblocks write sample on " + t.Device)
|
||||
bbStart := time.Now()
|
||||
bb := runBadblocks(ctx, t.Device)
|
||||
bbEnd := time.Now()
|
||||
bbSummary, _ := json.Marshal(bb)
|
||||
subs = append(subs, SubStepReport{
|
||||
Name: fmt.Sprintf("badblocks %s", t.Device),
|
||||
Passed: bb.OK,
|
||||
StartedAt: bbStart,
|
||||
CompletedAt: bbEnd,
|
||||
SummaryJSON: bbSummary,
|
||||
})
|
||||
|
||||
d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
|
||||
d.Info(fmt.Sprintf("Storage: running fio %s on %s", fioOpts.Mode, t.Device))
|
||||
fioStart := time.Now()
|
||||
fr := runFio(ctx, t.Device)
|
||||
fr := runFioVerify(ctx, t.Device, fioOpts)
|
||||
fioEnd := time.Now()
|
||||
fioSummary, _ := json.Marshal(fr)
|
||||
subs = append(subs, SubStepReport{
|
||||
Name: fmt.Sprintf("fio %s", t.Device),
|
||||
Name: fmt.Sprintf("fio %s %s", fioOpts.Mode, t.Device),
|
||||
Passed: fr.Error == "",
|
||||
StartedAt: fioStart,
|
||||
CompletedAt: fioEnd,
|
||||
SummaryJSON: fioSummary,
|
||||
})
|
||||
perDisk[t.Device] = map[string]any{"fio": fr}
|
||||
|
||||
perDisk[t.Device] = map[string]any{
|
||||
"badblocks": bb,
|
||||
"fio": fr,
|
||||
}
|
||||
samples = append(samples,
|
||||
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
|
||||
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
|
||||
)
|
||||
if !bb.OK {
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "badblocks found errors on " + t.Device,
|
||||
Summary: "badblocks failed on " + t.Device,
|
||||
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
|
||||
SubSteps: subs,
|
||||
if fr.Error == "" {
|
||||
samples = append(samples,
|
||||
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
|
||||
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
|
||||
)
|
||||
if fr.ReadP99Us > 0 {
|
||||
samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
|
||||
}
|
||||
if fr.WriteP99Us > 0 {
|
||||
samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
|
||||
}
|
||||
} else if failed == "" {
|
||||
failed = t.Device
|
||||
}
|
||||
}
|
||||
if d.Sensor != nil {
|
||||
|
||||
// End-of-stage SMART snapshot + diff. We capture whether or not fio
|
||||
// succeeded — a mid-run failure still produces attributable deltas,
|
||||
// which is often more interesting than the stage outcome itself.
|
||||
endSMART := captureSMARTAttrs(ctx, targets)
|
||||
deltas := diffSMARTAttrs(startSMART, endSMART)
|
||||
for dev, attrs := range deltas {
|
||||
for attr, delta := range attrs {
|
||||
samples = append(samples, Sample{Kind: "smart_delta", Key: dev + "/" + attr, Value: delta, Unit: "count"})
|
||||
}
|
||||
}
|
||||
if d.Sensor != nil && len(samples) > 0 {
|
||||
_ = d.Sensor(ctx, samples)
|
||||
}
|
||||
|
||||
d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
|
||||
if failed != "" {
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "fio verify failed on " + failed,
|
||||
Summary: "fio failed on " + failed,
|
||||
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
|
||||
SubSteps: subs,
|
||||
}
|
||||
}
|
||||
|
||||
d.Info(fmt.Sprintf("Storage: %d disk(s) passed fio --verify", len(targets)))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("%d disks passed", len(targets)),
|
||||
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
|
||||
Summary: fmt.Sprintf("%d disks passed (%s)", len(targets), fioOpts.Mode),
|
||||
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
|
||||
SubSteps: subs,
|
||||
}
|
||||
}
|
||||
@@ -229,8 +257,8 @@ type wipeProbeResult struct {
|
||||
|
||||
// probeWipe runs blkid + wipefs -n. Any non-empty output from either is
|
||||
// a "has data" signal. This is deliberately conservative: we'd rather
|
||||
// halt on a bare ext4 signature than hand badblocks a disk with real
|
||||
// bytes on it.
|
||||
// halt on a bare ext4 signature than hand fio a disk with real bytes on
|
||||
// it.
|
||||
func probeWipe(ctx context.Context, device string) wipeProbeResult {
|
||||
out := wipeProbeResult{Device: device}
|
||||
|
||||
@@ -257,84 +285,269 @@ func probeWipe(ctx context.Context, device string) wipeProbeResult {
|
||||
return out
|
||||
}
|
||||
|
||||
// ---------- badblocks ----------
|
||||
// ---------- fio ----------
|
||||
|
||||
type badblocksResult struct {
|
||||
OK bool `json:"ok"`
|
||||
Elapsed string `json:"elapsed"`
|
||||
Error string `json:"error,omitempty"`
|
||||
OutputTail string `json:"output_tail,omitempty"`
|
||||
// fioOpts resolves the probe knobs into the concrete flag values fio
|
||||
// needs. Defaults match the quick profile's fio_sample shape so callers
|
||||
// with zero knobs still run something bounded.
|
||||
type fioOpts struct {
|
||||
Mode string `json:"mode"` // "fio_sample" | "full_disk"
|
||||
Size string `json:"size"` // "1GiB"; only used for fio_sample
|
||||
Runtime time.Duration `json:"runtime"` // bounding time
|
||||
BS string `json:"bs"` // "4k"
|
||||
RW string `json:"rw"` // "randrw"
|
||||
Verify string `json:"verify"` // "md5" | ""
|
||||
}
|
||||
|
||||
func runBadblocks(ctx context.Context, device string) badblocksResult {
|
||||
// -c 64 blocks per check, -w destructive write, -b 4096 block size,
|
||||
// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays
|
||||
// bounded. A real burn-in would run the whole disk; that belongs in
|
||||
// a separate "deep" stage.
|
||||
args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"}
|
||||
start := time.Now()
|
||||
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||||
// resolveFioOpts normalizes the knobs into a runnable config. Zero-
|
||||
// valued fields fall back to the quick defaults so a stage that's
|
||||
// missing its knobs still has coherent behavior (safer than refusing).
|
||||
func resolveFioOpts(k StorageKnobs) fioOpts {
|
||||
o := fioOpts{
|
||||
Mode: firstNonEmpty(k.Mode, "fio_sample"),
|
||||
Size: firstNonEmpty(k.FioSize, "1GiB"),
|
||||
Runtime: k.FioTime,
|
||||
BS: firstNonEmpty(k.FioBS, "4k"),
|
||||
RW: firstNonEmpty(k.FioRW, "randrw"),
|
||||
Verify: firstNonEmpty(k.Verify, "md5"),
|
||||
}
|
||||
if o.Runtime <= 0 {
|
||||
o.Runtime = 3 * time.Minute
|
||||
}
|
||||
return o
|
||||
}
|
||||
|
||||
func firstNonEmpty(vs ...string) string {
|
||||
for _, v := range vs {
|
||||
if v != "" {
|
||||
return v
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
type fioResult struct {
|
||||
Mode string `json:"mode"`
|
||||
ReadIOPS float64 `json:"read_iops"`
|
||||
WriteIOPS float64 `json:"write_iops"`
|
||||
ReadBWKBps float64 `json:"read_bw_kbps"`
|
||||
WriteBWKBps float64 `json:"write_bw_kbps"`
|
||||
ReadP99Us float64 `json:"read_p99_us,omitempty"`
|
||||
WriteP99Us float64 `json:"write_p99_us,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
OutputTail string `json:"output_tail,omitempty"`
|
||||
}
|
||||
|
||||
// runFioVerify invokes fio with md5-verify semantics. fio_sample mode
|
||||
// caps the IO at opts.Size; full_disk drives the whole device bounded
|
||||
// by runtime. Both use direct IO to bypass the page cache — we want
|
||||
// real disk latency, not Linux' cheerful buffer.
|
||||
func runFioVerify(ctx context.Context, device string, opts fioOpts) fioResult {
|
||||
// 30s grace over runtime so fio has time to flush + close cleanly.
|
||||
runCtx, cancel := context.WithTimeout(ctx, opts.Runtime+30*time.Second)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(runCtx, "badblocks", args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)}
|
||||
|
||||
args := []string{
|
||||
"--name=verify-" + strings.TrimPrefix(device, "/dev/"),
|
||||
"--filename=" + device,
|
||||
"--rw=" + opts.RW,
|
||||
"--bs=" + opts.BS,
|
||||
"--numjobs=1",
|
||||
"--direct=1",
|
||||
"--group_reporting",
|
||||
"--output-format=json",
|
||||
"--runtime=" + strconv.Itoa(int(opts.Runtime.Seconds())),
|
||||
}
|
||||
if opts.Verify != "" {
|
||||
args = append(args,
|
||||
"--verify="+opts.Verify,
|
||||
"--verify_pattern=random",
|
||||
"--do_verify=1",
|
||||
)
|
||||
}
|
||||
switch opts.Mode {
|
||||
case "full_disk":
|
||||
// Time-bounded across the full device — fio uses the device's
|
||||
// full size when --size is omitted on a block device.
|
||||
args = append(args, "--time_based=1")
|
||||
default:
|
||||
// fio_sample: bounded write. Setting --size= limits the IO
|
||||
// volume regardless of runtime.
|
||||
args = append(args, "--size="+opts.Size, "--time_based=0")
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(runCtx, "fio", args...)
|
||||
out, err := cmd.Output()
|
||||
r := fioResult{Mode: opts.Mode, OutputTail: tailLines(string(out), 20)}
|
||||
if err != nil {
|
||||
r.Error = err.Error()
|
||||
return r
|
||||
}
|
||||
// badblocks prints each bad block to stdout. Empty output = clean.
|
||||
if strings.TrimSpace(string(out)) == "" {
|
||||
r.OK = true
|
||||
} else {
|
||||
r.Error = "bad blocks found"
|
||||
parsed, perr := parseFioJSON(out)
|
||||
if perr != nil {
|
||||
r.Error = "parse fio json: " + perr.Error()
|
||||
return r
|
||||
}
|
||||
r.ReadIOPS = parsed.ReadIOPS
|
||||
r.WriteIOPS = parsed.WriteIOPS
|
||||
r.ReadBWKBps = parsed.ReadBWKBps
|
||||
r.WriteBWKBps = parsed.WriteBWKBps
|
||||
r.ReadP99Us = parsed.ReadP99Us
|
||||
r.WriteP99Us = parsed.WriteP99Us
|
||||
return r
|
||||
}
|
||||
|
||||
// ---------- fio ----------
|
||||
|
||||
type fioResult struct {
|
||||
ReadIOPS float64 `json:"read_iops"`
|
||||
WriteIOPS float64 `json:"write_iops"`
|
||||
ReadBWKBps float64 `json:"read_bw_kbps"`
|
||||
WriteBWKBps float64 `json:"write_bw_kbps"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
|
||||
// This is a health bar, not a benchmark — we want to know the disk
|
||||
// services IO, not how fast it is at p99.
|
||||
func runFio(ctx context.Context, device string) fioResult {
|
||||
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||||
defer cancel()
|
||||
args := []string{
|
||||
"--name=health", "--filename=" + device, "--rw=randrw",
|
||||
"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
|
||||
"--group_reporting", "--output-format=json", "--direct=1",
|
||||
}
|
||||
cmd := exec.CommandContext(runCtx, "fio", args...)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return fioResult{Error: err.Error()}
|
||||
}
|
||||
// parseFioJSON extracts the bits we care about from fio's --output-format=json.
|
||||
// Latency percentiles live at .jobs[0].read.clat_ns.percentile["99.000000"];
|
||||
// we convert nanoseconds to microseconds for the fio_p99_us sample.
|
||||
func parseFioJSON(out []byte) (fioResult, error) {
|
||||
var top struct {
|
||||
Jobs []struct {
|
||||
Read struct {
|
||||
Read struct {
|
||||
IOPS float64 `json:"iops"`
|
||||
BW float64 `json:"bw"`
|
||||
CLat struct {
|
||||
Percentile map[string]float64 `json:"percentile"`
|
||||
} `json:"clat_ns"`
|
||||
} `json:"read"`
|
||||
Write struct {
|
||||
IOPS float64 `json:"iops"`
|
||||
BW float64 `json:"bw"`
|
||||
CLat struct {
|
||||
Percentile map[string]float64 `json:"percentile"`
|
||||
} `json:"clat_ns"`
|
||||
} `json:"write"`
|
||||
} `json:"jobs"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 {
|
||||
return fioResult{Error: "parse fio json: " + fmt.Sprint(err)}
|
||||
if err := json.Unmarshal(out, &top); err != nil {
|
||||
return fioResult{}, err
|
||||
}
|
||||
if len(top.Jobs) == 0 {
|
||||
return fioResult{}, fmt.Errorf("no jobs in fio output")
|
||||
}
|
||||
j := top.Jobs[0]
|
||||
return fioResult{
|
||||
r := fioResult{
|
||||
ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
|
||||
ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
|
||||
}
|
||||
if p := j.Read.CLat.Percentile["99.000000"]; p > 0 {
|
||||
r.ReadP99Us = p / 1000.0
|
||||
}
|
||||
if p := j.Write.CLat.Percentile["99.000000"]; p > 0 {
|
||||
r.WriteP99Us = p / 1000.0
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// ---------- SMART delta ----------
|
||||
|
||||
// smartAttrMap: device → attribute → raw counter value. ATA drives
|
||||
// populate named attributes (Reallocated_Sector_Ct etc); NVMe drives
|
||||
// populate a flatter nvme-specific map. We track a curated whitelist
|
||||
// of wear indicators — anything else is diagnostic and drops to the raw
|
||||
// report output.
|
||||
type smartAttrMap map[string]map[string]float64
|
||||
|
||||
// captureSMARTAttrs runs smartctl -aj on each target and pulls the
|
||||
// whitelisted attributes. Per-device failures (virtio, permission
|
||||
// issues) degrade silently — the delta step just shows no data for
|
||||
// that device.
|
||||
func captureSMARTAttrs(ctx context.Context, targets []diskTarget) smartAttrMap {
|
||||
out := smartAttrMap{}
|
||||
for _, t := range targets {
|
||||
parsed, err := runSmartctl(ctx, t.Device)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
attrs := extractSMARTAttrs(parsed)
|
||||
if len(attrs) > 0 {
|
||||
out[t.Device] = attrs
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// smartAttributeWhitelist is the set of attributes we diff across a
|
||||
// stage. They're the ones that reflect *this stage's* IO damage, not
|
||||
// cumulative drive history. Adding attributes is cheap — missing ones
|
||||
// just drop to zero.
|
||||
var smartAttributeWhitelist = map[string]bool{
|
||||
// ATA SMART attribute names (smartctl normalizes to these)
|
||||
"Reallocated_Sector_Ct": true,
|
||||
"Current_Pending_Sector": true,
|
||||
"Offline_Uncorrectable": true,
|
||||
"UDMA_CRC_Error_Count": true,
|
||||
"Reported_Uncorrect": true,
|
||||
"Raw_Read_Error_Rate": true,
|
||||
// NVMe log fields (flat keys at top of nvme_smart_health_information_log)
|
||||
"media_errors": true,
|
||||
"num_err_log_entries": true,
|
||||
"percentage_used": true,
|
||||
}
|
||||
|
||||
// extractSMARTAttrs walks smartctl's JSON for whitelisted attribute
|
||||
// values. Handles both the ATA shape (ata_smart_attributes.table[]) and
|
||||
// the NVMe shape (nvme_smart_health_information_log). Returns a map
|
||||
// keyed by the canonical attribute name.
|
||||
func extractSMARTAttrs(raw map[string]any) map[string]float64 {
|
||||
out := map[string]float64{}
|
||||
// ATA attributes are in ata_smart_attributes.table[] — each element
|
||||
// has {"name": "Reallocated_Sector_Ct", "raw": {"value": N}}.
|
||||
if ata, ok := raw["ata_smart_attributes"].(map[string]any); ok {
|
||||
if tbl, ok := ata["table"].([]any); ok {
|
||||
for _, row := range tbl {
|
||||
rm, ok := row.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
name, _ := rm["name"].(string)
|
||||
if !smartAttributeWhitelist[name] {
|
||||
continue
|
||||
}
|
||||
if r, ok := rm["raw"].(map[string]any); ok {
|
||||
if v, ok := r["value"].(float64); ok {
|
||||
out[name] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// NVMe attributes live flat under nvme_smart_health_information_log.
|
||||
if nvme, ok := raw["nvme_smart_health_information_log"].(map[string]any); ok {
|
||||
for k, v := range nvme {
|
||||
if !smartAttributeWhitelist[k] {
|
||||
continue
|
||||
}
|
||||
if n, ok := v.(float64); ok {
|
||||
out[k] = n
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// diffSMARTAttrs subtracts start from end per (device, attribute).
|
||||
// Only attributes present in both ends produce a delta; missing
|
||||
// attributes drop out (can't attribute a zero-to-present delta safely).
|
||||
// Negative deltas are kept so a drive that resets a counter is visible.
|
||||
func diffSMARTAttrs(start, end smartAttrMap) map[string]map[string]float64 {
|
||||
out := map[string]map[string]float64{}
|
||||
for dev, endAttrs := range end {
|
||||
startAttrs, ok := start[dev]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
devOut := map[string]float64{}
|
||||
for attr, endV := range endAttrs {
|
||||
startV, ok := startAttrs[attr]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
devOut[attr] = endV - startV
|
||||
}
|
||||
if len(devOut) > 0 {
|
||||
out[dev] = devOut
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
@@ -0,0 +1,218 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestParseFioJSON_ATAReadWrite confirms we pull IOPS, BW, and p99
|
||||
// latency from both read and write sides. P99 is read from clat_ns and
|
||||
// converted ns → us (the unit we emit to the threshold evaluator).
|
||||
func TestParseFioJSON_ATAReadWrite(t *testing.T) {
|
||||
raw := `{
|
||||
"jobs": [{
|
||||
"read": {"iops": 1234.5, "bw": 5000, "clat_ns": {"percentile": {"99.000000": 250000}}},
|
||||
"write": {"iops": 432.1, "bw": 2000, "clat_ns": {"percentile": {"99.000000": 500000}}}
|
||||
}]
|
||||
}`
|
||||
r, err := parseFioJSON([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("parseFioJSON: %v", err)
|
||||
}
|
||||
if r.ReadIOPS != 1234.5 {
|
||||
t.Errorf("ReadIOPS = %v, want 1234.5", r.ReadIOPS)
|
||||
}
|
||||
if r.WriteIOPS != 432.1 {
|
||||
t.Errorf("WriteIOPS = %v, want 432.1", r.WriteIOPS)
|
||||
}
|
||||
if r.ReadBWKBps != 5000 {
|
||||
t.Errorf("ReadBWKBps = %v, want 5000", r.ReadBWKBps)
|
||||
}
|
||||
// 250000 ns → 250 us
|
||||
if r.ReadP99Us != 250 {
|
||||
t.Errorf("ReadP99Us = %v, want 250", r.ReadP99Us)
|
||||
}
|
||||
// 500000 ns → 500 us
|
||||
if r.WriteP99Us != 500 {
|
||||
t.Errorf("WriteP99Us = %v, want 500", r.WriteP99Us)
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseFioJSON_ReadOnlyJob: if only one side has p99 populated the
|
||||
// other stays zero (not emitted as a sample). Mirrors a randread job.
|
||||
func TestParseFioJSON_ReadOnlyJob(t *testing.T) {
|
||||
raw := `{
|
||||
"jobs": [{
|
||||
"read": {"iops": 1000, "bw": 4000, "clat_ns": {"percentile": {"99.000000": 100000}}},
|
||||
"write": {"iops": 0, "bw": 0}
|
||||
}]
|
||||
}`
|
||||
r, err := parseFioJSON([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("parseFioJSON: %v", err)
|
||||
}
|
||||
if r.WriteP99Us != 0 {
|
||||
t.Errorf("WriteP99Us = %v on read-only job, want 0", r.WriteP99Us)
|
||||
}
|
||||
if r.ReadP99Us != 100 {
|
||||
t.Errorf("ReadP99Us = %v, want 100", r.ReadP99Us)
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseFioJSON_NoJobs fails rather than reporting zeroes silently.
|
||||
// An empty jobs array means fio didn't run anything.
|
||||
func TestParseFioJSON_NoJobs(t *testing.T) {
|
||||
raw := `{"jobs": []}`
|
||||
if _, err := parseFioJSON([]byte(raw)); err == nil {
|
||||
t.Errorf("expected error on empty jobs array")
|
||||
}
|
||||
}
|
||||
|
||||
// TestExtractSMARTAttrs_ATA picks attributes out of ata_smart_attributes.table
|
||||
// when present. Attributes outside the whitelist drop out silently.
|
||||
func TestExtractSMARTAttrs_ATA(t *testing.T) {
|
||||
raw := map[string]any{}
|
||||
smartJSON := `{
|
||||
"ata_smart_attributes": {
|
||||
"table": [
|
||||
{"name": "Reallocated_Sector_Ct", "raw": {"value": 7}},
|
||||
{"name": "Current_Pending_Sector", "raw": {"value": 3}},
|
||||
{"name": "Spin_Retry_Count", "raw": {"value": 99}}
|
||||
]
|
||||
}
|
||||
}`
|
||||
if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
|
||||
t.Fatalf("unmarshal fixture: %v", err)
|
||||
}
|
||||
out := extractSMARTAttrs(raw)
|
||||
if out["Reallocated_Sector_Ct"] != 7 {
|
||||
t.Errorf("Reallocated_Sector_Ct = %v, want 7", out["Reallocated_Sector_Ct"])
|
||||
}
|
||||
if out["Current_Pending_Sector"] != 3 {
|
||||
t.Errorf("Current_Pending_Sector = %v, want 3", out["Current_Pending_Sector"])
|
||||
}
|
||||
if _, ok := out["Spin_Retry_Count"]; ok {
|
||||
t.Errorf("Spin_Retry_Count should not appear (not in whitelist)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestExtractSMARTAttrs_NVMe picks media_errors and friends from the
|
||||
// nvme health log shape, which is a flat map at the top of the JSON.
|
||||
func TestExtractSMARTAttrs_NVMe(t *testing.T) {
|
||||
raw := map[string]any{}
|
||||
smartJSON := `{
|
||||
"nvme_smart_health_information_log": {
|
||||
"media_errors": 2,
|
||||
"num_err_log_entries": 15,
|
||||
"percentage_used": 7,
|
||||
"temperature": 42
|
||||
}
|
||||
}`
|
||||
if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
|
||||
t.Fatalf("unmarshal fixture: %v", err)
|
||||
}
|
||||
out := extractSMARTAttrs(raw)
|
||||
if out["media_errors"] != 2 {
|
||||
t.Errorf("media_errors = %v, want 2", out["media_errors"])
|
||||
}
|
||||
if out["num_err_log_entries"] != 15 {
|
||||
t.Errorf("num_err_log_entries = %v, want 15", out["num_err_log_entries"])
|
||||
}
|
||||
if out["percentage_used"] != 7 {
|
||||
t.Errorf("percentage_used = %v, want 7", out["percentage_used"])
|
||||
}
|
||||
if _, ok := out["temperature"]; ok {
|
||||
t.Errorf("temperature should not appear (not in whitelist)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestDiffSMARTAttrs: end − start per (device, attr). Only attrs in
|
||||
// both snapshots yield a delta; any disappearing attribute just drops
|
||||
// out instead of showing a misleading negative.
|
||||
func TestDiffSMARTAttrs(t *testing.T) {
|
||||
start := smartAttrMap{
|
||||
"/dev/sda": {"Reallocated_Sector_Ct": 5, "Current_Pending_Sector": 0},
|
||||
}
|
||||
end := smartAttrMap{
|
||||
"/dev/sda": {"Reallocated_Sector_Ct": 8, "Current_Pending_Sector": 2, "UDMA_CRC_Error_Count": 1},
|
||||
}
|
||||
out := diffSMARTAttrs(start, end)
|
||||
if out["/dev/sda"]["Reallocated_Sector_Ct"] != 3 {
|
||||
t.Errorf("Reallocated_Sector_Ct delta = %v, want 3", out["/dev/sda"]["Reallocated_Sector_Ct"])
|
||||
}
|
||||
if out["/dev/sda"]["Current_Pending_Sector"] != 2 {
|
||||
t.Errorf("Current_Pending_Sector delta = %v, want 2", out["/dev/sda"]["Current_Pending_Sector"])
|
||||
}
|
||||
if _, ok := out["/dev/sda"]["UDMA_CRC_Error_Count"]; ok {
|
||||
t.Errorf("UDMA_CRC_Error_Count should not appear (missing at start)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestDiffSMARTAttrs_DeviceNewAtEnd: a device only present in the end
|
||||
// snapshot (drive hot-plugged mid-run, or SMART read succeeded only at
|
||||
// end) is dropped from the diff — no start baseline to subtract from.
|
||||
func TestDiffSMARTAttrs_DeviceNewAtEnd(t *testing.T) {
|
||||
start := smartAttrMap{}
|
||||
end := smartAttrMap{
|
||||
"/dev/sda": {"Reallocated_Sector_Ct": 10},
|
||||
}
|
||||
out := diffSMARTAttrs(start, end)
|
||||
if _, ok := out["/dev/sda"]; ok {
|
||||
t.Errorf("/dev/sda should drop from diff when absent at start")
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveFioOpts_Defaults: zero-valued knobs resolve to the quick
|
||||
// profile's fio_sample shape. Any stage that's missing per-profile
|
||||
// knobs (legacy claim response, test harness) still has coherent
|
||||
// bounded defaults — we won't accidentally fall into unbounded writes.
|
||||
func TestResolveFioOpts_Defaults(t *testing.T) {
|
||||
o := resolveFioOpts(StorageKnobs{})
|
||||
if o.Mode != "fio_sample" {
|
||||
t.Errorf("Mode = %q, want fio_sample", o.Mode)
|
||||
}
|
||||
if o.Size != "1GiB" {
|
||||
t.Errorf("Size = %q, want 1GiB", o.Size)
|
||||
}
|
||||
if o.Runtime != 3*time.Minute {
|
||||
t.Errorf("Runtime = %v, want 3m", o.Runtime)
|
||||
}
|
||||
if o.BS != "4k" {
|
||||
t.Errorf("BS = %q, want 4k", o.BS)
|
||||
}
|
||||
if o.RW != "randrw" {
|
||||
t.Errorf("RW = %q, want randrw", o.RW)
|
||||
}
|
||||
if o.Verify != "md5" {
|
||||
t.Errorf("Verify = %q, want md5", o.Verify)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveFioOpts_FullDiskOverride confirms the deep/soak shape
|
||||
// round-trips. FioTime as 2h overrides the 3-minute default.
|
||||
func TestResolveFioOpts_FullDiskOverride(t *testing.T) {
|
||||
k := StorageKnobs{
|
||||
Mode: "full_disk",
|
||||
FioTime: 2 * time.Hour,
|
||||
FioBS: "64k",
|
||||
FioRW: "write",
|
||||
}
|
||||
o := resolveFioOpts(k)
|
||||
if o.Mode != "full_disk" {
|
||||
t.Errorf("Mode = %q, want full_disk", o.Mode)
|
||||
}
|
||||
if o.Runtime != 2*time.Hour {
|
||||
t.Errorf("Runtime = %v, want 2h", o.Runtime)
|
||||
}
|
||||
if o.BS != "64k" {
|
||||
t.Errorf("BS = %q, want 64k", o.BS)
|
||||
}
|
||||
if o.RW != "write" {
|
||||
t.Errorf("RW = %q, want write", o.RW)
|
||||
}
|
||||
// Verify should fall back to md5 default since knob was empty.
|
||||
if o.Verify != "md5" {
|
||||
t.Errorf("Verify = %q, want md5 (default)", o.Verify)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user