package tests import ( "context" "encoding/json" "fmt" "os/exec" "runtime" "strconv" "strings" "sync" "time" "vetting/agent/probes" ) // BurnConfig is what the agent passes to Burn: the orchestrator's iperf3 // server address and port. Durations + concurrency knobs come from // Deps.BurnKnobs so they scale with profile. type BurnConfig struct { OrchestratorURL string IperfPort int // 0 = 5201 } // Burn is the concurrent soak stage. Unlike CPUStress (serial // CPU→memory) or Storage (serial per disk) it fans out every workload // at once: stress-ng hammers CPU + memory, fio drives the allow-listed // disks, iperf3 pushes sustained NIC traffic, and two sidecars poll // EDAC + PSU rails for the duration of the window. // // This is where PSU rails actually matter: 12V sag under simultaneous // CPU + disk + NIC load is exactly the failure a thermal/power // regression produces, and it's invisible to any stage that loads one // subsystem at a time. The PSU stage that follows Burn in the pipeline // re-samples rails post-window to confirm they settle back to nominal. // // Burn stays inside the stage framework — it doesn't spawn a parallel // stage runner. The goroutine fan-out is local; the stage converges // before returning an Outcome so every invariant the orchestrator // relies on (serial stage order, single in-flight stage per run) still // holds. func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome { duration := d.BurnKnobs.Duration if duration <= 0 { duration = 2 * time.Minute } cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers) memPct := clampMemPct(d.BurnKnobs.MemPct) iperfParallel := d.BurnKnobs.IperfParallel if iperfParallel <= 0 { iperfParallel = 2 } d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v", duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare)) // Sidecars run for the lifetime of the window and are cancelled on // return so the main stage converges cleanly. EDAC catches DIMM // bit-flips that appear only under concurrent load; PSU catches // rail sag that only appears when CPU + disk + NIC pull current // simultaneously. sideCtx, sideCancel := context.WithCancel(ctx) defer sideCancel() var sideWG sync.WaitGroup sideWG.Add(2) go runEDACSidecar(sideCtx, &sideWG, d) go runPSUSidecar(sideCtx, &sideWG, d) runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second) defer cancel() results := make(chan burnSubResult, 4) var wg sync.WaitGroup wg.Add(1) go func() { defer wg.Done() results <- runBurnCPU(runCtx, d, duration, cpuWorkers) }() wg.Add(1) go func() { defer wg.Done() results <- runBurnMemory(runCtx, d, duration, memPct) }() // fio runs only when explicitly enabled *and* there are allow-listed // disks *and* the run wasn't marked non-destructive. Any of those // missing records a Skipped sub-step so the operator sees why. if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive { wg.Add(1) go func() { defer wg.Done() results <- runBurnFio(runCtx, d, duration) }() } else { reason := burnFioSkipReason(d) results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason} } // iperf requires an orchestrator host. Lab hosts run with the // bundled iperf3 server; without a base URL we can't derive a // target so we skip rather than fail the stage. if cfg.OrchestratorURL != "" { wg.Add(1) go func() { defer wg.Done() results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel) }() } else { results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"} } wg.Wait() sideCancel() sideWG.Wait() close(results) subs, samples, failures := collectBurnResults(results) if d.Sensor != nil && len(samples) > 0 { _ = d.Sensor(ctx, samples) } extras := map[string]any{ "duration": duration.String(), "cpu_workers": cpuWorkers, "mem_pct": memPct, "iperf_parallel": iperfParallel, "fio_on_spare": d.BurnKnobs.FioOnSpare, } if len(failures) > 0 { msg := "Burn workloads failed: " + strings.Join(failures, ", ") d.Error(msg) return Outcome{ Passed: false, Message: msg, Summary: fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)), Extras: extras, SubSteps: subs, } } d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs))) return Outcome{ Passed: true, Summary: fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)), Extras: extras, SubSteps: subs, } } // burnSubResult is the per-workload return type used by the fan-out // goroutines. Sample slice is merged into the stage's final /sensor // batch; SubStep becomes a row on the /result sub-steps list. type burnSubResult struct { Name string Passed bool Skipped bool Reason string // why a workload was skipped Err string // why a workload failed Samples []Sample SubStep SubStepReport } func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) { var subs []SubStepReport var samples []Sample var failures []string for r := range ch { // Non-skipped goroutines populate SubStep directly. Skipped slots // get a synthesized row here so the /result shape stays stable. if r.Skipped { stamp := time.Now().UTC() subs = append(subs, SubStepReport{ Name: r.Name, Skipped: true, StartedAt: stamp, CompletedAt: stamp, SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}), }) continue } subs = append(subs, r.SubStep) samples = append(samples, r.Samples...) if !r.Passed { reason := r.Err if reason == "" { reason = "unknown" } failures = append(failures, r.Name+": "+reason) } } return subs, samples, failures } func burnFioSkipReason(d Deps) string { if !d.BurnKnobs.FioOnSpare { return "fio_on_spare knob disabled" } if d.NonDestructive { return "non-destructive run" } if len(d.ExpectedDisks) == 0 { return "no allowlisted disks" } return "disabled" } // runBurnCPU hammers all CPU cores with stress-ng for the window. Same // shape as CPUStress pass 1 but with shorter label so the sub-step row // doesn't collide with the earlier stage's "CPU pass". func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult { if _, err := exec.LookPath("stress-ng"); err != nil { return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"} } args := []string{ "--cpu", strconv.Itoa(workers), "--cpu-method", "all", "--timeout", durationSeconds(duration), "--metrics-brief", "--verify", } d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " "))) pass := runStressPass(ctx, d, "Burn CPU", duration, args) return burnSubResult{ Name: "Burn CPU", Passed: pass.Passed, Err: pass.Err, SubStep: subStepFromPass("Burn CPU", pass), } } // runBurnMemory drives a single --vm worker sized at memPct of // MemAvailable, capped so the kernel + agent + other workloads still // have headroom. Clamping happens here rather than in resolveBurnKnobs // so the cap is computed against real live memory each run. func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult { if _, err := exec.LookPath("stress-ng"); err != nil { return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"} } avail, err := memAvailableBytes() if err != nil { return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()} } // Budget = avail * memPct / 100, then subtract the standard headroom. // If the result is below the memory-pass floor we record a skipped // row instead — the window is too tight to be meaningful on this box. budget := int64(float64(avail) * float64(memPct) / 100.0) cap := budget - memHeadroomBytes if cap < memFloorBytes { return burnSubResult{ Name: "Burn memory", Skipped: true, Reason: fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)), } } args := []string{ "--vm", "1", "--vm-bytes", strconv.FormatInt(cap, 10), "--vm-keep", "--timeout", durationSeconds(duration), "--metrics-brief", "--verify", } d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct)) pass := runStressPass(ctx, d, "Burn memory", duration, args) return burnSubResult{ Name: "Burn memory", Passed: pass.Passed, Err: pass.Err, SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass), } } // runBurnFio runs fio_sample against the first allow-listed disk for // the window. Reuses runFioVerify + parseFioJSON so the samples line // up with what Storage emits. Using fio_sample (bounded by --size) // keeps Burn's write volume predictable regardless of profile. func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult { if _, err := exec.LookPath("fio"); err != nil { return burnSubResult{Name: "Burn fio", Err: "fio missing"} } targets := resolveTargets(d.ExpectedDisks) if len(targets) == 0 { return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"} } t := targets[0] opts := fioOpts{ Mode: "fio_sample", Size: "512MiB", Runtime: duration, BS: "4k", RW: "randrw", Verify: "md5", } start := time.Now() d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration)) fr := runFioVerify(ctx, t.Device, opts) end := time.Now() sub := SubStepReport{ Name: "Burn fio " + t.Device, Passed: fr.Error == "", StartedAt: start, CompletedAt: end, SummaryJSON: mustJSON(fr), } out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error} if fr.Error == "" { out.Samples = append(out.Samples, Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"}, Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"}, ) if fr.ReadP99Us > 0 { out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"}) } if fr.WriteP99Us > 0 { out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"}) } } return out } // runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON // so the same (mbps, retrans, bytesSent) extraction the Network stage // uses applies here too. Samples emitted as Burn-scoped keys so the // dashboard can tell at-a-glance which window they came from. func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult { if _, err := exec.LookPath("iperf3"); err != nil { return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"} } host, err := deriveHost(orchestratorURL) if err != nil || host == "" { return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"} } if port == 0 { port = 5201 } if parallel < 1 { parallel = 1 } args := []string{ "-c", host, "-p", strconv.Itoa(port), "-t", strconv.Itoa(int(duration.Seconds())), "-P", strconv.Itoa(parallel), "-J", } runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second) defer cancel() start := time.Now() out, err := exec.CommandContext(runCtx, "iperf3", args...).Output() end := time.Now() if err != nil { return burnSubResult{ Name: "Burn iperf", Err: "iperf3 client error: " + err.Error(), SubStep: SubStepReport{ Name: "Burn iperf", StartedAt: start, CompletedAt: end, SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}), }, } } mbps, retrans, bytesSent, _, perr := parseIperfJSON(out) if perr != nil { return burnSubResult{ Name: "Burn iperf", Err: "parse iperf3 json: " + perr.Error(), SubStep: SubStepReport{ Name: "Burn iperf", StartedAt: start, CompletedAt: end, SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}), }, } } samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}} if bytesSent > 0 { packets := float64(bytesSent) / 1460.0 if packets > 0 { samples = append(samples, Sample{ Kind: "nic_retrans", Key: "burn/rate", Value: float64(retrans) / packets, Unit: "rate", }) } } passed := mbps > 0 errMsg := "" if !passed { errMsg = "zero throughput from iperf3" } return burnSubResult{ Name: "Burn iperf", Passed: passed, Err: errMsg, Samples: samples, SubStep: SubStepReport{ Name: fmt.Sprintf("Burn iperf (P=%d)", parallel), Passed: passed, StartedAt: start, CompletedAt: end, SummaryJSON: mustJSON(map[string]any{ "throughput_mbps": mbps, "retransmits": retrans, "bytes_sent": bytesSent, "parallel": parallel, }), }, } } // runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration // of the Burn window, piping each read into the stage's sensor channel // as a psu_volt sample. The threshold evaluator then applies the same // within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V // under load will fire the critical threshold mid-Burn and the run // will flip into FailedHolding without waiting for the post-Burn PSU // stage to catch it. func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) { defer wg.Done() if d.Sensor == nil { return } t := time.NewTicker(5 * time.Second) defer t.Stop() for { select { case <-ctx.Done(): return case <-t.C: rails := scanPSURails() if len(rails) == 0 { continue } batch := make([]Sample, 0, len(rails)) for _, r := range rails { batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"}) } sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) if err := d.Sensor(sendCtx, batch); err != nil { d.Warn("Burn: PSU sample post: " + err.Error()) } cancel() } } } func resolveCPUWorkers(raw string) int { if raw == "" || strings.EqualFold(raw, "all") { return runtime.NumCPU() } if n, err := strconv.Atoi(raw); err == nil && n > 0 { return n } return runtime.NumCPU() } // clampMemPct keeps the knob in a sane band. 0 means "use default 50%"; // above 90 would crowd the kernel + agent + fio + iperf3 workers off the // page cache. Anything outside [10, 90] is clamped. func clampMemPct(pct int) int { if pct <= 0 { return 50 } if pct < 10 { return 10 } if pct > 90 { return 90 } return pct } func mustJSON(v any) json.RawMessage { b, err := json.Marshal(v) if err != nil { return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`)) } return b } // Ensure the probes package import stays anchored — the Burn sidecars // use probes.EDAC + the PSU rail scanner defined in psu.go which // otherwise wouldn't pull probes in on its own. var _ = probes.EDAC