deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+486
View File
@@ -0,0 +1,486 @@
package tests
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"runtime"
"strconv"
"strings"
"sync"
"time"
"vetting/agent/probes"
)
// BurnConfig is what the agent passes to Burn: the orchestrator's iperf3
// server address and port. Durations + concurrency knobs come from
// Deps.BurnKnobs so they scale with profile.
type BurnConfig struct {
OrchestratorURL string
IperfPort int // 0 = 5201
}
// Burn is the concurrent soak stage. Unlike CPUStress (serial
// CPU→memory) or Storage (serial per disk) it fans out every workload
// at once: stress-ng hammers CPU + memory, fio drives the allow-listed
// disks, iperf3 pushes sustained NIC traffic, and two sidecars poll
// EDAC + PSU rails for the duration of the window.
//
// This is where PSU rails actually matter: 12V sag under simultaneous
// CPU + disk + NIC load is exactly the failure a thermal/power
// regression produces, and it's invisible to any stage that loads one
// subsystem at a time. The PSU stage that follows Burn in the pipeline
// re-samples rails post-window to confirm they settle back to nominal.
//
// Burn stays inside the stage framework — it doesn't spawn a parallel
// stage runner. The goroutine fan-out is local; the stage converges
// before returning an Outcome so every invariant the orchestrator
// relies on (serial stage order, single in-flight stage per run) still
// holds.
func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome {
duration := d.BurnKnobs.Duration
if duration <= 0 {
duration = 2 * time.Minute
}
cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers)
memPct := clampMemPct(d.BurnKnobs.MemPct)
iperfParallel := d.BurnKnobs.IperfParallel
if iperfParallel <= 0 {
iperfParallel = 2
}
d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v",
duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare))
// Sidecars run for the lifetime of the window and are cancelled on
// return so the main stage converges cleanly. EDAC catches DIMM
// bit-flips that appear only under concurrent load; PSU catches
// rail sag that only appears when CPU + disk + NIC pull current
// simultaneously.
sideCtx, sideCancel := context.WithCancel(ctx)
defer sideCancel()
var sideWG sync.WaitGroup
sideWG.Add(2)
go runEDACSidecar(sideCtx, &sideWG, d)
go runPSUSidecar(sideCtx, &sideWG, d)
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
defer cancel()
results := make(chan burnSubResult, 4)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnCPU(runCtx, d, duration, cpuWorkers)
}()
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnMemory(runCtx, d, duration, memPct)
}()
// fio runs only when explicitly enabled *and* there are allow-listed
// disks *and* the run wasn't marked non-destructive. Any of those
// missing records a Skipped sub-step so the operator sees why.
if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive {
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnFio(runCtx, d, duration)
}()
} else {
reason := burnFioSkipReason(d)
results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason}
}
// iperf requires an orchestrator host. Lab hosts run with the
// bundled iperf3 server; without a base URL we can't derive a
// target so we skip rather than fail the stage.
if cfg.OrchestratorURL != "" {
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel)
}()
} else {
results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"}
}
wg.Wait()
sideCancel()
sideWG.Wait()
close(results)
subs, samples, failures := collectBurnResults(results)
if d.Sensor != nil && len(samples) > 0 {
_ = d.Sensor(ctx, samples)
}
extras := map[string]any{
"duration": duration.String(),
"cpu_workers": cpuWorkers,
"mem_pct": memPct,
"iperf_parallel": iperfParallel,
"fio_on_spare": d.BurnKnobs.FioOnSpare,
}
if len(failures) > 0 {
msg := "Burn workloads failed: " + strings.Join(failures, ", ")
d.Error(msg)
return Outcome{
Passed: false,
Message: msg,
Summary: fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)),
Extras: extras,
SubSteps: subs,
}
}
d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)),
Extras: extras,
SubSteps: subs,
}
}
// burnSubResult is the per-workload return type used by the fan-out
// goroutines. Sample slice is merged into the stage's final /sensor
// batch; SubStep becomes a row on the /result sub-steps list.
type burnSubResult struct {
Name string
Passed bool
Skipped bool
Reason string // why a workload was skipped
Err string // why a workload failed
Samples []Sample
SubStep SubStepReport
}
func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) {
var subs []SubStepReport
var samples []Sample
var failures []string
for r := range ch {
// Non-skipped goroutines populate SubStep directly. Skipped slots
// get a synthesized row here so the /result shape stays stable.
if r.Skipped {
stamp := time.Now().UTC()
subs = append(subs, SubStepReport{
Name: r.Name,
Skipped: true,
StartedAt: stamp,
CompletedAt: stamp,
SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}),
})
continue
}
subs = append(subs, r.SubStep)
samples = append(samples, r.Samples...)
if !r.Passed {
reason := r.Err
if reason == "" {
reason = "unknown"
}
failures = append(failures, r.Name+": "+reason)
}
}
return subs, samples, failures
}
func burnFioSkipReason(d Deps) string {
if !d.BurnKnobs.FioOnSpare {
return "fio_on_spare knob disabled"
}
if d.NonDestructive {
return "non-destructive run"
}
if len(d.ExpectedDisks) == 0 {
return "no allowlisted disks"
}
return "disabled"
}
// runBurnCPU hammers all CPU cores with stress-ng for the window. Same
// shape as CPUStress pass 1 but with shorter label so the sub-step row
// doesn't collide with the earlier stage's "CPU pass".
func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult {
if _, err := exec.LookPath("stress-ng"); err != nil {
return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"}
}
args := []string{
"--cpu", strconv.Itoa(workers),
"--cpu-method", "all",
"--timeout", durationSeconds(duration),
"--metrics-brief",
"--verify",
}
d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " ")))
pass := runStressPass(ctx, d, "Burn CPU", duration, args)
return burnSubResult{
Name: "Burn CPU",
Passed: pass.Passed,
Err: pass.Err,
SubStep: subStepFromPass("Burn CPU", pass),
}
}
// runBurnMemory drives a single --vm worker sized at memPct of
// MemAvailable, capped so the kernel + agent + other workloads still
// have headroom. Clamping happens here rather than in resolveBurnKnobs
// so the cap is computed against real live memory each run.
func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult {
if _, err := exec.LookPath("stress-ng"); err != nil {
return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"}
}
avail, err := memAvailableBytes()
if err != nil {
return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()}
}
// Budget = avail * memPct / 100, then subtract the standard headroom.
// If the result is below the memory-pass floor we record a skipped
// row instead — the window is too tight to be meaningful on this box.
budget := int64(float64(avail) * float64(memPct) / 100.0)
cap := budget - memHeadroomBytes
if cap < memFloorBytes {
return burnSubResult{
Name: "Burn memory",
Skipped: true,
Reason: fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)),
}
}
args := []string{
"--vm", "1",
"--vm-bytes", strconv.FormatInt(cap, 10),
"--vm-keep",
"--timeout", durationSeconds(duration),
"--metrics-brief",
"--verify",
}
d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct))
pass := runStressPass(ctx, d, "Burn memory", duration, args)
return burnSubResult{
Name: "Burn memory",
Passed: pass.Passed,
Err: pass.Err,
SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass),
}
}
// runBurnFio runs fio_sample against the first allow-listed disk for
// the window. Reuses runFioVerify + parseFioJSON so the samples line
// up with what Storage emits. Using fio_sample (bounded by --size)
// keeps Burn's write volume predictable regardless of profile.
func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult {
if _, err := exec.LookPath("fio"); err != nil {
return burnSubResult{Name: "Burn fio", Err: "fio missing"}
}
targets := resolveTargets(d.ExpectedDisks)
if len(targets) == 0 {
return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"}
}
t := targets[0]
opts := fioOpts{
Mode: "fio_sample",
Size: "512MiB",
Runtime: duration,
BS: "4k",
RW: "randrw",
Verify: "md5",
}
start := time.Now()
d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration))
fr := runFioVerify(ctx, t.Device, opts)
end := time.Now()
sub := SubStepReport{
Name: "Burn fio " + t.Device,
Passed: fr.Error == "",
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(fr),
}
out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error}
if fr.Error == "" {
out.Samples = append(out.Samples,
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
)
if fr.ReadP99Us > 0 {
out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
}
if fr.WriteP99Us > 0 {
out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
}
}
return out
}
// runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON
// so the same (mbps, retrans, bytesSent) extraction the Network stage
// uses applies here too. Samples emitted as Burn-scoped keys so the
// dashboard can tell at-a-glance which window they came from.
func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult {
if _, err := exec.LookPath("iperf3"); err != nil {
return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"}
}
host, err := deriveHost(orchestratorURL)
if err != nil || host == "" {
return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"}
}
if port == 0 {
port = 5201
}
if parallel < 1 {
parallel = 1
}
args := []string{
"-c", host,
"-p", strconv.Itoa(port),
"-t", strconv.Itoa(int(duration.Seconds())),
"-P", strconv.Itoa(parallel),
"-J",
}
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
defer cancel()
start := time.Now()
out, err := exec.CommandContext(runCtx, "iperf3", args...).Output()
end := time.Now()
if err != nil {
return burnSubResult{
Name: "Burn iperf",
Err: "iperf3 client error: " + err.Error(),
SubStep: SubStepReport{
Name: "Burn iperf",
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}),
},
}
}
mbps, retrans, bytesSent, _, perr := parseIperfJSON(out)
if perr != nil {
return burnSubResult{
Name: "Burn iperf",
Err: "parse iperf3 json: " + perr.Error(),
SubStep: SubStepReport{
Name: "Burn iperf",
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}),
},
}
}
samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}}
if bytesSent > 0 {
packets := float64(bytesSent) / 1460.0
if packets > 0 {
samples = append(samples, Sample{
Kind: "nic_retrans", Key: "burn/rate",
Value: float64(retrans) / packets, Unit: "rate",
})
}
}
passed := mbps > 0
errMsg := ""
if !passed {
errMsg = "zero throughput from iperf3"
}
return burnSubResult{
Name: "Burn iperf",
Passed: passed,
Err: errMsg,
Samples: samples,
SubStep: SubStepReport{
Name: fmt.Sprintf("Burn iperf (P=%d)", parallel),
Passed: passed,
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(map[string]any{
"throughput_mbps": mbps,
"retransmits": retrans,
"bytes_sent": bytesSent,
"parallel": parallel,
}),
},
}
}
// runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration
// of the Burn window, piping each read into the stage's sensor channel
// as a psu_volt sample. The threshold evaluator then applies the same
// within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V
// under load will fire the critical threshold mid-Burn and the run
// will flip into FailedHolding without waiting for the post-Burn PSU
// stage to catch it.
func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
defer wg.Done()
if d.Sensor == nil {
return
}
t := time.NewTicker(5 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
rails := scanPSURails()
if len(rails) == 0 {
continue
}
batch := make([]Sample, 0, len(rails))
for _, r := range rails {
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
}
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
if err := d.Sensor(sendCtx, batch); err != nil {
d.Warn("Burn: PSU sample post: " + err.Error())
}
cancel()
}
}
}
func resolveCPUWorkers(raw string) int {
if raw == "" || strings.EqualFold(raw, "all") {
return runtime.NumCPU()
}
if n, err := strconv.Atoi(raw); err == nil && n > 0 {
return n
}
return runtime.NumCPU()
}
// clampMemPct keeps the knob in a sane band. 0 means "use default 50%";
// above 90 would crowd the kernel + agent + fio + iperf3 workers off the
// page cache. Anything outside [10, 90] is clamped.
func clampMemPct(pct int) int {
if pct <= 0 {
return 50
}
if pct < 10 {
return 10
}
if pct > 90 {
return 90
}
return pct
}
func mustJSON(v any) json.RawMessage {
b, err := json.Marshal(v)
if err != nil {
return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`))
}
return b
}
// Ensure the probes package import stays anchored — the Burn sidecars
// use probes.EDAC + the PSU rail scanner defined in psu.go which
// otherwise wouldn't pull probes in on its own.
var _ = probes.EDAC