deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,486 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"vetting/agent/probes"
|
||||
)
|
||||
|
||||
// BurnConfig is what the agent passes to Burn: the orchestrator's iperf3
|
||||
// server address and port. Durations + concurrency knobs come from
|
||||
// Deps.BurnKnobs so they scale with profile.
|
||||
type BurnConfig struct {
|
||||
OrchestratorURL string
|
||||
IperfPort int // 0 = 5201
|
||||
}
|
||||
|
||||
// Burn is the concurrent soak stage. Unlike CPUStress (serial
|
||||
// CPU→memory) or Storage (serial per disk) it fans out every workload
|
||||
// at once: stress-ng hammers CPU + memory, fio drives the allow-listed
|
||||
// disks, iperf3 pushes sustained NIC traffic, and two sidecars poll
|
||||
// EDAC + PSU rails for the duration of the window.
|
||||
//
|
||||
// This is where PSU rails actually matter: 12V sag under simultaneous
|
||||
// CPU + disk + NIC load is exactly the failure a thermal/power
|
||||
// regression produces, and it's invisible to any stage that loads one
|
||||
// subsystem at a time. The PSU stage that follows Burn in the pipeline
|
||||
// re-samples rails post-window to confirm they settle back to nominal.
|
||||
//
|
||||
// Burn stays inside the stage framework — it doesn't spawn a parallel
|
||||
// stage runner. The goroutine fan-out is local; the stage converges
|
||||
// before returning an Outcome so every invariant the orchestrator
|
||||
// relies on (serial stage order, single in-flight stage per run) still
|
||||
// holds.
|
||||
func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome {
|
||||
duration := d.BurnKnobs.Duration
|
||||
if duration <= 0 {
|
||||
duration = 2 * time.Minute
|
||||
}
|
||||
cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers)
|
||||
memPct := clampMemPct(d.BurnKnobs.MemPct)
|
||||
iperfParallel := d.BurnKnobs.IperfParallel
|
||||
if iperfParallel <= 0 {
|
||||
iperfParallel = 2
|
||||
}
|
||||
d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v",
|
||||
duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare))
|
||||
|
||||
// Sidecars run for the lifetime of the window and are cancelled on
|
||||
// return so the main stage converges cleanly. EDAC catches DIMM
|
||||
// bit-flips that appear only under concurrent load; PSU catches
|
||||
// rail sag that only appears when CPU + disk + NIC pull current
|
||||
// simultaneously.
|
||||
sideCtx, sideCancel := context.WithCancel(ctx)
|
||||
defer sideCancel()
|
||||
var sideWG sync.WaitGroup
|
||||
sideWG.Add(2)
|
||||
go runEDACSidecar(sideCtx, &sideWG, d)
|
||||
go runPSUSidecar(sideCtx, &sideWG, d)
|
||||
|
||||
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
results := make(chan burnSubResult, 4)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results <- runBurnCPU(runCtx, d, duration, cpuWorkers)
|
||||
}()
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results <- runBurnMemory(runCtx, d, duration, memPct)
|
||||
}()
|
||||
|
||||
// fio runs only when explicitly enabled *and* there are allow-listed
|
||||
// disks *and* the run wasn't marked non-destructive. Any of those
|
||||
// missing records a Skipped sub-step so the operator sees why.
|
||||
if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results <- runBurnFio(runCtx, d, duration)
|
||||
}()
|
||||
} else {
|
||||
reason := burnFioSkipReason(d)
|
||||
results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason}
|
||||
}
|
||||
|
||||
// iperf requires an orchestrator host. Lab hosts run with the
|
||||
// bundled iperf3 server; without a base URL we can't derive a
|
||||
// target so we skip rather than fail the stage.
|
||||
if cfg.OrchestratorURL != "" {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel)
|
||||
}()
|
||||
} else {
|
||||
results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
sideCancel()
|
||||
sideWG.Wait()
|
||||
close(results)
|
||||
|
||||
subs, samples, failures := collectBurnResults(results)
|
||||
if d.Sensor != nil && len(samples) > 0 {
|
||||
_ = d.Sensor(ctx, samples)
|
||||
}
|
||||
|
||||
extras := map[string]any{
|
||||
"duration": duration.String(),
|
||||
"cpu_workers": cpuWorkers,
|
||||
"mem_pct": memPct,
|
||||
"iperf_parallel": iperfParallel,
|
||||
"fio_on_spare": d.BurnKnobs.FioOnSpare,
|
||||
}
|
||||
if len(failures) > 0 {
|
||||
msg := "Burn workloads failed: " + strings.Join(failures, ", ")
|
||||
d.Error(msg)
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: msg,
|
||||
Summary: fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)),
|
||||
Extras: extras,
|
||||
SubSteps: subs,
|
||||
}
|
||||
}
|
||||
d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs)))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)),
|
||||
Extras: extras,
|
||||
SubSteps: subs,
|
||||
}
|
||||
}
|
||||
|
||||
// burnSubResult is the per-workload return type used by the fan-out
|
||||
// goroutines. Sample slice is merged into the stage's final /sensor
|
||||
// batch; SubStep becomes a row on the /result sub-steps list.
|
||||
type burnSubResult struct {
|
||||
Name string
|
||||
Passed bool
|
||||
Skipped bool
|
||||
Reason string // why a workload was skipped
|
||||
Err string // why a workload failed
|
||||
Samples []Sample
|
||||
SubStep SubStepReport
|
||||
}
|
||||
|
||||
func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) {
|
||||
var subs []SubStepReport
|
||||
var samples []Sample
|
||||
var failures []string
|
||||
for r := range ch {
|
||||
// Non-skipped goroutines populate SubStep directly. Skipped slots
|
||||
// get a synthesized row here so the /result shape stays stable.
|
||||
if r.Skipped {
|
||||
stamp := time.Now().UTC()
|
||||
subs = append(subs, SubStepReport{
|
||||
Name: r.Name,
|
||||
Skipped: true,
|
||||
StartedAt: stamp,
|
||||
CompletedAt: stamp,
|
||||
SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}),
|
||||
})
|
||||
continue
|
||||
}
|
||||
subs = append(subs, r.SubStep)
|
||||
samples = append(samples, r.Samples...)
|
||||
if !r.Passed {
|
||||
reason := r.Err
|
||||
if reason == "" {
|
||||
reason = "unknown"
|
||||
}
|
||||
failures = append(failures, r.Name+": "+reason)
|
||||
}
|
||||
}
|
||||
return subs, samples, failures
|
||||
}
|
||||
|
||||
func burnFioSkipReason(d Deps) string {
|
||||
if !d.BurnKnobs.FioOnSpare {
|
||||
return "fio_on_spare knob disabled"
|
||||
}
|
||||
if d.NonDestructive {
|
||||
return "non-destructive run"
|
||||
}
|
||||
if len(d.ExpectedDisks) == 0 {
|
||||
return "no allowlisted disks"
|
||||
}
|
||||
return "disabled"
|
||||
}
|
||||
|
||||
// runBurnCPU hammers all CPU cores with stress-ng for the window. Same
|
||||
// shape as CPUStress pass 1 but with shorter label so the sub-step row
|
||||
// doesn't collide with the earlier stage's "CPU pass".
|
||||
func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult {
|
||||
if _, err := exec.LookPath("stress-ng"); err != nil {
|
||||
return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"}
|
||||
}
|
||||
args := []string{
|
||||
"--cpu", strconv.Itoa(workers),
|
||||
"--cpu-method", "all",
|
||||
"--timeout", durationSeconds(duration),
|
||||
"--metrics-brief",
|
||||
"--verify",
|
||||
}
|
||||
d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " ")))
|
||||
pass := runStressPass(ctx, d, "Burn CPU", duration, args)
|
||||
return burnSubResult{
|
||||
Name: "Burn CPU",
|
||||
Passed: pass.Passed,
|
||||
Err: pass.Err,
|
||||
SubStep: subStepFromPass("Burn CPU", pass),
|
||||
}
|
||||
}
|
||||
|
||||
// runBurnMemory drives a single --vm worker sized at memPct of
|
||||
// MemAvailable, capped so the kernel + agent + other workloads still
|
||||
// have headroom. Clamping happens here rather than in resolveBurnKnobs
|
||||
// so the cap is computed against real live memory each run.
|
||||
func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult {
|
||||
if _, err := exec.LookPath("stress-ng"); err != nil {
|
||||
return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"}
|
||||
}
|
||||
avail, err := memAvailableBytes()
|
||||
if err != nil {
|
||||
return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()}
|
||||
}
|
||||
// Budget = avail * memPct / 100, then subtract the standard headroom.
|
||||
// If the result is below the memory-pass floor we record a skipped
|
||||
// row instead — the window is too tight to be meaningful on this box.
|
||||
budget := int64(float64(avail) * float64(memPct) / 100.0)
|
||||
cap := budget - memHeadroomBytes
|
||||
if cap < memFloorBytes {
|
||||
return burnSubResult{
|
||||
Name: "Burn memory",
|
||||
Skipped: true,
|
||||
Reason: fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)),
|
||||
}
|
||||
}
|
||||
args := []string{
|
||||
"--vm", "1",
|
||||
"--vm-bytes", strconv.FormatInt(cap, 10),
|
||||
"--vm-keep",
|
||||
"--timeout", durationSeconds(duration),
|
||||
"--metrics-brief",
|
||||
"--verify",
|
||||
}
|
||||
d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct))
|
||||
pass := runStressPass(ctx, d, "Burn memory", duration, args)
|
||||
return burnSubResult{
|
||||
Name: "Burn memory",
|
||||
Passed: pass.Passed,
|
||||
Err: pass.Err,
|
||||
SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass),
|
||||
}
|
||||
}
|
||||
|
||||
// runBurnFio runs fio_sample against the first allow-listed disk for
|
||||
// the window. Reuses runFioVerify + parseFioJSON so the samples line
|
||||
// up with what Storage emits. Using fio_sample (bounded by --size)
|
||||
// keeps Burn's write volume predictable regardless of profile.
|
||||
func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult {
|
||||
if _, err := exec.LookPath("fio"); err != nil {
|
||||
return burnSubResult{Name: "Burn fio", Err: "fio missing"}
|
||||
}
|
||||
targets := resolveTargets(d.ExpectedDisks)
|
||||
if len(targets) == 0 {
|
||||
return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"}
|
||||
}
|
||||
t := targets[0]
|
||||
opts := fioOpts{
|
||||
Mode: "fio_sample",
|
||||
Size: "512MiB",
|
||||
Runtime: duration,
|
||||
BS: "4k",
|
||||
RW: "randrw",
|
||||
Verify: "md5",
|
||||
}
|
||||
start := time.Now()
|
||||
d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration))
|
||||
fr := runFioVerify(ctx, t.Device, opts)
|
||||
end := time.Now()
|
||||
|
||||
sub := SubStepReport{
|
||||
Name: "Burn fio " + t.Device,
|
||||
Passed: fr.Error == "",
|
||||
StartedAt: start,
|
||||
CompletedAt: end,
|
||||
SummaryJSON: mustJSON(fr),
|
||||
}
|
||||
out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error}
|
||||
if fr.Error == "" {
|
||||
out.Samples = append(out.Samples,
|
||||
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
|
||||
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
|
||||
)
|
||||
if fr.ReadP99Us > 0 {
|
||||
out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
|
||||
}
|
||||
if fr.WriteP99Us > 0 {
|
||||
out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON
|
||||
// so the same (mbps, retrans, bytesSent) extraction the Network stage
|
||||
// uses applies here too. Samples emitted as Burn-scoped keys so the
|
||||
// dashboard can tell at-a-glance which window they came from.
|
||||
func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult {
|
||||
if _, err := exec.LookPath("iperf3"); err != nil {
|
||||
return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"}
|
||||
}
|
||||
host, err := deriveHost(orchestratorURL)
|
||||
if err != nil || host == "" {
|
||||
return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"}
|
||||
}
|
||||
if port == 0 {
|
||||
port = 5201
|
||||
}
|
||||
if parallel < 1 {
|
||||
parallel = 1
|
||||
}
|
||||
args := []string{
|
||||
"-c", host,
|
||||
"-p", strconv.Itoa(port),
|
||||
"-t", strconv.Itoa(int(duration.Seconds())),
|
||||
"-P", strconv.Itoa(parallel),
|
||||
"-J",
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
|
||||
defer cancel()
|
||||
start := time.Now()
|
||||
out, err := exec.CommandContext(runCtx, "iperf3", args...).Output()
|
||||
end := time.Now()
|
||||
if err != nil {
|
||||
return burnSubResult{
|
||||
Name: "Burn iperf",
|
||||
Err: "iperf3 client error: " + err.Error(),
|
||||
SubStep: SubStepReport{
|
||||
Name: "Burn iperf",
|
||||
StartedAt: start,
|
||||
CompletedAt: end,
|
||||
SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}),
|
||||
},
|
||||
}
|
||||
}
|
||||
mbps, retrans, bytesSent, _, perr := parseIperfJSON(out)
|
||||
if perr != nil {
|
||||
return burnSubResult{
|
||||
Name: "Burn iperf",
|
||||
Err: "parse iperf3 json: " + perr.Error(),
|
||||
SubStep: SubStepReport{
|
||||
Name: "Burn iperf",
|
||||
StartedAt: start,
|
||||
CompletedAt: end,
|
||||
SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}}
|
||||
if bytesSent > 0 {
|
||||
packets := float64(bytesSent) / 1460.0
|
||||
if packets > 0 {
|
||||
samples = append(samples, Sample{
|
||||
Kind: "nic_retrans", Key: "burn/rate",
|
||||
Value: float64(retrans) / packets, Unit: "rate",
|
||||
})
|
||||
}
|
||||
}
|
||||
passed := mbps > 0
|
||||
errMsg := ""
|
||||
if !passed {
|
||||
errMsg = "zero throughput from iperf3"
|
||||
}
|
||||
return burnSubResult{
|
||||
Name: "Burn iperf",
|
||||
Passed: passed,
|
||||
Err: errMsg,
|
||||
Samples: samples,
|
||||
SubStep: SubStepReport{
|
||||
Name: fmt.Sprintf("Burn iperf (P=%d)", parallel),
|
||||
Passed: passed,
|
||||
StartedAt: start,
|
||||
CompletedAt: end,
|
||||
SummaryJSON: mustJSON(map[string]any{
|
||||
"throughput_mbps": mbps,
|
||||
"retransmits": retrans,
|
||||
"bytes_sent": bytesSent,
|
||||
"parallel": parallel,
|
||||
}),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration
|
||||
// of the Burn window, piping each read into the stage's sensor channel
|
||||
// as a psu_volt sample. The threshold evaluator then applies the same
|
||||
// within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V
|
||||
// under load will fire the critical threshold mid-Burn and the run
|
||||
// will flip into FailedHolding without waiting for the post-Burn PSU
|
||||
// stage to catch it.
|
||||
func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
|
||||
defer wg.Done()
|
||||
if d.Sensor == nil {
|
||||
return
|
||||
}
|
||||
t := time.NewTicker(5 * time.Second)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
rails := scanPSURails()
|
||||
if len(rails) == 0 {
|
||||
continue
|
||||
}
|
||||
batch := make([]Sample, 0, len(rails))
|
||||
for _, r := range rails {
|
||||
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
|
||||
}
|
||||
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
if err := d.Sensor(sendCtx, batch); err != nil {
|
||||
d.Warn("Burn: PSU sample post: " + err.Error())
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func resolveCPUWorkers(raw string) int {
|
||||
if raw == "" || strings.EqualFold(raw, "all") {
|
||||
return runtime.NumCPU()
|
||||
}
|
||||
if n, err := strconv.Atoi(raw); err == nil && n > 0 {
|
||||
return n
|
||||
}
|
||||
return runtime.NumCPU()
|
||||
}
|
||||
|
||||
// clampMemPct keeps the knob in a sane band. 0 means "use default 50%";
|
||||
// above 90 would crowd the kernel + agent + fio + iperf3 workers off the
|
||||
// page cache. Anything outside [10, 90] is clamped.
|
||||
func clampMemPct(pct int) int {
|
||||
if pct <= 0 {
|
||||
return 50
|
||||
}
|
||||
if pct < 10 {
|
||||
return 10
|
||||
}
|
||||
if pct > 90 {
|
||||
return 90
|
||||
}
|
||||
return pct
|
||||
}
|
||||
|
||||
func mustJSON(v any) json.RawMessage {
|
||||
b, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`))
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// Ensure the probes package import stays anchored — the Burn sidecars
|
||||
// use probes.EDAC + the PSU rail scanner defined in psu.go which
|
||||
// otherwise wouldn't pull probes in on its own.
|
||||
var _ = probes.EDAC
|
||||
Reference in New Issue
Block a user