deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -0,0 +1,486 @@
+package tests
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"vetting/agent/probes"
+)
+
+// BurnConfig is what the agent passes to Burn: the orchestrator's iperf3
+// server address and port. Durations + concurrency knobs come from
+// Deps.BurnKnobs so they scale with profile.
+type BurnConfig struct {
+	OrchestratorURL string
+	IperfPort       int // 0 = 5201
+}
+
+// Burn is the concurrent soak stage. Unlike CPUStress (serial
+// CPU→memory) or Storage (serial per disk) it fans out every workload
+// at once: stress-ng hammers CPU + memory, fio drives the allow-listed
+// disks, iperf3 pushes sustained NIC traffic, and two sidecars poll
+// EDAC + PSU rails for the duration of the window.
+//
+// This is where PSU rails actually matter: 12V sag under simultaneous
+// CPU + disk + NIC load is exactly the failure a thermal/power
+// regression produces, and it's invisible to any stage that loads one
+// subsystem at a time. The PSU stage that follows Burn in the pipeline
+// re-samples rails post-window to confirm they settle back to nominal.
+//
+// Burn stays inside the stage framework — it doesn't spawn a parallel
+// stage runner. The goroutine fan-out is local; the stage converges
+// before returning an Outcome so every invariant the orchestrator
+// relies on (serial stage order, single in-flight stage per run) still
+// holds.
+func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome {
+	duration := d.BurnKnobs.Duration
+	if duration <= 0 {
+		duration = 2 * time.Minute
+	}
+	cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers)
+	memPct := clampMemPct(d.BurnKnobs.MemPct)
+	iperfParallel := d.BurnKnobs.IperfParallel
+	if iperfParallel <= 0 {
+		iperfParallel = 2
+	}
+	d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v",
+		duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare))
+
+	// Sidecars run for the lifetime of the window and are cancelled on
+	// return so the main stage converges cleanly. EDAC catches DIMM
+	// bit-flips that appear only under concurrent load; PSU catches
+	// rail sag that only appears when CPU + disk + NIC pull current
+	// simultaneously.
+	sideCtx, sideCancel := context.WithCancel(ctx)
+	defer sideCancel()
+	var sideWG sync.WaitGroup
+	sideWG.Add(2)
+	go runEDACSidecar(sideCtx, &sideWG, d)
+	go runPSUSidecar(sideCtx, &sideWG, d)
+
+	runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
+	defer cancel()
+
+	results := make(chan burnSubResult, 4)
+	var wg sync.WaitGroup
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		results <- runBurnCPU(runCtx, d, duration, cpuWorkers)
+	}()
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		results <- runBurnMemory(runCtx, d, duration, memPct)
+	}()
+
+	// fio runs only when explicitly enabled *and* there are allow-listed
+	// disks *and* the run wasn't marked non-destructive. Any of those
+	// missing records a Skipped sub-step so the operator sees why.
+	if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			results <- runBurnFio(runCtx, d, duration)
+		}()
+	} else {
+		reason := burnFioSkipReason(d)
+		results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason}
+	}
+
+	// iperf requires an orchestrator host. Lab hosts run with the
+	// bundled iperf3 server; without a base URL we can't derive a
+	// target so we skip rather than fail the stage.
+	if cfg.OrchestratorURL != "" {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel)
+		}()
+	} else {
+		results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"}
+	}
+
+	wg.Wait()
+	sideCancel()
+	sideWG.Wait()
+	close(results)
+
+	subs, samples, failures := collectBurnResults(results)
+	if d.Sensor != nil && len(samples) > 0 {
+		_ = d.Sensor(ctx, samples)
+	}
+
+	extras := map[string]any{
+		"duration":       duration.String(),
+		"cpu_workers":    cpuWorkers,
+		"mem_pct":        memPct,
+		"iperf_parallel": iperfParallel,
+		"fio_on_spare":   d.BurnKnobs.FioOnSpare,
+	}
+	if len(failures) > 0 {
+		msg := "Burn workloads failed: " + strings.Join(failures, ", ")
+		d.Error(msg)
+		return Outcome{
+			Passed:   false,
+			Message:  msg,
+			Summary:  fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)),
+			Extras:   extras,
+			SubSteps: subs,
+		}
+	}
+	d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs)))
+	return Outcome{
+		Passed:   true,
+		Summary:  fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)),
+		Extras:   extras,
+		SubSteps: subs,
+	}
+}
+
+// burnSubResult is the per-workload return type used by the fan-out
+// goroutines. Sample slice is merged into the stage's final /sensor
+// batch; SubStep becomes a row on the /result sub-steps list.
+type burnSubResult struct {
+	Name    string
+	Passed  bool
+	Skipped bool
+	Reason  string // why a workload was skipped
+	Err     string // why a workload failed
+	Samples []Sample
+	SubStep SubStepReport
+}
+
+func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) {
+	var subs []SubStepReport
+	var samples []Sample
+	var failures []string
+	for r := range ch {
+		// Non-skipped goroutines populate SubStep directly. Skipped slots
+		// get a synthesized row here so the /result shape stays stable.
+		if r.Skipped {
+			stamp := time.Now().UTC()
+			subs = append(subs, SubStepReport{
+				Name:        r.Name,
+				Skipped:     true,
+				StartedAt:   stamp,
+				CompletedAt: stamp,
+				SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}),
+			})
+			continue
+		}
+		subs = append(subs, r.SubStep)
+		samples = append(samples, r.Samples...)
+		if !r.Passed {
+			reason := r.Err
+			if reason == "" {
+				reason = "unknown"
+			}
+			failures = append(failures, r.Name+": "+reason)
+		}
+	}
+	return subs, samples, failures
+}
+
+func burnFioSkipReason(d Deps) string {
+	if !d.BurnKnobs.FioOnSpare {
+		return "fio_on_spare knob disabled"
+	}
+	if d.NonDestructive {
+		return "non-destructive run"
+	}
+	if len(d.ExpectedDisks) == 0 {
+		return "no allowlisted disks"
+	}
+	return "disabled"
+}
+
+// runBurnCPU hammers all CPU cores with stress-ng for the window. Same
+// shape as CPUStress pass 1 but with shorter label so the sub-step row
+// doesn't collide with the earlier stage's "CPU pass".
+func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult {
+	if _, err := exec.LookPath("stress-ng"); err != nil {
+		return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"}
+	}
+	args := []string{
+		"--cpu", strconv.Itoa(workers),
+		"--cpu-method", "all",
+		"--timeout", durationSeconds(duration),
+		"--metrics-brief",
+		"--verify",
+	}
+	d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " ")))
+	pass := runStressPass(ctx, d, "Burn CPU", duration, args)
+	return burnSubResult{
+		Name:    "Burn CPU",
+		Passed:  pass.Passed,
+		Err:     pass.Err,
+		SubStep: subStepFromPass("Burn CPU", pass),
+	}
+}
+
+// runBurnMemory drives a single --vm worker sized at memPct of
+// MemAvailable, capped so the kernel + agent + other workloads still
+// have headroom. Clamping happens here rather than in resolveBurnKnobs
+// so the cap is computed against real live memory each run.
+func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult {
+	if _, err := exec.LookPath("stress-ng"); err != nil {
+		return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"}
+	}
+	avail, err := memAvailableBytes()
+	if err != nil {
+		return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()}
+	}
+	// Budget = avail * memPct / 100, then subtract the standard headroom.
+	// If the result is below the memory-pass floor we record a skipped
+	// row instead — the window is too tight to be meaningful on this box.
+	budget := int64(float64(avail) * float64(memPct) / 100.0)
+	cap := budget - memHeadroomBytes
+	if cap < memFloorBytes {
+		return burnSubResult{
+			Name:    "Burn memory",
+			Skipped: true,
+			Reason:  fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)),
+		}
+	}
+	args := []string{
+		"--vm", "1",
+		"--vm-bytes", strconv.FormatInt(cap, 10),
+		"--vm-keep",
+		"--timeout", durationSeconds(duration),
+		"--metrics-brief",
+		"--verify",
+	}
+	d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct))
+	pass := runStressPass(ctx, d, "Burn memory", duration, args)
+	return burnSubResult{
+		Name:    "Burn memory",
+		Passed:  pass.Passed,
+		Err:     pass.Err,
+		SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass),
+	}
+}
+
+// runBurnFio runs fio_sample against the first allow-listed disk for
+// the window. Reuses runFioVerify + parseFioJSON so the samples line
+// up with what Storage emits. Using fio_sample (bounded by --size)
+// keeps Burn's write volume predictable regardless of profile.
+func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult {
+	if _, err := exec.LookPath("fio"); err != nil {
+		return burnSubResult{Name: "Burn fio", Err: "fio missing"}
+	}
+	targets := resolveTargets(d.ExpectedDisks)
+	if len(targets) == 0 {
+		return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"}
+	}
+	t := targets[0]
+	opts := fioOpts{
+		Mode:    "fio_sample",
+		Size:    "512MiB",
+		Runtime: duration,
+		BS:      "4k",
+		RW:      "randrw",
+		Verify:  "md5",
+	}
+	start := time.Now()
+	d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration))
+	fr := runFioVerify(ctx, t.Device, opts)
+	end := time.Now()
+
+	sub := SubStepReport{
+		Name:        "Burn fio " + t.Device,
+		Passed:      fr.Error == "",
+		StartedAt:   start,
+		CompletedAt: end,
+		SummaryJSON: mustJSON(fr),
+	}
+	out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error}
+	if fr.Error == "" {
+		out.Samples = append(out.Samples,
+			Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
+			Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
+		)
+		if fr.ReadP99Us > 0 {
+			out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
+		}
+		if fr.WriteP99Us > 0 {
+			out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
+		}
+	}
+	return out
+}
+
+// runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON
+// so the same (mbps, retrans, bytesSent) extraction the Network stage
+// uses applies here too. Samples emitted as Burn-scoped keys so the
+// dashboard can tell at-a-glance which window they came from.
+func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult {
+	if _, err := exec.LookPath("iperf3"); err != nil {
+		return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"}
+	}
+	host, err := deriveHost(orchestratorURL)
+	if err != nil || host == "" {
+		return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"}
+	}
+	if port == 0 {
+		port = 5201
+	}
+	if parallel < 1 {
+		parallel = 1
+	}
+	args := []string{
+		"-c", host,
+		"-p", strconv.Itoa(port),
+		"-t", strconv.Itoa(int(duration.Seconds())),
+		"-P", strconv.Itoa(parallel),
+		"-J",
+	}
+	runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
+	defer cancel()
+	start := time.Now()
+	out, err := exec.CommandContext(runCtx, "iperf3", args...).Output()
+	end := time.Now()
+	if err != nil {
+		return burnSubResult{
+			Name:    "Burn iperf",
+			Err:     "iperf3 client error: " + err.Error(),
+			SubStep: SubStepReport{
+				Name:        "Burn iperf",
+				StartedAt:   start,
+				CompletedAt: end,
+				SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}),
+			},
+		}
+	}
+	mbps, retrans, bytesSent, _, perr := parseIperfJSON(out)
+	if perr != nil {
+		return burnSubResult{
+			Name:    "Burn iperf",
+			Err:     "parse iperf3 json: " + perr.Error(),
+			SubStep: SubStepReport{
+				Name:        "Burn iperf",
+				StartedAt:   start,
+				CompletedAt: end,
+				SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}),
+			},
+		}
+	}
+
+	samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}}
+	if bytesSent > 0 {
+		packets := float64(bytesSent) / 1460.0
+		if packets > 0 {
+			samples = append(samples, Sample{
+				Kind: "nic_retrans", Key: "burn/rate",
+				Value: float64(retrans) / packets, Unit: "rate",
+			})
+		}
+	}
+	passed := mbps > 0
+	errMsg := ""
+	if !passed {
+		errMsg = "zero throughput from iperf3"
+	}
+	return burnSubResult{
+		Name:    "Burn iperf",
+		Passed:  passed,
+		Err:     errMsg,
+		Samples: samples,
+		SubStep: SubStepReport{
+			Name:        fmt.Sprintf("Burn iperf (P=%d)", parallel),
+			Passed:      passed,
+			StartedAt:   start,
+			CompletedAt: end,
+			SummaryJSON: mustJSON(map[string]any{
+				"throughput_mbps": mbps,
+				"retransmits":     retrans,
+				"bytes_sent":      bytesSent,
+				"parallel":        parallel,
+			}),
+		},
+	}
+}
+
+// runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration
+// of the Burn window, piping each read into the stage's sensor channel
+// as a psu_volt sample. The threshold evaluator then applies the same
+// within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V
+// under load will fire the critical threshold mid-Burn and the run
+// will flip into FailedHolding without waiting for the post-Burn PSU
+// stage to catch it.
+func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
+	defer wg.Done()
+	if d.Sensor == nil {
+		return
+	}
+	t := time.NewTicker(5 * time.Second)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			rails := scanPSURails()
+			if len(rails) == 0 {
+				continue
+			}
+			batch := make([]Sample, 0, len(rails))
+			for _, r := range rails {
+				batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
+			}
+			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+			if err := d.Sensor(sendCtx, batch); err != nil {
+				d.Warn("Burn: PSU sample post: " + err.Error())
+			}
+			cancel()
+		}
+	}
+}
+
+func resolveCPUWorkers(raw string) int {
+	if raw == "" || strings.EqualFold(raw, "all") {
+		return runtime.NumCPU()
+	}
+	if n, err := strconv.Atoi(raw); err == nil && n > 0 {
+		return n
+	}
+	return runtime.NumCPU()
+}
+
+// clampMemPct keeps the knob in a sane band. 0 means "use default 50%";
+// above 90 would crowd the kernel + agent + fio + iperf3 workers off the
+// page cache. Anything outside [10, 90] is clamped.
+func clampMemPct(pct int) int {
+	if pct <= 0 {
+		return 50
+	}
+	if pct < 10 {
+		return 10
+	}
+	if pct > 90 {
+		return 90
+	}
+	return pct
+}
+
+func mustJSON(v any) json.RawMessage {
+	b, err := json.Marshal(v)
+	if err != nil {
+		return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`))
+	}
+	return b
+}
+
+// Ensure the probes package import stays anchored — the Burn sidecars
+// use probes.EDAC + the PSU rail scanner defined in psu.go which
+// otherwise wouldn't pull probes in on its own.
+var _ = probes.EDAC