deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -7,12 +7,20 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
+	"time"
 )

 // PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
-// PSU rails. In home-lab hosts the kernel surfaces a handful of named
-// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10%
-// window of its nominal value → fail.
+// PSU rails, then samples each rail every psuSampleInterval for a
+// window sized by the stage timeout. During Burn a separate sidecar
+// (see burn.go) runs the same probe concurrently with workload — the
+// PSU stage itself catches slow post-load sag that only surfaces once
+// the 12V rail starts recovering from a brownout under concurrent CPU
+// + fio + iperf load.
+//
+// Any rail outside ±10% of its nominal value at any tick fires the
+// critical threshold (server-side) and fails the stage. A host with no
+// PSU rails wired to hwmon auto-skips.
 func PSU(ctx context.Context, d Deps) Outcome {
 	rails := scanPSURails()
 	if len(rails) == 0 {
@@ -24,39 +32,150 @@ func PSU(ctx context.Context, d Deps) Outcome {
 		}
 	}

-	var samples []Sample
-	problems := []string{}
-	for _, rail := range rails {
-		samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"})
-		if ok, why := voltageInRange(rail); !ok {
-			problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why))
+	window := resolvePSUWindow(d.StageTimeout)
+	deadline := time.Now().Add(window)
+	interval := psuSampleInterval
+	if window < interval*2 {
+		// Tiny window (tests, pathological stage_timeout) — at least two
+		// ticks so aggregate stats are meaningful.
+		interval = window / 2
+		if interval < time.Second {
+			interval = time.Second
 		}
 	}
-	if d.Sensor != nil {
-		_ = d.Sensor(ctx, samples)
+
+	// Per-label tracking: min/max across the window, count of out-of-range
+	// hits, last-observed value (shown in the summary).
+	type railStats struct {
+		label    string
+		minV     float64
+		maxV     float64
+		lastV    float64
+		ticks    int
+		breaches int
+		reason   string
+	}
+	stats := map[string]*railStats{}
+
+	tick := time.NewTicker(interval)
+	defer tick.Stop()
+	// Start with an immediate sample so a sub-45s window still produces
+	// at least one reading.
+	sampleOnce := func() {
+		cur := scanPSURails()
+		if len(cur) == 0 {
+			return
+		}
+		batch := make([]Sample, 0, len(cur))
+		for _, r := range cur {
+			s, ok := stats[r.Label]
+			if !ok {
+				s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts}
+				stats[r.Label] = s
+			}
+			s.ticks++
+			s.lastV = r.Volts
+			if r.Volts < s.minV {
+				s.minV = r.Volts
+			}
+			if r.Volts > s.maxV {
+				s.maxV = r.Volts
+			}
+			if ok, why := voltageInRange(r); !ok {
+				s.breaches++
+				if s.reason == "" {
+					s.reason = why
+				}
+			}
+			batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
+		}
+		if d.Sensor != nil && len(batch) > 0 {
+			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+			_ = d.Sensor(sendCtx, batch)
+			cancel()
+		}
+	}
+	sampleOnce()
+sampling:
+	for time.Now().Before(deadline) {
+		select {
+		case <-ctx.Done():
+			break sampling
+		case <-tick.C:
+			sampleOnce()
+		}
+	}
+
+	// Build the outcome. Extras carry per-rail rollup so the report can
+	// show "12V min=11.1 max=12.05 (3/120 ticks out of range)".
+	type railRollup struct {
+		Label    string  `json:"label"`
+		MinV     float64 `json:"min_v"`
+		MaxV     float64 `json:"max_v"`
+		LastV    float64 `json:"last_v"`
+		Ticks    int     `json:"ticks"`
+		Breaches int     `json:"breaches"`
+		Reason   string  `json:"reason,omitempty"`
+	}
+	rollups := make([]railRollup, 0, len(stats))
+	problems := []string{}
+	for _, s := range stats {
+		rollups = append(rollups, railRollup{
+			Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV,
+			Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason,
+		})
+		if s.breaches > 0 {
+			problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason))
+		}
 	}

 	extras := map[string]any{
-		"rails":    rails,
-		"problems": problems,
+		"rails":       rollups,
+		"problems":    problems,
+		"window":      window.String(),
+		"interval":    interval.String(),
 	}
 	if len(problems) > 0 {
-		d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", "))
+		d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; "))
 		return Outcome{
 			Passed:  false,
-			Message: "PSU rails out of range: " + strings.Join(problems, ", "),
-			Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)),
+			Message: "PSU rails out of range: " + strings.Join(problems, "; "),
+			Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)),
 			Extras:  extras,
 		}
 	}
-	d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails)))
+	d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window))
 	return Outcome{
 		Passed:  true,
-		Summary: fmt.Sprintf("%d rails nominal", len(rails)),
+		Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window),
 		Extras:  extras,
 	}
 }

+// psuSampleInterval is the default tick for post-Burn rail sampling.
+// Five seconds is slow enough to stay under the HTTP budget and fast
+// enough to catch rail recovery transients.
+const psuSampleInterval = 5 * time.Second
+
+// resolvePSUWindow maps the stage timeout to the sampling window.
+// With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot-
+// like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom
+// for sensor flush + result post, capped at 10 min so a 24 h soak
+// doesn't spend all day in PSU.
+func resolvePSUWindow(stageTimeout time.Duration) time.Duration {
+	if stageTimeout <= 0 {
+		return 30 * time.Second
+	}
+	w := stageTimeout - 5*time.Second
+	if w < 30*time.Second {
+		w = 30 * time.Second
+	}
+	if w > 10*time.Minute {
+		w = 10 * time.Minute
+	}
+	return w
+}
+
 type psuRail struct {
 	Label string  `json:"label"`
 	Volts float64 `json:"volts"`