deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+137 -18
View File
@@ -7,12 +7,20 @@ import (
"path/filepath"
"strconv"
"strings"
"time"
)
// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
// PSU rails. In home-lab hosts the kernel surfaces a handful of named
// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10%
// window of its nominal value → fail.
// PSU rails, then samples each rail every psuSampleInterval for a
// window sized by the stage timeout. During Burn a separate sidecar
// (see burn.go) runs the same probe concurrently with workload — the
// PSU stage itself catches slow post-load sag that only surfaces once
// the 12V rail starts recovering from a brownout under concurrent CPU
// + fio + iperf load.
//
// Any rail outside ±10% of its nominal value at any tick fires the
// critical threshold (server-side) and fails the stage. A host with no
// PSU rails wired to hwmon auto-skips.
func PSU(ctx context.Context, d Deps) Outcome {
rails := scanPSURails()
if len(rails) == 0 {
@@ -24,39 +32,150 @@ func PSU(ctx context.Context, d Deps) Outcome {
}
}
var samples []Sample
problems := []string{}
for _, rail := range rails {
samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"})
if ok, why := voltageInRange(rail); !ok {
problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why))
window := resolvePSUWindow(d.StageTimeout)
deadline := time.Now().Add(window)
interval := psuSampleInterval
if window < interval*2 {
// Tiny window (tests, pathological stage_timeout) — at least two
// ticks so aggregate stats are meaningful.
interval = window / 2
if interval < time.Second {
interval = time.Second
}
}
if d.Sensor != nil {
_ = d.Sensor(ctx, samples)
// Per-label tracking: min/max across the window, count of out-of-range
// hits, last-observed value (shown in the summary).
type railStats struct {
label string
minV float64
maxV float64
lastV float64
ticks int
breaches int
reason string
}
stats := map[string]*railStats{}
tick := time.NewTicker(interval)
defer tick.Stop()
// Start with an immediate sample so a sub-45s window still produces
// at least one reading.
sampleOnce := func() {
cur := scanPSURails()
if len(cur) == 0 {
return
}
batch := make([]Sample, 0, len(cur))
for _, r := range cur {
s, ok := stats[r.Label]
if !ok {
s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts}
stats[r.Label] = s
}
s.ticks++
s.lastV = r.Volts
if r.Volts < s.minV {
s.minV = r.Volts
}
if r.Volts > s.maxV {
s.maxV = r.Volts
}
if ok, why := voltageInRange(r); !ok {
s.breaches++
if s.reason == "" {
s.reason = why
}
}
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
}
if d.Sensor != nil && len(batch) > 0 {
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
_ = d.Sensor(sendCtx, batch)
cancel()
}
}
sampleOnce()
sampling:
for time.Now().Before(deadline) {
select {
case <-ctx.Done():
break sampling
case <-tick.C:
sampleOnce()
}
}
// Build the outcome. Extras carry per-rail rollup so the report can
// show "12V min=11.1 max=12.05 (3/120 ticks out of range)".
type railRollup struct {
Label string `json:"label"`
MinV float64 `json:"min_v"`
MaxV float64 `json:"max_v"`
LastV float64 `json:"last_v"`
Ticks int `json:"ticks"`
Breaches int `json:"breaches"`
Reason string `json:"reason,omitempty"`
}
rollups := make([]railRollup, 0, len(stats))
problems := []string{}
for _, s := range stats {
rollups = append(rollups, railRollup{
Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV,
Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason,
})
if s.breaches > 0 {
problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason))
}
}
extras := map[string]any{
"rails": rails,
"problems": problems,
"rails": rollups,
"problems": problems,
"window": window.String(),
"interval": interval.String(),
}
if len(problems) > 0 {
d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", "))
d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; "))
return Outcome{
Passed: false,
Message: "PSU rails out of range: " + strings.Join(problems, ", "),
Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)),
Message: "PSU rails out of range: " + strings.Join(problems, "; "),
Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)),
Extras: extras,
}
}
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails)))
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%d rails nominal", len(rails)),
Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window),
Extras: extras,
}
}
// psuSampleInterval is the default tick for post-Burn rail sampling.
// Five seconds is slow enough to stay under the HTTP budget and fast
// enough to catch rail recovery transients.
const psuSampleInterval = 5 * time.Second
// resolvePSUWindow maps the stage timeout to the sampling window.
// With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot-
// like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom
// for sensor flush + result post, capped at 10 min so a 24 h soak
// doesn't spend all day in PSU.
func resolvePSUWindow(stageTimeout time.Duration) time.Duration {
if stageTimeout <= 0 {
return 30 * time.Second
}
w := stageTimeout - 5*time.Second
if w < 30*time.Second {
w = 30 * time.Second
}
if w > 10*time.Minute {
w = 10 * time.Minute
}
return w
}
type psuRail struct {
Label string `json:"label"`
Volts float64 `json:"volts"`