deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+137
-18
@@ -7,12 +7,20 @@ import (
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
|
||||
// PSU rails. In home-lab hosts the kernel surfaces a handful of named
|
||||
// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10%
|
||||
// window of its nominal value → fail.
|
||||
// PSU rails, then samples each rail every psuSampleInterval for a
|
||||
// window sized by the stage timeout. During Burn a separate sidecar
|
||||
// (see burn.go) runs the same probe concurrently with workload — the
|
||||
// PSU stage itself catches slow post-load sag that only surfaces once
|
||||
// the 12V rail starts recovering from a brownout under concurrent CPU
|
||||
// + fio + iperf load.
|
||||
//
|
||||
// Any rail outside ±10% of its nominal value at any tick fires the
|
||||
// critical threshold (server-side) and fails the stage. A host with no
|
||||
// PSU rails wired to hwmon auto-skips.
|
||||
func PSU(ctx context.Context, d Deps) Outcome {
|
||||
rails := scanPSURails()
|
||||
if len(rails) == 0 {
|
||||
@@ -24,39 +32,150 @@ func PSU(ctx context.Context, d Deps) Outcome {
|
||||
}
|
||||
}
|
||||
|
||||
var samples []Sample
|
||||
problems := []string{}
|
||||
for _, rail := range rails {
|
||||
samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"})
|
||||
if ok, why := voltageInRange(rail); !ok {
|
||||
problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why))
|
||||
window := resolvePSUWindow(d.StageTimeout)
|
||||
deadline := time.Now().Add(window)
|
||||
interval := psuSampleInterval
|
||||
if window < interval*2 {
|
||||
// Tiny window (tests, pathological stage_timeout) — at least two
|
||||
// ticks so aggregate stats are meaningful.
|
||||
interval = window / 2
|
||||
if interval < time.Second {
|
||||
interval = time.Second
|
||||
}
|
||||
}
|
||||
if d.Sensor != nil {
|
||||
_ = d.Sensor(ctx, samples)
|
||||
|
||||
// Per-label tracking: min/max across the window, count of out-of-range
|
||||
// hits, last-observed value (shown in the summary).
|
||||
type railStats struct {
|
||||
label string
|
||||
minV float64
|
||||
maxV float64
|
||||
lastV float64
|
||||
ticks int
|
||||
breaches int
|
||||
reason string
|
||||
}
|
||||
stats := map[string]*railStats{}
|
||||
|
||||
tick := time.NewTicker(interval)
|
||||
defer tick.Stop()
|
||||
// Start with an immediate sample so a sub-45s window still produces
|
||||
// at least one reading.
|
||||
sampleOnce := func() {
|
||||
cur := scanPSURails()
|
||||
if len(cur) == 0 {
|
||||
return
|
||||
}
|
||||
batch := make([]Sample, 0, len(cur))
|
||||
for _, r := range cur {
|
||||
s, ok := stats[r.Label]
|
||||
if !ok {
|
||||
s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts}
|
||||
stats[r.Label] = s
|
||||
}
|
||||
s.ticks++
|
||||
s.lastV = r.Volts
|
||||
if r.Volts < s.minV {
|
||||
s.minV = r.Volts
|
||||
}
|
||||
if r.Volts > s.maxV {
|
||||
s.maxV = r.Volts
|
||||
}
|
||||
if ok, why := voltageInRange(r); !ok {
|
||||
s.breaches++
|
||||
if s.reason == "" {
|
||||
s.reason = why
|
||||
}
|
||||
}
|
||||
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
|
||||
}
|
||||
if d.Sensor != nil && len(batch) > 0 {
|
||||
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
_ = d.Sensor(sendCtx, batch)
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
sampleOnce()
|
||||
sampling:
|
||||
for time.Now().Before(deadline) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break sampling
|
||||
case <-tick.C:
|
||||
sampleOnce()
|
||||
}
|
||||
}
|
||||
|
||||
// Build the outcome. Extras carry per-rail rollup so the report can
|
||||
// show "12V min=11.1 max=12.05 (3/120 ticks out of range)".
|
||||
type railRollup struct {
|
||||
Label string `json:"label"`
|
||||
MinV float64 `json:"min_v"`
|
||||
MaxV float64 `json:"max_v"`
|
||||
LastV float64 `json:"last_v"`
|
||||
Ticks int `json:"ticks"`
|
||||
Breaches int `json:"breaches"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
rollups := make([]railRollup, 0, len(stats))
|
||||
problems := []string{}
|
||||
for _, s := range stats {
|
||||
rollups = append(rollups, railRollup{
|
||||
Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV,
|
||||
Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason,
|
||||
})
|
||||
if s.breaches > 0 {
|
||||
problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason))
|
||||
}
|
||||
}
|
||||
|
||||
extras := map[string]any{
|
||||
"rails": rails,
|
||||
"problems": problems,
|
||||
"rails": rollups,
|
||||
"problems": problems,
|
||||
"window": window.String(),
|
||||
"interval": interval.String(),
|
||||
}
|
||||
if len(problems) > 0 {
|
||||
d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", "))
|
||||
d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; "))
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "PSU rails out of range: " + strings.Join(problems, ", "),
|
||||
Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)),
|
||||
Message: "PSU rails out of range: " + strings.Join(problems, "; "),
|
||||
Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails)))
|
||||
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("%d rails nominal", len(rails)),
|
||||
Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
|
||||
// psuSampleInterval is the default tick for post-Burn rail sampling.
|
||||
// Five seconds is slow enough to stay under the HTTP budget and fast
|
||||
// enough to catch rail recovery transients.
|
||||
const psuSampleInterval = 5 * time.Second
|
||||
|
||||
// resolvePSUWindow maps the stage timeout to the sampling window.
|
||||
// With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot-
|
||||
// like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom
|
||||
// for sensor flush + result post, capped at 10 min so a 24 h soak
|
||||
// doesn't spend all day in PSU.
|
||||
func resolvePSUWindow(stageTimeout time.Duration) time.Duration {
|
||||
if stageTimeout <= 0 {
|
||||
return 30 * time.Second
|
||||
}
|
||||
w := stageTimeout - 5*time.Second
|
||||
if w < 30*time.Second {
|
||||
w = 30 * time.Second
|
||||
}
|
||||
if w > 10*time.Minute {
|
||||
w = 10 * time.Minute
|
||||
}
|
||||
return w
|
||||
}
|
||||
|
||||
type psuRail struct {
|
||||
Label string `json:"label"`
|
||||
Volts float64 `json:"volts"`
|
||||
|
||||
Reference in New Issue
Block a user