23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
273 lines
7.3 KiB
Go
273 lines
7.3 KiB
Go
package tests
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
|
|
// PSU rails, then samples each rail every psuSampleInterval for a
|
|
// window sized by the stage timeout. During Burn a separate sidecar
|
|
// (see burn.go) runs the same probe concurrently with workload — the
|
|
// PSU stage itself catches slow post-load sag that only surfaces once
|
|
// the 12V rail starts recovering from a brownout under concurrent CPU
|
|
// + fio + iperf load.
|
|
//
|
|
// Any rail outside ±10% of its nominal value at any tick fires the
|
|
// critical threshold (server-side) and fails the stage. A host with no
|
|
// PSU rails wired to hwmon auto-skips.
|
|
func PSU(ctx context.Context, d Deps) Outcome {
|
|
rails := scanPSURails()
|
|
if len(rails) == 0 {
|
|
d.Info("PSU: no voltage rails found under /sys/class/hwmon — skipping stage")
|
|
return Outcome{
|
|
Passed: true,
|
|
Summary: "skipped (no PSU sensors)",
|
|
Extras: map[string]any{"skipped": true, "reason": "no_hwmon_voltages"},
|
|
}
|
|
}
|
|
|
|
window := resolvePSUWindow(d.StageTimeout)
|
|
deadline := time.Now().Add(window)
|
|
interval := psuSampleInterval
|
|
if window < interval*2 {
|
|
// Tiny window (tests, pathological stage_timeout) — at least two
|
|
// ticks so aggregate stats are meaningful.
|
|
interval = window / 2
|
|
if interval < time.Second {
|
|
interval = time.Second
|
|
}
|
|
}
|
|
|
|
// Per-label tracking: min/max across the window, count of out-of-range
|
|
// hits, last-observed value (shown in the summary).
|
|
type railStats struct {
|
|
label string
|
|
minV float64
|
|
maxV float64
|
|
lastV float64
|
|
ticks int
|
|
breaches int
|
|
reason string
|
|
}
|
|
stats := map[string]*railStats{}
|
|
|
|
tick := time.NewTicker(interval)
|
|
defer tick.Stop()
|
|
// Start with an immediate sample so a sub-45s window still produces
|
|
// at least one reading.
|
|
sampleOnce := func() {
|
|
cur := scanPSURails()
|
|
if len(cur) == 0 {
|
|
return
|
|
}
|
|
batch := make([]Sample, 0, len(cur))
|
|
for _, r := range cur {
|
|
s, ok := stats[r.Label]
|
|
if !ok {
|
|
s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts}
|
|
stats[r.Label] = s
|
|
}
|
|
s.ticks++
|
|
s.lastV = r.Volts
|
|
if r.Volts < s.minV {
|
|
s.minV = r.Volts
|
|
}
|
|
if r.Volts > s.maxV {
|
|
s.maxV = r.Volts
|
|
}
|
|
if ok, why := voltageInRange(r); !ok {
|
|
s.breaches++
|
|
if s.reason == "" {
|
|
s.reason = why
|
|
}
|
|
}
|
|
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
|
|
}
|
|
if d.Sensor != nil && len(batch) > 0 {
|
|
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
_ = d.Sensor(sendCtx, batch)
|
|
cancel()
|
|
}
|
|
}
|
|
sampleOnce()
|
|
sampling:
|
|
for time.Now().Before(deadline) {
|
|
select {
|
|
case <-ctx.Done():
|
|
break sampling
|
|
case <-tick.C:
|
|
sampleOnce()
|
|
}
|
|
}
|
|
|
|
// Build the outcome. Extras carry per-rail rollup so the report can
|
|
// show "12V min=11.1 max=12.05 (3/120 ticks out of range)".
|
|
type railRollup struct {
|
|
Label string `json:"label"`
|
|
MinV float64 `json:"min_v"`
|
|
MaxV float64 `json:"max_v"`
|
|
LastV float64 `json:"last_v"`
|
|
Ticks int `json:"ticks"`
|
|
Breaches int `json:"breaches"`
|
|
Reason string `json:"reason,omitempty"`
|
|
}
|
|
rollups := make([]railRollup, 0, len(stats))
|
|
problems := []string{}
|
|
for _, s := range stats {
|
|
rollups = append(rollups, railRollup{
|
|
Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV,
|
|
Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason,
|
|
})
|
|
if s.breaches > 0 {
|
|
problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason))
|
|
}
|
|
}
|
|
|
|
extras := map[string]any{
|
|
"rails": rollups,
|
|
"problems": problems,
|
|
"window": window.String(),
|
|
"interval": interval.String(),
|
|
}
|
|
if len(problems) > 0 {
|
|
d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; "))
|
|
return Outcome{
|
|
Passed: false,
|
|
Message: "PSU rails out of range: " + strings.Join(problems, "; "),
|
|
Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)),
|
|
Extras: extras,
|
|
}
|
|
}
|
|
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window))
|
|
return Outcome{
|
|
Passed: true,
|
|
Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window),
|
|
Extras: extras,
|
|
}
|
|
}
|
|
|
|
// psuSampleInterval is the default tick for post-Burn rail sampling.
|
|
// Five seconds is slow enough to stay under the HTTP budget and fast
|
|
// enough to catch rail recovery transients.
|
|
const psuSampleInterval = 5 * time.Second
|
|
|
|
// resolvePSUWindow maps the stage timeout to the sampling window.
|
|
// With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot-
|
|
// like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom
|
|
// for sensor flush + result post, capped at 10 min so a 24 h soak
|
|
// doesn't spend all day in PSU.
|
|
func resolvePSUWindow(stageTimeout time.Duration) time.Duration {
|
|
if stageTimeout <= 0 {
|
|
return 30 * time.Second
|
|
}
|
|
w := stageTimeout - 5*time.Second
|
|
if w < 30*time.Second {
|
|
w = 30 * time.Second
|
|
}
|
|
if w > 10*time.Minute {
|
|
w = 10 * time.Minute
|
|
}
|
|
return w
|
|
}
|
|
|
|
type psuRail struct {
|
|
Label string `json:"label"`
|
|
Volts float64 `json:"volts"`
|
|
}
|
|
|
|
// scanPSURails walks every hwmon chip looking for in*_input files with
|
|
// an accompanying in*_label that mentions a known rail name. Unknown
|
|
// labels are skipped rather than flagged — motherboard VRMs report many
|
|
// rails that aren't PSU outputs.
|
|
func scanPSURails() []psuRail {
|
|
root := "/sys/class/hwmon"
|
|
chips, err := os.ReadDir(root)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var out []psuRail
|
|
for _, c := range chips {
|
|
base := filepath.Join(root, c.Name())
|
|
files, err := os.ReadDir(base)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
for _, f := range files {
|
|
name := f.Name()
|
|
if !strings.HasPrefix(name, "in") || !strings.HasSuffix(name, "_input") {
|
|
continue
|
|
}
|
|
n := strings.TrimSuffix(strings.TrimPrefix(name, "in"), "_input")
|
|
labelPath := filepath.Join(base, "in"+n+"_label")
|
|
label := strings.TrimSpace(readFileStr(labelPath))
|
|
if !isPSULabel(label) {
|
|
continue
|
|
}
|
|
raw := strings.TrimSpace(readFileStr(filepath.Join(base, name)))
|
|
mv, err := strconv.Atoi(raw)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
out = append(out, psuRail{Label: label, Volts: float64(mv) / 1000})
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// isPSULabel filters labels that look like PSU rails. Keeps a small
|
|
// allowlist to avoid flagging CPU VRM rails as PSU failures.
|
|
func isPSULabel(label string) bool {
|
|
l := strings.ToLower(label)
|
|
switch {
|
|
case strings.Contains(l, "12v"), strings.Contains(l, "5v"),
|
|
strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"),
|
|
strings.Contains(l, "vccin"):
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// voltageInRange returns (ok, reason). A label like "12V" has a 12.0V
|
|
// nominal; we accept ±10%. Unknown labels pass.
|
|
func voltageInRange(r psuRail) (bool, string) {
|
|
nom := nominalFor(r.Label)
|
|
if nom == 0 {
|
|
return true, ""
|
|
}
|
|
delta := r.Volts - nom
|
|
if delta < 0 {
|
|
delta = -delta
|
|
}
|
|
if delta/nom > 0.10 {
|
|
return false, fmt.Sprintf("expected ~%.1fV", nom)
|
|
}
|
|
return true, ""
|
|
}
|
|
|
|
func nominalFor(label string) float64 {
|
|
l := strings.ToLower(label)
|
|
switch {
|
|
case strings.Contains(l, "12v"):
|
|
return 12.0
|
|
case strings.Contains(l, "5v"):
|
|
return 5.0
|
|
case strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"):
|
|
return 3.3
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func readFileStr(p string) string {
|
|
b, err := os.ReadFile(p)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return string(b)
|
|
}
|