package tests import ( "context" "fmt" "os" "path/filepath" "strconv" "strings" "time" ) // PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find // PSU rails, then samples each rail every psuSampleInterval for a // window sized by the stage timeout. During Burn a separate sidecar // (see burn.go) runs the same probe concurrently with workload — the // PSU stage itself catches slow post-load sag that only surfaces once // the 12V rail starts recovering from a brownout under concurrent CPU // + fio + iperf load. // // Any rail outside ±10% of its nominal value at any tick fires the // critical threshold (server-side) and fails the stage. A host with no // PSU rails wired to hwmon auto-skips. func PSU(ctx context.Context, d Deps) Outcome { rails := scanPSURails() if len(rails) == 0 { d.Info("PSU: no voltage rails found under /sys/class/hwmon — skipping stage") return Outcome{ Passed: true, Summary: "skipped (no PSU sensors)", Extras: map[string]any{"skipped": true, "reason": "no_hwmon_voltages"}, } } window := resolvePSUWindow(d.StageTimeout) deadline := time.Now().Add(window) interval := psuSampleInterval if window < interval*2 { // Tiny window (tests, pathological stage_timeout) — at least two // ticks so aggregate stats are meaningful. interval = window / 2 if interval < time.Second { interval = time.Second } } // Per-label tracking: min/max across the window, count of out-of-range // hits, last-observed value (shown in the summary). type railStats struct { label string minV float64 maxV float64 lastV float64 ticks int breaches int reason string } stats := map[string]*railStats{} tick := time.NewTicker(interval) defer tick.Stop() // Start with an immediate sample so a sub-45s window still produces // at least one reading. sampleOnce := func() { cur := scanPSURails() if len(cur) == 0 { return } batch := make([]Sample, 0, len(cur)) for _, r := range cur { s, ok := stats[r.Label] if !ok { s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts} stats[r.Label] = s } s.ticks++ s.lastV = r.Volts if r.Volts < s.minV { s.minV = r.Volts } if r.Volts > s.maxV { s.maxV = r.Volts } if ok, why := voltageInRange(r); !ok { s.breaches++ if s.reason == "" { s.reason = why } } batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"}) } if d.Sensor != nil && len(batch) > 0 { sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) _ = d.Sensor(sendCtx, batch) cancel() } } sampleOnce() sampling: for time.Now().Before(deadline) { select { case <-ctx.Done(): break sampling case <-tick.C: sampleOnce() } } // Build the outcome. Extras carry per-rail rollup so the report can // show "12V min=11.1 max=12.05 (3/120 ticks out of range)". type railRollup struct { Label string `json:"label"` MinV float64 `json:"min_v"` MaxV float64 `json:"max_v"` LastV float64 `json:"last_v"` Ticks int `json:"ticks"` Breaches int `json:"breaches"` Reason string `json:"reason,omitempty"` } rollups := make([]railRollup, 0, len(stats)) problems := []string{} for _, s := range stats { rollups = append(rollups, railRollup{ Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV, Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason, }) if s.breaches > 0 { problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason)) } } extras := map[string]any{ "rails": rollups, "problems": problems, "window": window.String(), "interval": interval.String(), } if len(problems) > 0 { d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; ")) return Outcome{ Passed: false, Message: "PSU rails out of range: " + strings.Join(problems, "; "), Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)), Extras: extras, } } d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window)) return Outcome{ Passed: true, Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window), Extras: extras, } } // psuSampleInterval is the default tick for post-Burn rail sampling. // Five seconds is slow enough to stay under the HTTP budget and fast // enough to catch rail recovery transients. const psuSampleInterval = 5 * time.Second // resolvePSUWindow maps the stage timeout to the sampling window. // With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot- // like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom // for sensor flush + result post, capped at 10 min so a 24 h soak // doesn't spend all day in PSU. func resolvePSUWindow(stageTimeout time.Duration) time.Duration { if stageTimeout <= 0 { return 30 * time.Second } w := stageTimeout - 5*time.Second if w < 30*time.Second { w = 30 * time.Second } if w > 10*time.Minute { w = 10 * time.Minute } return w } type psuRail struct { Label string `json:"label"` Volts float64 `json:"volts"` } // scanPSURails walks every hwmon chip looking for in*_input files with // an accompanying in*_label that mentions a known rail name. Unknown // labels are skipped rather than flagged — motherboard VRMs report many // rails that aren't PSU outputs. func scanPSURails() []psuRail { root := "/sys/class/hwmon" chips, err := os.ReadDir(root) if err != nil { return nil } var out []psuRail for _, c := range chips { base := filepath.Join(root, c.Name()) files, err := os.ReadDir(base) if err != nil { continue } for _, f := range files { name := f.Name() if !strings.HasPrefix(name, "in") || !strings.HasSuffix(name, "_input") { continue } n := strings.TrimSuffix(strings.TrimPrefix(name, "in"), "_input") labelPath := filepath.Join(base, "in"+n+"_label") label := strings.TrimSpace(readFileStr(labelPath)) if !isPSULabel(label) { continue } raw := strings.TrimSpace(readFileStr(filepath.Join(base, name))) mv, err := strconv.Atoi(raw) if err != nil { continue } out = append(out, psuRail{Label: label, Volts: float64(mv) / 1000}) } } return out } // isPSULabel filters labels that look like PSU rails. Keeps a small // allowlist to avoid flagging CPU VRM rails as PSU failures. func isPSULabel(label string) bool { l := strings.ToLower(label) switch { case strings.Contains(l, "12v"), strings.Contains(l, "5v"), strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"), strings.Contains(l, "vccin"): return true } return false } // voltageInRange returns (ok, reason). A label like "12V" has a 12.0V // nominal; we accept ±10%. Unknown labels pass. func voltageInRange(r psuRail) (bool, string) { nom := nominalFor(r.Label) if nom == 0 { return true, "" } delta := r.Volts - nom if delta < 0 { delta = -delta } if delta/nom > 0.10 { return false, fmt.Sprintf("expected ~%.1fV", nom) } return true, "" } func nominalFor(label string) float64 { l := strings.ToLower(label) switch { case strings.Contains(l, "12v"): return 12.0 case strings.Contains(l, "5v"): return 5.0 case strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"): return 3.3 } return 0 } func readFileStr(p string) string { b, err := os.ReadFile(p) if err != nil { return "" } return string(b) }