Vetting/agent/tests/psu.go

package tests

import (
	"context"
	"fmt"
	"os"
	"path/filepath"
	"strconv"
	"strings"
	"time"
)

// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
// PSU rails, then samples each rail every psuSampleInterval for a
// window sized by the stage timeout. During Burn a separate sidecar
// (see burn.go) runs the same probe concurrently with workload — the
// PSU stage itself catches slow post-load sag that only surfaces once
// the 12V rail starts recovering from a brownout under concurrent CPU
// + fio + iperf load.
//
// Any rail outside ±10% of its nominal value at any tick fires the
// critical threshold (server-side) and fails the stage. A host with no
// PSU rails wired to hwmon auto-skips.
func PSU(ctx context.Context, d Deps) Outcome {
	rails := scanPSURails()
	if len(rails) == 0 {
		d.Info("PSU: no voltage rails found under /sys/class/hwmon — skipping stage")
		return Outcome{
			Passed:  true,
			Summary: "skipped (no PSU sensors)",
			Extras:  map[string]any{"skipped": true, "reason": "no_hwmon_voltages"},
		}
	}

	window := resolvePSUWindow(d.StageTimeout)
	deadline := time.Now().Add(window)
	interval := psuSampleInterval
	if window < interval*2 {
		// Tiny window (tests, pathological stage_timeout) — at least two
		// ticks so aggregate stats are meaningful.
		interval = window / 2
		if interval < time.Second {
			interval = time.Second
		}
	}

	// Per-label tracking: min/max across the window, count of out-of-range
	// hits, last-observed value (shown in the summary).
	type railStats struct {
		label    string
		minV     float64
		maxV     float64
		lastV    float64
		ticks    int
		breaches int
		reason   string
	}
	stats := map[string]*railStats{}

	tick := time.NewTicker(interval)
	defer tick.Stop()
	// Start with an immediate sample so a sub-45s window still produces
	// at least one reading.
	sampleOnce := func() {
		cur := scanPSURails()
		if len(cur) == 0 {
			return
		}
		batch := make([]Sample, 0, len(cur))
		for _, r := range cur {
			s, ok := stats[r.Label]
			if !ok {
				s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts}
				stats[r.Label] = s
			}
			s.ticks++
			s.lastV = r.Volts
			if r.Volts < s.minV {
				s.minV = r.Volts
			}
			if r.Volts > s.maxV {
				s.maxV = r.Volts
			}
			if ok, why := voltageInRange(r); !ok {
				s.breaches++
				if s.reason == "" {
					s.reason = why
				}
			}
			batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
		}
		if d.Sensor != nil && len(batch) > 0 {
			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
			_ = d.Sensor(sendCtx, batch)
			cancel()
		}
	}
	sampleOnce()
sampling:
	for time.Now().Before(deadline) {
		select {
		case <-ctx.Done():
			break sampling
		case <-tick.C:
			sampleOnce()
		}
	}

	// Build the outcome. Extras carry per-rail rollup so the report can
	// show "12V min=11.1 max=12.05 (3/120 ticks out of range)".
	type railRollup struct {
		Label    string  `json:"label"`
		MinV     float64 `json:"min_v"`
		MaxV     float64 `json:"max_v"`
		LastV    float64 `json:"last_v"`
		Ticks    int     `json:"ticks"`
		Breaches int     `json:"breaches"`
		Reason   string  `json:"reason,omitempty"`
	}
	rollups := make([]railRollup, 0, len(stats))
	problems := []string{}
	for _, s := range stats {
		rollups = append(rollups, railRollup{
			Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV,
			Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason,
		})
		if s.breaches > 0 {
			problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason))
		}
	}

	extras := map[string]any{
		"rails":       rollups,
		"problems":    problems,
		"window":      window.String(),
		"interval":    interval.String(),
	}
	if len(problems) > 0 {
		d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; "))
		return Outcome{
			Passed:  false,
			Message: "PSU rails out of range: " + strings.Join(problems, "; "),
			Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)),
			Extras:  extras,
		}
	}
	d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window))
	return Outcome{
		Passed:  true,
		Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window),
		Extras:  extras,
	}
}

// psuSampleInterval is the default tick for post-Burn rail sampling.
// Five seconds is slow enough to stay under the HTTP budget and fast
// enough to catch rail recovery transients.
const psuSampleInterval = 5 * time.Second

// resolvePSUWindow maps the stage timeout to the sampling window.
// With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot-
// like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom
// for sensor flush + result post, capped at 10 min so a 24 h soak
// doesn't spend all day in PSU.
func resolvePSUWindow(stageTimeout time.Duration) time.Duration {
	if stageTimeout <= 0 {
		return 30 * time.Second
	}
	w := stageTimeout - 5*time.Second
	if w < 30*time.Second {
		w = 30 * time.Second
	}
	if w > 10*time.Minute {
		w = 10 * time.Minute
	}
	return w
}

type psuRail struct {
	Label string  `json:"label"`
	Volts float64 `json:"volts"`
}

// scanPSURails walks every hwmon chip looking for in*_input files with
// an accompanying in*_label that mentions a known rail name. Unknown
// labels are skipped rather than flagged — motherboard VRMs report many
// rails that aren't PSU outputs.
func scanPSURails() []psuRail {
	root := "/sys/class/hwmon"
	chips, err := os.ReadDir(root)
	if err != nil {
		return nil
	}
	var out []psuRail
	for _, c := range chips {
		base := filepath.Join(root, c.Name())
		files, err := os.ReadDir(base)
		if err != nil {
			continue
		}
		for _, f := range files {
			name := f.Name()
			if !strings.HasPrefix(name, "in") || !strings.HasSuffix(name, "_input") {
				continue
			}
			n := strings.TrimSuffix(strings.TrimPrefix(name, "in"), "_input")
			labelPath := filepath.Join(base, "in"+n+"_label")
			label := strings.TrimSpace(readFileStr(labelPath))
			if !isPSULabel(label) {
				continue
			}
			raw := strings.TrimSpace(readFileStr(filepath.Join(base, name)))
			mv, err := strconv.Atoi(raw)
			if err != nil {
				continue
			}
			out = append(out, psuRail{Label: label, Volts: float64(mv) / 1000})
		}
	}
	return out
}

// isPSULabel filters labels that look like PSU rails. Keeps a small
// allowlist to avoid flagging CPU VRM rails as PSU failures.
func isPSULabel(label string) bool {
	l := strings.ToLower(label)
	switch {
	case strings.Contains(l, "12v"), strings.Contains(l, "5v"),
		strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"),
		strings.Contains(l, "vccin"):
		return true
	}
	return false
}

// voltageInRange returns (ok, reason). A label like "12V" has a 12.0V
// nominal; we accept ±10%. Unknown labels pass.
func voltageInRange(r psuRail) (bool, string) {
	nom := nominalFor(r.Label)
	if nom == 0 {
		return true, ""
	}
	delta := r.Volts - nom
	if delta < 0 {
		delta = -delta
	}
	if delta/nom > 0.10 {
		return false, fmt.Sprintf("expected ~%.1fV", nom)
	}
	return true, ""
}

func nominalFor(label string) float64 {
	l := strings.ToLower(label)
	switch {
	case strings.Contains(l, "12v"):
		return 12.0
	case strings.Contains(l, "5v"):
		return 5.0
	case strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"):
		return 3.3
	}
	return 0
}

func readFileStr(p string) string {
	b, err := os.ReadFile(p)
	if err != nil {
		return ""
	}
	return string(b)
}