Files
Vetting/internal/config/profiles.go
T
josh 23c689aa5b
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled
deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00

442 lines
14 KiB
Go

package config
import (
"fmt"
"strings"
"time"
)
// ProfileName is the set of legal values for a Run's profile column.
// Exposed as constants so callers (UI handler, tests, agent) don't
// sprinkle literal strings.
const (
ProfileQuick = "quick"
ProfileDeep = "deep"
ProfileSoak = "soak"
)
// AllProfiles is the canonical ordering shown in the picker. Leftmost
// is the default; rightmost is the longest-running.
var AllProfiles = []string{ProfileQuick, ProfileDeep, ProfileSoak}
// IsValidProfile returns true when name is one of the known profile
// identifiers. Used at the UI boundary to reject malformed POSTs and in
// store code as a fallback guard.
func IsValidProfile(name string) bool {
for _, p := range AllProfiles {
if p == name {
return true
}
}
return false
}
// Vetting holds the stage order + threshold defaults that are shared
// across all profiles. Only the per-stage durations/concurrency differ
// between quick/deep/soak; gates like "CPU > 92C fails the run" apply
// to a 2-minute quick run and a 12-hour soak alike.
type Vetting struct {
Stages []string `yaml:"stages"`
Thresholds []ThresholdDefaults `yaml:"thresholds"`
}
// ThresholdDefaults is the YAML shape of a threshold declaration. One
// stanza can declare a per-stage rule ("stage: Network") or a global
// rule ("stage: *") — the threshold evaluator applies both to samples
// with matching (stage, kind, key).
type ThresholdDefaults struct {
Stage string `yaml:"stage"`
Kind string `yaml:"kind"`
Key string `yaml:"key"`
Op string `yaml:"op"` // lt|lte|gt|gte|within_pct
Value float64 `yaml:"value"`
Nominal float64 `yaml:"nominal"` // only used by within_pct (e.g. 12.0 for +12V rail)
Unit string `yaml:"unit"`
Severity string `yaml:"severity"` // critical|warning
}
// ProfileRegistry is the in-memory view of the `profiles:` block in
// vetting.yaml. The orchestrator queries it at run creation time to
// seed thresholds and (in Phase 3+) to scale per-stage durations.
type ProfileRegistry struct {
// Shared stage ordering + threshold defaults. Every profile walks
// the same list; only durations/concurrency differ.
Vetting Vetting
// Profiles is keyed by name ("quick"/"deep"/"soak"). Inherit is
// already resolved at load time — a caller sees a flattened view.
Profiles map[string]Profile
}
// Profile is a loaded profile. StageTimeouts is keyed by stage name.
// Defaults carries the free-form knobs each probe reads.
type Profile struct {
Name string
Inherit string
StageTimeouts map[string]time.Duration
Defaults map[string]map[string]any
}
// StageConfig is the flat view of a profile's knobs, shipped on the
// claim response so the agent can size CPUStress/Storage/Network/Burn
// work without parsing YAML. Empty values mean "fall back to the
// agent's compile-time default" — an older orchestrator that doesn't
// set these fields keeps working unchanged.
type StageConfig struct {
Profile string `json:"profile"`
StageTimeouts map[string]string `json:"stage_timeouts,omitempty"`
CPUStress CPUStressKnobs `json:"cpustress"`
Storage StorageKnobs `json:"storage"`
Network NetworkKnobs `json:"network"`
Burn BurnKnobs `json:"burn"`
}
// CPUStressKnobs parallels the `cpustress:` block under `profiles.<name>.defaults`.
// Durations are YAML duration strings ("2m", "60m", "12h").
type CPUStressKnobs struct {
CPUPass string `json:"cpu_pass,omitempty"`
MemPass string `json:"mem_pass,omitempty"`
EDACPoll string `json:"edac_poll,omitempty"`
}
// StorageKnobs parallels `storage:` defaults. Mode is "fio_sample" (quick)
// or "full_disk" (deep/soak). Verify names the integrity mode ("md5" or "").
type StorageKnobs struct {
Mode string `json:"mode,omitempty"`
FioSize string `json:"fio_size,omitempty"`
FioTime string `json:"fio_time,omitempty"`
FioBS string `json:"fio_bs,omitempty"`
FioRW string `json:"fio_rw,omitempty"`
Verify string `json:"verify,omitempty"`
}
// NetworkKnobs parallels `network:` defaults. Duration is a YAML string.
type NetworkKnobs struct {
Duration string `json:"duration,omitempty"`
}
// BurnKnobs parallels `burn:` defaults. Duration is the total Burn window.
// CPUWorkers is "all" (agent picks runtime.NumCPU) or a numeric string.
// MemPct is a percentage of MemAvailable to stress. FioOnSpare gates
// whether fio runs inside Burn (set false if operator lacks a spare
// partition). IperfParallel is the parallel stream count fed to iperf3 -P.
type BurnKnobs struct {
Duration string `json:"duration,omitempty"`
CPUWorkers string `json:"cpu_workers,omitempty"`
MemPct int `json:"mem_pct,omitempty"`
FioOnSpare bool `json:"fio_on_spare,omitempty"`
IperfParallel int `json:"iperf_parallel,omitempty"`
}
// ResolveStageConfig flattens the named profile into the wire shape the
// claim handler ships. Missing keys render as empty strings so the agent
// falls back to its own defaults.
func (pr *ProfileRegistry) ResolveStageConfig(name string) StageConfig {
if pr == nil {
return StageConfig{Profile: name}
}
p, err := pr.Lookup(name)
if err != nil {
return StageConfig{Profile: name}
}
out := StageConfig{Profile: p.Name}
if len(p.StageTimeouts) > 0 {
out.StageTimeouts = make(map[string]string, len(p.StageTimeouts))
for k, v := range p.StageTimeouts {
out.StageTimeouts[k] = v.String()
}
}
cpu := p.Defaults["cpustress"]
out.CPUStress.CPUPass = yamlString(cpu, "cpu_pass")
out.CPUStress.MemPass = yamlString(cpu, "mem_pass")
out.CPUStress.EDACPoll = yamlString(cpu, "edac_poll")
st := p.Defaults["storage"]
out.Storage.Mode = yamlString(st, "mode")
out.Storage.FioSize = yamlString(st, "fio_size")
out.Storage.FioTime = yamlString(st, "fio_time")
out.Storage.FioBS = yamlString(st, "fio_bs")
out.Storage.FioRW = yamlString(st, "fio_rw")
out.Storage.Verify = yamlString(st, "verify")
net := p.Defaults["network"]
out.Network.Duration = yamlString(net, "duration")
burn := p.Defaults["burn"]
out.Burn.Duration = yamlString(burn, "duration")
out.Burn.CPUWorkers = yamlString(burn, "cpu_workers")
out.Burn.MemPct = yamlInt(burn, "mem_pct")
out.Burn.FioOnSpare = yamlBool(burn, "fio_on_spare")
out.Burn.IperfParallel = yamlInt(burn, "iperf_parallel")
return out
}
// yamlInt coerces a map[string]any entry to int. Accepts native int,
// float64 (JSON numbers round-trip as float), or numeric string. Missing
// / malformed values return 0 so the agent falls back to its default.
func yamlInt(m map[string]any, key string) int {
v, ok := m[key]
if !ok || v == nil {
return 0
}
switch x := v.(type) {
case int:
return x
case int64:
return int(x)
case float64:
return int(x)
case string:
// Best-effort string → int. Empty and non-numeric fall through
// to zero.
var n int
if _, err := fmt.Sscanf(x, "%d", &n); err == nil {
return n
}
}
return 0
}
// yamlBool accepts native bool or "true"/"false" strings. Anything else
// (missing key, numeric, typo) returns false — a safer default than
// "true" for a destructive knob like fio_on_spare.
func yamlBool(m map[string]any, key string) bool {
v, ok := m[key]
if !ok || v == nil {
return false
}
switch x := v.(type) {
case bool:
return x
case string:
return strings.EqualFold(x, "true")
}
return false
}
// yamlString coerces a map[string]any entry to its string form. YAML
// durations like "2m" parse as strings; numeric literals like 5 parse as
// int. We format non-string scalars with fmt.Sprint so the agent can
// still interpret them.
func yamlString(m map[string]any, key string) string {
v, ok := m[key]
if !ok || v == nil {
return ""
}
if s, ok := v.(string); ok {
return s
}
return fmt.Sprint(v)
}
// Lookup returns the profile with the given name. Falls back to the
// default profile (quick) if the name is empty. Returns an error when
// the name is non-empty but unknown so the caller can surface it.
func (pr *ProfileRegistry) Lookup(name string) (Profile, error) {
if name == "" {
name = ProfileQuick
}
p, ok := pr.Profiles[name]
if !ok {
return Profile{}, fmt.Errorf("unknown profile %q", name)
}
return p, nil
}
// Names returns the registry's profile names in the canonical
// picker order (quick/deep/soak). Profiles present in the config but
// unknown to AllProfiles are appended after, alphabetically.
func (pr *ProfileRegistry) Names() []string {
out := make([]string, 0, len(pr.Profiles))
seen := map[string]bool{}
for _, n := range AllProfiles {
if _, ok := pr.Profiles[n]; ok {
out = append(out, n)
seen[n] = true
}
}
for n := range pr.Profiles {
if !seen[n] {
out = append(out, n)
}
}
return out
}
// Stages returns the shared stage order, or a safe default when the
// config didn't declare one — keeps tests that don't build a full
// ProfileRegistry from tripping over a nil slice.
func (pr *ProfileRegistry) Stages() []string {
if len(pr.Vetting.Stages) == 0 {
return DefaultStages()
}
out := make([]string, len(pr.Vetting.Stages))
copy(out, pr.Vetting.Stages)
return out
}
// DefaultStages is the canonical stage list the orchestrator walks
// when no config is loaded. Mirrored in the vetting.yaml shipped with
// the repo so edits to the slice and the file stay in sync.
func DefaultStages() []string {
return []string{
"Inventory",
"Firmware",
"SpecValidate",
"SMART",
"CPUStress",
"Storage",
"Network",
"Burn",
"GPU",
"PSU",
"Reporting",
}
}
// rawProfile is the YAML shape before inherit resolution. Durations
// arrive as strings (e.g. "2h") so we can parse them with
// time.ParseDuration instead of rolling our own.
type rawProfile struct {
Inherit string `yaml:"inherit"`
StageTimeouts map[string]string `yaml:"stage_timeouts"`
Defaults map[string]map[string]any `yaml:"defaults"`
}
type rawProfilesBlock struct {
Vetting Vetting `yaml:"vetting"`
Profiles map[string]rawProfile `yaml:"profiles"`
}
// buildProfileRegistry flattens a rawProfilesBlock into a ProfileRegistry.
// Resolves `inherit:` by recursive merge (child keys win), parses
// stage_timeouts strings into time.Durations, and returns an error if
// the inherit chain loops or references an unknown profile.
func buildProfileRegistry(raw rawProfilesBlock) (*ProfileRegistry, error) {
if len(raw.Profiles) == 0 {
raw.Profiles = defaultRawProfiles()
}
out := &ProfileRegistry{
Vetting: raw.Vetting,
Profiles: make(map[string]Profile, len(raw.Profiles)),
}
if len(out.Vetting.Stages) == 0 {
out.Vetting.Stages = DefaultStages()
}
for name := range raw.Profiles {
resolved, err := resolveProfile(raw.Profiles, name, nil)
if err != nil {
return nil, err
}
out.Profiles[name] = resolved
}
return out, nil
}
// resolveProfile recursively walks inherit chains, depth-first. The
// visited slice is a cycle guard — we add the current name before
// recursing and bail if we ever see it again.
func resolveProfile(all map[string]rawProfile, name string, visited []string) (Profile, error) {
for _, v := range visited {
if v == name {
return Profile{}, fmt.Errorf("profile inherit cycle: %s -> %s", strings.Join(visited, " -> "), name)
}
}
raw, ok := all[name]
if !ok {
return Profile{}, fmt.Errorf("unknown profile %q", name)
}
base := Profile{
Name: name,
Inherit: raw.Inherit,
StageTimeouts: map[string]time.Duration{},
Defaults: map[string]map[string]any{},
}
if raw.Inherit != "" {
parent, err := resolveProfile(all, raw.Inherit, append(visited, name))
if err != nil {
return Profile{}, err
}
for k, v := range parent.StageTimeouts {
base.StageTimeouts[k] = v
}
for k, v := range parent.Defaults {
copyMap := make(map[string]any, len(v))
for kk, vv := range v {
copyMap[kk] = vv
}
base.Defaults[k] = copyMap
}
}
for stage, s := range raw.StageTimeouts {
d, err := time.ParseDuration(s)
if err != nil {
return Profile{}, fmt.Errorf("profile %s stage_timeouts[%s]: %w", name, stage, err)
}
base.StageTimeouts[stage] = d
}
for group, kv := range raw.Defaults {
dest, ok := base.Defaults[group]
if !ok {
dest = map[string]any{}
base.Defaults[group] = dest
}
for k, v := range kv {
dest[k] = v
}
}
return base, nil
}
// defaultRawProfiles returns sane per-profile durations + probe knobs
// used when vetting.yaml omits the `profiles:` block entirely. Matches
// the plan's per-stage budget table so the agent still gets coherent
// CPUStress/Storage/Network knobs without any operator-visible config.
func defaultRawProfiles() map[string]rawProfile {
return map[string]rawProfile{
ProfileQuick: {
StageTimeouts: map[string]string{
"CPUStress": "5m",
"Storage": "5m",
"Network": "2m",
"Burn": "3m",
"PSU": "1m",
},
Defaults: map[string]map[string]any{
"cpustress": {"cpu_pass": "2m", "mem_pass": "2m", "edac_poll": "10s"},
"storage": {"mode": "fio_sample", "fio_size": "1GiB", "fio_time": "3m", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"},
"network": {"duration": "60s"},
"burn": {"duration": "2m", "cpu_workers": "all", "mem_pct": 50, "fio_on_spare": true, "iperf_parallel": 2},
},
},
ProfileDeep: {
StageTimeouts: map[string]string{
"CPUStress": "2h",
"Storage": "4h",
"Network": "35m",
"Burn": "3h",
"PSU": "10m",
},
Defaults: map[string]map[string]any{
"cpustress": {"cpu_pass": "60m", "mem_pass": "60m", "edac_poll": "10s"},
"storage": {"mode": "full_disk", "fio_time": "2h", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"},
"network": {"duration": "30m"},
"burn": {"duration": "2h", "cpu_workers": "all", "mem_pct": 70, "fio_on_spare": true, "iperf_parallel": 4},
},
},
ProfileSoak: {
Inherit: ProfileDeep,
StageTimeouts: map[string]string{
"CPUStress": "14h",
"Storage": "8h",
"Network": "2h30m",
"Burn": "20h",
"PSU": "15m",
},
Defaults: map[string]map[string]any{
"cpustress": {"cpu_pass": "12h"},
"storage": {"mode": "full_disk", "fio_time": "6h"},
"network": {"duration": "2h"},
"burn": {"duration": "18h", "iperf_parallel": 8},
},
},
}
}