23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
442 lines
14 KiB
Go
442 lines
14 KiB
Go
package config
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// ProfileName is the set of legal values for a Run's profile column.
|
|
// Exposed as constants so callers (UI handler, tests, agent) don't
|
|
// sprinkle literal strings.
|
|
const (
|
|
ProfileQuick = "quick"
|
|
ProfileDeep = "deep"
|
|
ProfileSoak = "soak"
|
|
)
|
|
|
|
// AllProfiles is the canonical ordering shown in the picker. Leftmost
|
|
// is the default; rightmost is the longest-running.
|
|
var AllProfiles = []string{ProfileQuick, ProfileDeep, ProfileSoak}
|
|
|
|
// IsValidProfile returns true when name is one of the known profile
|
|
// identifiers. Used at the UI boundary to reject malformed POSTs and in
|
|
// store code as a fallback guard.
|
|
func IsValidProfile(name string) bool {
|
|
for _, p := range AllProfiles {
|
|
if p == name {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Vetting holds the stage order + threshold defaults that are shared
|
|
// across all profiles. Only the per-stage durations/concurrency differ
|
|
// between quick/deep/soak; gates like "CPU > 92C fails the run" apply
|
|
// to a 2-minute quick run and a 12-hour soak alike.
|
|
type Vetting struct {
|
|
Stages []string `yaml:"stages"`
|
|
Thresholds []ThresholdDefaults `yaml:"thresholds"`
|
|
}
|
|
|
|
// ThresholdDefaults is the YAML shape of a threshold declaration. One
|
|
// stanza can declare a per-stage rule ("stage: Network") or a global
|
|
// rule ("stage: *") — the threshold evaluator applies both to samples
|
|
// with matching (stage, kind, key).
|
|
type ThresholdDefaults struct {
|
|
Stage string `yaml:"stage"`
|
|
Kind string `yaml:"kind"`
|
|
Key string `yaml:"key"`
|
|
Op string `yaml:"op"` // lt|lte|gt|gte|within_pct
|
|
Value float64 `yaml:"value"`
|
|
Nominal float64 `yaml:"nominal"` // only used by within_pct (e.g. 12.0 for +12V rail)
|
|
Unit string `yaml:"unit"`
|
|
Severity string `yaml:"severity"` // critical|warning
|
|
}
|
|
|
|
// ProfileRegistry is the in-memory view of the `profiles:` block in
|
|
// vetting.yaml. The orchestrator queries it at run creation time to
|
|
// seed thresholds and (in Phase 3+) to scale per-stage durations.
|
|
type ProfileRegistry struct {
|
|
// Shared stage ordering + threshold defaults. Every profile walks
|
|
// the same list; only durations/concurrency differ.
|
|
Vetting Vetting
|
|
|
|
// Profiles is keyed by name ("quick"/"deep"/"soak"). Inherit is
|
|
// already resolved at load time — a caller sees a flattened view.
|
|
Profiles map[string]Profile
|
|
}
|
|
|
|
// Profile is a loaded profile. StageTimeouts is keyed by stage name.
|
|
// Defaults carries the free-form knobs each probe reads.
|
|
type Profile struct {
|
|
Name string
|
|
Inherit string
|
|
StageTimeouts map[string]time.Duration
|
|
Defaults map[string]map[string]any
|
|
}
|
|
|
|
// StageConfig is the flat view of a profile's knobs, shipped on the
|
|
// claim response so the agent can size CPUStress/Storage/Network/Burn
|
|
// work without parsing YAML. Empty values mean "fall back to the
|
|
// agent's compile-time default" — an older orchestrator that doesn't
|
|
// set these fields keeps working unchanged.
|
|
type StageConfig struct {
|
|
Profile string `json:"profile"`
|
|
StageTimeouts map[string]string `json:"stage_timeouts,omitempty"`
|
|
CPUStress CPUStressKnobs `json:"cpustress"`
|
|
Storage StorageKnobs `json:"storage"`
|
|
Network NetworkKnobs `json:"network"`
|
|
Burn BurnKnobs `json:"burn"`
|
|
}
|
|
|
|
// CPUStressKnobs parallels the `cpustress:` block under `profiles.<name>.defaults`.
|
|
// Durations are YAML duration strings ("2m", "60m", "12h").
|
|
type CPUStressKnobs struct {
|
|
CPUPass string `json:"cpu_pass,omitempty"`
|
|
MemPass string `json:"mem_pass,omitempty"`
|
|
EDACPoll string `json:"edac_poll,omitempty"`
|
|
}
|
|
|
|
// StorageKnobs parallels `storage:` defaults. Mode is "fio_sample" (quick)
|
|
// or "full_disk" (deep/soak). Verify names the integrity mode ("md5" or "").
|
|
type StorageKnobs struct {
|
|
Mode string `json:"mode,omitempty"`
|
|
FioSize string `json:"fio_size,omitempty"`
|
|
FioTime string `json:"fio_time,omitempty"`
|
|
FioBS string `json:"fio_bs,omitempty"`
|
|
FioRW string `json:"fio_rw,omitempty"`
|
|
Verify string `json:"verify,omitempty"`
|
|
}
|
|
|
|
// NetworkKnobs parallels `network:` defaults. Duration is a YAML string.
|
|
type NetworkKnobs struct {
|
|
Duration string `json:"duration,omitempty"`
|
|
}
|
|
|
|
// BurnKnobs parallels `burn:` defaults. Duration is the total Burn window.
|
|
// CPUWorkers is "all" (agent picks runtime.NumCPU) or a numeric string.
|
|
// MemPct is a percentage of MemAvailable to stress. FioOnSpare gates
|
|
// whether fio runs inside Burn (set false if operator lacks a spare
|
|
// partition). IperfParallel is the parallel stream count fed to iperf3 -P.
|
|
type BurnKnobs struct {
|
|
Duration string `json:"duration,omitempty"`
|
|
CPUWorkers string `json:"cpu_workers,omitempty"`
|
|
MemPct int `json:"mem_pct,omitempty"`
|
|
FioOnSpare bool `json:"fio_on_spare,omitempty"`
|
|
IperfParallel int `json:"iperf_parallel,omitempty"`
|
|
}
|
|
|
|
// ResolveStageConfig flattens the named profile into the wire shape the
|
|
// claim handler ships. Missing keys render as empty strings so the agent
|
|
// falls back to its own defaults.
|
|
func (pr *ProfileRegistry) ResolveStageConfig(name string) StageConfig {
|
|
if pr == nil {
|
|
return StageConfig{Profile: name}
|
|
}
|
|
p, err := pr.Lookup(name)
|
|
if err != nil {
|
|
return StageConfig{Profile: name}
|
|
}
|
|
out := StageConfig{Profile: p.Name}
|
|
if len(p.StageTimeouts) > 0 {
|
|
out.StageTimeouts = make(map[string]string, len(p.StageTimeouts))
|
|
for k, v := range p.StageTimeouts {
|
|
out.StageTimeouts[k] = v.String()
|
|
}
|
|
}
|
|
cpu := p.Defaults["cpustress"]
|
|
out.CPUStress.CPUPass = yamlString(cpu, "cpu_pass")
|
|
out.CPUStress.MemPass = yamlString(cpu, "mem_pass")
|
|
out.CPUStress.EDACPoll = yamlString(cpu, "edac_poll")
|
|
st := p.Defaults["storage"]
|
|
out.Storage.Mode = yamlString(st, "mode")
|
|
out.Storage.FioSize = yamlString(st, "fio_size")
|
|
out.Storage.FioTime = yamlString(st, "fio_time")
|
|
out.Storage.FioBS = yamlString(st, "fio_bs")
|
|
out.Storage.FioRW = yamlString(st, "fio_rw")
|
|
out.Storage.Verify = yamlString(st, "verify")
|
|
net := p.Defaults["network"]
|
|
out.Network.Duration = yamlString(net, "duration")
|
|
burn := p.Defaults["burn"]
|
|
out.Burn.Duration = yamlString(burn, "duration")
|
|
out.Burn.CPUWorkers = yamlString(burn, "cpu_workers")
|
|
out.Burn.MemPct = yamlInt(burn, "mem_pct")
|
|
out.Burn.FioOnSpare = yamlBool(burn, "fio_on_spare")
|
|
out.Burn.IperfParallel = yamlInt(burn, "iperf_parallel")
|
|
return out
|
|
}
|
|
|
|
// yamlInt coerces a map[string]any entry to int. Accepts native int,
|
|
// float64 (JSON numbers round-trip as float), or numeric string. Missing
|
|
// / malformed values return 0 so the agent falls back to its default.
|
|
func yamlInt(m map[string]any, key string) int {
|
|
v, ok := m[key]
|
|
if !ok || v == nil {
|
|
return 0
|
|
}
|
|
switch x := v.(type) {
|
|
case int:
|
|
return x
|
|
case int64:
|
|
return int(x)
|
|
case float64:
|
|
return int(x)
|
|
case string:
|
|
// Best-effort string → int. Empty and non-numeric fall through
|
|
// to zero.
|
|
var n int
|
|
if _, err := fmt.Sscanf(x, "%d", &n); err == nil {
|
|
return n
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// yamlBool accepts native bool or "true"/"false" strings. Anything else
|
|
// (missing key, numeric, typo) returns false — a safer default than
|
|
// "true" for a destructive knob like fio_on_spare.
|
|
func yamlBool(m map[string]any, key string) bool {
|
|
v, ok := m[key]
|
|
if !ok || v == nil {
|
|
return false
|
|
}
|
|
switch x := v.(type) {
|
|
case bool:
|
|
return x
|
|
case string:
|
|
return strings.EqualFold(x, "true")
|
|
}
|
|
return false
|
|
}
|
|
|
|
// yamlString coerces a map[string]any entry to its string form. YAML
|
|
// durations like "2m" parse as strings; numeric literals like 5 parse as
|
|
// int. We format non-string scalars with fmt.Sprint so the agent can
|
|
// still interpret them.
|
|
func yamlString(m map[string]any, key string) string {
|
|
v, ok := m[key]
|
|
if !ok || v == nil {
|
|
return ""
|
|
}
|
|
if s, ok := v.(string); ok {
|
|
return s
|
|
}
|
|
return fmt.Sprint(v)
|
|
}
|
|
|
|
// Lookup returns the profile with the given name. Falls back to the
|
|
// default profile (quick) if the name is empty. Returns an error when
|
|
// the name is non-empty but unknown so the caller can surface it.
|
|
func (pr *ProfileRegistry) Lookup(name string) (Profile, error) {
|
|
if name == "" {
|
|
name = ProfileQuick
|
|
}
|
|
p, ok := pr.Profiles[name]
|
|
if !ok {
|
|
return Profile{}, fmt.Errorf("unknown profile %q", name)
|
|
}
|
|
return p, nil
|
|
}
|
|
|
|
// Names returns the registry's profile names in the canonical
|
|
// picker order (quick/deep/soak). Profiles present in the config but
|
|
// unknown to AllProfiles are appended after, alphabetically.
|
|
func (pr *ProfileRegistry) Names() []string {
|
|
out := make([]string, 0, len(pr.Profiles))
|
|
seen := map[string]bool{}
|
|
for _, n := range AllProfiles {
|
|
if _, ok := pr.Profiles[n]; ok {
|
|
out = append(out, n)
|
|
seen[n] = true
|
|
}
|
|
}
|
|
for n := range pr.Profiles {
|
|
if !seen[n] {
|
|
out = append(out, n)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// Stages returns the shared stage order, or a safe default when the
|
|
// config didn't declare one — keeps tests that don't build a full
|
|
// ProfileRegistry from tripping over a nil slice.
|
|
func (pr *ProfileRegistry) Stages() []string {
|
|
if len(pr.Vetting.Stages) == 0 {
|
|
return DefaultStages()
|
|
}
|
|
out := make([]string, len(pr.Vetting.Stages))
|
|
copy(out, pr.Vetting.Stages)
|
|
return out
|
|
}
|
|
|
|
// DefaultStages is the canonical stage list the orchestrator walks
|
|
// when no config is loaded. Mirrored in the vetting.yaml shipped with
|
|
// the repo so edits to the slice and the file stay in sync.
|
|
func DefaultStages() []string {
|
|
return []string{
|
|
"Inventory",
|
|
"Firmware",
|
|
"SpecValidate",
|
|
"SMART",
|
|
"CPUStress",
|
|
"Storage",
|
|
"Network",
|
|
"Burn",
|
|
"GPU",
|
|
"PSU",
|
|
"Reporting",
|
|
}
|
|
}
|
|
|
|
// rawProfile is the YAML shape before inherit resolution. Durations
|
|
// arrive as strings (e.g. "2h") so we can parse them with
|
|
// time.ParseDuration instead of rolling our own.
|
|
type rawProfile struct {
|
|
Inherit string `yaml:"inherit"`
|
|
StageTimeouts map[string]string `yaml:"stage_timeouts"`
|
|
Defaults map[string]map[string]any `yaml:"defaults"`
|
|
}
|
|
|
|
type rawProfilesBlock struct {
|
|
Vetting Vetting `yaml:"vetting"`
|
|
Profiles map[string]rawProfile `yaml:"profiles"`
|
|
}
|
|
|
|
// buildProfileRegistry flattens a rawProfilesBlock into a ProfileRegistry.
|
|
// Resolves `inherit:` by recursive merge (child keys win), parses
|
|
// stage_timeouts strings into time.Durations, and returns an error if
|
|
// the inherit chain loops or references an unknown profile.
|
|
func buildProfileRegistry(raw rawProfilesBlock) (*ProfileRegistry, error) {
|
|
if len(raw.Profiles) == 0 {
|
|
raw.Profiles = defaultRawProfiles()
|
|
}
|
|
out := &ProfileRegistry{
|
|
Vetting: raw.Vetting,
|
|
Profiles: make(map[string]Profile, len(raw.Profiles)),
|
|
}
|
|
if len(out.Vetting.Stages) == 0 {
|
|
out.Vetting.Stages = DefaultStages()
|
|
}
|
|
for name := range raw.Profiles {
|
|
resolved, err := resolveProfile(raw.Profiles, name, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
out.Profiles[name] = resolved
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// resolveProfile recursively walks inherit chains, depth-first. The
|
|
// visited slice is a cycle guard — we add the current name before
|
|
// recursing and bail if we ever see it again.
|
|
func resolveProfile(all map[string]rawProfile, name string, visited []string) (Profile, error) {
|
|
for _, v := range visited {
|
|
if v == name {
|
|
return Profile{}, fmt.Errorf("profile inherit cycle: %s -> %s", strings.Join(visited, " -> "), name)
|
|
}
|
|
}
|
|
raw, ok := all[name]
|
|
if !ok {
|
|
return Profile{}, fmt.Errorf("unknown profile %q", name)
|
|
}
|
|
base := Profile{
|
|
Name: name,
|
|
Inherit: raw.Inherit,
|
|
StageTimeouts: map[string]time.Duration{},
|
|
Defaults: map[string]map[string]any{},
|
|
}
|
|
if raw.Inherit != "" {
|
|
parent, err := resolveProfile(all, raw.Inherit, append(visited, name))
|
|
if err != nil {
|
|
return Profile{}, err
|
|
}
|
|
for k, v := range parent.StageTimeouts {
|
|
base.StageTimeouts[k] = v
|
|
}
|
|
for k, v := range parent.Defaults {
|
|
copyMap := make(map[string]any, len(v))
|
|
for kk, vv := range v {
|
|
copyMap[kk] = vv
|
|
}
|
|
base.Defaults[k] = copyMap
|
|
}
|
|
}
|
|
for stage, s := range raw.StageTimeouts {
|
|
d, err := time.ParseDuration(s)
|
|
if err != nil {
|
|
return Profile{}, fmt.Errorf("profile %s stage_timeouts[%s]: %w", name, stage, err)
|
|
}
|
|
base.StageTimeouts[stage] = d
|
|
}
|
|
for group, kv := range raw.Defaults {
|
|
dest, ok := base.Defaults[group]
|
|
if !ok {
|
|
dest = map[string]any{}
|
|
base.Defaults[group] = dest
|
|
}
|
|
for k, v := range kv {
|
|
dest[k] = v
|
|
}
|
|
}
|
|
return base, nil
|
|
}
|
|
|
|
// defaultRawProfiles returns sane per-profile durations + probe knobs
|
|
// used when vetting.yaml omits the `profiles:` block entirely. Matches
|
|
// the plan's per-stage budget table so the agent still gets coherent
|
|
// CPUStress/Storage/Network knobs without any operator-visible config.
|
|
func defaultRawProfiles() map[string]rawProfile {
|
|
return map[string]rawProfile{
|
|
ProfileQuick: {
|
|
StageTimeouts: map[string]string{
|
|
"CPUStress": "5m",
|
|
"Storage": "5m",
|
|
"Network": "2m",
|
|
"Burn": "3m",
|
|
"PSU": "1m",
|
|
},
|
|
Defaults: map[string]map[string]any{
|
|
"cpustress": {"cpu_pass": "2m", "mem_pass": "2m", "edac_poll": "10s"},
|
|
"storage": {"mode": "fio_sample", "fio_size": "1GiB", "fio_time": "3m", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"},
|
|
"network": {"duration": "60s"},
|
|
"burn": {"duration": "2m", "cpu_workers": "all", "mem_pct": 50, "fio_on_spare": true, "iperf_parallel": 2},
|
|
},
|
|
},
|
|
ProfileDeep: {
|
|
StageTimeouts: map[string]string{
|
|
"CPUStress": "2h",
|
|
"Storage": "4h",
|
|
"Network": "35m",
|
|
"Burn": "3h",
|
|
"PSU": "10m",
|
|
},
|
|
Defaults: map[string]map[string]any{
|
|
"cpustress": {"cpu_pass": "60m", "mem_pass": "60m", "edac_poll": "10s"},
|
|
"storage": {"mode": "full_disk", "fio_time": "2h", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"},
|
|
"network": {"duration": "30m"},
|
|
"burn": {"duration": "2h", "cpu_workers": "all", "mem_pct": 70, "fio_on_spare": true, "iperf_parallel": 4},
|
|
},
|
|
},
|
|
ProfileSoak: {
|
|
Inherit: ProfileDeep,
|
|
StageTimeouts: map[string]string{
|
|
"CPUStress": "14h",
|
|
"Storage": "8h",
|
|
"Network": "2h30m",
|
|
"Burn": "20h",
|
|
"PSU": "15m",
|
|
},
|
|
Defaults: map[string]map[string]any{
|
|
"cpustress": {"cpu_pass": "12h"},
|
|
"storage": {"mode": "full_disk", "fio_time": "6h"},
|
|
"network": {"duration": "2h"},
|
|
"burn": {"duration": "18h", "iperf_parallel": 8},
|
|
},
|
|
},
|
|
}
|
|
}
|