deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -20,6 +20,13 @@ type Config struct {
|
||||
Agent Agent `yaml:"agent"`
|
||||
Notifiers []Notifier `yaml:"notifiers"`
|
||||
Routes []Route `yaml:"routes"`
|
||||
|
||||
// Profiles holds the Phase-1 quick/deep/soak registry (stage order,
|
||||
// threshold defaults, per-profile stage timeouts + probe knobs).
|
||||
// Populated from the `vetting:` and `profiles:` top-level blocks
|
||||
// during Load. Nil is never returned — Load installs a default
|
||||
// registry when those blocks are absent.
|
||||
Profiles *ProfileRegistry `yaml:"-"`
|
||||
}
|
||||
|
||||
type Server struct {
|
||||
@@ -111,6 +118,20 @@ func Load(path string) (*Config, error) {
|
||||
if err := yaml.Unmarshal(b, &c); err != nil {
|
||||
return nil, fmt.Errorf("parse config: %w", err)
|
||||
}
|
||||
// The `vetting:` + `profiles:` blocks live alongside the existing
|
||||
// fields but we decode them into the raw shape because YAML
|
||||
// durations arrive as strings. Reusing the same byte buffer is
|
||||
// safe: yaml.Unmarshal is happy to ignore keys the target doesn't
|
||||
// know about.
|
||||
var rawProfiles rawProfilesBlock
|
||||
if err := yaml.Unmarshal(b, &rawProfiles); err != nil {
|
||||
return nil, fmt.Errorf("parse profiles: %w", err)
|
||||
}
|
||||
reg, err := buildProfileRegistry(rawProfiles)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("profiles: %w", err)
|
||||
}
|
||||
c.Profiles = reg
|
||||
if c.Server.Bind == "" {
|
||||
c.Server.Bind = "127.0.0.1:8080"
|
||||
}
|
||||
|
||||
@@ -0,0 +1,441 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ProfileName is the set of legal values for a Run's profile column.
|
||||
// Exposed as constants so callers (UI handler, tests, agent) don't
|
||||
// sprinkle literal strings.
|
||||
const (
|
||||
ProfileQuick = "quick"
|
||||
ProfileDeep = "deep"
|
||||
ProfileSoak = "soak"
|
||||
)
|
||||
|
||||
// AllProfiles is the canonical ordering shown in the picker. Leftmost
|
||||
// is the default; rightmost is the longest-running.
|
||||
var AllProfiles = []string{ProfileQuick, ProfileDeep, ProfileSoak}
|
||||
|
||||
// IsValidProfile returns true when name is one of the known profile
|
||||
// identifiers. Used at the UI boundary to reject malformed POSTs and in
|
||||
// store code as a fallback guard.
|
||||
func IsValidProfile(name string) bool {
|
||||
for _, p := range AllProfiles {
|
||||
if p == name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Vetting holds the stage order + threshold defaults that are shared
|
||||
// across all profiles. Only the per-stage durations/concurrency differ
|
||||
// between quick/deep/soak; gates like "CPU > 92C fails the run" apply
|
||||
// to a 2-minute quick run and a 12-hour soak alike.
|
||||
type Vetting struct {
|
||||
Stages []string `yaml:"stages"`
|
||||
Thresholds []ThresholdDefaults `yaml:"thresholds"`
|
||||
}
|
||||
|
||||
// ThresholdDefaults is the YAML shape of a threshold declaration. One
|
||||
// stanza can declare a per-stage rule ("stage: Network") or a global
|
||||
// rule ("stage: *") — the threshold evaluator applies both to samples
|
||||
// with matching (stage, kind, key).
|
||||
type ThresholdDefaults struct {
|
||||
Stage string `yaml:"stage"`
|
||||
Kind string `yaml:"kind"`
|
||||
Key string `yaml:"key"`
|
||||
Op string `yaml:"op"` // lt|lte|gt|gte|within_pct
|
||||
Value float64 `yaml:"value"`
|
||||
Nominal float64 `yaml:"nominal"` // only used by within_pct (e.g. 12.0 for +12V rail)
|
||||
Unit string `yaml:"unit"`
|
||||
Severity string `yaml:"severity"` // critical|warning
|
||||
}
|
||||
|
||||
// ProfileRegistry is the in-memory view of the `profiles:` block in
|
||||
// vetting.yaml. The orchestrator queries it at run creation time to
|
||||
// seed thresholds and (in Phase 3+) to scale per-stage durations.
|
||||
type ProfileRegistry struct {
|
||||
// Shared stage ordering + threshold defaults. Every profile walks
|
||||
// the same list; only durations/concurrency differ.
|
||||
Vetting Vetting
|
||||
|
||||
// Profiles is keyed by name ("quick"/"deep"/"soak"). Inherit is
|
||||
// already resolved at load time — a caller sees a flattened view.
|
||||
Profiles map[string]Profile
|
||||
}
|
||||
|
||||
// Profile is a loaded profile. StageTimeouts is keyed by stage name.
|
||||
// Defaults carries the free-form knobs each probe reads.
|
||||
type Profile struct {
|
||||
Name string
|
||||
Inherit string
|
||||
StageTimeouts map[string]time.Duration
|
||||
Defaults map[string]map[string]any
|
||||
}
|
||||
|
||||
// StageConfig is the flat view of a profile's knobs, shipped on the
|
||||
// claim response so the agent can size CPUStress/Storage/Network/Burn
|
||||
// work without parsing YAML. Empty values mean "fall back to the
|
||||
// agent's compile-time default" — an older orchestrator that doesn't
|
||||
// set these fields keeps working unchanged.
|
||||
type StageConfig struct {
|
||||
Profile string `json:"profile"`
|
||||
StageTimeouts map[string]string `json:"stage_timeouts,omitempty"`
|
||||
CPUStress CPUStressKnobs `json:"cpustress"`
|
||||
Storage StorageKnobs `json:"storage"`
|
||||
Network NetworkKnobs `json:"network"`
|
||||
Burn BurnKnobs `json:"burn"`
|
||||
}
|
||||
|
||||
// CPUStressKnobs parallels the `cpustress:` block under `profiles.<name>.defaults`.
|
||||
// Durations are YAML duration strings ("2m", "60m", "12h").
|
||||
type CPUStressKnobs struct {
|
||||
CPUPass string `json:"cpu_pass,omitempty"`
|
||||
MemPass string `json:"mem_pass,omitempty"`
|
||||
EDACPoll string `json:"edac_poll,omitempty"`
|
||||
}
|
||||
|
||||
// StorageKnobs parallels `storage:` defaults. Mode is "fio_sample" (quick)
|
||||
// or "full_disk" (deep/soak). Verify names the integrity mode ("md5" or "").
|
||||
type StorageKnobs struct {
|
||||
Mode string `json:"mode,omitempty"`
|
||||
FioSize string `json:"fio_size,omitempty"`
|
||||
FioTime string `json:"fio_time,omitempty"`
|
||||
FioBS string `json:"fio_bs,omitempty"`
|
||||
FioRW string `json:"fio_rw,omitempty"`
|
||||
Verify string `json:"verify,omitempty"`
|
||||
}
|
||||
|
||||
// NetworkKnobs parallels `network:` defaults. Duration is a YAML string.
|
||||
type NetworkKnobs struct {
|
||||
Duration string `json:"duration,omitempty"`
|
||||
}
|
||||
|
||||
// BurnKnobs parallels `burn:` defaults. Duration is the total Burn window.
|
||||
// CPUWorkers is "all" (agent picks runtime.NumCPU) or a numeric string.
|
||||
// MemPct is a percentage of MemAvailable to stress. FioOnSpare gates
|
||||
// whether fio runs inside Burn (set false if operator lacks a spare
|
||||
// partition). IperfParallel is the parallel stream count fed to iperf3 -P.
|
||||
type BurnKnobs struct {
|
||||
Duration string `json:"duration,omitempty"`
|
||||
CPUWorkers string `json:"cpu_workers,omitempty"`
|
||||
MemPct int `json:"mem_pct,omitempty"`
|
||||
FioOnSpare bool `json:"fio_on_spare,omitempty"`
|
||||
IperfParallel int `json:"iperf_parallel,omitempty"`
|
||||
}
|
||||
|
||||
// ResolveStageConfig flattens the named profile into the wire shape the
|
||||
// claim handler ships. Missing keys render as empty strings so the agent
|
||||
// falls back to its own defaults.
|
||||
func (pr *ProfileRegistry) ResolveStageConfig(name string) StageConfig {
|
||||
if pr == nil {
|
||||
return StageConfig{Profile: name}
|
||||
}
|
||||
p, err := pr.Lookup(name)
|
||||
if err != nil {
|
||||
return StageConfig{Profile: name}
|
||||
}
|
||||
out := StageConfig{Profile: p.Name}
|
||||
if len(p.StageTimeouts) > 0 {
|
||||
out.StageTimeouts = make(map[string]string, len(p.StageTimeouts))
|
||||
for k, v := range p.StageTimeouts {
|
||||
out.StageTimeouts[k] = v.String()
|
||||
}
|
||||
}
|
||||
cpu := p.Defaults["cpustress"]
|
||||
out.CPUStress.CPUPass = yamlString(cpu, "cpu_pass")
|
||||
out.CPUStress.MemPass = yamlString(cpu, "mem_pass")
|
||||
out.CPUStress.EDACPoll = yamlString(cpu, "edac_poll")
|
||||
st := p.Defaults["storage"]
|
||||
out.Storage.Mode = yamlString(st, "mode")
|
||||
out.Storage.FioSize = yamlString(st, "fio_size")
|
||||
out.Storage.FioTime = yamlString(st, "fio_time")
|
||||
out.Storage.FioBS = yamlString(st, "fio_bs")
|
||||
out.Storage.FioRW = yamlString(st, "fio_rw")
|
||||
out.Storage.Verify = yamlString(st, "verify")
|
||||
net := p.Defaults["network"]
|
||||
out.Network.Duration = yamlString(net, "duration")
|
||||
burn := p.Defaults["burn"]
|
||||
out.Burn.Duration = yamlString(burn, "duration")
|
||||
out.Burn.CPUWorkers = yamlString(burn, "cpu_workers")
|
||||
out.Burn.MemPct = yamlInt(burn, "mem_pct")
|
||||
out.Burn.FioOnSpare = yamlBool(burn, "fio_on_spare")
|
||||
out.Burn.IperfParallel = yamlInt(burn, "iperf_parallel")
|
||||
return out
|
||||
}
|
||||
|
||||
// yamlInt coerces a map[string]any entry to int. Accepts native int,
|
||||
// float64 (JSON numbers round-trip as float), or numeric string. Missing
|
||||
// / malformed values return 0 so the agent falls back to its default.
|
||||
func yamlInt(m map[string]any, key string) int {
|
||||
v, ok := m[key]
|
||||
if !ok || v == nil {
|
||||
return 0
|
||||
}
|
||||
switch x := v.(type) {
|
||||
case int:
|
||||
return x
|
||||
case int64:
|
||||
return int(x)
|
||||
case float64:
|
||||
return int(x)
|
||||
case string:
|
||||
// Best-effort string → int. Empty and non-numeric fall through
|
||||
// to zero.
|
||||
var n int
|
||||
if _, err := fmt.Sscanf(x, "%d", &n); err == nil {
|
||||
return n
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// yamlBool accepts native bool or "true"/"false" strings. Anything else
|
||||
// (missing key, numeric, typo) returns false — a safer default than
|
||||
// "true" for a destructive knob like fio_on_spare.
|
||||
func yamlBool(m map[string]any, key string) bool {
|
||||
v, ok := m[key]
|
||||
if !ok || v == nil {
|
||||
return false
|
||||
}
|
||||
switch x := v.(type) {
|
||||
case bool:
|
||||
return x
|
||||
case string:
|
||||
return strings.EqualFold(x, "true")
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// yamlString coerces a map[string]any entry to its string form. YAML
|
||||
// durations like "2m" parse as strings; numeric literals like 5 parse as
|
||||
// int. We format non-string scalars with fmt.Sprint so the agent can
|
||||
// still interpret them.
|
||||
func yamlString(m map[string]any, key string) string {
|
||||
v, ok := m[key]
|
||||
if !ok || v == nil {
|
||||
return ""
|
||||
}
|
||||
if s, ok := v.(string); ok {
|
||||
return s
|
||||
}
|
||||
return fmt.Sprint(v)
|
||||
}
|
||||
|
||||
// Lookup returns the profile with the given name. Falls back to the
|
||||
// default profile (quick) if the name is empty. Returns an error when
|
||||
// the name is non-empty but unknown so the caller can surface it.
|
||||
func (pr *ProfileRegistry) Lookup(name string) (Profile, error) {
|
||||
if name == "" {
|
||||
name = ProfileQuick
|
||||
}
|
||||
p, ok := pr.Profiles[name]
|
||||
if !ok {
|
||||
return Profile{}, fmt.Errorf("unknown profile %q", name)
|
||||
}
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// Names returns the registry's profile names in the canonical
|
||||
// picker order (quick/deep/soak). Profiles present in the config but
|
||||
// unknown to AllProfiles are appended after, alphabetically.
|
||||
func (pr *ProfileRegistry) Names() []string {
|
||||
out := make([]string, 0, len(pr.Profiles))
|
||||
seen := map[string]bool{}
|
||||
for _, n := range AllProfiles {
|
||||
if _, ok := pr.Profiles[n]; ok {
|
||||
out = append(out, n)
|
||||
seen[n] = true
|
||||
}
|
||||
}
|
||||
for n := range pr.Profiles {
|
||||
if !seen[n] {
|
||||
out = append(out, n)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// Stages returns the shared stage order, or a safe default when the
|
||||
// config didn't declare one — keeps tests that don't build a full
|
||||
// ProfileRegistry from tripping over a nil slice.
|
||||
func (pr *ProfileRegistry) Stages() []string {
|
||||
if len(pr.Vetting.Stages) == 0 {
|
||||
return DefaultStages()
|
||||
}
|
||||
out := make([]string, len(pr.Vetting.Stages))
|
||||
copy(out, pr.Vetting.Stages)
|
||||
return out
|
||||
}
|
||||
|
||||
// DefaultStages is the canonical stage list the orchestrator walks
|
||||
// when no config is loaded. Mirrored in the vetting.yaml shipped with
|
||||
// the repo so edits to the slice and the file stay in sync.
|
||||
func DefaultStages() []string {
|
||||
return []string{
|
||||
"Inventory",
|
||||
"Firmware",
|
||||
"SpecValidate",
|
||||
"SMART",
|
||||
"CPUStress",
|
||||
"Storage",
|
||||
"Network",
|
||||
"Burn",
|
||||
"GPU",
|
||||
"PSU",
|
||||
"Reporting",
|
||||
}
|
||||
}
|
||||
|
||||
// rawProfile is the YAML shape before inherit resolution. Durations
|
||||
// arrive as strings (e.g. "2h") so we can parse them with
|
||||
// time.ParseDuration instead of rolling our own.
|
||||
type rawProfile struct {
|
||||
Inherit string `yaml:"inherit"`
|
||||
StageTimeouts map[string]string `yaml:"stage_timeouts"`
|
||||
Defaults map[string]map[string]any `yaml:"defaults"`
|
||||
}
|
||||
|
||||
type rawProfilesBlock struct {
|
||||
Vetting Vetting `yaml:"vetting"`
|
||||
Profiles map[string]rawProfile `yaml:"profiles"`
|
||||
}
|
||||
|
||||
// buildProfileRegistry flattens a rawProfilesBlock into a ProfileRegistry.
|
||||
// Resolves `inherit:` by recursive merge (child keys win), parses
|
||||
// stage_timeouts strings into time.Durations, and returns an error if
|
||||
// the inherit chain loops or references an unknown profile.
|
||||
func buildProfileRegistry(raw rawProfilesBlock) (*ProfileRegistry, error) {
|
||||
if len(raw.Profiles) == 0 {
|
||||
raw.Profiles = defaultRawProfiles()
|
||||
}
|
||||
out := &ProfileRegistry{
|
||||
Vetting: raw.Vetting,
|
||||
Profiles: make(map[string]Profile, len(raw.Profiles)),
|
||||
}
|
||||
if len(out.Vetting.Stages) == 0 {
|
||||
out.Vetting.Stages = DefaultStages()
|
||||
}
|
||||
for name := range raw.Profiles {
|
||||
resolved, err := resolveProfile(raw.Profiles, name, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out.Profiles[name] = resolved
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// resolveProfile recursively walks inherit chains, depth-first. The
|
||||
// visited slice is a cycle guard — we add the current name before
|
||||
// recursing and bail if we ever see it again.
|
||||
func resolveProfile(all map[string]rawProfile, name string, visited []string) (Profile, error) {
|
||||
for _, v := range visited {
|
||||
if v == name {
|
||||
return Profile{}, fmt.Errorf("profile inherit cycle: %s -> %s", strings.Join(visited, " -> "), name)
|
||||
}
|
||||
}
|
||||
raw, ok := all[name]
|
||||
if !ok {
|
||||
return Profile{}, fmt.Errorf("unknown profile %q", name)
|
||||
}
|
||||
base := Profile{
|
||||
Name: name,
|
||||
Inherit: raw.Inherit,
|
||||
StageTimeouts: map[string]time.Duration{},
|
||||
Defaults: map[string]map[string]any{},
|
||||
}
|
||||
if raw.Inherit != "" {
|
||||
parent, err := resolveProfile(all, raw.Inherit, append(visited, name))
|
||||
if err != nil {
|
||||
return Profile{}, err
|
||||
}
|
||||
for k, v := range parent.StageTimeouts {
|
||||
base.StageTimeouts[k] = v
|
||||
}
|
||||
for k, v := range parent.Defaults {
|
||||
copyMap := make(map[string]any, len(v))
|
||||
for kk, vv := range v {
|
||||
copyMap[kk] = vv
|
||||
}
|
||||
base.Defaults[k] = copyMap
|
||||
}
|
||||
}
|
||||
for stage, s := range raw.StageTimeouts {
|
||||
d, err := time.ParseDuration(s)
|
||||
if err != nil {
|
||||
return Profile{}, fmt.Errorf("profile %s stage_timeouts[%s]: %w", name, stage, err)
|
||||
}
|
||||
base.StageTimeouts[stage] = d
|
||||
}
|
||||
for group, kv := range raw.Defaults {
|
||||
dest, ok := base.Defaults[group]
|
||||
if !ok {
|
||||
dest = map[string]any{}
|
||||
base.Defaults[group] = dest
|
||||
}
|
||||
for k, v := range kv {
|
||||
dest[k] = v
|
||||
}
|
||||
}
|
||||
return base, nil
|
||||
}
|
||||
|
||||
// defaultRawProfiles returns sane per-profile durations + probe knobs
|
||||
// used when vetting.yaml omits the `profiles:` block entirely. Matches
|
||||
// the plan's per-stage budget table so the agent still gets coherent
|
||||
// CPUStress/Storage/Network knobs without any operator-visible config.
|
||||
func defaultRawProfiles() map[string]rawProfile {
|
||||
return map[string]rawProfile{
|
||||
ProfileQuick: {
|
||||
StageTimeouts: map[string]string{
|
||||
"CPUStress": "5m",
|
||||
"Storage": "5m",
|
||||
"Network": "2m",
|
||||
"Burn": "3m",
|
||||
"PSU": "1m",
|
||||
},
|
||||
Defaults: map[string]map[string]any{
|
||||
"cpustress": {"cpu_pass": "2m", "mem_pass": "2m", "edac_poll": "10s"},
|
||||
"storage": {"mode": "fio_sample", "fio_size": "1GiB", "fio_time": "3m", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"},
|
||||
"network": {"duration": "60s"},
|
||||
"burn": {"duration": "2m", "cpu_workers": "all", "mem_pct": 50, "fio_on_spare": true, "iperf_parallel": 2},
|
||||
},
|
||||
},
|
||||
ProfileDeep: {
|
||||
StageTimeouts: map[string]string{
|
||||
"CPUStress": "2h",
|
||||
"Storage": "4h",
|
||||
"Network": "35m",
|
||||
"Burn": "3h",
|
||||
"PSU": "10m",
|
||||
},
|
||||
Defaults: map[string]map[string]any{
|
||||
"cpustress": {"cpu_pass": "60m", "mem_pass": "60m", "edac_poll": "10s"},
|
||||
"storage": {"mode": "full_disk", "fio_time": "2h", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"},
|
||||
"network": {"duration": "30m"},
|
||||
"burn": {"duration": "2h", "cpu_workers": "all", "mem_pct": 70, "fio_on_spare": true, "iperf_parallel": 4},
|
||||
},
|
||||
},
|
||||
ProfileSoak: {
|
||||
Inherit: ProfileDeep,
|
||||
StageTimeouts: map[string]string{
|
||||
"CPUStress": "14h",
|
||||
"Storage": "8h",
|
||||
"Network": "2h30m",
|
||||
"Burn": "20h",
|
||||
"PSU": "15m",
|
||||
},
|
||||
Defaults: map[string]map[string]any{
|
||||
"cpustress": {"cpu_pass": "12h"},
|
||||
"storage": {"mode": "full_disk", "fio_time": "6h"},
|
||||
"network": {"duration": "2h"},
|
||||
"burn": {"duration": "18h", "iperf_parallel": 8},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user