package config import ( "fmt" "strings" "time" ) // ProfileName is the set of legal values for a Run's profile column. // Exposed as constants so callers (UI handler, tests, agent) don't // sprinkle literal strings. const ( ProfileQuick = "quick" ProfileDeep = "deep" ProfileSoak = "soak" ) // AllProfiles is the canonical ordering shown in the picker. Leftmost // is the default; rightmost is the longest-running. var AllProfiles = []string{ProfileQuick, ProfileDeep, ProfileSoak} // IsValidProfile returns true when name is one of the known profile // identifiers. Used at the UI boundary to reject malformed POSTs and in // store code as a fallback guard. func IsValidProfile(name string) bool { for _, p := range AllProfiles { if p == name { return true } } return false } // Vetting holds the stage order + threshold defaults that are shared // across all profiles. Only the per-stage durations/concurrency differ // between quick/deep/soak; gates like "CPU > 92C fails the run" apply // to a 2-minute quick run and a 12-hour soak alike. type Vetting struct { Stages []string `yaml:"stages"` Thresholds []ThresholdDefaults `yaml:"thresholds"` } // ThresholdDefaults is the YAML shape of a threshold declaration. One // stanza can declare a per-stage rule ("stage: Network") or a global // rule ("stage: *") — the threshold evaluator applies both to samples // with matching (stage, kind, key). type ThresholdDefaults struct { Stage string `yaml:"stage"` Kind string `yaml:"kind"` Key string `yaml:"key"` Op string `yaml:"op"` // lt|lte|gt|gte|within_pct Value float64 `yaml:"value"` Nominal float64 `yaml:"nominal"` // only used by within_pct (e.g. 12.0 for +12V rail) Unit string `yaml:"unit"` Severity string `yaml:"severity"` // critical|warning } // ProfileRegistry is the in-memory view of the `profiles:` block in // vetting.yaml. The orchestrator queries it at run creation time to // seed thresholds and (in Phase 3+) to scale per-stage durations. type ProfileRegistry struct { // Shared stage ordering + threshold defaults. Every profile walks // the same list; only durations/concurrency differ. Vetting Vetting // Profiles is keyed by name ("quick"/"deep"/"soak"). Inherit is // already resolved at load time — a caller sees a flattened view. Profiles map[string]Profile } // Profile is a loaded profile. StageTimeouts is keyed by stage name. // Defaults carries the free-form knobs each probe reads. type Profile struct { Name string Inherit string StageTimeouts map[string]time.Duration Defaults map[string]map[string]any } // StageConfig is the flat view of a profile's knobs, shipped on the // claim response so the agent can size CPUStress/Storage/Network/Burn // work without parsing YAML. Empty values mean "fall back to the // agent's compile-time default" — an older orchestrator that doesn't // set these fields keeps working unchanged. type StageConfig struct { Profile string `json:"profile"` StageTimeouts map[string]string `json:"stage_timeouts,omitempty"` CPUStress CPUStressKnobs `json:"cpustress"` Storage StorageKnobs `json:"storage"` Network NetworkKnobs `json:"network"` Burn BurnKnobs `json:"burn"` } // CPUStressKnobs parallels the `cpustress:` block under `profiles..defaults`. // Durations are YAML duration strings ("2m", "60m", "12h"). type CPUStressKnobs struct { CPUPass string `json:"cpu_pass,omitempty"` MemPass string `json:"mem_pass,omitempty"` EDACPoll string `json:"edac_poll,omitempty"` } // StorageKnobs parallels `storage:` defaults. Mode is "fio_sample" (quick) // or "full_disk" (deep/soak). Verify names the integrity mode ("md5" or ""). type StorageKnobs struct { Mode string `json:"mode,omitempty"` FioSize string `json:"fio_size,omitempty"` FioTime string `json:"fio_time,omitempty"` FioBS string `json:"fio_bs,omitempty"` FioRW string `json:"fio_rw,omitempty"` Verify string `json:"verify,omitempty"` } // NetworkKnobs parallels `network:` defaults. Duration is a YAML string. type NetworkKnobs struct { Duration string `json:"duration,omitempty"` } // BurnKnobs parallels `burn:` defaults. Duration is the total Burn window. // CPUWorkers is "all" (agent picks runtime.NumCPU) or a numeric string. // MemPct is a percentage of MemAvailable to stress. FioOnSpare gates // whether fio runs inside Burn (set false if operator lacks a spare // partition). IperfParallel is the parallel stream count fed to iperf3 -P. type BurnKnobs struct { Duration string `json:"duration,omitempty"` CPUWorkers string `json:"cpu_workers,omitempty"` MemPct int `json:"mem_pct,omitempty"` FioOnSpare bool `json:"fio_on_spare,omitempty"` IperfParallel int `json:"iperf_parallel,omitempty"` } // ResolveStageConfig flattens the named profile into the wire shape the // claim handler ships. Missing keys render as empty strings so the agent // falls back to its own defaults. func (pr *ProfileRegistry) ResolveStageConfig(name string) StageConfig { if pr == nil { return StageConfig{Profile: name} } p, err := pr.Lookup(name) if err != nil { return StageConfig{Profile: name} } out := StageConfig{Profile: p.Name} if len(p.StageTimeouts) > 0 { out.StageTimeouts = make(map[string]string, len(p.StageTimeouts)) for k, v := range p.StageTimeouts { out.StageTimeouts[k] = v.String() } } cpu := p.Defaults["cpustress"] out.CPUStress.CPUPass = yamlString(cpu, "cpu_pass") out.CPUStress.MemPass = yamlString(cpu, "mem_pass") out.CPUStress.EDACPoll = yamlString(cpu, "edac_poll") st := p.Defaults["storage"] out.Storage.Mode = yamlString(st, "mode") out.Storage.FioSize = yamlString(st, "fio_size") out.Storage.FioTime = yamlString(st, "fio_time") out.Storage.FioBS = yamlString(st, "fio_bs") out.Storage.FioRW = yamlString(st, "fio_rw") out.Storage.Verify = yamlString(st, "verify") net := p.Defaults["network"] out.Network.Duration = yamlString(net, "duration") burn := p.Defaults["burn"] out.Burn.Duration = yamlString(burn, "duration") out.Burn.CPUWorkers = yamlString(burn, "cpu_workers") out.Burn.MemPct = yamlInt(burn, "mem_pct") out.Burn.FioOnSpare = yamlBool(burn, "fio_on_spare") out.Burn.IperfParallel = yamlInt(burn, "iperf_parallel") return out } // yamlInt coerces a map[string]any entry to int. Accepts native int, // float64 (JSON numbers round-trip as float), or numeric string. Missing // / malformed values return 0 so the agent falls back to its default. func yamlInt(m map[string]any, key string) int { v, ok := m[key] if !ok || v == nil { return 0 } switch x := v.(type) { case int: return x case int64: return int(x) case float64: return int(x) case string: // Best-effort string → int. Empty and non-numeric fall through // to zero. var n int if _, err := fmt.Sscanf(x, "%d", &n); err == nil { return n } } return 0 } // yamlBool accepts native bool or "true"/"false" strings. Anything else // (missing key, numeric, typo) returns false — a safer default than // "true" for a destructive knob like fio_on_spare. func yamlBool(m map[string]any, key string) bool { v, ok := m[key] if !ok || v == nil { return false } switch x := v.(type) { case bool: return x case string: return strings.EqualFold(x, "true") } return false } // yamlString coerces a map[string]any entry to its string form. YAML // durations like "2m" parse as strings; numeric literals like 5 parse as // int. We format non-string scalars with fmt.Sprint so the agent can // still interpret them. func yamlString(m map[string]any, key string) string { v, ok := m[key] if !ok || v == nil { return "" } if s, ok := v.(string); ok { return s } return fmt.Sprint(v) } // Lookup returns the profile with the given name. Falls back to the // default profile (quick) if the name is empty. Returns an error when // the name is non-empty but unknown so the caller can surface it. func (pr *ProfileRegistry) Lookup(name string) (Profile, error) { if name == "" { name = ProfileQuick } p, ok := pr.Profiles[name] if !ok { return Profile{}, fmt.Errorf("unknown profile %q", name) } return p, nil } // Names returns the registry's profile names in the canonical // picker order (quick/deep/soak). Profiles present in the config but // unknown to AllProfiles are appended after, alphabetically. func (pr *ProfileRegistry) Names() []string { out := make([]string, 0, len(pr.Profiles)) seen := map[string]bool{} for _, n := range AllProfiles { if _, ok := pr.Profiles[n]; ok { out = append(out, n) seen[n] = true } } for n := range pr.Profiles { if !seen[n] { out = append(out, n) } } return out } // Stages returns the shared stage order, or a safe default when the // config didn't declare one — keeps tests that don't build a full // ProfileRegistry from tripping over a nil slice. func (pr *ProfileRegistry) Stages() []string { if len(pr.Vetting.Stages) == 0 { return DefaultStages() } out := make([]string, len(pr.Vetting.Stages)) copy(out, pr.Vetting.Stages) return out } // DefaultStages is the canonical stage list the orchestrator walks // when no config is loaded. Mirrored in the vetting.yaml shipped with // the repo so edits to the slice and the file stay in sync. func DefaultStages() []string { return []string{ "Inventory", "Firmware", "SpecValidate", "SMART", "CPUStress", "Storage", "Network", "Burn", "GPU", "PSU", "Reporting", } } // rawProfile is the YAML shape before inherit resolution. Durations // arrive as strings (e.g. "2h") so we can parse them with // time.ParseDuration instead of rolling our own. type rawProfile struct { Inherit string `yaml:"inherit"` StageTimeouts map[string]string `yaml:"stage_timeouts"` Defaults map[string]map[string]any `yaml:"defaults"` } type rawProfilesBlock struct { Vetting Vetting `yaml:"vetting"` Profiles map[string]rawProfile `yaml:"profiles"` } // buildProfileRegistry flattens a rawProfilesBlock into a ProfileRegistry. // Resolves `inherit:` by recursive merge (child keys win), parses // stage_timeouts strings into time.Durations, and returns an error if // the inherit chain loops or references an unknown profile. func buildProfileRegistry(raw rawProfilesBlock) (*ProfileRegistry, error) { if len(raw.Profiles) == 0 { raw.Profiles = defaultRawProfiles() } out := &ProfileRegistry{ Vetting: raw.Vetting, Profiles: make(map[string]Profile, len(raw.Profiles)), } if len(out.Vetting.Stages) == 0 { out.Vetting.Stages = DefaultStages() } for name := range raw.Profiles { resolved, err := resolveProfile(raw.Profiles, name, nil) if err != nil { return nil, err } out.Profiles[name] = resolved } return out, nil } // resolveProfile recursively walks inherit chains, depth-first. The // visited slice is a cycle guard — we add the current name before // recursing and bail if we ever see it again. func resolveProfile(all map[string]rawProfile, name string, visited []string) (Profile, error) { for _, v := range visited { if v == name { return Profile{}, fmt.Errorf("profile inherit cycle: %s -> %s", strings.Join(visited, " -> "), name) } } raw, ok := all[name] if !ok { return Profile{}, fmt.Errorf("unknown profile %q", name) } base := Profile{ Name: name, Inherit: raw.Inherit, StageTimeouts: map[string]time.Duration{}, Defaults: map[string]map[string]any{}, } if raw.Inherit != "" { parent, err := resolveProfile(all, raw.Inherit, append(visited, name)) if err != nil { return Profile{}, err } for k, v := range parent.StageTimeouts { base.StageTimeouts[k] = v } for k, v := range parent.Defaults { copyMap := make(map[string]any, len(v)) for kk, vv := range v { copyMap[kk] = vv } base.Defaults[k] = copyMap } } for stage, s := range raw.StageTimeouts { d, err := time.ParseDuration(s) if err != nil { return Profile{}, fmt.Errorf("profile %s stage_timeouts[%s]: %w", name, stage, err) } base.StageTimeouts[stage] = d } for group, kv := range raw.Defaults { dest, ok := base.Defaults[group] if !ok { dest = map[string]any{} base.Defaults[group] = dest } for k, v := range kv { dest[k] = v } } return base, nil } // defaultRawProfiles returns sane per-profile durations + probe knobs // used when vetting.yaml omits the `profiles:` block entirely. Matches // the plan's per-stage budget table so the agent still gets coherent // CPUStress/Storage/Network knobs without any operator-visible config. func defaultRawProfiles() map[string]rawProfile { return map[string]rawProfile{ ProfileQuick: { StageTimeouts: map[string]string{ "CPUStress": "5m", "Storage": "5m", "Network": "2m", "Burn": "3m", "PSU": "1m", }, Defaults: map[string]map[string]any{ "cpustress": {"cpu_pass": "2m", "mem_pass": "2m", "edac_poll": "10s"}, "storage": {"mode": "fio_sample", "fio_size": "1GiB", "fio_time": "3m", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"}, "network": {"duration": "60s"}, "burn": {"duration": "2m", "cpu_workers": "all", "mem_pct": 50, "fio_on_spare": true, "iperf_parallel": 2}, }, }, ProfileDeep: { StageTimeouts: map[string]string{ "CPUStress": "2h", "Storage": "4h", "Network": "35m", "Burn": "3h", "PSU": "10m", }, Defaults: map[string]map[string]any{ "cpustress": {"cpu_pass": "60m", "mem_pass": "60m", "edac_poll": "10s"}, "storage": {"mode": "full_disk", "fio_time": "2h", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"}, "network": {"duration": "30m"}, "burn": {"duration": "2h", "cpu_workers": "all", "mem_pct": 70, "fio_on_spare": true, "iperf_parallel": 4}, }, }, ProfileSoak: { Inherit: ProfileDeep, StageTimeouts: map[string]string{ "CPUStress": "14h", "Storage": "8h", "Network": "2h30m", "Burn": "20h", "PSU": "15m", }, Defaults: map[string]map[string]any{ "cpustress": {"cpu_pass": "12h"}, "storage": {"mode": "full_disk", "fio_time": "6h"}, "network": {"duration": "2h"}, "burn": {"duration": "18h", "iperf_parallel": 8}, }, }, } }