deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+97 -5
View File
@@ -21,11 +21,36 @@ import (
)
type Spec struct {
CPU *CPUSpec `yaml:"cpu,omitempty"`
Memory *MemorySpec `yaml:"memory,omitempty"`
Disks []DiskSpec `yaml:"disks,omitempty"`
NICs []NICSpec `yaml:"nics,omitempty"`
GPUs []GPUSpec `yaml:"gpus,omitempty"`
CPU *CPUSpec `yaml:"cpu,omitempty"`
Memory *MemorySpec `yaml:"memory,omitempty"`
Disks []DiskSpec `yaml:"disks,omitempty"`
NICs []NICSpec `yaml:"nics,omitempty"`
GPUs []GPUSpec `yaml:"gpus,omitempty"`
Firmware []FirmwareSpec `yaml:"firmware,omitempty"`
}
// FirmwareSpec is one row in the expected-spec YAML's `firmware:` block.
// Component is one of bios|bmc|nic|hba|microcode|nvme_fw (matches the
// on-wire value from agent/probes.FirmwareSnapshot.Component). Identifier
// is optional — when empty the rule applies to every observed snapshot
// of that component (use for single-instance things like BIOS/microcode);
// when set it pins the check to a specific NIC port / NVMe controller /
// PCI address. Version is the literal string expected; comparison is
// exact after trimming whitespace.
type FirmwareSpec struct {
Component string `yaml:"component"`
Identifier string `yaml:"identifier,omitempty"`
Version string `yaml:"version"`
}
// FirmwareObserved is what the agent reported, in a spec-package-local
// shape so callers don't need to thread store types through the diff.
// The server converts store.FirmwareSnapshot → FirmwareObserved before
// calling DiffFirmware.
type FirmwareObserved struct {
Component string
Identifier string
Version string
}
type CPUSpec struct {
@@ -175,6 +200,73 @@ func diffNICs(expected, actual []NICSpec) []model.SpecDiff {
return out
}
// DiffFirmware returns a SpecDiff per firmware expectation that doesn't
// find a matching observed snapshot. Matching rules:
// - An expected rule with Identifier set matches by (component, id);
// a missing observed snapshot yields a "present=false" diff.
// - An expected rule with Identifier empty applies to every observed
// snapshot of that component — useful for "all NICs must run fw
// 8.30" without listing each port. Zero observed snapshots of the
// component yields a single "present=false" diff, not N.
// - Version mismatch emits an exact-string expected→actual diff.
// Case is preserved (firmware versions are case-sensitive in practice).
func DiffFirmware(expected []FirmwareSpec, actual []FirmwareObserved) []model.SpecDiff {
if len(expected) == 0 {
return nil
}
byCompIdent := map[string]FirmwareObserved{}
byComp := map[string][]FirmwareObserved{}
for _, o := range actual {
byCompIdent[fwKey(o.Component, o.Identifier)] = o
byComp[o.Component] = append(byComp[o.Component], o)
}
var out []model.SpecDiff
for _, exp := range expected {
comp := strings.TrimSpace(exp.Component)
if comp == "" || strings.TrimSpace(exp.Version) == "" {
continue
}
label := "firmware[" + comp
if exp.Identifier != "" {
label += "/" + exp.Identifier
}
label += "]"
if exp.Identifier != "" {
got, ok := byCompIdent[fwKey(comp, exp.Identifier)]
if !ok {
out = append(out, diff(label+".present", "true", "false"))
continue
}
if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
out = append(out, diff(label+".version", exp.Version, got.Version))
}
continue
}
// No identifier: fan out across every observed snapshot of this
// component. Missing is one diff; a mismatching port/controller
// emits one diff per mismatch.
observed := byComp[comp]
if len(observed) == 0 {
out = append(out, diff(label+".present", "true", "false"))
continue
}
for _, got := range observed {
if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
slot := got.Identifier
if slot == "" {
slot = "*"
}
out = append(out, diff("firmware["+comp+"/"+slot+"].version", exp.Version, got.Version))
}
}
}
return out
}
func fwKey(component, identifier string) string {
return strings.ToLower(component) + "|" + strings.ToLower(identifier)
}
func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff {
if len(expected) == 0 {
return nil