deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+97
-5
@@ -21,11 +21,36 @@ import (
|
||||
)
|
||||
|
||||
type Spec struct {
|
||||
CPU *CPUSpec `yaml:"cpu,omitempty"`
|
||||
Memory *MemorySpec `yaml:"memory,omitempty"`
|
||||
Disks []DiskSpec `yaml:"disks,omitempty"`
|
||||
NICs []NICSpec `yaml:"nics,omitempty"`
|
||||
GPUs []GPUSpec `yaml:"gpus,omitempty"`
|
||||
CPU *CPUSpec `yaml:"cpu,omitempty"`
|
||||
Memory *MemorySpec `yaml:"memory,omitempty"`
|
||||
Disks []DiskSpec `yaml:"disks,omitempty"`
|
||||
NICs []NICSpec `yaml:"nics,omitempty"`
|
||||
GPUs []GPUSpec `yaml:"gpus,omitempty"`
|
||||
Firmware []FirmwareSpec `yaml:"firmware,omitempty"`
|
||||
}
|
||||
|
||||
// FirmwareSpec is one row in the expected-spec YAML's `firmware:` block.
|
||||
// Component is one of bios|bmc|nic|hba|microcode|nvme_fw (matches the
|
||||
// on-wire value from agent/probes.FirmwareSnapshot.Component). Identifier
|
||||
// is optional — when empty the rule applies to every observed snapshot
|
||||
// of that component (use for single-instance things like BIOS/microcode);
|
||||
// when set it pins the check to a specific NIC port / NVMe controller /
|
||||
// PCI address. Version is the literal string expected; comparison is
|
||||
// exact after trimming whitespace.
|
||||
type FirmwareSpec struct {
|
||||
Component string `yaml:"component"`
|
||||
Identifier string `yaml:"identifier,omitempty"`
|
||||
Version string `yaml:"version"`
|
||||
}
|
||||
|
||||
// FirmwareObserved is what the agent reported, in a spec-package-local
|
||||
// shape so callers don't need to thread store types through the diff.
|
||||
// The server converts store.FirmwareSnapshot → FirmwareObserved before
|
||||
// calling DiffFirmware.
|
||||
type FirmwareObserved struct {
|
||||
Component string
|
||||
Identifier string
|
||||
Version string
|
||||
}
|
||||
|
||||
type CPUSpec struct {
|
||||
@@ -175,6 +200,73 @@ func diffNICs(expected, actual []NICSpec) []model.SpecDiff {
|
||||
return out
|
||||
}
|
||||
|
||||
// DiffFirmware returns a SpecDiff per firmware expectation that doesn't
|
||||
// find a matching observed snapshot. Matching rules:
|
||||
// - An expected rule with Identifier set matches by (component, id);
|
||||
// a missing observed snapshot yields a "present=false" diff.
|
||||
// - An expected rule with Identifier empty applies to every observed
|
||||
// snapshot of that component — useful for "all NICs must run fw
|
||||
// 8.30" without listing each port. Zero observed snapshots of the
|
||||
// component yields a single "present=false" diff, not N.
|
||||
// - Version mismatch emits an exact-string expected→actual diff.
|
||||
// Case is preserved (firmware versions are case-sensitive in practice).
|
||||
func DiffFirmware(expected []FirmwareSpec, actual []FirmwareObserved) []model.SpecDiff {
|
||||
if len(expected) == 0 {
|
||||
return nil
|
||||
}
|
||||
byCompIdent := map[string]FirmwareObserved{}
|
||||
byComp := map[string][]FirmwareObserved{}
|
||||
for _, o := range actual {
|
||||
byCompIdent[fwKey(o.Component, o.Identifier)] = o
|
||||
byComp[o.Component] = append(byComp[o.Component], o)
|
||||
}
|
||||
var out []model.SpecDiff
|
||||
for _, exp := range expected {
|
||||
comp := strings.TrimSpace(exp.Component)
|
||||
if comp == "" || strings.TrimSpace(exp.Version) == "" {
|
||||
continue
|
||||
}
|
||||
label := "firmware[" + comp
|
||||
if exp.Identifier != "" {
|
||||
label += "/" + exp.Identifier
|
||||
}
|
||||
label += "]"
|
||||
if exp.Identifier != "" {
|
||||
got, ok := byCompIdent[fwKey(comp, exp.Identifier)]
|
||||
if !ok {
|
||||
out = append(out, diff(label+".present", "true", "false"))
|
||||
continue
|
||||
}
|
||||
if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
|
||||
out = append(out, diff(label+".version", exp.Version, got.Version))
|
||||
}
|
||||
continue
|
||||
}
|
||||
// No identifier: fan out across every observed snapshot of this
|
||||
// component. Missing is one diff; a mismatching port/controller
|
||||
// emits one diff per mismatch.
|
||||
observed := byComp[comp]
|
||||
if len(observed) == 0 {
|
||||
out = append(out, diff(label+".present", "true", "false"))
|
||||
continue
|
||||
}
|
||||
for _, got := range observed {
|
||||
if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
|
||||
slot := got.Identifier
|
||||
if slot == "" {
|
||||
slot = "*"
|
||||
}
|
||||
out = append(out, diff("firmware["+comp+"/"+slot+"].version", exp.Version, got.Version))
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func fwKey(component, identifier string) string {
|
||||
return strings.ToLower(component) + "|" + strings.ToLower(identifier)
|
||||
}
|
||||
|
||||
func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff {
|
||||
if len(expected) == 0 {
|
||||
return nil
|
||||
|
||||
@@ -119,3 +119,96 @@ func TestDiffSeverityAlwaysCritical(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiffFirmwareIdentifierMatch(t *testing.T) {
|
||||
exp := []FirmwareSpec{{Component: "bios", Version: "3.2"}}
|
||||
obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}}
|
||||
if d := DiffFirmware(exp, obs); len(d) != 0 {
|
||||
t.Fatalf("matching bios version should produce no diff, got %+v", d)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiffFirmwareVersionMismatch(t *testing.T) {
|
||||
exp := []FirmwareSpec{{Component: "bios", Version: "3.3"}}
|
||||
obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}}
|
||||
d := DiffFirmware(exp, obs)
|
||||
if len(d) != 1 {
|
||||
t.Fatalf("want 1 diff, got %d: %+v", len(d), d)
|
||||
}
|
||||
if d[0].Expected != "3.3" || d[0].Actual != "3.2" {
|
||||
t.Fatalf("diff expected/actual = %q/%q, want 3.3/3.2", d[0].Expected, d[0].Actual)
|
||||
}
|
||||
if d[0].Severity != "critical" {
|
||||
t.Errorf("severity = %q, want critical", d[0].Severity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiffFirmwareMissingComponentPresent(t *testing.T) {
|
||||
// Expected rule with no identifier + zero observed snapshots →
|
||||
// single "present=false" diff, not N.
|
||||
exp := []FirmwareSpec{{Component: "bmc", Version: "1.74"}}
|
||||
d := DiffFirmware(exp, nil)
|
||||
if len(d) != 1 {
|
||||
t.Fatalf("want 1 diff for missing BMC, got %d: %+v", len(d), d)
|
||||
}
|
||||
if d[0].Field != "firmware[bmc].present" || d[0].Expected != "true" || d[0].Actual != "false" {
|
||||
t.Fatalf("missing-BMC diff = %+v", d[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiffFirmwareWildcardFanOut(t *testing.T) {
|
||||
// Expected rule with empty identifier fans across every observed
|
||||
// snapshot of the component — one port matches, one doesn't → one diff.
|
||||
exp := []FirmwareSpec{{Component: "nic", Version: "16.32.1010"}}
|
||||
obs := []FirmwareObserved{
|
||||
{Component: "nic", Identifier: "eth0", Version: "16.32.1010"},
|
||||
{Component: "nic", Identifier: "eth1", Version: "14.28.0000"},
|
||||
}
|
||||
d := DiffFirmware(exp, obs)
|
||||
if len(d) != 1 {
|
||||
t.Fatalf("want 1 diff (mismatched eth1 only), got %d: %+v", len(d), d)
|
||||
}
|
||||
if d[0].Field != "firmware[nic/eth1].version" {
|
||||
t.Errorf("field = %q, want firmware[nic/eth1].version", d[0].Field)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiffFirmwareIdentifierPin(t *testing.T) {
|
||||
// Identifier set: pins the rule to a specific port. Other ports
|
||||
// with mismatched firmware are not evaluated by this rule.
|
||||
exp := []FirmwareSpec{{Component: "nic", Identifier: "eth0", Version: "1.0"}}
|
||||
obs := []FirmwareObserved{
|
||||
{Component: "nic", Identifier: "eth0", Version: "1.0"},
|
||||
{Component: "nic", Identifier: "eth1", Version: "9.9"},
|
||||
}
|
||||
if d := DiffFirmware(exp, obs); len(d) != 0 {
|
||||
t.Fatalf("pinned rule should ignore other ports, got %+v", d)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiffFirmwareIdentifierPinMissing(t *testing.T) {
|
||||
// Pinned rule with no matching observed snapshot → present=false diff.
|
||||
exp := []FirmwareSpec{{Component: "nic", Identifier: "eth0", Version: "1.0"}}
|
||||
if d := DiffFirmware(exp, nil); len(d) != 1 || d[0].Field != "firmware[nic/eth0].present" {
|
||||
t.Fatalf("want present=false for pinned rule, got %+v", d)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiffFirmwareEmptyRuleSkipped(t *testing.T) {
|
||||
// Empty component or empty version silently skip rather than panic.
|
||||
exp := []FirmwareSpec{{Component: "", Version: "x"}, {Component: "bios", Version: ""}}
|
||||
obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}}
|
||||
if d := DiffFirmware(exp, obs); len(d) != 0 {
|
||||
t.Fatalf("empty rules should skip, got %+v", d)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiffFirmwareCaseInsensitive(t *testing.T) {
|
||||
// Version match is case-insensitive after trim; avoids spurious diff
|
||||
// from ethtool's "FW1234" vs expected YAML's "fw1234".
|
||||
exp := []FirmwareSpec{{Component: "nvme_fw", Identifier: "nvme0", Version: "fw1234"}}
|
||||
obs := []FirmwareObserved{{Component: "nvme_fw", Identifier: "nvme0", Version: "FW1234"}}
|
||||
if d := DiffFirmware(exp, obs); len(d) != 0 {
|
||||
t.Fatalf("case-insensitive match expected, got %+v", d)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user