23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
325 lines
10 KiB
Go
325 lines
10 KiB
Go
// Package spec owns the expected-vs-actual hardware diff for Vetting.
|
|
//
|
|
// The operator writes an expected spec YAML per host when registering.
|
|
// The agent submits an Inventory artifact after boot. Diff() compares
|
|
// them and emits per-field SpecDiff rows; the orchestrator fails the
|
|
// SpecValidate stage if any row is classified critical.
|
|
//
|
|
// Phase 3 rule (operator decision): every mismatch is critical. Missing
|
|
// expected fields skip that check entirely so partial specs stay useful
|
|
// instead of exploding.
|
|
package spec
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
|
|
"gopkg.in/yaml.v3"
|
|
|
|
"vetting/internal/model"
|
|
)
|
|
|
|
type Spec struct {
|
|
CPU *CPUSpec `yaml:"cpu,omitempty"`
|
|
Memory *MemorySpec `yaml:"memory,omitempty"`
|
|
Disks []DiskSpec `yaml:"disks,omitempty"`
|
|
NICs []NICSpec `yaml:"nics,omitempty"`
|
|
GPUs []GPUSpec `yaml:"gpus,omitempty"`
|
|
Firmware []FirmwareSpec `yaml:"firmware,omitempty"`
|
|
}
|
|
|
|
// FirmwareSpec is one row in the expected-spec YAML's `firmware:` block.
|
|
// Component is one of bios|bmc|nic|hba|microcode|nvme_fw (matches the
|
|
// on-wire value from agent/probes.FirmwareSnapshot.Component). Identifier
|
|
// is optional — when empty the rule applies to every observed snapshot
|
|
// of that component (use for single-instance things like BIOS/microcode);
|
|
// when set it pins the check to a specific NIC port / NVMe controller /
|
|
// PCI address. Version is the literal string expected; comparison is
|
|
// exact after trimming whitespace.
|
|
type FirmwareSpec struct {
|
|
Component string `yaml:"component"`
|
|
Identifier string `yaml:"identifier,omitempty"`
|
|
Version string `yaml:"version"`
|
|
}
|
|
|
|
// FirmwareObserved is what the agent reported, in a spec-package-local
|
|
// shape so callers don't need to thread store types through the diff.
|
|
// The server converts store.FirmwareSnapshot → FirmwareObserved before
|
|
// calling DiffFirmware.
|
|
type FirmwareObserved struct {
|
|
Component string
|
|
Identifier string
|
|
Version string
|
|
}
|
|
|
|
type CPUSpec struct {
|
|
Model string `json:"model,omitempty" yaml:"model,omitempty"`
|
|
LogicalCores int `json:"logical_cores,omitempty" yaml:"logical_cores,omitempty"`
|
|
}
|
|
|
|
type MemorySpec struct {
|
|
TotalGiB int `json:"total_gib,omitempty" yaml:"total_gib,omitempty"`
|
|
}
|
|
|
|
type DiskSpec struct {
|
|
Serial string `json:"serial,omitempty" yaml:"serial,omitempty"`
|
|
SizeGB int `json:"size_gb,omitempty" yaml:"size_gb,omitempty"`
|
|
}
|
|
|
|
type NICSpec struct {
|
|
MAC string `json:"mac,omitempty" yaml:"mac,omitempty"`
|
|
SpeedGbps int `json:"speed_gbps,omitempty" yaml:"speed_gbps,omitempty"`
|
|
}
|
|
|
|
type GPUSpec struct {
|
|
Model string `json:"model,omitempty" yaml:"model,omitempty"`
|
|
}
|
|
|
|
// Inventory is the actual measured hardware. Field names deliberately
|
|
// match Spec so the diff reads cleanly.
|
|
type Inventory struct {
|
|
CPU CPUSpec `json:"cpu" yaml:"cpu"`
|
|
Memory MemorySpec `json:"memory" yaml:"memory"`
|
|
Disks []DiskSpec `json:"disks" yaml:"disks"`
|
|
NICs []NICSpec `json:"nics" yaml:"nics"`
|
|
GPUs []GPUSpec `json:"gpus" yaml:"gpus"`
|
|
}
|
|
|
|
// Parse reads expected-spec YAML. Empty YAML parses to a zero Spec and
|
|
// yields an empty diff — i.e. "no expectations" is a legal stance.
|
|
func Parse(src string) (*Spec, error) {
|
|
var s Spec
|
|
if err := yaml.Unmarshal([]byte(src), &s); err != nil {
|
|
return nil, fmt.Errorf("parse spec yaml: %w", err)
|
|
}
|
|
return &s, nil
|
|
}
|
|
|
|
// Diff returns the per-field differences with severity. Phase 3 rule:
|
|
// every present-expected-field-that-mismatches is critical. Missing
|
|
// expected fields are skipped (not info-logged) so the diff list stays
|
|
// focused on real problems.
|
|
func Diff(expected *Spec, actual *Inventory) []model.SpecDiff {
|
|
if expected == nil {
|
|
return nil
|
|
}
|
|
out := []model.SpecDiff{}
|
|
|
|
if expected.CPU != nil {
|
|
if expected.CPU.Model != "" {
|
|
if !cpuModelMatches(expected.CPU.Model, actual.CPU.Model) {
|
|
out = append(out, diff("cpu.model", expected.CPU.Model, actual.CPU.Model))
|
|
}
|
|
}
|
|
if expected.CPU.LogicalCores > 0 && expected.CPU.LogicalCores != actual.CPU.LogicalCores {
|
|
out = append(out, diff("cpu.logical_cores", itoa(expected.CPU.LogicalCores), itoa(actual.CPU.LogicalCores)))
|
|
}
|
|
}
|
|
|
|
if expected.Memory != nil && expected.Memory.TotalGiB > 0 {
|
|
// Allow ±2 GiB tolerance: BIOS-reserved, kernel, reporting
|
|
// quantization. A dead 16 GiB stick will still surface.
|
|
if absInt(expected.Memory.TotalGiB-actual.Memory.TotalGiB) > 2 {
|
|
out = append(out, diff("memory.total_gib", itoa(expected.Memory.TotalGiB), itoa(actual.Memory.TotalGiB)))
|
|
}
|
|
}
|
|
|
|
out = append(out, diffDisks(expected.Disks, actual.Disks)...)
|
|
out = append(out, diffNICs(expected.NICs, actual.NICs)...)
|
|
out = append(out, diffGPUs(expected.GPUs, actual.GPUs)...)
|
|
|
|
return out
|
|
}
|
|
|
|
func diffDisks(expected, actual []DiskSpec) []model.SpecDiff {
|
|
if len(expected) == 0 {
|
|
return nil
|
|
}
|
|
actualBySerial := map[string]DiskSpec{}
|
|
for _, d := range actual {
|
|
if d.Serial != "" {
|
|
actualBySerial[strings.ToLower(d.Serial)] = d
|
|
}
|
|
}
|
|
var out []model.SpecDiff
|
|
seen := map[string]bool{}
|
|
for _, exp := range expected {
|
|
if exp.Serial == "" {
|
|
continue
|
|
}
|
|
key := strings.ToLower(exp.Serial)
|
|
seen[key] = true
|
|
got, ok := actualBySerial[key]
|
|
if !ok {
|
|
out = append(out, diff("disks["+exp.Serial+"].present", "true", "false"))
|
|
continue
|
|
}
|
|
if exp.SizeGB > 0 && absInt(exp.SizeGB-got.SizeGB) > 1 {
|
|
out = append(out, diff("disks["+exp.Serial+"].size_gb", itoa(exp.SizeGB), itoa(got.SizeGB)))
|
|
}
|
|
}
|
|
// Extra disks on the host that operator didn't declare are flagged:
|
|
// a leftover USB stick could be a destructive-test target we'd
|
|
// rather the operator know about.
|
|
for _, got := range actual {
|
|
if got.Serial == "" {
|
|
continue
|
|
}
|
|
if !seen[strings.ToLower(got.Serial)] {
|
|
out = append(out, diff("disks[unexpected "+got.Serial+"]", "", "present"))
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func diffNICs(expected, actual []NICSpec) []model.SpecDiff {
|
|
if len(expected) == 0 {
|
|
return nil
|
|
}
|
|
actualByMAC := map[string]NICSpec{}
|
|
for _, n := range actual {
|
|
if n.MAC != "" {
|
|
actualByMAC[strings.ToLower(n.MAC)] = n
|
|
}
|
|
}
|
|
var out []model.SpecDiff
|
|
for _, exp := range expected {
|
|
if exp.MAC == "" {
|
|
continue
|
|
}
|
|
got, ok := actualByMAC[strings.ToLower(exp.MAC)]
|
|
if !ok {
|
|
out = append(out, diff("nics["+exp.MAC+"].present", "true", "false"))
|
|
continue
|
|
}
|
|
if exp.SpeedGbps > 0 && got.SpeedGbps > 0 && exp.SpeedGbps != got.SpeedGbps {
|
|
out = append(out, diff("nics["+exp.MAC+"].speed_gbps", itoa(exp.SpeedGbps), itoa(got.SpeedGbps)))
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// DiffFirmware returns a SpecDiff per firmware expectation that doesn't
|
|
// find a matching observed snapshot. Matching rules:
|
|
// - An expected rule with Identifier set matches by (component, id);
|
|
// a missing observed snapshot yields a "present=false" diff.
|
|
// - An expected rule with Identifier empty applies to every observed
|
|
// snapshot of that component — useful for "all NICs must run fw
|
|
// 8.30" without listing each port. Zero observed snapshots of the
|
|
// component yields a single "present=false" diff, not N.
|
|
// - Version mismatch emits an exact-string expected→actual diff.
|
|
// Case is preserved (firmware versions are case-sensitive in practice).
|
|
func DiffFirmware(expected []FirmwareSpec, actual []FirmwareObserved) []model.SpecDiff {
|
|
if len(expected) == 0 {
|
|
return nil
|
|
}
|
|
byCompIdent := map[string]FirmwareObserved{}
|
|
byComp := map[string][]FirmwareObserved{}
|
|
for _, o := range actual {
|
|
byCompIdent[fwKey(o.Component, o.Identifier)] = o
|
|
byComp[o.Component] = append(byComp[o.Component], o)
|
|
}
|
|
var out []model.SpecDiff
|
|
for _, exp := range expected {
|
|
comp := strings.TrimSpace(exp.Component)
|
|
if comp == "" || strings.TrimSpace(exp.Version) == "" {
|
|
continue
|
|
}
|
|
label := "firmware[" + comp
|
|
if exp.Identifier != "" {
|
|
label += "/" + exp.Identifier
|
|
}
|
|
label += "]"
|
|
if exp.Identifier != "" {
|
|
got, ok := byCompIdent[fwKey(comp, exp.Identifier)]
|
|
if !ok {
|
|
out = append(out, diff(label+".present", "true", "false"))
|
|
continue
|
|
}
|
|
if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
|
|
out = append(out, diff(label+".version", exp.Version, got.Version))
|
|
}
|
|
continue
|
|
}
|
|
// No identifier: fan out across every observed snapshot of this
|
|
// component. Missing is one diff; a mismatching port/controller
|
|
// emits one diff per mismatch.
|
|
observed := byComp[comp]
|
|
if len(observed) == 0 {
|
|
out = append(out, diff(label+".present", "true", "false"))
|
|
continue
|
|
}
|
|
for _, got := range observed {
|
|
if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
|
|
slot := got.Identifier
|
|
if slot == "" {
|
|
slot = "*"
|
|
}
|
|
out = append(out, diff("firmware["+comp+"/"+slot+"].version", exp.Version, got.Version))
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func fwKey(component, identifier string) string {
|
|
return strings.ToLower(component) + "|" + strings.ToLower(identifier)
|
|
}
|
|
|
|
func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff {
|
|
if len(expected) == 0 {
|
|
return nil
|
|
}
|
|
// GPU matching is by model string. Multiple identical cards match
|
|
// by count, not identity, since PCI-slot order isn't meaningful.
|
|
want := map[string]int{}
|
|
for _, g := range expected {
|
|
want[strings.ToLower(g.Model)]++
|
|
}
|
|
got := map[string]int{}
|
|
for _, g := range actual {
|
|
got[strings.ToLower(g.Model)]++
|
|
}
|
|
var keys []string
|
|
for k := range want {
|
|
keys = append(keys, k)
|
|
}
|
|
sort.Strings(keys)
|
|
var out []model.SpecDiff
|
|
for _, k := range keys {
|
|
if got[k] < want[k] {
|
|
out = append(out, diff("gpus["+k+"].count", itoa(want[k]), itoa(got[k])))
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// cpuModelMatches compares model strings case-insensitively and allows
|
|
// the operator to declare a substring (e.g. "E5-2680 v4") that matches
|
|
// the verbose kernel-reported string ("Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz").
|
|
func cpuModelMatches(expected, actual string) bool {
|
|
e := strings.ToLower(strings.TrimSpace(expected))
|
|
a := strings.ToLower(strings.TrimSpace(actual))
|
|
return e == a || strings.Contains(a, e)
|
|
}
|
|
|
|
// In Phase 3 all diffs are critical. Later phases may tier them.
|
|
func diff(field, expected, actual string) model.SpecDiff {
|
|
return model.SpecDiff{
|
|
Field: field,
|
|
Expected: expected,
|
|
Actual: actual,
|
|
Severity: "critical",
|
|
}
|
|
}
|
|
|
|
func absInt(n int) int {
|
|
if n < 0 {
|
|
return -n
|
|
}
|
|
return n
|
|
}
|
|
|
|
func itoa(n int) string { return fmt.Sprintf("%d", n) }
|