Vetting/internal/spec/spec.go

// Package spec owns the expected-vs-actual hardware diff for Vetting.
//
// The operator writes an expected spec YAML per host when registering.
// The agent submits an Inventory artifact after boot. Diff() compares
// them and emits per-field SpecDiff rows; the orchestrator fails the
// SpecValidate stage if any row is classified critical.
//
// Phase 3 rule (operator decision): every mismatch is critical. Missing
// expected fields skip that check entirely so partial specs stay useful
// instead of exploding.
package spec

import (
	"fmt"
	"sort"
	"strings"

	"gopkg.in/yaml.v3"

	"vetting/internal/model"
)

type Spec struct {
	CPU      *CPUSpec       `yaml:"cpu,omitempty"`
	Memory   *MemorySpec    `yaml:"memory,omitempty"`
	Disks    []DiskSpec     `yaml:"disks,omitempty"`
	NICs     []NICSpec      `yaml:"nics,omitempty"`
	GPUs     []GPUSpec      `yaml:"gpus,omitempty"`
	Firmware []FirmwareSpec `yaml:"firmware,omitempty"`
}

// FirmwareSpec is one row in the expected-spec YAML's `firmware:` block.
// Component is one of bios|bmc|nic|hba|microcode|nvme_fw (matches the
// on-wire value from agent/probes.FirmwareSnapshot.Component). Identifier
// is optional — when empty the rule applies to every observed snapshot
// of that component (use for single-instance things like BIOS/microcode);
// when set it pins the check to a specific NIC port / NVMe controller /
// PCI address. Version is the literal string expected; comparison is
// exact after trimming whitespace.
type FirmwareSpec struct {
	Component  string `yaml:"component"`
	Identifier string `yaml:"identifier,omitempty"`
	Version    string `yaml:"version"`
}

// FirmwareObserved is what the agent reported, in a spec-package-local
// shape so callers don't need to thread store types through the diff.
// The server converts store.FirmwareSnapshot → FirmwareObserved before
// calling DiffFirmware.
type FirmwareObserved struct {
	Component  string
	Identifier string
	Version    string
}

type CPUSpec struct {
	Model        string `json:"model,omitempty" yaml:"model,omitempty"`
	LogicalCores int    `json:"logical_cores,omitempty" yaml:"logical_cores,omitempty"`
}

type MemorySpec struct {
	TotalGiB int `json:"total_gib,omitempty" yaml:"total_gib,omitempty"`
}

type DiskSpec struct {
	Serial string `json:"serial,omitempty" yaml:"serial,omitempty"`
	SizeGB int    `json:"size_gb,omitempty" yaml:"size_gb,omitempty"`
}

type NICSpec struct {
	MAC       string `json:"mac,omitempty" yaml:"mac,omitempty"`
	SpeedGbps int    `json:"speed_gbps,omitempty" yaml:"speed_gbps,omitempty"`
}

type GPUSpec struct {
	Model string `json:"model,omitempty" yaml:"model,omitempty"`
}

// Inventory is the actual measured hardware. Field names deliberately
// match Spec so the diff reads cleanly.
type Inventory struct {
	CPU    CPUSpec     `json:"cpu" yaml:"cpu"`
	Memory MemorySpec  `json:"memory" yaml:"memory"`
	Disks  []DiskSpec  `json:"disks" yaml:"disks"`
	NICs   []NICSpec   `json:"nics" yaml:"nics"`
	GPUs   []GPUSpec   `json:"gpus" yaml:"gpus"`
}

// Parse reads expected-spec YAML. Empty YAML parses to a zero Spec and
// yields an empty diff — i.e. "no expectations" is a legal stance.
func Parse(src string) (*Spec, error) {
	var s Spec
	if err := yaml.Unmarshal([]byte(src), &s); err != nil {
		return nil, fmt.Errorf("parse spec yaml: %w", err)
	}
	return &s, nil
}

// Diff returns the per-field differences with severity. Phase 3 rule:
// every present-expected-field-that-mismatches is critical. Missing
// expected fields are skipped (not info-logged) so the diff list stays
// focused on real problems.
func Diff(expected *Spec, actual *Inventory) []model.SpecDiff {
	if expected == nil {
		return nil
	}
	out := []model.SpecDiff{}

	if expected.CPU != nil {
		if expected.CPU.Model != "" {
			if !cpuModelMatches(expected.CPU.Model, actual.CPU.Model) {
				out = append(out, diff("cpu.model", expected.CPU.Model, actual.CPU.Model))
			}
		}
		if expected.CPU.LogicalCores > 0 && expected.CPU.LogicalCores != actual.CPU.LogicalCores {
			out = append(out, diff("cpu.logical_cores", itoa(expected.CPU.LogicalCores), itoa(actual.CPU.LogicalCores)))
		}
	}

	if expected.Memory != nil && expected.Memory.TotalGiB > 0 {
		// Allow ±2 GiB tolerance: BIOS-reserved, kernel, reporting
		// quantization. A dead 16 GiB stick will still surface.
		if absInt(expected.Memory.TotalGiB-actual.Memory.TotalGiB) > 2 {
			out = append(out, diff("memory.total_gib", itoa(expected.Memory.TotalGiB), itoa(actual.Memory.TotalGiB)))
		}
	}

	out = append(out, diffDisks(expected.Disks, actual.Disks)...)
	out = append(out, diffNICs(expected.NICs, actual.NICs)...)
	out = append(out, diffGPUs(expected.GPUs, actual.GPUs)...)

	return out
}

func diffDisks(expected, actual []DiskSpec) []model.SpecDiff {
	if len(expected) == 0 {
		return nil
	}
	actualBySerial := map[string]DiskSpec{}
	for _, d := range actual {
		if d.Serial != "" {
			actualBySerial[strings.ToLower(d.Serial)] = d
		}
	}
	var out []model.SpecDiff
	seen := map[string]bool{}
	for _, exp := range expected {
		if exp.Serial == "" {
			continue
		}
		key := strings.ToLower(exp.Serial)
		seen[key] = true
		got, ok := actualBySerial[key]
		if !ok {
			out = append(out, diff("disks["+exp.Serial+"].present", "true", "false"))
			continue
		}
		if exp.SizeGB > 0 && absInt(exp.SizeGB-got.SizeGB) > 1 {
			out = append(out, diff("disks["+exp.Serial+"].size_gb", itoa(exp.SizeGB), itoa(got.SizeGB)))
		}
	}
	// Extra disks on the host that operator didn't declare are flagged:
	// a leftover USB stick could be a destructive-test target we'd
	// rather the operator know about.
	for _, got := range actual {
		if got.Serial == "" {
			continue
		}
		if !seen[strings.ToLower(got.Serial)] {
			out = append(out, diff("disks[unexpected "+got.Serial+"]", "", "present"))
		}
	}
	return out
}

func diffNICs(expected, actual []NICSpec) []model.SpecDiff {
	if len(expected) == 0 {
		return nil
	}
	actualByMAC := map[string]NICSpec{}
	for _, n := range actual {
		if n.MAC != "" {
			actualByMAC[strings.ToLower(n.MAC)] = n
		}
	}
	var out []model.SpecDiff
	for _, exp := range expected {
		if exp.MAC == "" {
			continue
		}
		got, ok := actualByMAC[strings.ToLower(exp.MAC)]
		if !ok {
			out = append(out, diff("nics["+exp.MAC+"].present", "true", "false"))
			continue
		}
		if exp.SpeedGbps > 0 && got.SpeedGbps > 0 && exp.SpeedGbps != got.SpeedGbps {
			out = append(out, diff("nics["+exp.MAC+"].speed_gbps", itoa(exp.SpeedGbps), itoa(got.SpeedGbps)))
		}
	}
	return out
}

// DiffFirmware returns a SpecDiff per firmware expectation that doesn't
// find a matching observed snapshot. Matching rules:
//   - An expected rule with Identifier set matches by (component, id);
//     a missing observed snapshot yields a "present=false" diff.
//   - An expected rule with Identifier empty applies to every observed
//     snapshot of that component — useful for "all NICs must run fw
//     8.30" without listing each port. Zero observed snapshots of the
//     component yields a single "present=false" diff, not N.
//   - Version mismatch emits an exact-string expected→actual diff.
// Case is preserved (firmware versions are case-sensitive in practice).
func DiffFirmware(expected []FirmwareSpec, actual []FirmwareObserved) []model.SpecDiff {
	if len(expected) == 0 {
		return nil
	}
	byCompIdent := map[string]FirmwareObserved{}
	byComp := map[string][]FirmwareObserved{}
	for _, o := range actual {
		byCompIdent[fwKey(o.Component, o.Identifier)] = o
		byComp[o.Component] = append(byComp[o.Component], o)
	}
	var out []model.SpecDiff
	for _, exp := range expected {
		comp := strings.TrimSpace(exp.Component)
		if comp == "" || strings.TrimSpace(exp.Version) == "" {
			continue
		}
		label := "firmware[" + comp
		if exp.Identifier != "" {
			label += "/" + exp.Identifier
		}
		label += "]"
		if exp.Identifier != "" {
			got, ok := byCompIdent[fwKey(comp, exp.Identifier)]
			if !ok {
				out = append(out, diff(label+".present", "true", "false"))
				continue
			}
			if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
				out = append(out, diff(label+".version", exp.Version, got.Version))
			}
			continue
		}
		// No identifier: fan out across every observed snapshot of this
		// component. Missing is one diff; a mismatching port/controller
		// emits one diff per mismatch.
		observed := byComp[comp]
		if len(observed) == 0 {
			out = append(out, diff(label+".present", "true", "false"))
			continue
		}
		for _, got := range observed {
			if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
				slot := got.Identifier
				if slot == "" {
					slot = "*"
				}
				out = append(out, diff("firmware["+comp+"/"+slot+"].version", exp.Version, got.Version))
			}
		}
	}
	return out
}

func fwKey(component, identifier string) string {
	return strings.ToLower(component) + "|" + strings.ToLower(identifier)
}

func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff {
	if len(expected) == 0 {
		return nil
	}
	// GPU matching is by model string. Multiple identical cards match
	// by count, not identity, since PCI-slot order isn't meaningful.
	want := map[string]int{}
	for _, g := range expected {
		want[strings.ToLower(g.Model)]++
	}
	got := map[string]int{}
	for _, g := range actual {
		got[strings.ToLower(g.Model)]++
	}
	var keys []string
	for k := range want {
		keys = append(keys, k)
	}
	sort.Strings(keys)
	var out []model.SpecDiff
	for _, k := range keys {
		if got[k] < want[k] {
			out = append(out, diff("gpus["+k+"].count", itoa(want[k]), itoa(got[k])))
		}
	}
	return out
}

// cpuModelMatches compares model strings case-insensitively and allows
// the operator to declare a substring (e.g. "E5-2680 v4") that matches
// the verbose kernel-reported string ("Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz").
func cpuModelMatches(expected, actual string) bool {
	e := strings.ToLower(strings.TrimSpace(expected))
	a := strings.ToLower(strings.TrimSpace(actual))
	return e == a || strings.Contains(a, e)
}

// In Phase 3 all diffs are critical. Later phases may tier them.
func diff(field, expected, actual string) model.SpecDiff {
	return model.SpecDiff{
		Field:    field,
		Expected: expected,
		Actual:   actual,
		Severity: "critical",
	}
}

func absInt(n int) int {
	if n < 0 {
		return -n
	}
	return n
}

func itoa(n int) string { return fmt.Sprintf("%d", n) }