Initial commit: full Phases 1-6 implementation

Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
@@ -0,0 +1,232 @@
+// Package spec owns the expected-vs-actual hardware diff for Vetting.
+//
+// The operator writes an expected spec YAML per host when registering.
+// The agent submits an Inventory artifact after boot. Diff() compares
+// them and emits per-field SpecDiff rows; the orchestrator fails the
+// SpecValidate stage if any row is classified critical.
+//
+// Phase 3 rule (operator decision): every mismatch is critical. Missing
+// expected fields skip that check entirely so partial specs stay useful
+// instead of exploding.
+package spec
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"gopkg.in/yaml.v3"
+
+	"vetting/internal/model"
+)
+
+type Spec struct {
+	CPU    *CPUSpec    `yaml:"cpu,omitempty"`
+	Memory *MemorySpec `yaml:"memory,omitempty"`
+	Disks  []DiskSpec  `yaml:"disks,omitempty"`
+	NICs   []NICSpec   `yaml:"nics,omitempty"`
+	GPUs   []GPUSpec   `yaml:"gpus,omitempty"`
+}
+
+type CPUSpec struct {
+	Model        string `json:"model,omitempty" yaml:"model,omitempty"`
+	LogicalCores int    `json:"logical_cores,omitempty" yaml:"logical_cores,omitempty"`
+}
+
+type MemorySpec struct {
+	TotalGiB int `json:"total_gib,omitempty" yaml:"total_gib,omitempty"`
+}
+
+type DiskSpec struct {
+	Serial string `json:"serial,omitempty" yaml:"serial,omitempty"`
+	SizeGB int    `json:"size_gb,omitempty" yaml:"size_gb,omitempty"`
+}
+
+type NICSpec struct {
+	MAC       string `json:"mac,omitempty" yaml:"mac,omitempty"`
+	SpeedGbps int    `json:"speed_gbps,omitempty" yaml:"speed_gbps,omitempty"`
+}
+
+type GPUSpec struct {
+	Model string `json:"model,omitempty" yaml:"model,omitempty"`
+}
+
+// Inventory is the actual measured hardware. Field names deliberately
+// match Spec so the diff reads cleanly.
+type Inventory struct {
+	CPU    CPUSpec     `json:"cpu" yaml:"cpu"`
+	Memory MemorySpec  `json:"memory" yaml:"memory"`
+	Disks  []DiskSpec  `json:"disks" yaml:"disks"`
+	NICs   []NICSpec   `json:"nics" yaml:"nics"`
+	GPUs   []GPUSpec   `json:"gpus" yaml:"gpus"`
+}
+
+// Parse reads expected-spec YAML. Empty YAML parses to a zero Spec and
+// yields an empty diff — i.e. "no expectations" is a legal stance.
+func Parse(src string) (*Spec, error) {
+	var s Spec
+	if err := yaml.Unmarshal([]byte(src), &s); err != nil {
+		return nil, fmt.Errorf("parse spec yaml: %w", err)
+	}
+	return &s, nil
+}
+
+// Diff returns the per-field differences with severity. Phase 3 rule:
+// every present-expected-field-that-mismatches is critical. Missing
+// expected fields are skipped (not info-logged) so the diff list stays
+// focused on real problems.
+func Diff(expected *Spec, actual *Inventory) []model.SpecDiff {
+	if expected == nil {
+		return nil
+	}
+	out := []model.SpecDiff{}
+
+	if expected.CPU != nil {
+		if expected.CPU.Model != "" {
+			if !cpuModelMatches(expected.CPU.Model, actual.CPU.Model) {
+				out = append(out, diff("cpu.model", expected.CPU.Model, actual.CPU.Model))
+			}
+		}
+		if expected.CPU.LogicalCores > 0 && expected.CPU.LogicalCores != actual.CPU.LogicalCores {
+			out = append(out, diff("cpu.logical_cores", itoa(expected.CPU.LogicalCores), itoa(actual.CPU.LogicalCores)))
+		}
+	}
+
+	if expected.Memory != nil && expected.Memory.TotalGiB > 0 {
+		// Allow ±2 GiB tolerance: BIOS-reserved, kernel, reporting
+		// quantization. A dead 16 GiB stick will still surface.
+		if absInt(expected.Memory.TotalGiB-actual.Memory.TotalGiB) > 2 {
+			out = append(out, diff("memory.total_gib", itoa(expected.Memory.TotalGiB), itoa(actual.Memory.TotalGiB)))
+		}
+	}
+
+	out = append(out, diffDisks(expected.Disks, actual.Disks)...)
+	out = append(out, diffNICs(expected.NICs, actual.NICs)...)
+	out = append(out, diffGPUs(expected.GPUs, actual.GPUs)...)
+
+	return out
+}
+
+func diffDisks(expected, actual []DiskSpec) []model.SpecDiff {
+	if len(expected) == 0 {
+		return nil
+	}
+	actualBySerial := map[string]DiskSpec{}
+	for _, d := range actual {
+		if d.Serial != "" {
+			actualBySerial[strings.ToLower(d.Serial)] = d
+		}
+	}
+	var out []model.SpecDiff
+	seen := map[string]bool{}
+	for _, exp := range expected {
+		if exp.Serial == "" {
+			continue
+		}
+		key := strings.ToLower(exp.Serial)
+		seen[key] = true
+		got, ok := actualBySerial[key]
+		if !ok {
+			out = append(out, diff("disks["+exp.Serial+"].present", "true", "false"))
+			continue
+		}
+		if exp.SizeGB > 0 && absInt(exp.SizeGB-got.SizeGB) > 1 {
+			out = append(out, diff("disks["+exp.Serial+"].size_gb", itoa(exp.SizeGB), itoa(got.SizeGB)))
+		}
+	}
+	// Extra disks on the host that operator didn't declare are flagged:
+	// a leftover USB stick could be a destructive-test target we'd
+	// rather the operator know about.
+	for _, got := range actual {
+		if got.Serial == "" {
+			continue
+		}
+		if !seen[strings.ToLower(got.Serial)] {
+			out = append(out, diff("disks[unexpected "+got.Serial+"]", "", "present"))
+		}
+	}
+	return out
+}
+
+func diffNICs(expected, actual []NICSpec) []model.SpecDiff {
+	if len(expected) == 0 {
+		return nil
+	}
+	actualByMAC := map[string]NICSpec{}
+	for _, n := range actual {
+		if n.MAC != "" {
+			actualByMAC[strings.ToLower(n.MAC)] = n
+		}
+	}
+	var out []model.SpecDiff
+	for _, exp := range expected {
+		if exp.MAC == "" {
+			continue
+		}
+		got, ok := actualByMAC[strings.ToLower(exp.MAC)]
+		if !ok {
+			out = append(out, diff("nics["+exp.MAC+"].present", "true", "false"))
+			continue
+		}
+		if exp.SpeedGbps > 0 && got.SpeedGbps > 0 && exp.SpeedGbps != got.SpeedGbps {
+			out = append(out, diff("nics["+exp.MAC+"].speed_gbps", itoa(exp.SpeedGbps), itoa(got.SpeedGbps)))
+		}
+	}
+	return out
+}
+
+func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff {
+	if len(expected) == 0 {
+		return nil
+	}
+	// GPU matching is by model string. Multiple identical cards match
+	// by count, not identity, since PCI-slot order isn't meaningful.
+	want := map[string]int{}
+	for _, g := range expected {
+		want[strings.ToLower(g.Model)]++
+	}
+	got := map[string]int{}
+	for _, g := range actual {
+		got[strings.ToLower(g.Model)]++
+	}
+	var keys []string
+	for k := range want {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	var out []model.SpecDiff
+	for _, k := range keys {
+		if got[k] < want[k] {
+			out = append(out, diff("gpus["+k+"].count", itoa(want[k]), itoa(got[k])))
+		}
+	}
+	return out
+}
+
+// cpuModelMatches compares model strings case-insensitively and allows
+// the operator to declare a substring (e.g. "E5-2680 v4") that matches
+// the verbose kernel-reported string ("Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz").
+func cpuModelMatches(expected, actual string) bool {
+	e := strings.ToLower(strings.TrimSpace(expected))
+	a := strings.ToLower(strings.TrimSpace(actual))
+	return e == a || strings.Contains(a, e)
+}
+
+// In Phase 3 all diffs are critical. Later phases may tier them.
+func diff(field, expected, actual string) model.SpecDiff {
+	return model.SpecDiff{
+		Field:    field,
+		Expected: expected,
+		Actual:   actual,
+		Severity: "critical",
+	}
+}
+
+func absInt(n int) int {
+	if n < 0 {
+		return -n
+	}
+	return n
+}
+
+func itoa(n int) string { return fmt.Sprintf("%d", n) }
@@ -0,0 +1,121 @@
+package spec
+
+import (
+	"testing"
+
+	"vetting/internal/model"
+)
+
+func TestDiffEmptySpec(t *testing.T) {
+	if d := Diff(&Spec{}, &Inventory{}); len(d) != 0 {
+		t.Fatalf("empty spec → empty diff, got %v", d)
+	}
+}
+
+func TestDiffCPUMismatch(t *testing.T) {
+	exp := &Spec{CPU: &CPUSpec{Model: "E5-2680 v4", LogicalCores: 28}}
+	act := &Inventory{CPU: CPUSpec{Model: "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz", LogicalCores: 16}}
+	d := Diff(exp, act)
+	if len(d) != 1 || d[0].Field != "cpu.logical_cores" || d[0].Severity != "critical" {
+		t.Fatalf("expected logical_cores critical, got %+v", d)
+	}
+}
+
+func TestDiffCPUModelSubstringMatch(t *testing.T) {
+	exp := &Spec{CPU: &CPUSpec{Model: "E5-2680 v4"}}
+	act := &Inventory{CPU: CPUSpec{Model: "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz"}}
+	if d := Diff(exp, act); len(d) != 0 {
+		t.Fatalf("substring should match, got %+v", d)
+	}
+}
+
+func TestDiffMemoryTolerance(t *testing.T) {
+	exp := &Spec{Memory: &MemorySpec{TotalGiB: 128}}
+	act := &Inventory{Memory: MemorySpec{TotalGiB: 127}}
+	if d := Diff(exp, act); len(d) != 0 {
+		t.Fatalf("1 GiB variance should be tolerated, got %+v", d)
+	}
+	act2 := &Inventory{Memory: MemorySpec{TotalGiB: 112}} // missing stick
+	d := Diff(exp, act2)
+	if len(d) != 1 || d[0].Field != "memory.total_gib" {
+		t.Fatalf("16 GiB drop should be critical, got %+v", d)
+	}
+}
+
+func TestDiffDisksMissingAndUnexpected(t *testing.T) {
+	exp := &Spec{Disks: []DiskSpec{{Serial: "A", SizeGB: 1000}, {Serial: "B", SizeGB: 500}}}
+	act := &Inventory{Disks: []DiskSpec{{Serial: "A", SizeGB: 1000}, {Serial: "C", SizeGB: 32}}}
+	d := Diff(exp, act)
+	// Expect: disk B missing, disk C unexpected.
+	got := map[string]bool{}
+	for _, row := range d {
+		got[row.Field] = true
+	}
+	if !got["disks[B].present"] {
+		t.Fatalf("expected disks[B].present critical; got %+v", d)
+	}
+	if !got["disks[unexpected C]"] {
+		t.Fatalf("expected disks[unexpected C] critical; got %+v", d)
+	}
+}
+
+func TestDiffDisksSerialCaseInsensitive(t *testing.T) {
+	exp := &Spec{Disks: []DiskSpec{{Serial: "wd-abc123", SizeGB: 1000}}}
+	act := &Inventory{Disks: []DiskSpec{{Serial: "WD-ABC123", SizeGB: 1000}}}
+	if d := Diff(exp, act); len(d) != 0 {
+		t.Fatalf("serial compare must be case-insensitive, got %+v", d)
+	}
+}
+
+func TestDiffNICMAC(t *testing.T) {
+	exp := &Spec{NICs: []NICSpec{{MAC: "aa:bb:cc:dd:ee:ff", SpeedGbps: 10}}}
+	act := &Inventory{NICs: []NICSpec{{MAC: "aa:bb:cc:dd:ee:ff", SpeedGbps: 1}}}
+	d := Diff(exp, act)
+	if len(d) != 1 || d[0].Field != "nics[aa:bb:cc:dd:ee:ff].speed_gbps" {
+		t.Fatalf("expected speed mismatch, got %+v", d)
+	}
+}
+
+func TestDiffGPUCount(t *testing.T) {
+	exp := &Spec{GPUs: []GPUSpec{{Model: "NVIDIA RTX 3090"}, {Model: "NVIDIA RTX 3090"}}}
+	act := &Inventory{GPUs: []GPUSpec{{Model: "nvidia rtx 3090"}}}
+	d := Diff(exp, act)
+	if len(d) != 1 || d[0].Field != "gpus[nvidia rtx 3090].count" {
+		t.Fatalf("expected GPU count critical, got %+v", d)
+	}
+}
+
+func TestParseValidYAML(t *testing.T) {
+	src := `
+cpu:
+  model: "E5-2680 v4"
+  logical_cores: 28
+memory:
+  total_gib: 128
+disks:
+  - serial: A
+    size_gb: 1000
+`
+	s, err := Parse(src)
+	if err != nil {
+		t.Fatalf("Parse: %v", err)
+	}
+	if s.CPU == nil || s.CPU.LogicalCores != 28 {
+		t.Fatalf("cpu not parsed: %+v", s)
+	}
+	if len(s.Disks) != 1 || s.Disks[0].Serial != "A" {
+		t.Fatalf("disks not parsed: %+v", s)
+	}
+}
+
+func TestDiffSeverityAlwaysCritical(t *testing.T) {
+	exp := &Spec{CPU: &CPUSpec{LogicalCores: 8}}
+	act := &Inventory{CPU: CPUSpec{LogicalCores: 4}}
+	d := Diff(exp, act)
+	var got []model.SpecDiff = d
+	for _, row := range got {
+		if row.Severity != "critical" {
+			t.Fatalf("phase-3 rule: every diff is critical; got %q for %s", row.Severity, row.Field)
+		}
+	}
+}