Initial commit: full Phases 1-6 implementation
CI / Lint + build + test (push) Has been cancelled

Post-repair hardware validation pipeline for Proxmox cluster hosts.
Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq
PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
+264
View File
@@ -0,0 +1,264 @@
// Package probes collects hardware facts from a booted Linux system.
// Phase 3 only needs enough to feed the spec diff: CPU model/cores,
// total RAM, per-disk serial+size, per-NIC MAC+speed, per-GPU model.
//
// Every probe is tolerant of missing files or tools — if /sys isn't
// available the field is just left empty. The orchestrator's diff
// engine will surface missing expected fields as failures; missing
// fields that weren't expected stay silent.
package probes
import (
"bufio"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"vetting/internal/spec"
)
// Collect runs every probe and returns the merged inventory. The only
// errors it surfaces are fatal ones that prevent progress — individual
// probe failures are logged to the returned Inventory's raw field and
// do not fail the whole call.
func Collect() (*spec.Inventory, error) {
inv := &spec.Inventory{}
inv.CPU = probeCPU()
inv.Memory = probeMemory()
inv.Disks = probeDisks()
inv.NICs = probeNICs()
inv.GPUs = probeGPUs()
return inv, nil
}
// ----- CPU --------------------------------------------------------------
func probeCPU() spec.CPUSpec {
// model: first "model name" in /proc/cpuinfo.
// logical_cores: runtime.NumCPU (Linux respects cpu cgroup; agent
// runs on bare metal so it will report every HT thread).
c := spec.CPUSpec{LogicalCores: runtime.NumCPU()}
f, err := os.Open("/proc/cpuinfo")
if err != nil {
return c
}
defer func() { _ = f.Close() }()
scan := bufio.NewScanner(f)
for scan.Scan() {
line := scan.Text()
if strings.HasPrefix(line, "model name") {
if _, v, ok := strings.Cut(line, ":"); ok {
c.Model = strings.TrimSpace(v)
break
}
}
}
return c
}
// ----- Memory -----------------------------------------------------------
func probeMemory() spec.MemorySpec {
// /proc/meminfo reports MemTotal in kB. Round down to the nearest
// GiB so the diff's ±2 GiB tolerance is meaningful.
f, err := os.Open("/proc/meminfo")
if err != nil {
return spec.MemorySpec{}
}
defer func() { _ = f.Close() }()
scan := bufio.NewScanner(f)
for scan.Scan() {
fields := strings.Fields(scan.Text())
if len(fields) >= 2 && fields[0] == "MemTotal:" {
kb, err := strconv.ParseInt(fields[1], 10, 64)
if err == nil {
return spec.MemorySpec{TotalGiB: int(kb / 1024 / 1024)}
}
}
}
return spec.MemorySpec{}
}
// ----- Disks ------------------------------------------------------------
// probeDisks walks /sys/class/block and picks out real block devices
// (no partitions, no loop/ram). For each it reads size (512B sectors)
// and serial. Virtio disks in QEMU report a serial only when launched
// with `-drive serial=...`; without that the field is empty, which is
// fine — the diff skips disks with empty serials anyway.
func probeDisks() []spec.DiskSpec {
entries, err := os.ReadDir("/sys/class/block")
if err != nil {
return nil
}
var out []spec.DiskSpec
for _, e := range entries {
name := e.Name()
if !isRealDisk(name) {
continue
}
base := filepath.Join("/sys/class/block", name)
size := diskSizeGB(base)
serial := diskSerial(name)
// size == 0 means we couldn't read /size; skip rather than
// emit garbage.
if size == 0 && serial == "" {
continue
}
out = append(out, spec.DiskSpec{Serial: serial, SizeGB: size})
}
return out
}
func isRealDisk(name string) bool {
// Exclude partitions: they have a parent block dir and a "partition"
// attribute. sd* disks without trailing digits are whole disks; nvme
// disks use nvme0n1 for the namespace and nvme0n1p1 for partitions.
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
return false
}
partPath := filepath.Join("/sys/class/block", name, "partition")
if _, err := os.Stat(partPath); err == nil {
return false
}
return true
}
func diskSizeGB(base string) int {
b, err := os.ReadFile(filepath.Join(base, "size"))
if err != nil {
return 0
}
sectors, err := strconv.ParseInt(strings.TrimSpace(string(b)), 10, 64)
if err != nil {
return 0
}
// /sys reports sectors of 512B regardless of physical sector size.
return int(sectors * 512 / 1_000_000_000)
}
func diskSerial(name string) string {
// Try a few known paths; the kernel exposes serials differently for
// ATA/SCSI vs NVMe.
for _, rel := range []string{
filepath.Join("/sys/block", name, "device", "serial"),
filepath.Join("/sys/block", name, "device", "vpd_pg80"),
filepath.Join("/sys/block", name, "serial"),
} {
if b, err := os.ReadFile(rel); err == nil {
s := strings.TrimSpace(string(b))
if s != "" {
return s
}
}
}
// Fallback: udevadm often knows the wwid / serial. Best-effort.
cmd := exec.Command("udevadm", "info", "--query=property", "--name="+name)
out, err := cmd.Output()
if err != nil {
return ""
}
for _, line := range strings.Split(string(out), "\n") {
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
return strings.TrimSpace(v)
}
}
return ""
}
// ----- NICs -------------------------------------------------------------
func probeNICs() []spec.NICSpec {
root := "/sys/class/net"
entries, err := os.ReadDir(root)
if err != nil {
return nil
}
var out []spec.NICSpec
for _, e := range entries {
name := e.Name()
if name == "lo" {
continue
}
base := filepath.Join(root, name)
mac := readLine(filepath.Join(base, "address"))
if mac == "" || mac == "00:00:00:00:00:00" {
continue
}
// /sys/class/net/*/speed reports Mbps or -1 if link down.
speed := 0
if b, err := os.ReadFile(filepath.Join(base, "speed")); err == nil {
if mbps, err := strconv.Atoi(strings.TrimSpace(string(b))); err == nil && mbps > 0 {
speed = mbps / 1000
}
}
out = append(out, spec.NICSpec{MAC: strings.ToLower(mac), SpeedGbps: speed})
}
return out
}
// ----- GPUs -------------------------------------------------------------
// probeGPUs leans on lspci; if lspci is missing, returns nothing and
// the diff engine just won't match any GPU expectations. Phase 4 will
// add nvidia-smi for VRAM and firmware.
func probeGPUs() []spec.GPUSpec {
cmd := exec.Command("lspci", "-mm", "-nnk")
out, err := cmd.Output()
if err != nil {
return nil
}
var gpus []spec.GPUSpec
for _, line := range strings.Split(string(out), "\n") {
low := strings.ToLower(line)
if !strings.Contains(low, "vga compatible controller") &&
!strings.Contains(low, "3d controller") {
continue
}
// `lspci -mm` quotes fields; device name is usually field 3.
fields := splitQuoted(line)
if len(fields) >= 4 {
gpus = append(gpus, spec.GPUSpec{Model: fmt.Sprintf("%s %s", fields[2], fields[3])})
}
}
return gpus
}
func splitQuoted(line string) []string {
var out []string
var cur strings.Builder
inQ := false
for _, r := range line {
switch {
case r == '"':
inQ = !inQ
if !inQ {
out = append(out, cur.String())
cur.Reset()
}
case r == ' ' && !inQ:
continue
default:
cur.WriteRune(r)
}
}
return out
}
// ----- shared helpers ---------------------------------------------------
func readLine(path string) string {
b, err := os.ReadFile(path)
if err != nil {
return ""
}
return strings.TrimSpace(string(b))
}
+67
View File
@@ -0,0 +1,67 @@
package probes
import (
"os"
"path/filepath"
"strconv"
"strings"
)
// ThermalSample is one reading from /sys/class/hwmon. Kind is "temp",
// Key is the label (or chip-relative name) and Value is degrees C.
type ThermalSample struct {
Kind string
Key string
Value float64
Unit string
}
// Thermals walks /sys/class/hwmon looking for temp*_input files. The
// kernel reports millidegrees C; we divide by 1000. Labels come from
// temp*_label (preferred) or a chip-relative fallback.
//
// This is also used by the thermal sidecar; it re-reads on each tick
// rather than holding open handles so hot-plugged sensors (e.g. a PCIe
// card enumerating late) get picked up.
func Thermals() []ThermalSample {
root := "/sys/class/hwmon"
chips, err := os.ReadDir(root)
if err != nil {
return nil
}
var out []ThermalSample
for _, c := range chips {
base := filepath.Join(root, c.Name())
chipName := strings.TrimSpace(readFileStr(filepath.Join(base, "name")))
files, err := os.ReadDir(base)
if err != nil {
continue
}
for _, f := range files {
name := f.Name()
if !strings.HasPrefix(name, "temp") || !strings.HasSuffix(name, "_input") {
continue
}
idx := strings.TrimSuffix(strings.TrimPrefix(name, "temp"), "_input")
label := strings.TrimSpace(readFileStr(filepath.Join(base, "temp"+idx+"_label")))
if label == "" {
label = chipName + "/temp" + idx
}
raw := strings.TrimSpace(readFileStr(filepath.Join(base, name)))
milli, err := strconv.Atoi(raw)
if err != nil {
continue
}
out = append(out, ThermalSample{Kind: "temp", Key: label, Value: float64(milli) / 1000, Unit: "C"})
}
}
return out
}
func readFileStr(p string) string {
b, err := os.ReadFile(p)
if err != nil {
return ""
}
return string(b)
}