Files
Vetting/agent/probes/inventory.go
T
josh 5e9ad7f569
CI / Lint + build + test (push) Successful in 1m25s
Release / release (push) Successful in 5m38s
probes: sanitize disk serials and normalize GPU model for stable spec keys
Two related bugs were producing different map keys for identical
hardware depending on whether the inventory probe ran in the reporter
on the Proxmox host or in the live-image agent after PXE boot.

1. diskSerial read /sys/block/<dev>/device/{serial,vpd_pg80} and only
   TrimSpace'd the result. vpd_pg80 is a binary SCSI VPD page with a
   4-byte header, and some SSDs leak NUL/control bytes into the text
   serial file. Those bytes survive into the Go string, lowercase
   unchanged, and become a garbage map key that the reporter's cleaner
   read can't match. Sanitize to ASCII-printable range at ingest.

2. probeGPUs built the model slug from fields[2] + " " + fields[3] of
   `lspci -mm -nnk` output. fields[3] is subsystem vendor/device info,
   which varies between otherwise-identical cards and carries the
   `-rXX` revision marker — stable-enough for display but not for
   identity. Use fields[2] alone, strip the trailing `[NNNN]` PCI
   device-ID that lspci -nn appends, and sanitize for consistency.

After deploying the new orchestrator + re-running the configure step
on each registered host, SpecValidate will match cleanly. Disk diffs
self-resolve because the reporter already stored clean serials; GPU
diffs need one reporter re-run because the old expected slug still
carries subsystem noise.
2026-04-18 16:06:18 -04:00

315 lines
9.3 KiB
Go

// Package probes collects hardware facts from a booted Linux system.
// Phase 3 only needs enough to feed the spec diff: CPU model/cores,
// total RAM, per-disk serial+size, per-NIC MAC+speed, per-GPU model.
//
// Every probe is tolerant of missing files or tools — if /sys isn't
// available the field is just left empty. The orchestrator's diff
// engine will surface missing expected fields as failures; missing
// fields that weren't expected stay silent.
package probes
import (
"bufio"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"vetting/internal/spec"
)
// Collect runs every probe and returns the merged inventory. The only
// errors it surfaces are fatal ones that prevent progress — individual
// probe failures are logged to the returned Inventory's raw field and
// do not fail the whole call.
func Collect() (*spec.Inventory, error) {
inv := &spec.Inventory{}
inv.CPU = probeCPU()
inv.Memory = probeMemory()
inv.Disks = probeDisks()
inv.NICs = probeNICs()
inv.GPUs = probeGPUs()
return inv, nil
}
// ----- CPU --------------------------------------------------------------
func probeCPU() spec.CPUSpec {
// model: first "model name" in /proc/cpuinfo.
// logical_cores: runtime.NumCPU (Linux respects cpu cgroup; agent
// runs on bare metal so it will report every HT thread).
c := spec.CPUSpec{LogicalCores: runtime.NumCPU()}
f, err := os.Open("/proc/cpuinfo")
if err != nil {
return c
}
defer func() { _ = f.Close() }()
scan := bufio.NewScanner(f)
for scan.Scan() {
line := scan.Text()
if strings.HasPrefix(line, "model name") {
if _, v, ok := strings.Cut(line, ":"); ok {
c.Model = strings.TrimSpace(v)
break
}
}
}
return c
}
// ----- Memory -----------------------------------------------------------
func probeMemory() spec.MemorySpec {
// /proc/meminfo reports MemTotal in kB. Round down to the nearest
// GiB so the diff's ±2 GiB tolerance is meaningful.
f, err := os.Open("/proc/meminfo")
if err != nil {
return spec.MemorySpec{}
}
defer func() { _ = f.Close() }()
scan := bufio.NewScanner(f)
for scan.Scan() {
fields := strings.Fields(scan.Text())
if len(fields) >= 2 && fields[0] == "MemTotal:" {
kb, err := strconv.ParseInt(fields[1], 10, 64)
if err == nil {
return spec.MemorySpec{TotalGiB: int(kb / 1024 / 1024)}
}
}
}
return spec.MemorySpec{}
}
// ----- Disks ------------------------------------------------------------
// probeDisks walks /sys/class/block and picks out real block devices
// (no partitions, no loop/ram). For each it reads size (512B sectors)
// and serial. Virtio disks in QEMU report a serial only when launched
// with `-drive serial=...`; without that the field is empty, which is
// fine — the diff skips disks with empty serials anyway.
func probeDisks() []spec.DiskSpec {
entries, err := os.ReadDir("/sys/class/block")
if err != nil {
return nil
}
var out []spec.DiskSpec
for _, e := range entries {
name := e.Name()
if !isRealDisk(name) {
continue
}
base := filepath.Join("/sys/class/block", name)
size := diskSizeGB(base)
serial := diskSerial(name)
// size == 0 means we couldn't read /size; skip rather than
// emit garbage.
if size == 0 && serial == "" {
continue
}
out = append(out, spec.DiskSpec{Serial: serial, SizeGB: size})
}
return out
}
func isRealDisk(name string) bool {
// Exclude partitions: they have a parent block dir and a "partition"
// attribute. sd* disks without trailing digits are whole disks; nvme
// disks use nvme0n1 for the namespace and nvme0n1p1 for partitions.
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
return false
}
partPath := filepath.Join("/sys/class/block", name, "partition")
if _, err := os.Stat(partPath); err == nil {
return false
}
return true
}
func diskSizeGB(base string) int {
b, err := os.ReadFile(filepath.Join(base, "size"))
if err != nil {
return 0
}
sectors, err := strconv.ParseInt(strings.TrimSpace(string(b)), 10, 64)
if err != nil {
return 0
}
// /sys reports sectors of 512B regardless of physical sector size.
return int(sectors * 512 / 1_000_000_000)
}
func diskSerial(name string) string {
// Try a few known paths; the kernel exposes serials differently for
// ATA/SCSI vs NVMe.
//
// sysfs reads return raw bytes: vpd_pg80 is a binary SCSI VPD page
// with a 4-byte header, and some SSDs put control/NUL bytes at the
// head of /device/serial. TrimSpace won't strip either, so the
// string survives into the spec map as a garbage key that doesn't
// match the reporter's cleaner read from the same file on a
// different kernel. sanitizeASCII drops everything below 0x20 and
// above 0x7E, which leaves a stable printable serial on both sides.
for _, rel := range []string{
filepath.Join("/sys/block", name, "device", "serial"),
filepath.Join("/sys/block", name, "device", "vpd_pg80"),
filepath.Join("/sys/block", name, "serial"),
} {
if b, err := os.ReadFile(rel); err == nil {
s := sanitizeASCII(string(b))
if s != "" {
return s
}
}
}
// Fallback: udevadm often knows the wwid / serial. Best-effort.
cmd := exec.Command("udevadm", "info", "--query=property", "--name="+name)
out, err := cmd.Output()
if err != nil {
return ""
}
for _, line := range strings.Split(string(out), "\n") {
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
return sanitizeASCII(v)
}
}
return ""
}
// ----- NICs -------------------------------------------------------------
func probeNICs() []spec.NICSpec {
root := "/sys/class/net"
entries, err := os.ReadDir(root)
if err != nil {
return nil
}
var out []spec.NICSpec
for _, e := range entries {
name := e.Name()
if name == "lo" {
continue
}
base := filepath.Join(root, name)
mac := readLine(filepath.Join(base, "address"))
if mac == "" || mac == "00:00:00:00:00:00" {
continue
}
// /sys/class/net/*/speed reports Mbps or -1 if link down.
speed := 0
if b, err := os.ReadFile(filepath.Join(base, "speed")); err == nil {
if mbps, err := strconv.Atoi(strings.TrimSpace(string(b))); err == nil && mbps > 0 {
speed = mbps / 1000
}
}
out = append(out, spec.NICSpec{MAC: strings.ToLower(mac), SpeedGbps: speed})
}
return out
}
// ----- GPUs -------------------------------------------------------------
// probeGPUs leans on lspci; if lspci is missing, returns nothing and
// the diff engine just won't match any GPU expectations. Phase 4 will
// add nvidia-smi for VRAM and firmware.
func probeGPUs() []spec.GPUSpec {
cmd := exec.Command("lspci", "-mm", "-nn")
out, err := cmd.Output()
if err != nil {
return nil
}
var gpus []spec.GPUSpec
for _, line := range strings.Split(string(out), "\n") {
low := strings.ToLower(line)
if !strings.Contains(low, "vga compatible controller") &&
!strings.Contains(low, "3d controller") {
continue
}
// lspci -mm quotes fields. splitQuoted indexes:
// [0] = class (e.g. "VGA compatible controller [0300]")
// [1] = vendor (e.g. "Intel Corporation [8086]")
// [2] = device (e.g. "Alder Lake-N [UHD Graphics] [46d1]")
// [3] = subsys (if present — varies between boards even
// for identical chips; NOT a model identifier)
// We used to concatenate [2] + " " + [3], which made the "model"
// key include subsystem noise and the occasional -rXX revision
// marker, so reporter and live-image runs produced different
// slugs for the same silicon. Use only [2], stripped of the
// trailing PCI device-id "[NNNN]" bracket that lspci -nn adds.
fields := splitQuoted(line)
if len(fields) >= 3 {
model := stripPCIID(fields[2])
model = sanitizeASCII(model)
if model != "" {
gpus = append(gpus, spec.GPUSpec{Model: model})
}
}
}
return gpus
}
func splitQuoted(line string) []string {
var out []string
var cur strings.Builder
inQ := false
for _, r := range line {
switch {
case r == '"':
inQ = !inQ
if !inQ {
out = append(out, cur.String())
cur.Reset()
}
case r == ' ' && !inQ:
continue
default:
cur.WriteRune(r)
}
}
return out
}
// ----- shared helpers ---------------------------------------------------
func readLine(path string) string {
b, err := os.ReadFile(path)
if err != nil {
return ""
}
return strings.TrimSpace(string(b))
}
// sanitizeASCII drops bytes below 0x20 (control chars) and above 0x7E
// (high-bit / UTF-8 continuation bytes that come from binary sysfs
// files like vpd_pg80 being read as a Go string) and trims the result.
// Everything the caller cares about — disk serials, GPU model names —
// is ASCII-printable, so this is safe and fixes the reporter-vs-live
// mismatch where the same hardware produced different map keys.
func sanitizeASCII(s string) string {
var b strings.Builder
b.Grow(len(s))
for i := 0; i < len(s); i++ {
c := s[i]
if c >= 0x20 && c <= 0x7E {
b.WriteByte(c)
}
}
return strings.TrimSpace(b.String())
}
// stripPCIID removes the trailing " [NNNN]" PCI device-ID marker that
// `lspci -nn` appends to vendor/device strings — useful context for a
// human but an unstable identifier across pciutils versions. Keeps any
// internal brackets (e.g. "[UHD Graphics]" is part of the real name).
var pciIDTail = regexp.MustCompile(` *\[[0-9a-fA-F]{4}\]$`)
func stripPCIID(s string) string {
return pciIDTail.ReplaceAllString(s, "")
}