5e9ad7f569
Two related bugs were producing different map keys for identical
hardware depending on whether the inventory probe ran in the reporter
on the Proxmox host or in the live-image agent after PXE boot.
1. diskSerial read /sys/block/<dev>/device/{serial,vpd_pg80} and only
TrimSpace'd the result. vpd_pg80 is a binary SCSI VPD page with a
4-byte header, and some SSDs leak NUL/control bytes into the text
serial file. Those bytes survive into the Go string, lowercase
unchanged, and become a garbage map key that the reporter's cleaner
read can't match. Sanitize to ASCII-printable range at ingest.
2. probeGPUs built the model slug from fields[2] + " " + fields[3] of
`lspci -mm -nnk` output. fields[3] is subsystem vendor/device info,
which varies between otherwise-identical cards and carries the
`-rXX` revision marker — stable-enough for display but not for
identity. Use fields[2] alone, strip the trailing `[NNNN]` PCI
device-ID that lspci -nn appends, and sanitize for consistency.
After deploying the new orchestrator + re-running the configure step
on each registered host, SpecValidate will match cleanly. Disk diffs
self-resolve because the reporter already stored clean serials; GPU
diffs need one reporter re-run because the old expected slug still
carries subsystem noise.
315 lines
9.3 KiB
Go
315 lines
9.3 KiB
Go
// Package probes collects hardware facts from a booted Linux system.
|
|
// Phase 3 only needs enough to feed the spec diff: CPU model/cores,
|
|
// total RAM, per-disk serial+size, per-NIC MAC+speed, per-GPU model.
|
|
//
|
|
// Every probe is tolerant of missing files or tools — if /sys isn't
|
|
// available the field is just left empty. The orchestrator's diff
|
|
// engine will surface missing expected fields as failures; missing
|
|
// fields that weren't expected stay silent.
|
|
package probes
|
|
|
|
import (
|
|
"bufio"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"regexp"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"vetting/internal/spec"
|
|
)
|
|
|
|
// Collect runs every probe and returns the merged inventory. The only
|
|
// errors it surfaces are fatal ones that prevent progress — individual
|
|
// probe failures are logged to the returned Inventory's raw field and
|
|
// do not fail the whole call.
|
|
func Collect() (*spec.Inventory, error) {
|
|
inv := &spec.Inventory{}
|
|
|
|
inv.CPU = probeCPU()
|
|
inv.Memory = probeMemory()
|
|
inv.Disks = probeDisks()
|
|
inv.NICs = probeNICs()
|
|
inv.GPUs = probeGPUs()
|
|
|
|
return inv, nil
|
|
}
|
|
|
|
// ----- CPU --------------------------------------------------------------
|
|
|
|
func probeCPU() spec.CPUSpec {
|
|
// model: first "model name" in /proc/cpuinfo.
|
|
// logical_cores: runtime.NumCPU (Linux respects cpu cgroup; agent
|
|
// runs on bare metal so it will report every HT thread).
|
|
c := spec.CPUSpec{LogicalCores: runtime.NumCPU()}
|
|
f, err := os.Open("/proc/cpuinfo")
|
|
if err != nil {
|
|
return c
|
|
}
|
|
defer func() { _ = f.Close() }()
|
|
scan := bufio.NewScanner(f)
|
|
for scan.Scan() {
|
|
line := scan.Text()
|
|
if strings.HasPrefix(line, "model name") {
|
|
if _, v, ok := strings.Cut(line, ":"); ok {
|
|
c.Model = strings.TrimSpace(v)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return c
|
|
}
|
|
|
|
// ----- Memory -----------------------------------------------------------
|
|
|
|
func probeMemory() spec.MemorySpec {
|
|
// /proc/meminfo reports MemTotal in kB. Round down to the nearest
|
|
// GiB so the diff's ±2 GiB tolerance is meaningful.
|
|
f, err := os.Open("/proc/meminfo")
|
|
if err != nil {
|
|
return spec.MemorySpec{}
|
|
}
|
|
defer func() { _ = f.Close() }()
|
|
scan := bufio.NewScanner(f)
|
|
for scan.Scan() {
|
|
fields := strings.Fields(scan.Text())
|
|
if len(fields) >= 2 && fields[0] == "MemTotal:" {
|
|
kb, err := strconv.ParseInt(fields[1], 10, 64)
|
|
if err == nil {
|
|
return spec.MemorySpec{TotalGiB: int(kb / 1024 / 1024)}
|
|
}
|
|
}
|
|
}
|
|
return spec.MemorySpec{}
|
|
}
|
|
|
|
// ----- Disks ------------------------------------------------------------
|
|
|
|
// probeDisks walks /sys/class/block and picks out real block devices
|
|
// (no partitions, no loop/ram). For each it reads size (512B sectors)
|
|
// and serial. Virtio disks in QEMU report a serial only when launched
|
|
// with `-drive serial=...`; without that the field is empty, which is
|
|
// fine — the diff skips disks with empty serials anyway.
|
|
func probeDisks() []spec.DiskSpec {
|
|
entries, err := os.ReadDir("/sys/class/block")
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var out []spec.DiskSpec
|
|
for _, e := range entries {
|
|
name := e.Name()
|
|
if !isRealDisk(name) {
|
|
continue
|
|
}
|
|
base := filepath.Join("/sys/class/block", name)
|
|
size := diskSizeGB(base)
|
|
serial := diskSerial(name)
|
|
// size == 0 means we couldn't read /size; skip rather than
|
|
// emit garbage.
|
|
if size == 0 && serial == "" {
|
|
continue
|
|
}
|
|
out = append(out, spec.DiskSpec{Serial: serial, SizeGB: size})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func isRealDisk(name string) bool {
|
|
// Exclude partitions: they have a parent block dir and a "partition"
|
|
// attribute. sd* disks without trailing digits are whole disks; nvme
|
|
// disks use nvme0n1 for the namespace and nvme0n1p1 for partitions.
|
|
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
|
|
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
|
|
return false
|
|
}
|
|
partPath := filepath.Join("/sys/class/block", name, "partition")
|
|
if _, err := os.Stat(partPath); err == nil {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func diskSizeGB(base string) int {
|
|
b, err := os.ReadFile(filepath.Join(base, "size"))
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
sectors, err := strconv.ParseInt(strings.TrimSpace(string(b)), 10, 64)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
// /sys reports sectors of 512B regardless of physical sector size.
|
|
return int(sectors * 512 / 1_000_000_000)
|
|
}
|
|
|
|
func diskSerial(name string) string {
|
|
// Try a few known paths; the kernel exposes serials differently for
|
|
// ATA/SCSI vs NVMe.
|
|
//
|
|
// sysfs reads return raw bytes: vpd_pg80 is a binary SCSI VPD page
|
|
// with a 4-byte header, and some SSDs put control/NUL bytes at the
|
|
// head of /device/serial. TrimSpace won't strip either, so the
|
|
// string survives into the spec map as a garbage key that doesn't
|
|
// match the reporter's cleaner read from the same file on a
|
|
// different kernel. sanitizeASCII drops everything below 0x20 and
|
|
// above 0x7E, which leaves a stable printable serial on both sides.
|
|
for _, rel := range []string{
|
|
filepath.Join("/sys/block", name, "device", "serial"),
|
|
filepath.Join("/sys/block", name, "device", "vpd_pg80"),
|
|
filepath.Join("/sys/block", name, "serial"),
|
|
} {
|
|
if b, err := os.ReadFile(rel); err == nil {
|
|
s := sanitizeASCII(string(b))
|
|
if s != "" {
|
|
return s
|
|
}
|
|
}
|
|
}
|
|
// Fallback: udevadm often knows the wwid / serial. Best-effort.
|
|
cmd := exec.Command("udevadm", "info", "--query=property", "--name="+name)
|
|
out, err := cmd.Output()
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
for _, line := range strings.Split(string(out), "\n") {
|
|
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
|
|
return sanitizeASCII(v)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// ----- NICs -------------------------------------------------------------
|
|
|
|
func probeNICs() []spec.NICSpec {
|
|
root := "/sys/class/net"
|
|
entries, err := os.ReadDir(root)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var out []spec.NICSpec
|
|
for _, e := range entries {
|
|
name := e.Name()
|
|
if name == "lo" {
|
|
continue
|
|
}
|
|
base := filepath.Join(root, name)
|
|
mac := readLine(filepath.Join(base, "address"))
|
|
if mac == "" || mac == "00:00:00:00:00:00" {
|
|
continue
|
|
}
|
|
// /sys/class/net/*/speed reports Mbps or -1 if link down.
|
|
speed := 0
|
|
if b, err := os.ReadFile(filepath.Join(base, "speed")); err == nil {
|
|
if mbps, err := strconv.Atoi(strings.TrimSpace(string(b))); err == nil && mbps > 0 {
|
|
speed = mbps / 1000
|
|
}
|
|
}
|
|
out = append(out, spec.NICSpec{MAC: strings.ToLower(mac), SpeedGbps: speed})
|
|
}
|
|
return out
|
|
}
|
|
|
|
// ----- GPUs -------------------------------------------------------------
|
|
|
|
// probeGPUs leans on lspci; if lspci is missing, returns nothing and
|
|
// the diff engine just won't match any GPU expectations. Phase 4 will
|
|
// add nvidia-smi for VRAM and firmware.
|
|
func probeGPUs() []spec.GPUSpec {
|
|
cmd := exec.Command("lspci", "-mm", "-nn")
|
|
out, err := cmd.Output()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var gpus []spec.GPUSpec
|
|
for _, line := range strings.Split(string(out), "\n") {
|
|
low := strings.ToLower(line)
|
|
if !strings.Contains(low, "vga compatible controller") &&
|
|
!strings.Contains(low, "3d controller") {
|
|
continue
|
|
}
|
|
// lspci -mm quotes fields. splitQuoted indexes:
|
|
// [0] = class (e.g. "VGA compatible controller [0300]")
|
|
// [1] = vendor (e.g. "Intel Corporation [8086]")
|
|
// [2] = device (e.g. "Alder Lake-N [UHD Graphics] [46d1]")
|
|
// [3] = subsys (if present — varies between boards even
|
|
// for identical chips; NOT a model identifier)
|
|
// We used to concatenate [2] + " " + [3], which made the "model"
|
|
// key include subsystem noise and the occasional -rXX revision
|
|
// marker, so reporter and live-image runs produced different
|
|
// slugs for the same silicon. Use only [2], stripped of the
|
|
// trailing PCI device-id "[NNNN]" bracket that lspci -nn adds.
|
|
fields := splitQuoted(line)
|
|
if len(fields) >= 3 {
|
|
model := stripPCIID(fields[2])
|
|
model = sanitizeASCII(model)
|
|
if model != "" {
|
|
gpus = append(gpus, spec.GPUSpec{Model: model})
|
|
}
|
|
}
|
|
}
|
|
return gpus
|
|
}
|
|
|
|
func splitQuoted(line string) []string {
|
|
var out []string
|
|
var cur strings.Builder
|
|
inQ := false
|
|
for _, r := range line {
|
|
switch {
|
|
case r == '"':
|
|
inQ = !inQ
|
|
if !inQ {
|
|
out = append(out, cur.String())
|
|
cur.Reset()
|
|
}
|
|
case r == ' ' && !inQ:
|
|
continue
|
|
default:
|
|
cur.WriteRune(r)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// ----- shared helpers ---------------------------------------------------
|
|
|
|
func readLine(path string) string {
|
|
b, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return strings.TrimSpace(string(b))
|
|
}
|
|
|
|
// sanitizeASCII drops bytes below 0x20 (control chars) and above 0x7E
|
|
// (high-bit / UTF-8 continuation bytes that come from binary sysfs
|
|
// files like vpd_pg80 being read as a Go string) and trims the result.
|
|
// Everything the caller cares about — disk serials, GPU model names —
|
|
// is ASCII-printable, so this is safe and fixes the reporter-vs-live
|
|
// mismatch where the same hardware produced different map keys.
|
|
func sanitizeASCII(s string) string {
|
|
var b strings.Builder
|
|
b.Grow(len(s))
|
|
for i := 0; i < len(s); i++ {
|
|
c := s[i]
|
|
if c >= 0x20 && c <= 0x7E {
|
|
b.WriteByte(c)
|
|
}
|
|
}
|
|
return strings.TrimSpace(b.String())
|
|
}
|
|
|
|
// stripPCIID removes the trailing " [NNNN]" PCI device-ID marker that
|
|
// `lspci -nn` appends to vendor/device strings — useful context for a
|
|
// human but an unstable identifier across pciutils versions. Keeps any
|
|
// internal brackets (e.g. "[UHD Graphics]" is part of the real name).
|
|
var pciIDTail = regexp.MustCompile(` *\[[0-9a-fA-F]{4}\]$`)
|
|
|
|
func stripPCIID(s string) string {
|
|
return pciIDTail.ReplaceAllString(s, "")
|
|
}
|
|
|