probes: sanitize disk serials and normalize GPU model for stable spec keys
CI / Lint + build + test (push) Successful in 1m25s
Release / release (push) Successful in 5m38s

Two related bugs were producing different map keys for identical
hardware depending on whether the inventory probe ran in the reporter
on the Proxmox host or in the live-image agent after PXE boot.

1. diskSerial read /sys/block/<dev>/device/{serial,vpd_pg80} and only
   TrimSpace'd the result. vpd_pg80 is a binary SCSI VPD page with a
   4-byte header, and some SSDs leak NUL/control bytes into the text
   serial file. Those bytes survive into the Go string, lowercase
   unchanged, and become a garbage map key that the reporter's cleaner
   read can't match. Sanitize to ASCII-printable range at ingest.

2. probeGPUs built the model slug from fields[2] + " " + fields[3] of
   `lspci -mm -nnk` output. fields[3] is subsystem vendor/device info,
   which varies between otherwise-identical cards and carries the
   `-rXX` revision marker — stable-enough for display but not for
   identity. Use fields[2] alone, strip the trailing `[NNNN]` PCI
   device-ID that lspci -nn appends, and sanitize for consistency.

After deploying the new orchestrator + re-running the configure step
on each registered host, SpecValidate will match cleanly. Disk diffs
self-resolve because the reporter already stored clean serials; GPU
diffs need one reporter re-run because the old expected slug still
carries subsystem noise.
This commit is contained in:
2026-04-18 16:06:18 -04:00
parent d48cf146f4
commit 5e9ad7f569
+57 -7
View File
@@ -10,10 +10,10 @@ package probes
import (
"bufio"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
@@ -147,13 +147,21 @@ func diskSizeGB(base string) int {
func diskSerial(name string) string {
// Try a few known paths; the kernel exposes serials differently for
// ATA/SCSI vs NVMe.
//
// sysfs reads return raw bytes: vpd_pg80 is a binary SCSI VPD page
// with a 4-byte header, and some SSDs put control/NUL bytes at the
// head of /device/serial. TrimSpace won't strip either, so the
// string survives into the spec map as a garbage key that doesn't
// match the reporter's cleaner read from the same file on a
// different kernel. sanitizeASCII drops everything below 0x20 and
// above 0x7E, which leaves a stable printable serial on both sides.
for _, rel := range []string{
filepath.Join("/sys/block", name, "device", "serial"),
filepath.Join("/sys/block", name, "device", "vpd_pg80"),
filepath.Join("/sys/block", name, "serial"),
} {
if b, err := os.ReadFile(rel); err == nil {
s := strings.TrimSpace(string(b))
s := sanitizeASCII(string(b))
if s != "" {
return s
}
@@ -167,7 +175,7 @@ func diskSerial(name string) string {
}
for _, line := range strings.Split(string(out), "\n") {
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
return strings.TrimSpace(v)
return sanitizeASCII(v)
}
}
return ""
@@ -210,7 +218,7 @@ func probeNICs() []spec.NICSpec {
// the diff engine just won't match any GPU expectations. Phase 4 will
// add nvidia-smi for VRAM and firmware.
func probeGPUs() []spec.GPUSpec {
cmd := exec.Command("lspci", "-mm", "-nnk")
cmd := exec.Command("lspci", "-mm", "-nn")
out, err := cmd.Output()
if err != nil {
return nil
@@ -222,10 +230,24 @@ func probeGPUs() []spec.GPUSpec {
!strings.Contains(low, "3d controller") {
continue
}
// `lspci -mm` quotes fields; device name is usually field 3.
// lspci -mm quotes fields. splitQuoted indexes:
// [0] = class (e.g. "VGA compatible controller [0300]")
// [1] = vendor (e.g. "Intel Corporation [8086]")
// [2] = device (e.g. "Alder Lake-N [UHD Graphics] [46d1]")
// [3] = subsys (if present — varies between boards even
// for identical chips; NOT a model identifier)
// We used to concatenate [2] + " " + [3], which made the "model"
// key include subsystem noise and the occasional -rXX revision
// marker, so reporter and live-image runs produced different
// slugs for the same silicon. Use only [2], stripped of the
// trailing PCI device-id "[NNNN]" bracket that lspci -nn adds.
fields := splitQuoted(line)
if len(fields) >= 4 {
gpus = append(gpus, spec.GPUSpec{Model: fmt.Sprintf("%s %s", fields[2], fields[3])})
if len(fields) >= 3 {
model := stripPCIID(fields[2])
model = sanitizeASCII(model)
if model != "" {
gpus = append(gpus, spec.GPUSpec{Model: model})
}
}
}
return gpus
@@ -262,3 +284,31 @@ func readLine(path string) string {
return strings.TrimSpace(string(b))
}
// sanitizeASCII drops bytes below 0x20 (control chars) and above 0x7E
// (high-bit / UTF-8 continuation bytes that come from binary sysfs
// files like vpd_pg80 being read as a Go string) and trims the result.
// Everything the caller cares about — disk serials, GPU model names —
// is ASCII-printable, so this is safe and fixes the reporter-vs-live
// mismatch where the same hardware produced different map keys.
func sanitizeASCII(s string) string {
var b strings.Builder
b.Grow(len(s))
for i := 0; i < len(s); i++ {
c := s[i]
if c >= 0x20 && c <= 0x7E {
b.WriteByte(c)
}
}
return strings.TrimSpace(b.String())
}
// stripPCIID removes the trailing " [NNNN]" PCI device-ID marker that
// `lspci -nn` appends to vendor/device strings — useful context for a
// human but an unstable identifier across pciutils versions. Keeps any
// internal brackets (e.g. "[UHD Graphics]" is part of the real name).
var pciIDTail = regexp.MustCompile(` *\[[0-9a-fA-F]{4}\]$`)
func stripPCIID(s string) string {
return pciIDTail.ReplaceAllString(s, "")
}