Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,264 @@
|
||||
// Package probes collects hardware facts from a booted Linux system.
|
||||
// Phase 3 only needs enough to feed the spec diff: CPU model/cores,
|
||||
// total RAM, per-disk serial+size, per-NIC MAC+speed, per-GPU model.
|
||||
//
|
||||
// Every probe is tolerant of missing files or tools — if /sys isn't
|
||||
// available the field is just left empty. The orchestrator's diff
|
||||
// engine will surface missing expected fields as failures; missing
|
||||
// fields that weren't expected stay silent.
|
||||
package probes
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"vetting/internal/spec"
|
||||
)
|
||||
|
||||
// Collect runs every probe and returns the merged inventory. The only
|
||||
// errors it surfaces are fatal ones that prevent progress — individual
|
||||
// probe failures are logged to the returned Inventory's raw field and
|
||||
// do not fail the whole call.
|
||||
func Collect() (*spec.Inventory, error) {
|
||||
inv := &spec.Inventory{}
|
||||
|
||||
inv.CPU = probeCPU()
|
||||
inv.Memory = probeMemory()
|
||||
inv.Disks = probeDisks()
|
||||
inv.NICs = probeNICs()
|
||||
inv.GPUs = probeGPUs()
|
||||
|
||||
return inv, nil
|
||||
}
|
||||
|
||||
// ----- CPU --------------------------------------------------------------
|
||||
|
||||
func probeCPU() spec.CPUSpec {
|
||||
// model: first "model name" in /proc/cpuinfo.
|
||||
// logical_cores: runtime.NumCPU (Linux respects cpu cgroup; agent
|
||||
// runs on bare metal so it will report every HT thread).
|
||||
c := spec.CPUSpec{LogicalCores: runtime.NumCPU()}
|
||||
f, err := os.Open("/proc/cpuinfo")
|
||||
if err != nil {
|
||||
return c
|
||||
}
|
||||
defer func() { _ = f.Close() }()
|
||||
scan := bufio.NewScanner(f)
|
||||
for scan.Scan() {
|
||||
line := scan.Text()
|
||||
if strings.HasPrefix(line, "model name") {
|
||||
if _, v, ok := strings.Cut(line, ":"); ok {
|
||||
c.Model = strings.TrimSpace(v)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// ----- Memory -----------------------------------------------------------
|
||||
|
||||
func probeMemory() spec.MemorySpec {
|
||||
// /proc/meminfo reports MemTotal in kB. Round down to the nearest
|
||||
// GiB so the diff's ±2 GiB tolerance is meaningful.
|
||||
f, err := os.Open("/proc/meminfo")
|
||||
if err != nil {
|
||||
return spec.MemorySpec{}
|
||||
}
|
||||
defer func() { _ = f.Close() }()
|
||||
scan := bufio.NewScanner(f)
|
||||
for scan.Scan() {
|
||||
fields := strings.Fields(scan.Text())
|
||||
if len(fields) >= 2 && fields[0] == "MemTotal:" {
|
||||
kb, err := strconv.ParseInt(fields[1], 10, 64)
|
||||
if err == nil {
|
||||
return spec.MemorySpec{TotalGiB: int(kb / 1024 / 1024)}
|
||||
}
|
||||
}
|
||||
}
|
||||
return spec.MemorySpec{}
|
||||
}
|
||||
|
||||
// ----- Disks ------------------------------------------------------------
|
||||
|
||||
// probeDisks walks /sys/class/block and picks out real block devices
|
||||
// (no partitions, no loop/ram). For each it reads size (512B sectors)
|
||||
// and serial. Virtio disks in QEMU report a serial only when launched
|
||||
// with `-drive serial=...`; without that the field is empty, which is
|
||||
// fine — the diff skips disks with empty serials anyway.
|
||||
func probeDisks() []spec.DiskSpec {
|
||||
entries, err := os.ReadDir("/sys/class/block")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var out []spec.DiskSpec
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if !isRealDisk(name) {
|
||||
continue
|
||||
}
|
||||
base := filepath.Join("/sys/class/block", name)
|
||||
size := diskSizeGB(base)
|
||||
serial := diskSerial(name)
|
||||
// size == 0 means we couldn't read /size; skip rather than
|
||||
// emit garbage.
|
||||
if size == 0 && serial == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, spec.DiskSpec{Serial: serial, SizeGB: size})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isRealDisk(name string) bool {
|
||||
// Exclude partitions: they have a parent block dir and a "partition"
|
||||
// attribute. sd* disks without trailing digits are whole disks; nvme
|
||||
// disks use nvme0n1 for the namespace and nvme0n1p1 for partitions.
|
||||
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
|
||||
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
|
||||
return false
|
||||
}
|
||||
partPath := filepath.Join("/sys/class/block", name, "partition")
|
||||
if _, err := os.Stat(partPath); err == nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func diskSizeGB(base string) int {
|
||||
b, err := os.ReadFile(filepath.Join(base, "size"))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
sectors, err := strconv.ParseInt(strings.TrimSpace(string(b)), 10, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
// /sys reports sectors of 512B regardless of physical sector size.
|
||||
return int(sectors * 512 / 1_000_000_000)
|
||||
}
|
||||
|
||||
func diskSerial(name string) string {
|
||||
// Try a few known paths; the kernel exposes serials differently for
|
||||
// ATA/SCSI vs NVMe.
|
||||
for _, rel := range []string{
|
||||
filepath.Join("/sys/block", name, "device", "serial"),
|
||||
filepath.Join("/sys/block", name, "device", "vpd_pg80"),
|
||||
filepath.Join("/sys/block", name, "serial"),
|
||||
} {
|
||||
if b, err := os.ReadFile(rel); err == nil {
|
||||
s := strings.TrimSpace(string(b))
|
||||
if s != "" {
|
||||
return s
|
||||
}
|
||||
}
|
||||
}
|
||||
// Fallback: udevadm often knows the wwid / serial. Best-effort.
|
||||
cmd := exec.Command("udevadm", "info", "--query=property", "--name="+name)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
|
||||
return strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// ----- NICs -------------------------------------------------------------
|
||||
|
||||
func probeNICs() []spec.NICSpec {
|
||||
root := "/sys/class/net"
|
||||
entries, err := os.ReadDir(root)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var out []spec.NICSpec
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if name == "lo" {
|
||||
continue
|
||||
}
|
||||
base := filepath.Join(root, name)
|
||||
mac := readLine(filepath.Join(base, "address"))
|
||||
if mac == "" || mac == "00:00:00:00:00:00" {
|
||||
continue
|
||||
}
|
||||
// /sys/class/net/*/speed reports Mbps or -1 if link down.
|
||||
speed := 0
|
||||
if b, err := os.ReadFile(filepath.Join(base, "speed")); err == nil {
|
||||
if mbps, err := strconv.Atoi(strings.TrimSpace(string(b))); err == nil && mbps > 0 {
|
||||
speed = mbps / 1000
|
||||
}
|
||||
}
|
||||
out = append(out, spec.NICSpec{MAC: strings.ToLower(mac), SpeedGbps: speed})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// ----- GPUs -------------------------------------------------------------
|
||||
|
||||
// probeGPUs leans on lspci; if lspci is missing, returns nothing and
|
||||
// the diff engine just won't match any GPU expectations. Phase 4 will
|
||||
// add nvidia-smi for VRAM and firmware.
|
||||
func probeGPUs() []spec.GPUSpec {
|
||||
cmd := exec.Command("lspci", "-mm", "-nnk")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var gpus []spec.GPUSpec
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
low := strings.ToLower(line)
|
||||
if !strings.Contains(low, "vga compatible controller") &&
|
||||
!strings.Contains(low, "3d controller") {
|
||||
continue
|
||||
}
|
||||
// `lspci -mm` quotes fields; device name is usually field 3.
|
||||
fields := splitQuoted(line)
|
||||
if len(fields) >= 4 {
|
||||
gpus = append(gpus, spec.GPUSpec{Model: fmt.Sprintf("%s %s", fields[2], fields[3])})
|
||||
}
|
||||
}
|
||||
return gpus
|
||||
}
|
||||
|
||||
func splitQuoted(line string) []string {
|
||||
var out []string
|
||||
var cur strings.Builder
|
||||
inQ := false
|
||||
for _, r := range line {
|
||||
switch {
|
||||
case r == '"':
|
||||
inQ = !inQ
|
||||
if !inQ {
|
||||
out = append(out, cur.String())
|
||||
cur.Reset()
|
||||
}
|
||||
case r == ' ' && !inQ:
|
||||
continue
|
||||
default:
|
||||
cur.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// ----- shared helpers ---------------------------------------------------
|
||||
|
||||
func readLine(path string) string {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(b))
|
||||
}
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
package probes
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ThermalSample is one reading from /sys/class/hwmon. Kind is "temp",
|
||||
// Key is the label (or chip-relative name) and Value is degrees C.
|
||||
type ThermalSample struct {
|
||||
Kind string
|
||||
Key string
|
||||
Value float64
|
||||
Unit string
|
||||
}
|
||||
|
||||
// Thermals walks /sys/class/hwmon looking for temp*_input files. The
|
||||
// kernel reports millidegrees C; we divide by 1000. Labels come from
|
||||
// temp*_label (preferred) or a chip-relative fallback.
|
||||
//
|
||||
// This is also used by the thermal sidecar; it re-reads on each tick
|
||||
// rather than holding open handles so hot-plugged sensors (e.g. a PCIe
|
||||
// card enumerating late) get picked up.
|
||||
func Thermals() []ThermalSample {
|
||||
root := "/sys/class/hwmon"
|
||||
chips, err := os.ReadDir(root)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var out []ThermalSample
|
||||
for _, c := range chips {
|
||||
base := filepath.Join(root, c.Name())
|
||||
chipName := strings.TrimSpace(readFileStr(filepath.Join(base, "name")))
|
||||
files, err := os.ReadDir(base)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, f := range files {
|
||||
name := f.Name()
|
||||
if !strings.HasPrefix(name, "temp") || !strings.HasSuffix(name, "_input") {
|
||||
continue
|
||||
}
|
||||
idx := strings.TrimSuffix(strings.TrimPrefix(name, "temp"), "_input")
|
||||
label := strings.TrimSpace(readFileStr(filepath.Join(base, "temp"+idx+"_label")))
|
||||
if label == "" {
|
||||
label = chipName + "/temp" + idx
|
||||
}
|
||||
raw := strings.TrimSpace(readFileStr(filepath.Join(base, name)))
|
||||
milli, err := strconv.Atoi(raw)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, ThermalSample{Kind: "temp", Key: label, Value: float64(milli) / 1000, Unit: "C"})
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func readFileStr(p string) string {
|
||||
b, err := os.ReadFile(p)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
Reference in New Issue
Block a user