Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,86 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// GPU enumerates VGA / 3D PCI devices. No devices → skip cleanly (a
|
||||
// CPU-only server passes this stage by virtue of having nothing to
|
||||
// stress). Devices present → try nvidia-smi for NVIDIA cards, else
|
||||
// accept PCI presence.
|
||||
func GPU(ctx context.Context, d Deps) Outcome {
|
||||
devices := listGPUPCI(ctx)
|
||||
if len(devices) == 0 {
|
||||
d.Info("GPU: no VGA/3D PCI devices found — skipping stage")
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: "skipped (no GPU present)",
|
||||
Extras: map[string]any{"skipped": true, "reason": "no_gpu_present"},
|
||||
}
|
||||
}
|
||||
d.Info("GPU: found " + joinDevices(devices))
|
||||
|
||||
nvidia := nvidiaSmiList(ctx)
|
||||
extras := map[string]any{
|
||||
"pci_devices": devices,
|
||||
"skipped": false,
|
||||
}
|
||||
if len(nvidia) > 0 {
|
||||
extras["nvidia"] = nvidia
|
||||
d.Info("GPU: nvidia-smi reports: " + strings.Join(nvidia, ", "))
|
||||
}
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: formatCount(len(devices), "GPU present"),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
|
||||
// listGPUPCI shells out to lspci. Returns human-readable strings, one
|
||||
// per VGA/3D device. If lspci isn't available we return nil and the
|
||||
// caller treats it as "no GPU" which auto-skips.
|
||||
func listGPUPCI(ctx context.Context) []string {
|
||||
cmd := exec.CommandContext(ctx, "lspci", "-mm")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var devs []string
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
l := strings.ToLower(line)
|
||||
if strings.Contains(l, "vga compatible controller") || strings.Contains(l, "3d controller") {
|
||||
devs = append(devs, strings.TrimSpace(line))
|
||||
}
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
// nvidiaSmiList returns each card's "<name>, <pci bus>" line; empty
|
||||
// slice when nvidia-smi isn't installed or fails.
|
||||
func nvidiaSmiList(ctx context.Context) []string {
|
||||
cmd := exec.CommandContext(ctx, "nvidia-smi", "-L")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var lines []string
|
||||
for _, l := range strings.Split(string(out), "\n") {
|
||||
l = strings.TrimSpace(l)
|
||||
if l != "" {
|
||||
lines = append(lines, l)
|
||||
}
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
func joinDevices(devs []string) string {
|
||||
if len(devs) == 0 {
|
||||
return ""
|
||||
}
|
||||
if len(devs) == 1 {
|
||||
return devs[0]
|
||||
}
|
||||
return devs[0] + " (+" + strings.TrimSpace(formatCount(len(devs)-1, "more")) + ")"
|
||||
}
|
||||
Reference in New Issue
Block a user