Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,152 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// SMART runs smartctl -a on each block device the kernel exposes. We
|
||||
// pass each device's result through smartctl --json output and key on:
|
||||
//
|
||||
// smart_status.passed -> overall-health PASSED
|
||||
// ata_smart_attributes -> per-attribute raw + threshold (ATA only)
|
||||
// nvme_smart_health_information_log -> NVMe health flags
|
||||
//
|
||||
// Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just
|
||||
// surfaces as a per-disk "skipped" entry; the stage only fails if at
|
||||
// least one disk reports !passed.
|
||||
func SMART(ctx context.Context, d Deps) Outcome {
|
||||
disks, err := listBlockDisks()
|
||||
if err != nil {
|
||||
d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error())
|
||||
return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}}
|
||||
}
|
||||
if len(disks) == 0 {
|
||||
d.Info("SMART: no physical disks found — skipping stage")
|
||||
return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
|
||||
}
|
||||
|
||||
type diskReport struct {
|
||||
Device string `json:"device"`
|
||||
Passed bool `json:"passed"`
|
||||
Skipped bool `json:"skipped,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
Raw map[string]any `json:"raw,omitempty"`
|
||||
}
|
||||
|
||||
var reports []diskReport
|
||||
failed := 0
|
||||
usable := 0
|
||||
for _, dev := range disks {
|
||||
rep := diskReport{Device: dev}
|
||||
out, err := runSmartctl(ctx, dev)
|
||||
if err != nil {
|
||||
rep.Skipped = true
|
||||
rep.Reason = err.Error()
|
||||
reports = append(reports, rep)
|
||||
d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
|
||||
continue
|
||||
}
|
||||
usable++
|
||||
rep.Raw = out
|
||||
if passed, ok := smartPassed(out); ok {
|
||||
rep.Passed = passed
|
||||
if !passed {
|
||||
failed++
|
||||
d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev))
|
||||
} else {
|
||||
d.Info(fmt.Sprintf("SMART: %s PASSED", dev))
|
||||
}
|
||||
} else {
|
||||
rep.Skipped = true
|
||||
rep.Reason = "no smart_status in output"
|
||||
}
|
||||
reports = append(reports, rep)
|
||||
}
|
||||
|
||||
extras := map[string]any{
|
||||
"disks": reports,
|
||||
"tested": usable,
|
||||
"failing": failed,
|
||||
}
|
||||
if failed > 0 {
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
|
||||
Summary: fmt.Sprintf("%d/%d failing", failed, usable),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
|
||||
if usable == 0 {
|
||||
summary = "skipped (no smartctl data on any disk)"
|
||||
extras["skipped"] = true
|
||||
}
|
||||
return Outcome{Passed: true, Summary: summary, Extras: extras}
|
||||
}
|
||||
|
||||
func listBlockDisks() ([]string, error) {
|
||||
entries, err := os.ReadDir("/sys/class/block")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out []string
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if !isRealBlockDisk(name) {
|
||||
continue
|
||||
}
|
||||
out = append(out, "/dev/"+name)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func isRealBlockDisk(name string) bool {
|
||||
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
|
||||
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
|
||||
return false
|
||||
}
|
||||
partPath := filepath.Join("/sys/class/block", name, "partition")
|
||||
if _, err := os.Stat(partPath); err == nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// runSmartctl invokes `smartctl -aj <dev>` and returns the parsed JSON.
|
||||
// Exit code 4 means smartctl found no device info (e.g. virtio), which
|
||||
// we surface as a skip rather than a failure.
|
||||
func runSmartctl(ctx context.Context, dev string) (map[string]any, error) {
|
||||
cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev)
|
||||
out, err := cmd.Output()
|
||||
if len(out) == 0 {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("smartctl: %w", err)
|
||||
}
|
||||
return nil, fmt.Errorf("empty smartctl output")
|
||||
}
|
||||
var parsed map[string]any
|
||||
if jerr := json.Unmarshal(out, &parsed); jerr != nil {
|
||||
return nil, fmt.Errorf("parse smartctl output: %w", jerr)
|
||||
}
|
||||
// Even with a non-zero exit code, if we got valid JSON with
|
||||
// smart_status, trust the structured result.
|
||||
return parsed, nil
|
||||
}
|
||||
|
||||
// smartPassed extracts smart_status.passed from a smartctl --json blob.
|
||||
// Returns (passed, present) so callers can distinguish "passed=false"
|
||||
// from "attribute missing".
|
||||
func smartPassed(out map[string]any) (bool, bool) {
|
||||
status, ok := out["smart_status"].(map[string]any)
|
||||
if !ok {
|
||||
return false, false
|
||||
}
|
||||
passed, ok := status["passed"].(bool)
|
||||
return passed, ok
|
||||
}
|
||||
Reference in New Issue
Block a user