9bb4b09a04
CI / Lint + build + test (push) Has been cancelled
Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
153 lines
4.3 KiB
Go
153 lines
4.3 KiB
Go
package tests
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// SMART runs smartctl -a on each block device the kernel exposes. We
|
|
// pass each device's result through smartctl --json output and key on:
|
|
//
|
|
// smart_status.passed -> overall-health PASSED
|
|
// ata_smart_attributes -> per-attribute raw + threshold (ATA only)
|
|
// nvme_smart_health_information_log -> NVMe health flags
|
|
//
|
|
// Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just
|
|
// surfaces as a per-disk "skipped" entry; the stage only fails if at
|
|
// least one disk reports !passed.
|
|
func SMART(ctx context.Context, d Deps) Outcome {
|
|
disks, err := listBlockDisks()
|
|
if err != nil {
|
|
d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error())
|
|
return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}}
|
|
}
|
|
if len(disks) == 0 {
|
|
d.Info("SMART: no physical disks found — skipping stage")
|
|
return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
|
|
}
|
|
|
|
type diskReport struct {
|
|
Device string `json:"device"`
|
|
Passed bool `json:"passed"`
|
|
Skipped bool `json:"skipped,omitempty"`
|
|
Reason string `json:"reason,omitempty"`
|
|
Raw map[string]any `json:"raw,omitempty"`
|
|
}
|
|
|
|
var reports []diskReport
|
|
failed := 0
|
|
usable := 0
|
|
for _, dev := range disks {
|
|
rep := diskReport{Device: dev}
|
|
out, err := runSmartctl(ctx, dev)
|
|
if err != nil {
|
|
rep.Skipped = true
|
|
rep.Reason = err.Error()
|
|
reports = append(reports, rep)
|
|
d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
|
|
continue
|
|
}
|
|
usable++
|
|
rep.Raw = out
|
|
if passed, ok := smartPassed(out); ok {
|
|
rep.Passed = passed
|
|
if !passed {
|
|
failed++
|
|
d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev))
|
|
} else {
|
|
d.Info(fmt.Sprintf("SMART: %s PASSED", dev))
|
|
}
|
|
} else {
|
|
rep.Skipped = true
|
|
rep.Reason = "no smart_status in output"
|
|
}
|
|
reports = append(reports, rep)
|
|
}
|
|
|
|
extras := map[string]any{
|
|
"disks": reports,
|
|
"tested": usable,
|
|
"failing": failed,
|
|
}
|
|
if failed > 0 {
|
|
return Outcome{
|
|
Passed: false,
|
|
Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
|
|
Summary: fmt.Sprintf("%d/%d failing", failed, usable),
|
|
Extras: extras,
|
|
}
|
|
}
|
|
summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
|
|
if usable == 0 {
|
|
summary = "skipped (no smartctl data on any disk)"
|
|
extras["skipped"] = true
|
|
}
|
|
return Outcome{Passed: true, Summary: summary, Extras: extras}
|
|
}
|
|
|
|
func listBlockDisks() ([]string, error) {
|
|
entries, err := os.ReadDir("/sys/class/block")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var out []string
|
|
for _, e := range entries {
|
|
name := e.Name()
|
|
if !isRealBlockDisk(name) {
|
|
continue
|
|
}
|
|
out = append(out, "/dev/"+name)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func isRealBlockDisk(name string) bool {
|
|
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
|
|
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
|
|
return false
|
|
}
|
|
partPath := filepath.Join("/sys/class/block", name, "partition")
|
|
if _, err := os.Stat(partPath); err == nil {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// runSmartctl invokes `smartctl -aj <dev>` and returns the parsed JSON.
|
|
// Exit code 4 means smartctl found no device info (e.g. virtio), which
|
|
// we surface as a skip rather than a failure.
|
|
func runSmartctl(ctx context.Context, dev string) (map[string]any, error) {
|
|
cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev)
|
|
out, err := cmd.Output()
|
|
if len(out) == 0 {
|
|
if err != nil {
|
|
return nil, fmt.Errorf("smartctl: %w", err)
|
|
}
|
|
return nil, fmt.Errorf("empty smartctl output")
|
|
}
|
|
var parsed map[string]any
|
|
if jerr := json.Unmarshal(out, &parsed); jerr != nil {
|
|
return nil, fmt.Errorf("parse smartctl output: %w", jerr)
|
|
}
|
|
// Even with a non-zero exit code, if we got valid JSON with
|
|
// smart_status, trust the structured result.
|
|
return parsed, nil
|
|
}
|
|
|
|
// smartPassed extracts smart_status.passed from a smartctl --json blob.
|
|
// Returns (passed, present) so callers can distinguish "passed=false"
|
|
// from "attribute missing".
|
|
func smartPassed(out map[string]any) (bool, bool) {
|
|
status, ok := out["smart_status"].(map[string]any)
|
|
if !ok {
|
|
return false, false
|
|
}
|
|
passed, ok := status["passed"].(bool)
|
|
return passed, ok
|
|
}
|