Files
Vetting/agent/tests/smart.go
T
josh 9bb4b09a04
CI / Lint + build + test (push) Has been cancelled
Initial commit: full Phases 1-6 implementation
Post-repair hardware validation pipeline for Proxmox cluster hosts.
Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq
PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
2026-04-17 21:32:10 -04:00

153 lines
4.3 KiB
Go

package tests
import (
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
)
// SMART runs smartctl -a on each block device the kernel exposes. We
// pass each device's result through smartctl --json output and key on:
//
// smart_status.passed -> overall-health PASSED
// ata_smart_attributes -> per-attribute raw + threshold (ATA only)
// nvme_smart_health_information_log -> NVMe health flags
//
// Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just
// surfaces as a per-disk "skipped" entry; the stage only fails if at
// least one disk reports !passed.
func SMART(ctx context.Context, d Deps) Outcome {
disks, err := listBlockDisks()
if err != nil {
d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error())
return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}}
}
if len(disks) == 0 {
d.Info("SMART: no physical disks found — skipping stage")
return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
}
type diskReport struct {
Device string `json:"device"`
Passed bool `json:"passed"`
Skipped bool `json:"skipped,omitempty"`
Reason string `json:"reason,omitempty"`
Raw map[string]any `json:"raw,omitempty"`
}
var reports []diskReport
failed := 0
usable := 0
for _, dev := range disks {
rep := diskReport{Device: dev}
out, err := runSmartctl(ctx, dev)
if err != nil {
rep.Skipped = true
rep.Reason = err.Error()
reports = append(reports, rep)
d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
continue
}
usable++
rep.Raw = out
if passed, ok := smartPassed(out); ok {
rep.Passed = passed
if !passed {
failed++
d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev))
} else {
d.Info(fmt.Sprintf("SMART: %s PASSED", dev))
}
} else {
rep.Skipped = true
rep.Reason = "no smart_status in output"
}
reports = append(reports, rep)
}
extras := map[string]any{
"disks": reports,
"tested": usable,
"failing": failed,
}
if failed > 0 {
return Outcome{
Passed: false,
Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
Summary: fmt.Sprintf("%d/%d failing", failed, usable),
Extras: extras,
}
}
summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
if usable == 0 {
summary = "skipped (no smartctl data on any disk)"
extras["skipped"] = true
}
return Outcome{Passed: true, Summary: summary, Extras: extras}
}
func listBlockDisks() ([]string, error) {
entries, err := os.ReadDir("/sys/class/block")
if err != nil {
return nil, err
}
var out []string
for _, e := range entries {
name := e.Name()
if !isRealBlockDisk(name) {
continue
}
out = append(out, "/dev/"+name)
}
return out, nil
}
func isRealBlockDisk(name string) bool {
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
return false
}
partPath := filepath.Join("/sys/class/block", name, "partition")
if _, err := os.Stat(partPath); err == nil {
return false
}
return true
}
// runSmartctl invokes `smartctl -aj <dev>` and returns the parsed JSON.
// Exit code 4 means smartctl found no device info (e.g. virtio), which
// we surface as a skip rather than a failure.
func runSmartctl(ctx context.Context, dev string) (map[string]any, error) {
cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev)
out, err := cmd.Output()
if len(out) == 0 {
if err != nil {
return nil, fmt.Errorf("smartctl: %w", err)
}
return nil, fmt.Errorf("empty smartctl output")
}
var parsed map[string]any
if jerr := json.Unmarshal(out, &parsed); jerr != nil {
return nil, fmt.Errorf("parse smartctl output: %w", jerr)
}
// Even with a non-zero exit code, if we got valid JSON with
// smart_status, trust the structured result.
return parsed, nil
}
// smartPassed extracts smart_status.passed from a smartctl --json blob.
// Returns (passed, present) so callers can distinguish "passed=false"
// from "attribute missing".
func smartPassed(out map[string]any) (bool, bool) {
status, ok := out["smart_status"].(map[string]any)
if !ok {
return false, false
}
passed, ok := status["passed"].(bool)
return passed, ok
}