package tests import ( "context" "encoding/json" "fmt" "os" "os/exec" "path/filepath" "strings" "time" ) // SMART runs smartctl -a on each block device the kernel exposes. We // pass each device's result through smartctl --json output and key on: // // smart_status.passed -> overall-health PASSED // ata_smart_attributes -> per-attribute raw + threshold (ATA only) // nvme_smart_health_information_log -> NVMe health flags // // Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just // surfaces as a per-disk "skipped" entry; the stage only fails if at // least one disk reports !passed. func SMART(ctx context.Context, d Deps) Outcome { // smartctl absence is a packaging defect, not a per-disk skip. The // per-disk `err != nil` path below catches "this device doesn't // support SMART" (virtio-blk, exit 4); pre-checking the binary up // front keeps that skip legitimate and fails the stage loudly if // the live image lost its smartmontools package. if _, err := exec.LookPath("smartctl"); err != nil { d.Error("SMART: smartctl not found — live image is missing required tool") return Outcome{ Passed: false, Message: "smartctl binary missing from live image", Summary: "failed (smartctl missing)", Extras: map[string]any{"reason": "smartctl_missing"}, } } disks, err := listBlockDisks() if err != nil { d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error()) return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}} } if len(disks) == 0 { d.Info("SMART: no physical disks found — skipping stage") return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}} } var reports []smartDiskReport var subs []SubStepReport failed := 0 usable := 0 for _, dev := range disks { rep := smartDiskReport{Device: dev} started := time.Now() out, err := runSmartctl(ctx, dev) ended := time.Now() if err != nil { rep.Skipped = true rep.Reason = err.Error() reports = append(reports, rep) d.Info("SMART: " + dev + " skipped (" + err.Error() + ")") subs = append(subs, subStepFromSMART(dev, rep, started, ended)) continue } usable++ rep.Raw = out if passed, ok := smartPassed(out); ok { rep.Passed = passed if !passed { failed++ d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev)) } else { d.Info(fmt.Sprintf("SMART: %s PASSED", dev)) } } else { rep.Skipped = true rep.Reason = "no smart_status in output" } reports = append(reports, rep) subs = append(subs, subStepFromSMART(dev, rep, started, ended)) } extras := map[string]any{ "disks": reports, "tested": usable, "failing": failed, } if failed > 0 { return Outcome{ Passed: false, Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed), Summary: fmt.Sprintf("%d/%d failing", failed, usable), Extras: extras, SubSteps: subs, } } summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable) if usable == 0 { summary = "skipped (no smartctl data on any disk)" extras["skipped"] = true } return Outcome{Passed: true, Summary: summary, Extras: extras, SubSteps: subs} } // smartDiskReport is the per-disk probe result. Lifted to package scope // so subStepFromSMART can accept it by value. type smartDiskReport struct { Device string `json:"device"` Passed bool `json:"passed"` Skipped bool `json:"skipped,omitempty"` Reason string `json:"reason,omitempty"` Raw map[string]any `json:"raw,omitempty"` } // subStepFromSMART builds a per-disk sub-step row from the in-flight // report. "skipped" takes precedence over passed so virtio-blk etc. // render as skipped rather than failed in the UI. func subStepFromSMART(dev string, rep smartDiskReport, started, ended time.Time) SubStepReport { summary, _ := json.Marshal(map[string]any{ "device": rep.Device, "reason": rep.Reason, "skipped": rep.Skipped, }) return SubStepReport{ Name: fmt.Sprintf("smartctl %s", dev), Passed: rep.Passed || rep.Skipped, Skipped: rep.Skipped, StartedAt: started, CompletedAt: ended, SummaryJSON: summary, } } func listBlockDisks() ([]string, error) { entries, err := os.ReadDir("/sys/class/block") if err != nil { return nil, err } var out []string for _, e := range entries { name := e.Name() if !isRealBlockDisk(name) { continue } out = append(out, "/dev/"+name) } return out, nil } func isRealBlockDisk(name string) bool { if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") || strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") { return false } partPath := filepath.Join("/sys/class/block", name, "partition") if _, err := os.Stat(partPath); err == nil { return false } return true } // runSmartctl invokes `smartctl -aj ` and returns the parsed JSON. // Exit code 4 means smartctl found no device info (e.g. virtio), which // we surface as a skip rather than a failure. func runSmartctl(ctx context.Context, dev string) (map[string]any, error) { cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev) out, err := cmd.Output() if len(out) == 0 { if err != nil { return nil, fmt.Errorf("smartctl: %w", err) } return nil, fmt.Errorf("empty smartctl output") } var parsed map[string]any if jerr := json.Unmarshal(out, &parsed); jerr != nil { return nil, fmt.Errorf("parse smartctl output: %w", jerr) } // Even with a non-zero exit code, if we got valid JSON with // smart_status, trust the structured result. return parsed, nil } // smartPassed extracts smart_status.passed from a smartctl --json blob. // Returns (passed, present) so callers can distinguish "passed=false" // from "attribute missing". func smartPassed(out map[string]any) (bool, bool) { status, ok := out["smart_status"].(map[string]any) if !ok { return false, false } passed, ok := status["passed"].(bool) return passed, ok }