e73e31af92
The live image was still carrying the Phase 2 package list, so SMART, CPUStress, and Network each hit a LookPath miss and returned pass-with-skip. A run that skipped every real check still ended in "completed" — nothing on the report said the image was broken. Add smartmontools, stress-ng, fio, iperf3, lshw, lm-sensors, e2fsprogs, and util-linux to mkosi.conf. Flip the three stages from skip-pass to fail when their binary is missing so any future packaging regression blocks the run instead of whispering past it. Legitimate "no hardware" skips (no GPU, no hwmon, no disks, non-destructive) are untouched. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
168 lines
4.9 KiB
Go
168 lines
4.9 KiB
Go
package tests
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// SMART runs smartctl -a on each block device the kernel exposes. We
|
|
// pass each device's result through smartctl --json output and key on:
|
|
//
|
|
// smart_status.passed -> overall-health PASSED
|
|
// ata_smart_attributes -> per-attribute raw + threshold (ATA only)
|
|
// nvme_smart_health_information_log -> NVMe health flags
|
|
//
|
|
// Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just
|
|
// surfaces as a per-disk "skipped" entry; the stage only fails if at
|
|
// least one disk reports !passed.
|
|
func SMART(ctx context.Context, d Deps) Outcome {
|
|
// smartctl absence is a packaging defect, not a per-disk skip. The
|
|
// per-disk `err != nil` path below catches "this device doesn't
|
|
// support SMART" (virtio-blk, exit 4); pre-checking the binary up
|
|
// front keeps that skip legitimate and fails the stage loudly if
|
|
// the live image lost its smartmontools package.
|
|
if _, err := exec.LookPath("smartctl"); err != nil {
|
|
d.Error("SMART: smartctl not found — live image is missing required tool")
|
|
return Outcome{
|
|
Passed: false,
|
|
Message: "smartctl binary missing from live image",
|
|
Summary: "failed (smartctl missing)",
|
|
Extras: map[string]any{"reason": "smartctl_missing"},
|
|
}
|
|
}
|
|
|
|
disks, err := listBlockDisks()
|
|
if err != nil {
|
|
d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error())
|
|
return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}}
|
|
}
|
|
if len(disks) == 0 {
|
|
d.Info("SMART: no physical disks found — skipping stage")
|
|
return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
|
|
}
|
|
|
|
type diskReport struct {
|
|
Device string `json:"device"`
|
|
Passed bool `json:"passed"`
|
|
Skipped bool `json:"skipped,omitempty"`
|
|
Reason string `json:"reason,omitempty"`
|
|
Raw map[string]any `json:"raw,omitempty"`
|
|
}
|
|
|
|
var reports []diskReport
|
|
failed := 0
|
|
usable := 0
|
|
for _, dev := range disks {
|
|
rep := diskReport{Device: dev}
|
|
out, err := runSmartctl(ctx, dev)
|
|
if err != nil {
|
|
rep.Skipped = true
|
|
rep.Reason = err.Error()
|
|
reports = append(reports, rep)
|
|
d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
|
|
continue
|
|
}
|
|
usable++
|
|
rep.Raw = out
|
|
if passed, ok := smartPassed(out); ok {
|
|
rep.Passed = passed
|
|
if !passed {
|
|
failed++
|
|
d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev))
|
|
} else {
|
|
d.Info(fmt.Sprintf("SMART: %s PASSED", dev))
|
|
}
|
|
} else {
|
|
rep.Skipped = true
|
|
rep.Reason = "no smart_status in output"
|
|
}
|
|
reports = append(reports, rep)
|
|
}
|
|
|
|
extras := map[string]any{
|
|
"disks": reports,
|
|
"tested": usable,
|
|
"failing": failed,
|
|
}
|
|
if failed > 0 {
|
|
return Outcome{
|
|
Passed: false,
|
|
Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
|
|
Summary: fmt.Sprintf("%d/%d failing", failed, usable),
|
|
Extras: extras,
|
|
}
|
|
}
|
|
summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
|
|
if usable == 0 {
|
|
summary = "skipped (no smartctl data on any disk)"
|
|
extras["skipped"] = true
|
|
}
|
|
return Outcome{Passed: true, Summary: summary, Extras: extras}
|
|
}
|
|
|
|
func listBlockDisks() ([]string, error) {
|
|
entries, err := os.ReadDir("/sys/class/block")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var out []string
|
|
for _, e := range entries {
|
|
name := e.Name()
|
|
if !isRealBlockDisk(name) {
|
|
continue
|
|
}
|
|
out = append(out, "/dev/"+name)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func isRealBlockDisk(name string) bool {
|
|
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
|
|
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
|
|
return false
|
|
}
|
|
partPath := filepath.Join("/sys/class/block", name, "partition")
|
|
if _, err := os.Stat(partPath); err == nil {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// runSmartctl invokes `smartctl -aj <dev>` and returns the parsed JSON.
|
|
// Exit code 4 means smartctl found no device info (e.g. virtio), which
|
|
// we surface as a skip rather than a failure.
|
|
func runSmartctl(ctx context.Context, dev string) (map[string]any, error) {
|
|
cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev)
|
|
out, err := cmd.Output()
|
|
if len(out) == 0 {
|
|
if err != nil {
|
|
return nil, fmt.Errorf("smartctl: %w", err)
|
|
}
|
|
return nil, fmt.Errorf("empty smartctl output")
|
|
}
|
|
var parsed map[string]any
|
|
if jerr := json.Unmarshal(out, &parsed); jerr != nil {
|
|
return nil, fmt.Errorf("parse smartctl output: %w", jerr)
|
|
}
|
|
// Even with a non-zero exit code, if we got valid JSON with
|
|
// smart_status, trust the structured result.
|
|
return parsed, nil
|
|
}
|
|
|
|
// smartPassed extracts smart_status.passed from a smartctl --json blob.
|
|
// Returns (passed, present) so callers can distinguish "passed=false"
|
|
// from "attribute missing".
|
|
func smartPassed(out map[string]any) (bool, bool) {
|
|
status, ok := out["smart_status"].(map[string]any)
|
|
if !ok {
|
|
return false, false
|
|
}
|
|
passed, ok := status["passed"].(bool)
|
|
return passed, ok
|
|
}
|