f79fe0f0db
Reshapes the detail page into a run-view: hybrid horizontal pipeline
+ expanded active-step pane with sub-steps, a per-step log pane with
line-numbered permalinks and client-side search, and a runs-history
sidebar that navigates via ?run=N. Default step is server-picked
(running → failed → Reporting) so the operator lands on the thing
that's moving.
Adds a sub_steps table + SSE topic (substep-{run}-{stage}-{ordinal})
so per-disk and per-pass work (SMART, CPUStress CPU/RAM, Storage,
GPU) is visible in the UI instead of buried in stage summary JSON.
Agent emits sub-step reports from existing per-iteration loops.
Dashboard tiles become a mini run-view with a 9-dot step strip so
the operator reads run health across the whole grid at a glance.
Register page gets the same card shell + button styling.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
196 lines
5.9 KiB
Go
196 lines
5.9 KiB
Go
package tests
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// SMART runs smartctl -a on each block device the kernel exposes. We
|
|
// pass each device's result through smartctl --json output and key on:
|
|
//
|
|
// smart_status.passed -> overall-health PASSED
|
|
// ata_smart_attributes -> per-attribute raw + threshold (ATA only)
|
|
// nvme_smart_health_information_log -> NVMe health flags
|
|
//
|
|
// Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just
|
|
// surfaces as a per-disk "skipped" entry; the stage only fails if at
|
|
// least one disk reports !passed.
|
|
func SMART(ctx context.Context, d Deps) Outcome {
|
|
// smartctl absence is a packaging defect, not a per-disk skip. The
|
|
// per-disk `err != nil` path below catches "this device doesn't
|
|
// support SMART" (virtio-blk, exit 4); pre-checking the binary up
|
|
// front keeps that skip legitimate and fails the stage loudly if
|
|
// the live image lost its smartmontools package.
|
|
if _, err := exec.LookPath("smartctl"); err != nil {
|
|
d.Error("SMART: smartctl not found — live image is missing required tool")
|
|
return Outcome{
|
|
Passed: false,
|
|
Message: "smartctl binary missing from live image",
|
|
Summary: "failed (smartctl missing)",
|
|
Extras: map[string]any{"reason": "smartctl_missing"},
|
|
}
|
|
}
|
|
|
|
disks, err := listBlockDisks()
|
|
if err != nil {
|
|
d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error())
|
|
return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}}
|
|
}
|
|
if len(disks) == 0 {
|
|
d.Info("SMART: no physical disks found — skipping stage")
|
|
return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
|
|
}
|
|
|
|
var reports []smartDiskReport
|
|
var subs []SubStepReport
|
|
failed := 0
|
|
usable := 0
|
|
for _, dev := range disks {
|
|
rep := smartDiskReport{Device: dev}
|
|
started := time.Now()
|
|
out, err := runSmartctl(ctx, dev)
|
|
ended := time.Now()
|
|
if err != nil {
|
|
rep.Skipped = true
|
|
rep.Reason = err.Error()
|
|
reports = append(reports, rep)
|
|
d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
|
|
subs = append(subs, subStepFromSMART(dev, rep, started, ended))
|
|
continue
|
|
}
|
|
usable++
|
|
rep.Raw = out
|
|
if passed, ok := smartPassed(out); ok {
|
|
rep.Passed = passed
|
|
if !passed {
|
|
failed++
|
|
d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev))
|
|
} else {
|
|
d.Info(fmt.Sprintf("SMART: %s PASSED", dev))
|
|
}
|
|
} else {
|
|
rep.Skipped = true
|
|
rep.Reason = "no smart_status in output"
|
|
}
|
|
reports = append(reports, rep)
|
|
subs = append(subs, subStepFromSMART(dev, rep, started, ended))
|
|
}
|
|
|
|
extras := map[string]any{
|
|
"disks": reports,
|
|
"tested": usable,
|
|
"failing": failed,
|
|
}
|
|
if failed > 0 {
|
|
return Outcome{
|
|
Passed: false,
|
|
Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
|
|
Summary: fmt.Sprintf("%d/%d failing", failed, usable),
|
|
Extras: extras,
|
|
SubSteps: subs,
|
|
}
|
|
}
|
|
summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
|
|
if usable == 0 {
|
|
summary = "skipped (no smartctl data on any disk)"
|
|
extras["skipped"] = true
|
|
}
|
|
return Outcome{Passed: true, Summary: summary, Extras: extras, SubSteps: subs}
|
|
}
|
|
|
|
// smartDiskReport is the per-disk probe result. Lifted to package scope
|
|
// so subStepFromSMART can accept it by value.
|
|
type smartDiskReport struct {
|
|
Device string `json:"device"`
|
|
Passed bool `json:"passed"`
|
|
Skipped bool `json:"skipped,omitempty"`
|
|
Reason string `json:"reason,omitempty"`
|
|
Raw map[string]any `json:"raw,omitempty"`
|
|
}
|
|
|
|
// subStepFromSMART builds a per-disk sub-step row from the in-flight
|
|
// report. "skipped" takes precedence over passed so virtio-blk etc.
|
|
// render as skipped rather than failed in the UI.
|
|
func subStepFromSMART(dev string, rep smartDiskReport, started, ended time.Time) SubStepReport {
|
|
summary, _ := json.Marshal(map[string]any{
|
|
"device": rep.Device,
|
|
"reason": rep.Reason,
|
|
"skipped": rep.Skipped,
|
|
})
|
|
return SubStepReport{
|
|
Name: fmt.Sprintf("smartctl %s", dev),
|
|
Passed: rep.Passed || rep.Skipped,
|
|
Skipped: rep.Skipped,
|
|
StartedAt: started,
|
|
CompletedAt: ended,
|
|
SummaryJSON: summary,
|
|
}
|
|
}
|
|
|
|
func listBlockDisks() ([]string, error) {
|
|
entries, err := os.ReadDir("/sys/class/block")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var out []string
|
|
for _, e := range entries {
|
|
name := e.Name()
|
|
if !isRealBlockDisk(name) {
|
|
continue
|
|
}
|
|
out = append(out, "/dev/"+name)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func isRealBlockDisk(name string) bool {
|
|
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
|
|
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
|
|
return false
|
|
}
|
|
partPath := filepath.Join("/sys/class/block", name, "partition")
|
|
if _, err := os.Stat(partPath); err == nil {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// runSmartctl invokes `smartctl -aj <dev>` and returns the parsed JSON.
|
|
// Exit code 4 means smartctl found no device info (e.g. virtio), which
|
|
// we surface as a skip rather than a failure.
|
|
func runSmartctl(ctx context.Context, dev string) (map[string]any, error) {
|
|
cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev)
|
|
out, err := cmd.Output()
|
|
if len(out) == 0 {
|
|
if err != nil {
|
|
return nil, fmt.Errorf("smartctl: %w", err)
|
|
}
|
|
return nil, fmt.Errorf("empty smartctl output")
|
|
}
|
|
var parsed map[string]any
|
|
if jerr := json.Unmarshal(out, &parsed); jerr != nil {
|
|
return nil, fmt.Errorf("parse smartctl output: %w", jerr)
|
|
}
|
|
// Even with a non-zero exit code, if we got valid JSON with
|
|
// smart_status, trust the structured result.
|
|
return parsed, nil
|
|
}
|
|
|
|
// smartPassed extracts smart_status.passed from a smartctl --json blob.
|
|
// Returns (passed, present) so callers can distinguish "passed=false"
|
|
// from "attribute missing".
|
|
func smartPassed(out map[string]any) (bool, bool) {
|
|
status, ok := out["smart_status"].(map[string]any)
|
|
if !ok {
|
|
return false, false
|
|
}
|
|
passed, ok := status["passed"].(bool)
|
|
return passed, ok
|
|
}
|