Files
josh f79fe0f0db
CI / Lint + build + test (push) Successful in 1m26s
Release / release (push) Successful in 6m47s
ui: GitHub-Actions-style detail page, sub-steps, mini-tile run-view
Reshapes the detail page into a run-view: hybrid horizontal pipeline
+ expanded active-step pane with sub-steps, a per-step log pane with
line-numbered permalinks and client-side search, and a runs-history
sidebar that navigates via ?run=N. Default step is server-picked
(running → failed → Reporting) so the operator lands on the thing
that's moving.

Adds a sub_steps table + SSE topic (substep-{run}-{stage}-{ordinal})
so per-disk and per-pass work (SMART, CPUStress CPU/RAM, Storage,
GPU) is visible in the UI instead of buried in stage summary JSON.
Agent emits sub-step reports from existing per-iteration loops.

Dashboard tiles become a mini run-view with a 9-dot step strip so
the operator reads run health across the whole grid at a glance.
Register page gets the same card shell + button styling.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 19:00:11 -04:00

196 lines
5.9 KiB
Go

package tests
import (
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
// SMART runs smartctl -a on each block device the kernel exposes. We
// pass each device's result through smartctl --json output and key on:
//
// smart_status.passed -> overall-health PASSED
// ata_smart_attributes -> per-attribute raw + threshold (ATA only)
// nvme_smart_health_information_log -> NVMe health flags
//
// Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just
// surfaces as a per-disk "skipped" entry; the stage only fails if at
// least one disk reports !passed.
func SMART(ctx context.Context, d Deps) Outcome {
// smartctl absence is a packaging defect, not a per-disk skip. The
// per-disk `err != nil` path below catches "this device doesn't
// support SMART" (virtio-blk, exit 4); pre-checking the binary up
// front keeps that skip legitimate and fails the stage loudly if
// the live image lost its smartmontools package.
if _, err := exec.LookPath("smartctl"); err != nil {
d.Error("SMART: smartctl not found — live image is missing required tool")
return Outcome{
Passed: false,
Message: "smartctl binary missing from live image",
Summary: "failed (smartctl missing)",
Extras: map[string]any{"reason": "smartctl_missing"},
}
}
disks, err := listBlockDisks()
if err != nil {
d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error())
return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}}
}
if len(disks) == 0 {
d.Info("SMART: no physical disks found — skipping stage")
return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
}
var reports []smartDiskReport
var subs []SubStepReport
failed := 0
usable := 0
for _, dev := range disks {
rep := smartDiskReport{Device: dev}
started := time.Now()
out, err := runSmartctl(ctx, dev)
ended := time.Now()
if err != nil {
rep.Skipped = true
rep.Reason = err.Error()
reports = append(reports, rep)
d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
subs = append(subs, subStepFromSMART(dev, rep, started, ended))
continue
}
usable++
rep.Raw = out
if passed, ok := smartPassed(out); ok {
rep.Passed = passed
if !passed {
failed++
d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev))
} else {
d.Info(fmt.Sprintf("SMART: %s PASSED", dev))
}
} else {
rep.Skipped = true
rep.Reason = "no smart_status in output"
}
reports = append(reports, rep)
subs = append(subs, subStepFromSMART(dev, rep, started, ended))
}
extras := map[string]any{
"disks": reports,
"tested": usable,
"failing": failed,
}
if failed > 0 {
return Outcome{
Passed: false,
Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
Summary: fmt.Sprintf("%d/%d failing", failed, usable),
Extras: extras,
SubSteps: subs,
}
}
summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
if usable == 0 {
summary = "skipped (no smartctl data on any disk)"
extras["skipped"] = true
}
return Outcome{Passed: true, Summary: summary, Extras: extras, SubSteps: subs}
}
// smartDiskReport is the per-disk probe result. Lifted to package scope
// so subStepFromSMART can accept it by value.
type smartDiskReport struct {
Device string `json:"device"`
Passed bool `json:"passed"`
Skipped bool `json:"skipped,omitempty"`
Reason string `json:"reason,omitempty"`
Raw map[string]any `json:"raw,omitempty"`
}
// subStepFromSMART builds a per-disk sub-step row from the in-flight
// report. "skipped" takes precedence over passed so virtio-blk etc.
// render as skipped rather than failed in the UI.
func subStepFromSMART(dev string, rep smartDiskReport, started, ended time.Time) SubStepReport {
summary, _ := json.Marshal(map[string]any{
"device": rep.Device,
"reason": rep.Reason,
"skipped": rep.Skipped,
})
return SubStepReport{
Name: fmt.Sprintf("smartctl %s", dev),
Passed: rep.Passed || rep.Skipped,
Skipped: rep.Skipped,
StartedAt: started,
CompletedAt: ended,
SummaryJSON: summary,
}
}
func listBlockDisks() ([]string, error) {
entries, err := os.ReadDir("/sys/class/block")
if err != nil {
return nil, err
}
var out []string
for _, e := range entries {
name := e.Name()
if !isRealBlockDisk(name) {
continue
}
out = append(out, "/dev/"+name)
}
return out, nil
}
func isRealBlockDisk(name string) bool {
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
return false
}
partPath := filepath.Join("/sys/class/block", name, "partition")
if _, err := os.Stat(partPath); err == nil {
return false
}
return true
}
// runSmartctl invokes `smartctl -aj <dev>` and returns the parsed JSON.
// Exit code 4 means smartctl found no device info (e.g. virtio), which
// we surface as a skip rather than a failure.
func runSmartctl(ctx context.Context, dev string) (map[string]any, error) {
cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev)
out, err := cmd.Output()
if len(out) == 0 {
if err != nil {
return nil, fmt.Errorf("smartctl: %w", err)
}
return nil, fmt.Errorf("empty smartctl output")
}
var parsed map[string]any
if jerr := json.Unmarshal(out, &parsed); jerr != nil {
return nil, fmt.Errorf("parse smartctl output: %w", jerr)
}
// Even with a non-zero exit code, if we got valid JSON with
// smart_status, trust the structured result.
return parsed, nil
}
// smartPassed extracts smart_status.passed from a smartctl --json blob.
// Returns (passed, present) so callers can distinguish "passed=false"
// from "attribute missing".
func smartPassed(out map[string]any) (bool, bool) {
status, ok := out["smart_status"].(map[string]any)
if !ok {
return false, false
}
passed, ok := status["passed"].(bool)
return passed, ok
}