ui: GitHub-Actions-style detail page, sub-steps, mini-tile run-view
CI / Lint + build + test (push) Successful in 1m26s
Release / release (push) Successful in 6m47s

Reshapes the detail page into a run-view: hybrid horizontal pipeline
+ expanded active-step pane with sub-steps, a per-step log pane with
line-numbered permalinks and client-side search, and a runs-history
sidebar that navigates via ?run=N. Default step is server-picked
(running → failed → Reporting) so the operator lands on the thing
that's moving.

Adds a sub_steps table + SSE topic (substep-{run}-{stage}-{ordinal})
so per-disk and per-pass work (SMART, CPUStress CPU/RAM, Storage,
GPU) is visible in the UI instead of buried in stage summary JSON.
Agent emits sub-step reports from existing per-iteration loops.

Dashboard tiles become a mini run-view with a 9-dot step strip so
the operator reads run health across the whole grid at a glance.
Register page gets the same card shell + button styling.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 19:00:11 -04:00
parent 5c00edd7b6
commit f79fe0f0db
38 changed files with 3972 additions and 936 deletions
+13
View File
@@ -150,6 +150,19 @@ type ResultResponse struct {
NextState string `json:"next_state"`
}
// SubStepReport is the wire shape the agent POSTs inside /result for
// each granular sub-step (CPU/Memory pass, per-disk SMART, per-device
// GPU, …). Ordinal is assigned by the server in slice order; the agent
// doesn't set it. Summary is opaque JSON the UI may render later.
type SubStepReport struct {
Name string `json:"name"`
Passed bool `json:"passed"`
Skipped bool `json:"skipped,omitempty"`
StartedAt string `json:"started_at,omitempty"`
CompletedAt string `json:"completed_at,omitempty"`
Summary json.RawMessage `json:"summary,omitempty"`
}
type HoldResponse struct {
AuthorizedKey string `json:"authorized_key"`
RunID int64 `json:"run_id"`
+19
View File
@@ -276,6 +276,25 @@ func postResult(ctx context.Context, c *Client, stage string, s stageOutcome) (*
if s.Inventory != nil {
body["inventory"] = s.Inventory
}
if len(s.Outcome.SubSteps) > 0 {
wire := make([]SubStepReport, 0, len(s.Outcome.SubSteps))
for _, ss := range s.Outcome.SubSteps {
w := SubStepReport{
Name: ss.Name,
Passed: ss.Passed,
Skipped: ss.Skipped,
Summary: ss.SummaryJSON,
}
if !ss.StartedAt.IsZero() {
w.StartedAt = ss.StartedAt.UTC().Format(time.RFC3339Nano)
}
if !ss.CompletedAt.IsZero() {
w.CompletedAt = ss.CompletedAt.UTC().Format(time.RFC3339Nano)
}
wire = append(wire, w)
}
body["sub_steps"] = wire
}
return c.Result(ctx, body)
}
+57 -23
View File
@@ -3,6 +3,7 @@ package tests
import (
"bufio"
"context"
"encoding/json"
"fmt"
"io"
"os"
@@ -52,6 +53,7 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
cores := runtime.NumCPU()
extras := map[string]any{"cores": cores}
var subs []SubStepReport
// Pass 1: CPU
cpu := runStressPass(ctx, d, "CPU", cpuPassDuration, []string{
@@ -62,12 +64,14 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
"--verify",
})
extras["cpu_pass"] = cpu
subs = append(subs, subStepFromPass("CPU pass", cpu))
if !cpu.Passed {
return Outcome{
Passed: false,
Message: "CPU pass failed: " + cpu.Err,
Summary: fmt.Sprintf("CPU pass failed after %ds", cpu.ElapsedSecs),
Extras: extras,
Passed: false,
Message: "CPU pass failed: " + cpu.Err,
Summary: fmt.Sprintf("CPU pass failed after %ds", cpu.ElapsedSecs),
Extras: extras,
SubSteps: subs,
}
}
@@ -77,10 +81,11 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
if err != nil {
d.Error("CPUStress: read MemAvailable: " + err.Error())
return Outcome{
Passed: false,
Message: "read MemAvailable: " + err.Error(),
Summary: "failed (meminfo unreadable)",
Extras: extras,
Passed: false,
Message: "read MemAvailable: " + err.Error(),
Summary: "failed (meminfo unreadable)",
Extras: extras,
SubSteps: subs,
}
}
cap := avail - memHeadroomBytes
@@ -92,10 +97,11 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
avail, memFloorBytes, memHeadroomBytes)
d.Error("CPUStress: " + msg)
return Outcome{
Passed: false,
Message: msg,
Summary: "failed (insufficient free RAM for memory pass)",
Extras: extras,
Passed: false,
Message: msg,
Summary: "failed (insufficient free RAM for memory pass)",
Extras: extras,
SubSteps: subs,
}
}
mem := runStressPass(ctx, d, "memory", memPassDuration, []string{
@@ -107,12 +113,14 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
"--verify",
})
extras["mem_pass"] = mem
subs = append(subs, subStepFromPass(fmt.Sprintf("Memory pass (cap %s)", humanBytes(cap)), mem))
if !mem.Passed {
return Outcome{
Passed: false,
Message: "memory pass failed: " + mem.Err,
Summary: fmt.Sprintf("memory pass failed after %ds", mem.ElapsedSecs),
Extras: extras,
Passed: false,
Message: "memory pass failed: " + mem.Err,
Summary: fmt.Sprintf("memory pass failed after %ds", mem.ElapsedSecs),
Extras: extras,
SubSteps: subs,
}
}
@@ -120,7 +128,26 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
Passed: true,
Summary: fmt.Sprintf("CPU+RAM PASSED (%d cores, %s cap)",
cores, humanBytes(cap)),
Extras: extras,
Extras: extras,
SubSteps: subs,
}
}
// subStepFromPass projects a stressPass into a SubStepReport — shared by
// both passes and by the mid-stage early-return paths so the UI always
// sees exactly one row per pass, even on failure.
func subStepFromPass(name string, p stressPass) SubStepReport {
summary, _ := json.Marshal(map[string]any{
"elapsed_secs": p.ElapsedSecs,
"target_secs": p.TargetSecs,
"err": p.Err,
})
return SubStepReport{
Name: name,
Passed: p.Passed,
StartedAt: p.StartedAt,
CompletedAt: p.CompletedAt,
SummaryJSON: summary,
}
}
@@ -140,12 +167,16 @@ const (
// stressPass is the per-pass result embedded in CPUStress's Extras.
// Passed==true and Elapsed close to target is the only happy path.
// StartedAt/CompletedAt are not serialized (the summary already has
// ElapsedSecs) but are used by the caller to emit SubStepReport rows.
type stressPass struct {
Passed bool `json:"passed"`
Err string `json:"err,omitempty"`
ElapsedSecs int `json:"elapsed_secs"`
TargetSecs int `json:"target_secs"`
OutputTail string `json:"output_tail,omitempty"`
Passed bool `json:"passed"`
Err string `json:"err,omitempty"`
ElapsedSecs int `json:"elapsed_secs"`
TargetSecs int `json:"target_secs"`
OutputTail string `json:"output_tail,omitempty"`
StartedAt time.Time `json:"-"`
CompletedAt time.Time `json:"-"`
}
// runStressPass invokes stress-ng and validates both exit code and
@@ -159,12 +190,15 @@ func runStressPass(ctx context.Context, d Deps, label string, target time.Durati
cmd := exec.CommandContext(runCtx, "stress-ng", args...)
start := time.Now()
out, err := cmd.CombinedOutput()
elapsed := time.Since(start)
end := time.Now()
elapsed := end.Sub(start)
res := stressPass{
ElapsedSecs: int(elapsed.Round(time.Second).Seconds()),
TargetSecs: int(target.Round(time.Second).Seconds()),
OutputTail: tailLines(string(out), 20),
StartedAt: start,
CompletedAt: end,
}
if err != nil {
res.Err = err.Error()
+39 -3
View File
@@ -2,8 +2,11 @@ package tests
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"strings"
"time"
)
// GPU enumerates VGA / 3D PCI devices. No devices → skip cleanly (a
@@ -11,7 +14,9 @@ import (
// stress). Devices present → try nvidia-smi for NVIDIA cards, else
// accept PCI presence.
func GPU(ctx context.Context, d Deps) Outcome {
pciStart := time.Now()
devices := listGPUPCI(ctx)
pciEnd := time.Now()
if len(devices) == 0 {
d.Info("GPU: no VGA/3D PCI devices found — skipping stage")
return Outcome{
@@ -22,7 +27,9 @@ func GPU(ctx context.Context, d Deps) Outcome {
}
d.Info("GPU: found " + joinDevices(devices))
nvStart := time.Now()
nvidia := nvidiaSmiList(ctx)
nvEnd := time.Now()
extras := map[string]any{
"pci_devices": devices,
"skipped": false,
@@ -31,10 +38,39 @@ func GPU(ctx context.Context, d Deps) Outcome {
extras["nvidia"] = nvidia
d.Info("GPU: nvidia-smi reports: " + strings.Join(nvidia, ", "))
}
// Sub-step rows: one per enumerated PCI device, plus (optionally) one
// per NVIDIA card when nvidia-smi sees anything. PCI enumeration runs
// once for all devices — we bracket that single invocation by
// pciStart/pciEnd and attribute the window to each device row so the
// UI can still slice the log per row by time.
var subs []SubStepReport
for i, dev := range devices {
summary, _ := json.Marshal(map[string]any{"pci": dev, "ordinal": i})
subs = append(subs, SubStepReport{
Name: fmt.Sprintf("pci #%d", i),
Passed: true,
StartedAt: pciStart,
CompletedAt: pciEnd,
SummaryJSON: summary,
})
}
for i, line := range nvidia {
summary, _ := json.Marshal(map[string]any{"nvidia_smi": line})
subs = append(subs, SubStepReport{
Name: fmt.Sprintf("nvidia #%d", i),
Passed: true,
StartedAt: nvStart,
CompletedAt: nvEnd,
SummaryJSON: summary,
})
}
return Outcome{
Passed: true,
Summary: formatCount(len(devices), "GPU present"),
Extras: extras,
Passed: true,
Summary: formatCount(len(devices), "GPU present"),
Extras: extras,
SubSteps: subs,
}
}
+43 -15
View File
@@ -8,6 +8,7 @@ import (
"os/exec"
"path/filepath"
"strings"
"time"
)
// SMART runs smartctl -a on each block device the kernel exposes. We
@@ -46,25 +47,21 @@ func SMART(ctx context.Context, d Deps) Outcome {
return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
}
type diskReport struct {
Device string `json:"device"`
Passed bool `json:"passed"`
Skipped bool `json:"skipped,omitempty"`
Reason string `json:"reason,omitempty"`
Raw map[string]any `json:"raw,omitempty"`
}
var reports []diskReport
var reports []smartDiskReport
var subs []SubStepReport
failed := 0
usable := 0
for _, dev := range disks {
rep := diskReport{Device: dev}
rep := smartDiskReport{Device: dev}
started := time.Now()
out, err := runSmartctl(ctx, dev)
ended := time.Now()
if err != nil {
rep.Skipped = true
rep.Reason = err.Error()
reports = append(reports, rep)
d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
subs = append(subs, subStepFromSMART(dev, rep, started, ended))
continue
}
usable++
@@ -82,6 +79,7 @@ func SMART(ctx context.Context, d Deps) Outcome {
rep.Reason = "no smart_status in output"
}
reports = append(reports, rep)
subs = append(subs, subStepFromSMART(dev, rep, started, ended))
}
extras := map[string]any{
@@ -91,10 +89,11 @@ func SMART(ctx context.Context, d Deps) Outcome {
}
if failed > 0 {
return Outcome{
Passed: false,
Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
Summary: fmt.Sprintf("%d/%d failing", failed, usable),
Extras: extras,
Passed: false,
Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
Summary: fmt.Sprintf("%d/%d failing", failed, usable),
Extras: extras,
SubSteps: subs,
}
}
summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
@@ -102,7 +101,36 @@ func SMART(ctx context.Context, d Deps) Outcome {
summary = "skipped (no smartctl data on any disk)"
extras["skipped"] = true
}
return Outcome{Passed: true, Summary: summary, Extras: extras}
return Outcome{Passed: true, Summary: summary, Extras: extras, SubSteps: subs}
}
// smartDiskReport is the per-disk probe result. Lifted to package scope
// so subStepFromSMART can accept it by value.
type smartDiskReport struct {
Device string `json:"device"`
Passed bool `json:"passed"`
Skipped bool `json:"skipped,omitempty"`
Reason string `json:"reason,omitempty"`
Raw map[string]any `json:"raw,omitempty"`
}
// subStepFromSMART builds a per-disk sub-step row from the in-flight
// report. "skipped" takes precedence over passed so virtio-blk etc.
// render as skipped rather than failed in the UI.
func subStepFromSMART(dev string, rep smartDiskReport, started, ended time.Time) SubStepReport {
summary, _ := json.Marshal(map[string]any{
"device": rep.Device,
"reason": rep.Reason,
"skipped": rep.Skipped,
})
return SubStepReport{
Name: fmt.Sprintf("smartctl %s", dev),
Passed: rep.Passed || rep.Skipped,
Skipped: rep.Skipped,
StartedAt: started,
CompletedAt: ended,
SummaryJSON: summary,
}
}
func listBlockDisks() ([]string, error) {
+23 -4
View File
@@ -16,11 +16,30 @@ import (
// - Message is only used on failure; the UI displays it in the log.
// - Extras is merged into the posted summary so stages can add
// their own shape (e.g. Storage returns per-disk probe results).
// - SubSteps carries agent-authored sub-step rows (CPU/Memory passes,
// per-disk SMART, per-device GPU, …). Empty for stages with no
// natural breakdown; persisted verbatim by the /result handler.
type Outcome struct {
Passed bool
Message string
Summary string // short human-readable one-liner
Extras map[string]any // merged into posted summary JSON
Passed bool
Message string
Summary string // short human-readable one-liner
Extras map[string]any // merged into posted summary JSON
SubSteps []SubStepReport // agent-authored granular rows
}
// SubStepReport is one entry a stage contributes to its sub-step list.
// Ordinal is assigned in the order entries appear in the slice — the
// agent shouldn't set it manually. State is derived from Passed/Skipped
// the same way Outcome is: Skipped wins if set, else Passed ? passed :
// failed. StartedAt/CompletedAt are required so the UI can order rows
// and slice the stage log by time window.
type SubStepReport struct {
Name string
Passed bool
Skipped bool
StartedAt time.Time
CompletedAt time.Time
SummaryJSON json.RawMessage
}
// MarshalSummary builds the summary JSON body POSTed to /result.
+32 -7
View File
@@ -91,12 +91,35 @@ func Storage(ctx context.Context, d Deps) Outcome {
// Per target: short badblocks write sample + fio random-read/write.
var samples []Sample
var subs []SubStepReport
perDisk := map[string]any{}
for _, t := range targets {
d.Info("Storage: running badblocks write sample on " + t.Device)
bbStart := time.Now()
bb := runBadblocks(ctx, t.Device)
bbEnd := time.Now()
bbSummary, _ := json.Marshal(bb)
subs = append(subs, SubStepReport{
Name: fmt.Sprintf("badblocks %s", t.Device),
Passed: bb.OK,
StartedAt: bbStart,
CompletedAt: bbEnd,
SummaryJSON: bbSummary,
})
d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
fioStart := time.Now()
fr := runFio(ctx, t.Device)
fioEnd := time.Now()
fioSummary, _ := json.Marshal(fr)
subs = append(subs, SubStepReport{
Name: fmt.Sprintf("fio %s", t.Device),
Passed: fr.Error == "",
StartedAt: fioStart,
CompletedAt: fioEnd,
SummaryJSON: fioSummary,
})
perDisk[t.Device] = map[string]any{
"badblocks": bb,
"fio": fr,
@@ -107,10 +130,11 @@ func Storage(ctx context.Context, d Deps) Outcome {
)
if !bb.OK {
return Outcome{
Passed: false,
Message: "badblocks found errors on " + t.Device,
Summary: "badblocks failed on " + t.Device,
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
Passed: false,
Message: "badblocks found errors on " + t.Device,
Summary: "badblocks failed on " + t.Device,
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
SubSteps: subs,
}
}
}
@@ -120,9 +144,10 @@ func Storage(ctx context.Context, d Deps) Outcome {
d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%d disks passed", len(targets)),
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
Passed: true,
Summary: fmt.Sprintf("%d disks passed", len(targets)),
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
SubSteps: subs,
}
}