ui: GitHub-Actions-style detail page, sub-steps, mini-tile run-view

Reshapes the detail page into a run-view: hybrid horizontal pipeline + expanded active-step pane with sub-steps, a per-step log pane with line-numbered permalinks and client-side search, and a runs-history sidebar that navigates via ?run=N. Default step is server-picked (running → failed → Reporting) so the operator lands on the thing that's moving. Adds a sub_steps table + SSE topic (substep-{run}-{stage}-{ordinal}) so per-disk and per-pass work (SMART, CPUStress CPU/RAM, Storage, GPU) is visible in the UI instead of buried in stage summary JSON. Agent emits sub-step reports from existing per-iteration loops. Dashboard tiles become a mini run-view with a 9-dot step strip so the operator reads run health across the whole grid at a glance. Register page gets the same card shell + button styling. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 19:00:11 -04:00
parent 5c00edd7b6
commit f79fe0f0db
38 changed files with 3972 additions and 936 deletions
@@ -3,6 +3,7 @@ package tests
 import (
 	"bufio"
 	"context"
+	"encoding/json"
 	"fmt"
 	"io"
 	"os"
@@ -52,6 +53,7 @@ func CPUStress(ctx context.Context, d Deps) Outcome {

 	cores := runtime.NumCPU()
 	extras := map[string]any{"cores": cores}
+	var subs []SubStepReport

 	// Pass 1: CPU
 	cpu := runStressPass(ctx, d, "CPU", cpuPassDuration, []string{
@@ -62,12 +64,14 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 		"--verify",
 	})
 	extras["cpu_pass"] = cpu
+	subs = append(subs, subStepFromPass("CPU pass", cpu))
 	if !cpu.Passed {
 		return Outcome{
-			Passed:  false,
-			Message: "CPU pass failed: " + cpu.Err,
-			Summary: fmt.Sprintf("CPU pass failed after %ds", cpu.ElapsedSecs),
-			Extras:  extras,
+			Passed:   false,
+			Message:  "CPU pass failed: " + cpu.Err,
+			Summary:  fmt.Sprintf("CPU pass failed after %ds", cpu.ElapsedSecs),
+			Extras:   extras,
+			SubSteps: subs,
 		}
 	}

@@ -77,10 +81,11 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 	if err != nil {
 		d.Error("CPUStress: read MemAvailable: " + err.Error())
 		return Outcome{
-			Passed:  false,
-			Message: "read MemAvailable: " + err.Error(),
-			Summary: "failed (meminfo unreadable)",
-			Extras:  extras,
+			Passed:   false,
+			Message:  "read MemAvailable: " + err.Error(),
+			Summary:  "failed (meminfo unreadable)",
+			Extras:   extras,
+			SubSteps: subs,
 		}
 	}
 	cap := avail - memHeadroomBytes
@@ -92,10 +97,11 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 			avail, memFloorBytes, memHeadroomBytes)
 		d.Error("CPUStress: " + msg)
 		return Outcome{
-			Passed:  false,
-			Message: msg,
-			Summary: "failed (insufficient free RAM for memory pass)",
-			Extras:  extras,
+			Passed:   false,
+			Message:  msg,
+			Summary:  "failed (insufficient free RAM for memory pass)",
+			Extras:   extras,
+			SubSteps: subs,
 		}
 	}
 	mem := runStressPass(ctx, d, "memory", memPassDuration, []string{
@@ -107,12 +113,14 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 		"--verify",
 	})
 	extras["mem_pass"] = mem
+	subs = append(subs, subStepFromPass(fmt.Sprintf("Memory pass (cap %s)", humanBytes(cap)), mem))
 	if !mem.Passed {
 		return Outcome{
-			Passed:  false,
-			Message: "memory pass failed: " + mem.Err,
-			Summary: fmt.Sprintf("memory pass failed after %ds", mem.ElapsedSecs),
-			Extras:  extras,
+			Passed:   false,
+			Message:  "memory pass failed: " + mem.Err,
+			Summary:  fmt.Sprintf("memory pass failed after %ds", mem.ElapsedSecs),
+			Extras:   extras,
+			SubSteps: subs,
 		}
 	}

@@ -120,7 +128,26 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
 		Passed: true,
 		Summary: fmt.Sprintf("CPU+RAM PASSED (%d cores, %s cap)",
 			cores, humanBytes(cap)),
-		Extras: extras,
+		Extras:   extras,
+		SubSteps: subs,
+	}
+}
+
+// subStepFromPass projects a stressPass into a SubStepReport — shared by
+// both passes and by the mid-stage early-return paths so the UI always
+// sees exactly one row per pass, even on failure.
+func subStepFromPass(name string, p stressPass) SubStepReport {
+	summary, _ := json.Marshal(map[string]any{
+		"elapsed_secs": p.ElapsedSecs,
+		"target_secs":  p.TargetSecs,
+		"err":          p.Err,
+	})
+	return SubStepReport{
+		Name:        name,
+		Passed:      p.Passed,
+		StartedAt:   p.StartedAt,
+		CompletedAt: p.CompletedAt,
+		SummaryJSON: summary,
 	}
 }

@@ -140,12 +167,16 @@ const (

 // stressPass is the per-pass result embedded in CPUStress's Extras.
 // Passed==true and Elapsed close to target is the only happy path.
+// StartedAt/CompletedAt are not serialized (the summary already has
+// ElapsedSecs) but are used by the caller to emit SubStepReport rows.
 type stressPass struct {
-	Passed      bool   `json:"passed"`
-	Err         string `json:"err,omitempty"`
-	ElapsedSecs int    `json:"elapsed_secs"`
-	TargetSecs  int    `json:"target_secs"`
-	OutputTail  string `json:"output_tail,omitempty"`
+	Passed      bool      `json:"passed"`
+	Err         string    `json:"err,omitempty"`
+	ElapsedSecs int       `json:"elapsed_secs"`
+	TargetSecs  int       `json:"target_secs"`
+	OutputTail  string    `json:"output_tail,omitempty"`
+	StartedAt   time.Time `json:"-"`
+	CompletedAt time.Time `json:"-"`
 }

 // runStressPass invokes stress-ng and validates both exit code and
@@ -159,12 +190,15 @@ func runStressPass(ctx context.Context, d Deps, label string, target time.Durati
 	cmd := exec.CommandContext(runCtx, "stress-ng", args...)
 	start := time.Now()
 	out, err := cmd.CombinedOutput()
-	elapsed := time.Since(start)
+	end := time.Now()
+	elapsed := end.Sub(start)

 	res := stressPass{
 		ElapsedSecs: int(elapsed.Round(time.Second).Seconds()),
 		TargetSecs:  int(target.Round(time.Second).Seconds()),
 		OutputTail:  tailLines(string(out), 20),
+		StartedAt:   start,
+		CompletedAt: end,
 	}
 	if err != nil {
 		res.Err = err.Error()
@@ -2,8 +2,11 @@ package tests

 import (
 	"context"
+	"encoding/json"
+	"fmt"
 	"os/exec"
 	"strings"
+	"time"
 )

 // GPU enumerates VGA / 3D PCI devices. No devices → skip cleanly (a
@@ -11,7 +14,9 @@ import (
 // stress). Devices present → try nvidia-smi for NVIDIA cards, else
 // accept PCI presence.
 func GPU(ctx context.Context, d Deps) Outcome {
+	pciStart := time.Now()
 	devices := listGPUPCI(ctx)
+	pciEnd := time.Now()
 	if len(devices) == 0 {
 		d.Info("GPU: no VGA/3D PCI devices found — skipping stage")
 		return Outcome{
@@ -22,7 +27,9 @@ func GPU(ctx context.Context, d Deps) Outcome {
 	}
 	d.Info("GPU: found " + joinDevices(devices))

+	nvStart := time.Now()
 	nvidia := nvidiaSmiList(ctx)
+	nvEnd := time.Now()
 	extras := map[string]any{
 		"pci_devices": devices,
 		"skipped":     false,
@@ -31,10 +38,39 @@ func GPU(ctx context.Context, d Deps) Outcome {
 		extras["nvidia"] = nvidia
 		d.Info("GPU: nvidia-smi reports: " + strings.Join(nvidia, ", "))
 	}
+
+	// Sub-step rows: one per enumerated PCI device, plus (optionally) one
+	// per NVIDIA card when nvidia-smi sees anything. PCI enumeration runs
+	// once for all devices — we bracket that single invocation by
+	// pciStart/pciEnd and attribute the window to each device row so the
+	// UI can still slice the log per row by time.
+	var subs []SubStepReport
+	for i, dev := range devices {
+		summary, _ := json.Marshal(map[string]any{"pci": dev, "ordinal": i})
+		subs = append(subs, SubStepReport{
+			Name:        fmt.Sprintf("pci #%d", i),
+			Passed:      true,
+			StartedAt:   pciStart,
+			CompletedAt: pciEnd,
+			SummaryJSON: summary,
+		})
+	}
+	for i, line := range nvidia {
+		summary, _ := json.Marshal(map[string]any{"nvidia_smi": line})
+		subs = append(subs, SubStepReport{
+			Name:        fmt.Sprintf("nvidia #%d", i),
+			Passed:      true,
+			StartedAt:   nvStart,
+			CompletedAt: nvEnd,
+			SummaryJSON: summary,
+		})
+	}
+
 	return Outcome{
-		Passed:  true,
-		Summary: formatCount(len(devices), "GPU present"),
-		Extras:  extras,
+		Passed:   true,
+		Summary:  formatCount(len(devices), "GPU present"),
+		Extras:   extras,
+		SubSteps: subs,
 	}
 }

@@ -8,6 +8,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"strings"
+	"time"
 )

 // SMART runs smartctl -a on each block device the kernel exposes. We
@@ -46,25 +47,21 @@ func SMART(ctx context.Context, d Deps) Outcome {
 		return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
 	}

-	type diskReport struct {
-		Device  string         `json:"device"`
-		Passed  bool           `json:"passed"`
-		Skipped bool           `json:"skipped,omitempty"`
-		Reason  string         `json:"reason,omitempty"`
-		Raw     map[string]any `json:"raw,omitempty"`
-	}
-
-	var reports []diskReport
+	var reports []smartDiskReport
+	var subs []SubStepReport
 	failed := 0
 	usable := 0
 	for _, dev := range disks {
-		rep := diskReport{Device: dev}
+		rep := smartDiskReport{Device: dev}
+		started := time.Now()
 		out, err := runSmartctl(ctx, dev)
+		ended := time.Now()
 		if err != nil {
 			rep.Skipped = true
 			rep.Reason = err.Error()
 			reports = append(reports, rep)
 			d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
+			subs = append(subs, subStepFromSMART(dev, rep, started, ended))
 			continue
 		}
 		usable++
@@ -82,6 +79,7 @@ func SMART(ctx context.Context, d Deps) Outcome {
 			rep.Reason = "no smart_status in output"
 		}
 		reports = append(reports, rep)
+		subs = append(subs, subStepFromSMART(dev, rep, started, ended))
 	}

 	extras := map[string]any{
@@ -91,10 +89,11 @@ func SMART(ctx context.Context, d Deps) Outcome {
 	}
 	if failed > 0 {
 		return Outcome{
-			Passed:  false,
-			Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
-			Summary: fmt.Sprintf("%d/%d failing", failed, usable),
-			Extras:  extras,
+			Passed:   false,
+			Message:  fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
+			Summary:  fmt.Sprintf("%d/%d failing", failed, usable),
+			Extras:   extras,
+			SubSteps: subs,
 		}
 	}
 	summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
@@ -102,7 +101,36 @@ func SMART(ctx context.Context, d Deps) Outcome {
 		summary = "skipped (no smartctl data on any disk)"
 		extras["skipped"] = true
 	}
-	return Outcome{Passed: true, Summary: summary, Extras: extras}
+	return Outcome{Passed: true, Summary: summary, Extras: extras, SubSteps: subs}
+}
+
+// smartDiskReport is the per-disk probe result. Lifted to package scope
+// so subStepFromSMART can accept it by value.
+type smartDiskReport struct {
+	Device  string         `json:"device"`
+	Passed  bool           `json:"passed"`
+	Skipped bool           `json:"skipped,omitempty"`
+	Reason  string         `json:"reason,omitempty"`
+	Raw     map[string]any `json:"raw,omitempty"`
+}
+
+// subStepFromSMART builds a per-disk sub-step row from the in-flight
+// report. "skipped" takes precedence over passed so virtio-blk etc.
+// render as skipped rather than failed in the UI.
+func subStepFromSMART(dev string, rep smartDiskReport, started, ended time.Time) SubStepReport {
+	summary, _ := json.Marshal(map[string]any{
+		"device":  rep.Device,
+		"reason":  rep.Reason,
+		"skipped": rep.Skipped,
+	})
+	return SubStepReport{
+		Name:        fmt.Sprintf("smartctl %s", dev),
+		Passed:      rep.Passed || rep.Skipped,
+		Skipped:     rep.Skipped,
+		StartedAt:   started,
+		CompletedAt: ended,
+		SummaryJSON: summary,
+	}
 }

 func listBlockDisks() ([]string, error) {
@@ -16,11 +16,30 @@ import (
 //   - Message is only used on failure; the UI displays it in the log.
 //   - Extras is merged into the posted summary so stages can add
 //     their own shape (e.g. Storage returns per-disk probe results).
+//   - SubSteps carries agent-authored sub-step rows (CPU/Memory passes,
+//     per-disk SMART, per-device GPU, …). Empty for stages with no
+//     natural breakdown; persisted verbatim by the /result handler.
 type Outcome struct {
-	Passed  bool
-	Message string
-	Summary string         // short human-readable one-liner
-	Extras  map[string]any // merged into posted summary JSON
+	Passed   bool
+	Message  string
+	Summary  string           // short human-readable one-liner
+	Extras   map[string]any   // merged into posted summary JSON
+	SubSteps []SubStepReport  // agent-authored granular rows
+}
+
+// SubStepReport is one entry a stage contributes to its sub-step list.
+// Ordinal is assigned in the order entries appear in the slice — the
+// agent shouldn't set it manually. State is derived from Passed/Skipped
+// the same way Outcome is: Skipped wins if set, else Passed ? passed :
+// failed. StartedAt/CompletedAt are required so the UI can order rows
+// and slice the stage log by time window.
+type SubStepReport struct {
+	Name        string
+	Passed      bool
+	Skipped     bool
+	StartedAt   time.Time
+	CompletedAt time.Time
+	SummaryJSON json.RawMessage
 }

 // MarshalSummary builds the summary JSON body POSTed to /result.
@@ -91,12 +91,35 @@ func Storage(ctx context.Context, d Deps) Outcome {

 	// Per target: short badblocks write sample + fio random-read/write.
 	var samples []Sample
+	var subs []SubStepReport
 	perDisk := map[string]any{}
 	for _, t := range targets {
 		d.Info("Storage: running badblocks write sample on " + t.Device)
+		bbStart := time.Now()
 		bb := runBadblocks(ctx, t.Device)
+		bbEnd := time.Now()
+		bbSummary, _ := json.Marshal(bb)
+		subs = append(subs, SubStepReport{
+			Name:        fmt.Sprintf("badblocks %s", t.Device),
+			Passed:      bb.OK,
+			StartedAt:   bbStart,
+			CompletedAt: bbEnd,
+			SummaryJSON: bbSummary,
+		})
+
 		d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
+		fioStart := time.Now()
 		fr := runFio(ctx, t.Device)
+		fioEnd := time.Now()
+		fioSummary, _ := json.Marshal(fr)
+		subs = append(subs, SubStepReport{
+			Name:        fmt.Sprintf("fio %s", t.Device),
+			Passed:      fr.Error == "",
+			StartedAt:   fioStart,
+			CompletedAt: fioEnd,
+			SummaryJSON: fioSummary,
+		})
+
 		perDisk[t.Device] = map[string]any{
 			"badblocks": bb,
 			"fio":       fr,
@@ -107,10 +130,11 @@ func Storage(ctx context.Context, d Deps) Outcome {
 		)
 		if !bb.OK {
 			return Outcome{
-				Passed:  false,
-				Message: "badblocks found errors on " + t.Device,
-				Summary: "badblocks failed on " + t.Device,
-				Extras:  map[string]any{"per_disk": perDisk, "wipe_probe": probes},
+				Passed:   false,
+				Message:  "badblocks found errors on " + t.Device,
+				Summary:  "badblocks failed on " + t.Device,
+				Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes},
+				SubSteps: subs,
 			}
 		}
 	}
@@ -120,9 +144,10 @@ func Storage(ctx context.Context, d Deps) Outcome {

 	d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
 	return Outcome{
-		Passed:  true,
-		Summary: fmt.Sprintf("%d disks passed", len(targets)),
-		Extras:  map[string]any{"per_disk": perDisk, "wipe_probe": probes},
+		Passed:   true,
+		Summary:  fmt.Sprintf("%d disks passed", len(targets)),
+		Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes},
+		SubSteps: subs,
 	}
 }