Files
Vetting/internal/web/templates/host_tile.templ
T
josh 62bddac110
CI / Lint + build + test (push) Successful in 1m38s
Release / release (push) Successful in 6m10s
feat(cancel): allow cancel from FailedHolding, reboot to local disk
A held run sits indefinitely at an SSH prompt waiting for operator
investigation. Previously the only exits were Override (re-enter the
failed stage) or leaving the host on forever — Cancel rejected any
terminal state, including FailedHolding, and there was no button in
the UI anyway.

Add a dedicated exit path:
  - statemachine: TriggerOperatorCancelled now accepts FailedHolding
    as a valid source, transitioning to Cancelled like any other
    live state.
  - CancelRun handler: treats FailedHolding as cancellable even
    though IsTerminal reports true.
  - heartbeat: Cancelled runs fork on FailedStage. Set means the
    agent is parked in waitForOverride with no subprocess in
    flight, so cmd=reboot tells it to systemctl reboot; the host
    falls through iPXE's no-active-run script to the local disk.
    Empty FailedStage keeps the pre-existing cmd=cancel_stage path
    for mid-stage cancels (kill stage ctx, then power off).
  - UI: canCancel now returns true for FailedHolding, and the
    run-detail page renders a distinct "Cancel & reboot" button
    with a hold-specific confirm message so the action doesn't
    look identical to a mid-run cancel.

Tests cover the new statemachine transition, the heartbeat fork
(reboot vs cancel_stage), and keep the pre-existing mid-run cancel
behaviour locked in.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-19 22:59:34 -04:00

192 lines
6.0 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package templates
import (
"bytes"
"context"
"fmt"
"time"
"vetting/internal/model"
)
// HostTile renders a single dashboard card: hostname, heartbeat badge,
// latest run status, and the primary action (Start / Cancel / View
// report). The whole tile is a link to /hosts/{id} via a CSS-overlay
// <a>; every deeper control lives on the host page or the run page.
// It's the SSE-swap target for per-host tile refreshes (`tile-N`).
templ HostTile(t TileData) {
<article
id={ fmt.Sprintf("host-%d", t.Host.ID) }
class={ "tile", "tile-" + tileMood(t.Latest) }
sse-swap={ fmt.Sprintf("tile-%d", t.Host.ID) }
hx-swap="outerHTML"
>
<a class="tile-link" href={ templ.SafeURL(fmt.Sprintf("/hosts/%d", t.Host.ID)) } aria-label={ "Open " + t.Host.Name }></a>
<header class="tile-head">
<div class="tile-name">{ t.Host.Name }</div>
<div class="tile-header-right">
<span class={ "tile-last-seen", lastSeenClass(t.LastSeenAt) }>{ lastSeenLabel(t.LastSeenAt) }</span>
<div class="tile-status">{ tileStatus(t.Latest) }</div>
</div>
</header>
<div class="tile-primary-action">
if canStart(t) {
<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/start", t.Host.ID)) } class="inline tile-start-form">
<label class="tile-nd-toggle">
<input type="checkbox" name="non_destructive" value="1"/>
Non-destructive
</label>
<button type="submit">Start vetting</button>
</form>
} else if canStartIfOnline(t.Latest) {
<button type="button" disabled title="host is not heartbeating — install the reporter via /register/quick.sh on the target host">Start vetting</button>
} else if canCancel(t.Latest) {
<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/cancel", t.Host.ID)) } class="inline tile-cancel-form" onsubmit="return confirm('Cancel run? Destructive stages may leave the host in an intermediate state requiring manual cleanup.');">
<button type="submit" class="danger">Cancel run</button>
</form>
} else if hasReport(t.Latest) {
<a class="button-like" href={ templ.SafeURL(fmt.Sprintf("/reports/%d", t.Latest.ID)) } target="_blank" rel="noopener">View report</a>
}
</div>
</article>
}
func canOverrideWipe(r *model.Run) bool {
if r == nil {
return false
}
return r.State == model.StateFailedHolding && r.FailedStage == "Storage"
}
// hasReport is true once the reporting stage has produced an HTML
// artifact. We cheat slightly: Completed runs always have one, and
// that's the only state in which the tile wants to surface a link.
func hasReport(r *model.Run) bool {
return r != nil && r.State == model.StateCompleted
}
// canStart gates the Start button on two things: the run is in a state
// that accepts a fresh start, AND the host is currently heartbeating.
// The heartbeat check mirrors the StartRun handler's preflight so the
// button never offers a click that the server would reject with 409.
func canStart(t TileData) bool {
if !canStartIfOnline(t.Latest) {
return false
}
if t.LastSeenAt == nil {
return false
}
return time.Since(*t.LastSeenAt) <= 60*time.Second
}
// canStartIfOnline is the run-state half of canStart, split out so the
// template can distinguish "waiting on run to end" (no button) from
// "run is done but host is offline" (disabled button with tooltip).
func canStartIfOnline(r *model.Run) bool {
if r == nil {
return true
}
return r.State.IsTerminal()
}
// canCancel is true for any non-terminal run, plus FailedHolding —
// a held run technically classifies as terminal for the pipeline but
// the host is still live on the SSH hold prompt, and the operator
// can walk away from it via Cancel (which reboots to local disk).
// Every other terminal state is truly done, so no Cancel button.
// The server-side CancelRun handler mirrors this predicate.
func canCancel(r *model.Run) bool {
if r == nil {
return false
}
if !r.State.IsTerminal() {
return true
}
return r.State == model.StateFailedHolding
}
func tileStatus(r *model.Run) string {
if r == nil {
return "Idle"
}
switch r.State {
case model.StateWaitingReboot:
return "Waiting for reboot"
}
return string(r.State)
}
func tileMood(r *model.Run) string {
if r == nil {
return "idle"
}
switch r.State {
case model.StateCompleted:
return "pass"
case model.StateFailed, model.StateFailedHolding:
return "fail"
case model.StateReleased, model.StateCancelled:
return "idle"
}
return "active"
}
func sshInvocation(keyPath, ip string) string {
if keyPath == "" {
return "ssh root@" + ip + " (hold key not yet recorded)"
}
return fmt.Sprintf("ssh -i %s root@%s", keyPath, ip)
}
// RenderTileString renders a single tile fragment so the orchestrator
// can publish it over SSE without threading a context through every
// event publisher.
func RenderTileString(t TileData) string {
var buf bytes.Buffer
_ = HostTile(t).Render(context.Background(), &buf)
return buf.String()
}
// lastSeenLabel renders the host-mode agent's liveness into a short
// badge: "never" if the host has never heartbeated, "online" within
// a 2×heartbeat grace window (60s, since agents heartbeat every 30s),
// "Nm ago" / "Nh ago" / "Nd ago" otherwise.
func lastSeenLabel(t *time.Time) string {
if t == nil {
return "never"
}
return humanAgoFrom(time.Now(), *t)
}
// lastSeenClass pairs with lastSeenLabel to drive the badge color
// without the template having to carry its own logic.
func lastSeenClass(t *time.Time) string {
if t == nil {
return "offline"
}
if time.Since(*t) < 60*time.Second {
return "online"
}
return "stale"
}
// humanAgoFrom formats (now - t) as a short "Nm ago" style string.
// Buckets: <60s -> "online", <60m -> minutes, <24h -> hours, else days.
// Split on `now` so callers can hold time for tests.
func humanAgoFrom(now time.Time, t time.Time) string {
d := now.Sub(t)
if d < 0 {
d = 0
}
if d < 60*time.Second {
return "online"
}
if d < time.Hour {
return fmt.Sprintf("%dm ago", int(d/time.Minute))
}
if d < 24*time.Hour {
return fmt.Sprintf("%dh ago", int(d/time.Hour))
}
return fmt.Sprintf("%dd ago", int(d/(24*time.Hour)))
}