62bddac110
A held run sits indefinitely at an SSH prompt waiting for operator
investigation. Previously the only exits were Override (re-enter the
failed stage) or leaving the host on forever — Cancel rejected any
terminal state, including FailedHolding, and there was no button in
the UI anyway.
Add a dedicated exit path:
- statemachine: TriggerOperatorCancelled now accepts FailedHolding
as a valid source, transitioning to Cancelled like any other
live state.
- CancelRun handler: treats FailedHolding as cancellable even
though IsTerminal reports true.
- heartbeat: Cancelled runs fork on FailedStage. Set means the
agent is parked in waitForOverride with no subprocess in
flight, so cmd=reboot tells it to systemctl reboot; the host
falls through iPXE's no-active-run script to the local disk.
Empty FailedStage keeps the pre-existing cmd=cancel_stage path
for mid-stage cancels (kill stage ctx, then power off).
- UI: canCancel now returns true for FailedHolding, and the
run-detail page renders a distinct "Cancel & reboot" button
with a hold-specific confirm message so the action doesn't
look identical to a mid-run cancel.
Tests cover the new statemachine transition, the heartbeat fork
(reboot vs cancel_stage), and keep the pre-existing mid-run cancel
behaviour locked in.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
192 lines
6.0 KiB
Plaintext
192 lines
6.0 KiB
Plaintext
package templates
|
||
|
||
import (
|
||
"bytes"
|
||
"context"
|
||
"fmt"
|
||
"time"
|
||
|
||
"vetting/internal/model"
|
||
)
|
||
|
||
// HostTile renders a single dashboard card: hostname, heartbeat badge,
|
||
// latest run status, and the primary action (Start / Cancel / View
|
||
// report). The whole tile is a link to /hosts/{id} via a CSS-overlay
|
||
// <a>; every deeper control lives on the host page or the run page.
|
||
// It's the SSE-swap target for per-host tile refreshes (`tile-N`).
|
||
templ HostTile(t TileData) {
|
||
<article
|
||
id={ fmt.Sprintf("host-%d", t.Host.ID) }
|
||
class={ "tile", "tile-" + tileMood(t.Latest) }
|
||
sse-swap={ fmt.Sprintf("tile-%d", t.Host.ID) }
|
||
hx-swap="outerHTML"
|
||
>
|
||
<a class="tile-link" href={ templ.SafeURL(fmt.Sprintf("/hosts/%d", t.Host.ID)) } aria-label={ "Open " + t.Host.Name }></a>
|
||
<header class="tile-head">
|
||
<div class="tile-name">{ t.Host.Name }</div>
|
||
<div class="tile-header-right">
|
||
<span class={ "tile-last-seen", lastSeenClass(t.LastSeenAt) }>{ lastSeenLabel(t.LastSeenAt) }</span>
|
||
<div class="tile-status">{ tileStatus(t.Latest) }</div>
|
||
</div>
|
||
</header>
|
||
<div class="tile-primary-action">
|
||
if canStart(t) {
|
||
<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/start", t.Host.ID)) } class="inline tile-start-form">
|
||
<label class="tile-nd-toggle">
|
||
<input type="checkbox" name="non_destructive" value="1"/>
|
||
Non-destructive
|
||
</label>
|
||
<button type="submit">Start vetting</button>
|
||
</form>
|
||
} else if canStartIfOnline(t.Latest) {
|
||
<button type="button" disabled title="host is not heartbeating — install the reporter via /register/quick.sh on the target host">Start vetting</button>
|
||
} else if canCancel(t.Latest) {
|
||
<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/cancel", t.Host.ID)) } class="inline tile-cancel-form" onsubmit="return confirm('Cancel run? Destructive stages may leave the host in an intermediate state requiring manual cleanup.');">
|
||
<button type="submit" class="danger">Cancel run</button>
|
||
</form>
|
||
} else if hasReport(t.Latest) {
|
||
<a class="button-like" href={ templ.SafeURL(fmt.Sprintf("/reports/%d", t.Latest.ID)) } target="_blank" rel="noopener">View report</a>
|
||
}
|
||
</div>
|
||
</article>
|
||
}
|
||
|
||
func canOverrideWipe(r *model.Run) bool {
|
||
if r == nil {
|
||
return false
|
||
}
|
||
return r.State == model.StateFailedHolding && r.FailedStage == "Storage"
|
||
}
|
||
|
||
// hasReport is true once the reporting stage has produced an HTML
|
||
// artifact. We cheat slightly: Completed runs always have one, and
|
||
// that's the only state in which the tile wants to surface a link.
|
||
func hasReport(r *model.Run) bool {
|
||
return r != nil && r.State == model.StateCompleted
|
||
}
|
||
|
||
// canStart gates the Start button on two things: the run is in a state
|
||
// that accepts a fresh start, AND the host is currently heartbeating.
|
||
// The heartbeat check mirrors the StartRun handler's preflight so the
|
||
// button never offers a click that the server would reject with 409.
|
||
func canStart(t TileData) bool {
|
||
if !canStartIfOnline(t.Latest) {
|
||
return false
|
||
}
|
||
if t.LastSeenAt == nil {
|
||
return false
|
||
}
|
||
return time.Since(*t.LastSeenAt) <= 60*time.Second
|
||
}
|
||
|
||
// canStartIfOnline is the run-state half of canStart, split out so the
|
||
// template can distinguish "waiting on run to end" (no button) from
|
||
// "run is done but host is offline" (disabled button with tooltip).
|
||
func canStartIfOnline(r *model.Run) bool {
|
||
if r == nil {
|
||
return true
|
||
}
|
||
return r.State.IsTerminal()
|
||
}
|
||
|
||
// canCancel is true for any non-terminal run, plus FailedHolding —
|
||
// a held run technically classifies as terminal for the pipeline but
|
||
// the host is still live on the SSH hold prompt, and the operator
|
||
// can walk away from it via Cancel (which reboots to local disk).
|
||
// Every other terminal state is truly done, so no Cancel button.
|
||
// The server-side CancelRun handler mirrors this predicate.
|
||
func canCancel(r *model.Run) bool {
|
||
if r == nil {
|
||
return false
|
||
}
|
||
if !r.State.IsTerminal() {
|
||
return true
|
||
}
|
||
return r.State == model.StateFailedHolding
|
||
}
|
||
|
||
func tileStatus(r *model.Run) string {
|
||
if r == nil {
|
||
return "Idle"
|
||
}
|
||
switch r.State {
|
||
case model.StateWaitingReboot:
|
||
return "Waiting for reboot"
|
||
}
|
||
return string(r.State)
|
||
}
|
||
|
||
func tileMood(r *model.Run) string {
|
||
if r == nil {
|
||
return "idle"
|
||
}
|
||
switch r.State {
|
||
case model.StateCompleted:
|
||
return "pass"
|
||
case model.StateFailed, model.StateFailedHolding:
|
||
return "fail"
|
||
case model.StateReleased, model.StateCancelled:
|
||
return "idle"
|
||
}
|
||
return "active"
|
||
}
|
||
|
||
func sshInvocation(keyPath, ip string) string {
|
||
if keyPath == "" {
|
||
return "ssh root@" + ip + " (hold key not yet recorded)"
|
||
}
|
||
return fmt.Sprintf("ssh -i %s root@%s", keyPath, ip)
|
||
}
|
||
|
||
// RenderTileString renders a single tile fragment so the orchestrator
|
||
// can publish it over SSE without threading a context through every
|
||
// event publisher.
|
||
func RenderTileString(t TileData) string {
|
||
var buf bytes.Buffer
|
||
_ = HostTile(t).Render(context.Background(), &buf)
|
||
return buf.String()
|
||
}
|
||
|
||
// lastSeenLabel renders the host-mode agent's liveness into a short
|
||
// badge: "never" if the host has never heartbeated, "online" within
|
||
// a 2×heartbeat grace window (60s, since agents heartbeat every 30s),
|
||
// "Nm ago" / "Nh ago" / "Nd ago" otherwise.
|
||
func lastSeenLabel(t *time.Time) string {
|
||
if t == nil {
|
||
return "never"
|
||
}
|
||
return humanAgoFrom(time.Now(), *t)
|
||
}
|
||
|
||
// lastSeenClass pairs with lastSeenLabel to drive the badge color
|
||
// without the template having to carry its own logic.
|
||
func lastSeenClass(t *time.Time) string {
|
||
if t == nil {
|
||
return "offline"
|
||
}
|
||
if time.Since(*t) < 60*time.Second {
|
||
return "online"
|
||
}
|
||
return "stale"
|
||
}
|
||
|
||
// humanAgoFrom formats (now - t) as a short "Nm ago" style string.
|
||
// Buckets: <60s -> "online", <60m -> minutes, <24h -> hours, else days.
|
||
// Split on `now` so callers can hold time for tests.
|
||
func humanAgoFrom(now time.Time, t time.Time) string {
|
||
d := now.Sub(t)
|
||
if d < 0 {
|
||
d = 0
|
||
}
|
||
if d < 60*time.Second {
|
||
return "online"
|
||
}
|
||
if d < time.Hour {
|
||
return fmt.Sprintf("%dm ago", int(d/time.Minute))
|
||
}
|
||
if d < 24*time.Hour {
|
||
return fmt.Sprintf("%dh ago", int(d/time.Hour))
|
||
}
|
||
return fmt.Sprintf("%dd ago", int(d/(24*time.Hour)))
|
||
}
|