Heartbeat-first dispatch: retire WoL-as-default, add WaitingReboot
CI / Lint + build + test (push) Has been cancelled

Every supported host runs vetting-reporter in-OS and heartbeats every
30s. WoL was never the thing that started vetting — the heartbeat
response's reboot_for_vetting command was. Firing WoL first only
crowded the run log with misleading diagnostics when the real failure
mode is "reporter isn't installed."

- StartRun 409s if the host hasn't heartbeated within 60s, pointing
  the operator at /register/quick.sh.
- Dispatcher re-checks LastSeenAt at dispatch time (run may sit in
  Queued long enough for the host to go offline); stale hosts mark
  the run Failed with failed_stage=dispatch instead of looping.
- New StateWaitingReboot + TriggerRebootCommanded capture the actual
  semantics. StateWaitingWoL kept as the hook point for a future
  manual-override button.
- Tile disables the Start button with a quick.sh tooltip when the
  host is offline, matching the server-side 409.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 01:10:34 -04:00
parent c9927ca2bf
commit d0bfae14c8
17 changed files with 632 additions and 155 deletions
+26 -3
View File
@@ -28,10 +28,12 @@ templ HostTile(t TileData) {
</div>
</header>
<div class="tile-primary-action">
if canStart(t.Latest) {
if canStart(t) {
<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/start", t.Host.ID)) } class="inline">
<button type="submit">Start vetting</button>
</form>
} else if canStartIfOnline(t.Latest) {
<button type="button" disabled title="host is not heartbeating — install the reporter via /register/quick.sh on the target host">Start vetting</button>
} else if hasReport(t.Latest) {
<a class="button-like" href={ templ.SafeURL(fmt.Sprintf("/reports/%d", t.Latest.ID)) } target="_blank" rel="noopener">View report</a>
}
@@ -53,12 +55,29 @@ func hasReport(r *model.Run) bool {
return r != nil && r.State == model.StateCompleted
}
func canStart(r *model.Run) bool {
// canStart gates the Start button on two things: the run is in a state
// that accepts a fresh start, AND the host is currently heartbeating.
// The heartbeat check mirrors the StartRun handler's preflight so the
// button never offers a click that the server would reject with 409.
func canStart(t TileData) bool {
if !canStartIfOnline(t.Latest) {
return false
}
if t.LastSeenAt == nil {
return false
}
return time.Since(*t.LastSeenAt) <= 60*time.Second
}
// canStartIfOnline is the run-state half of canStart, split out so the
// template can distinguish "waiting on run to end" (no button) from
// "run is done but host is offline" (disabled button with tooltip).
func canStartIfOnline(r *model.Run) bool {
if r == nil {
return true
}
switch r.State {
case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
case model.StateCompleted, model.StateReleased, model.StateFailed, model.StateFailedHolding:
return true
}
return false
@@ -68,6 +87,10 @@ func tileStatus(r *model.Run) string {
if r == nil {
return "Idle"
}
switch r.State {
case model.StateWaitingReboot:
return "Waiting for reboot"
}
return string(r.State)
}