Heartbeat-first dispatch: retire WoL-as-default, add WaitingReboot
CI / Lint + build + test (push) Has been cancelled

Every supported host runs vetting-reporter in-OS and heartbeats every
30s. WoL was never the thing that started vetting — the heartbeat
response's reboot_for_vetting command was. Firing WoL first only
crowded the run log with misleading diagnostics when the real failure
mode is "reporter isn't installed."

- StartRun 409s if the host hasn't heartbeated within 60s, pointing
  the operator at /register/quick.sh.
- Dispatcher re-checks LastSeenAt at dispatch time (run may sit in
  Queued long enough for the host to go offline); stale hosts mark
  the run Failed with failed_stage=dispatch instead of looping.
- New StateWaitingReboot + TriggerRebootCommanded capture the actual
  semantics. StateWaitingWoL kept as the hook point for a future
  manual-override button.
- Tile disables the Start button with a quick.sh tooltip when the
  host is offline, matching the server-side 409.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 01:10:34 -04:00
parent c9927ca2bf
commit d0bfae14c8
17 changed files with 632 additions and 155 deletions
+28 -13
View File
@@ -130,7 +130,10 @@ func (u *UI) HostDetail(w http.ResponseWriter, r *http.Request) {
// StartRun creates a new Run for the host, issues an agent token, and
// transitions Registered→Queued. The dispatcher goroutine picks it up
// and fires WoL.
// on its next tick; the happy path is heartbeat-driven (the reporter's
// next heartbeat fetches reboot_for_vetting). Refuses the click outright
// if the host isn't currently heartbeating — there is no path from
// Queued to live-image without an in-OS reporter on the target.
func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
hostID, err := strconv.ParseInt(idStr, 10, 64)
@@ -138,7 +141,8 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
if _, err := u.Hosts.Get(r.Context(), hostID); err != nil {
host, err := u.Hosts.Get(r.Context(), hostID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
http.NotFound(w, r)
return
@@ -147,10 +151,20 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
return
}
// Preflight: host must be heartbeating. The dispatcher re-checks at
// dispatch time (belt-and-braces for the gap between click and tick),
// but rejecting here gives the operator an immediate, actionable
// error instead of a mysterious Failed run 2s later.
if host.LastSeenAt == nil || time.Since(*host.LastSeenAt) > orchestrator.HostHeartbeatStaleAfter {
writeJSONError(w, http.StatusConflict,
"host is not heartbeating — install the reporter via /register/quick.sh on the target host, then retry")
return
}
// Guard: refuse to start a second run while one is still active.
if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
switch latest.State {
case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
case model.StateCompleted, model.StateReleased, model.StateFailed, model.StateFailedHolding:
// ok to start fresh
default:
http.Error(w, "host already has an active run", http.StatusConflict)
@@ -343,11 +357,12 @@ type heartbeatResponse struct {
// pickHostCommand decides what the host-mode agent should do on the
// back of this heartbeat. Returns ("", 0) when there's nothing to do.
//
// - Queued run → Transition(Dispatched) and tell the agent to reboot.
// The dispatcher would have WoL'd it anyway; we beat it to the
// punch so the host skips the WoL dance.
// - WaitingWoL run created <10min ago → also return reboot, covering
// "host crashed mid-reboot, systemd brought the reporter back".
// - Queued run → Transition(RebootCommanded) and tell the agent to
// reboot. Beats the dispatcher's 2s poll to the punch, but either
// path ends at WaitingReboot.
// - WaitingReboot (or legacy WaitingWoL) run <10min old → also return
// reboot, covering "host crashed mid-reboot, systemd brought the
// reporter back".
// - anything else → idle.
func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) {
if u.Runs == nil || u.Runner == nil {
@@ -363,7 +378,7 @@ func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64)
}
switch run.State {
case model.StateQueued:
if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerDispatched); err != nil {
if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerRebootCommanded); err != nil {
// Benign race with the dispatcher's own 2s poll — the
// state machine refuses the second transition; we just
// log and return idle so the agent doesn't reboot on a
@@ -371,13 +386,13 @@ func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64)
log.Printf("heartbeat: transition run %d: %v", run.ID, err)
return "", 0
}
log.Printf("heartbeat: dispatched run %d for host %d via heartbeat (no WoL)", run.ID, hostID)
log.Printf("heartbeat: dispatched run %d for host %d (reboot commanded)", run.ID, hostID)
return cmdRebootForVetting, run.ID
case model.StateWaitingWoL:
case model.StateWaitingReboot, model.StateWaitingWoL:
// Tolerate a crashed-mid-reboot retry: the reporter is the
// only thing that could be telling us about this host right
// now, and WoL is only the fallback anyway. Bound it so a
// perpetually-broken PXE doesn't reboot-loop the box.
// now. Bound it so a perpetually-broken PXE doesn't
// reboot-loop the box.
if time.Since(run.StartedAt) < 10*time.Minute {
return cmdRebootForVetting, run.ID
}