Heartbeat-first dispatch: retire WoL-as-default, add WaitingReboot
CI / Lint + build + test (push) Has been cancelled
CI / Lint + build + test (push) Has been cancelled
Every supported host runs vetting-reporter in-OS and heartbeats every 30s. WoL was never the thing that started vetting — the heartbeat response's reboot_for_vetting command was. Firing WoL first only crowded the run log with misleading diagnostics when the real failure mode is "reporter isn't installed." - StartRun 409s if the host hasn't heartbeated within 60s, pointing the operator at /register/quick.sh. - Dispatcher re-checks LastSeenAt at dispatch time (run may sit in Queued long enough for the host to go offline); stale hosts mark the run Failed with failed_stage=dispatch instead of looping. - New StateWaitingReboot + TriggerRebootCommanded capture the actual semantics. StateWaitingWoL kept as the hook point for a future manual-override button. - Tile disables the Start button with a quick.sh tooltip when the host is offline, matching the server-side 409. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -12,14 +12,25 @@ import (
|
||||
"vetting/internal/store"
|
||||
)
|
||||
|
||||
// Dispatcher picks Queued runs off the DB and drives them through
|
||||
// WaitingWoL (sending a WoL packet). Concurrency is capped at Max.
|
||||
// HostHeartbeatStaleAfter is how long we tolerate a host's last_seen_at
|
||||
// being in the past before treating the host as offline. Set to 2× the
|
||||
// default reporter heartbeat interval (30s) so a single dropped heartbeat
|
||||
// doesn't block dispatch. Used by the StartRun preflight and the
|
||||
// dispatcher itself — both must agree or the operator's click-time
|
||||
// validation wouldn't match the dispatch-time check.
|
||||
const HostHeartbeatStaleAfter = 60 * time.Second
|
||||
|
||||
// Dispatcher picks Queued runs off the DB and drives them to
|
||||
// WaitingReboot — the happy path is heartbeat-first: we transition and
|
||||
// rely on the host-mode reporter's next heartbeat to fetch the
|
||||
// reboot_for_vetting command. WoL is not fired in the default flow
|
||||
// because every supported host already runs the reporter in-OS.
|
||||
//
|
||||
// Pre-stage log lines (picked, WoL-sent, heartbeat, agent-claimed)
|
||||
// are written into the per-run log via Logs so the detail page's
|
||||
// log pane can show what's happening before the agent is alive.
|
||||
// Pre-stage log lines (picked, heartbeating, agent-claimed) are
|
||||
// written into the per-run log via Logs so the detail page's log pane
|
||||
// can show what's happening before the agent is alive.
|
||||
//
|
||||
// For Phase 2 the dispatcher's job ends at WaitingWoL; further
|
||||
// For Phase 2 the dispatcher's job ends at WaitingReboot; further
|
||||
// transitions are driven by iPXE and agent callbacks. Phase 4+ will
|
||||
// return here and shepherd each run through stage execution.
|
||||
type Dispatcher struct {
|
||||
@@ -107,10 +118,10 @@ func (d *Dispatcher) pickNext(ctx context.Context) {
|
||||
if queued == nil {
|
||||
queued = &runs[i]
|
||||
}
|
||||
case model.StateWaitingWoL, model.StateBooting, model.StateInventoryCheck,
|
||||
model.StateSpecValidate, model.StateSMART, model.StateCPUStress,
|
||||
model.StateStorage, model.StateNetwork, model.StateGPU,
|
||||
model.StatePSU, model.StateReporting:
|
||||
case model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
|
||||
model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
|
||||
model.StateCPUStress, model.StateStorage, model.StateNetwork,
|
||||
model.StateGPU, model.StatePSU, model.StateReporting:
|
||||
inFlight++
|
||||
}
|
||||
}
|
||||
@@ -124,23 +135,43 @@ func (d *Dispatcher) pickNext(ctx context.Context) {
|
||||
log.Printf("dispatcher: get host %d: %v", queued.HostID, err)
|
||||
return
|
||||
}
|
||||
d.runLog(queued.ID, "info", fmt.Sprintf("dispatcher: picked run for host %s (mac=%s wol=%s:%d)",
|
||||
host.Name, host.MAC, host.WoLBroadcastIP, host.WoLPort))
|
||||
if _, err := d.Runner.Transition(ctx, queued.ID, TriggerDispatched); err != nil {
|
||||
|
||||
// Heartbeat gate: the StartRun preflight catches this at click time,
|
||||
// but a run can sit in Queued long enough for the host to go offline
|
||||
// between click and dispatch. Re-check here so we never fire a
|
||||
// reboot command at a host that can't receive it.
|
||||
if host.LastSeenAt == nil || time.Since(*host.LastSeenAt) > HostHeartbeatStaleAfter {
|
||||
var ageMsg string
|
||||
if host.LastSeenAt == nil {
|
||||
ageMsg = "never heartbeated"
|
||||
} else {
|
||||
ageMsg = fmt.Sprintf("last heartbeat %s ago", time.Since(*host.LastSeenAt).Truncate(time.Second))
|
||||
}
|
||||
d.runLog(queued.ID, "error", fmt.Sprintf(
|
||||
"dispatcher: host %s is offline (%s) — refusing to dispatch; install the reporter via /register/quick.sh on the target and retry",
|
||||
host.Name, ageMsg))
|
||||
if err := d.Runs.MarkDispatchFailed(ctx, queued.ID, "dispatch", "host stopped heartbeating before dispatch"); err != nil {
|
||||
log.Printf("dispatcher: mark run %d dispatch-failed: %v", queued.ID, err)
|
||||
}
|
||||
if d.Runner != nil {
|
||||
d.Runner.PublishTileUpdate(ctx, host.ID)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
age := time.Since(*host.LastSeenAt).Truncate(time.Second)
|
||||
d.runLog(queued.ID, "info", fmt.Sprintf(
|
||||
"dispatcher: picked run for host %s (mac=%s, heartbeating, last seen %s ago)",
|
||||
host.Name, host.MAC, age))
|
||||
if _, err := d.Runner.Transition(ctx, queued.ID, TriggerRebootCommanded); err != nil {
|
||||
log.Printf("dispatcher: transition run %d: %v", queued.ID, err)
|
||||
d.runLog(queued.ID, "error", fmt.Sprintf("dispatcher: transition to WaitingWoL failed: %v", err))
|
||||
d.runLog(queued.ID, "error", fmt.Sprintf("dispatcher: transition to WaitingReboot failed: %v", err))
|
||||
return
|
||||
}
|
||||
if err := SendWoL(host.MAC, host.WoLBroadcastIP, host.WoLPort); err != nil {
|
||||
log.Printf("dispatcher: WoL run %d host %s: %v", queued.ID, host.Name, err)
|
||||
d.runLog(queued.ID, "error", fmt.Sprintf("dispatcher: WoL send failed: %v — check broadcast %s:%d is reachable",
|
||||
err, host.WoLBroadcastIP, host.WoLPort))
|
||||
// Stay in WaitingWoL; operator can retry or investigate.
|
||||
return
|
||||
}
|
||||
log.Printf("dispatcher: WoL sent for run %d (host=%s mac=%s)", queued.ID, host.Name, host.MAC)
|
||||
d.runLog(queued.ID, "info", fmt.Sprintf("dispatcher: sent WoL packet to %s via %s:%d — waiting for agent claim",
|
||||
host.MAC, host.WoLBroadcastIP, host.WoLPort))
|
||||
log.Printf("dispatcher: run %d host %s → WaitingReboot (heartbeat-driven)", queued.ID, host.Name)
|
||||
d.runLog(queued.ID, "info", fmt.Sprintf(
|
||||
"dispatcher: host %s heartbeating — waiting for next reporter heartbeat to deliver reboot_for_vetting",
|
||||
host.Name))
|
||||
|
||||
// Prime the heartbeat so the first "still waiting" fires 30s after
|
||||
// dispatch, not immediately.
|
||||
@@ -155,8 +186,8 @@ func (d *Dispatcher) pickNext(ctx context.Context) {
|
||||
}
|
||||
|
||||
// heartbeatWaiting emits a "still waiting" log line every beatEvery for
|
||||
// each run still sitting in WaitingWoL. Helps the operator spot hangs
|
||||
// without having to tail journalctl on the LXC.
|
||||
// each run still sitting in WaitingReboot (or legacy WaitingWoL). Helps
|
||||
// the operator spot hangs without having to tail journalctl on the LXC.
|
||||
func (d *Dispatcher) heartbeatWaiting(ctx context.Context) {
|
||||
if d.Logs == nil {
|
||||
return
|
||||
@@ -172,13 +203,13 @@ func (d *Dispatcher) heartbeatWaiting(ctx context.Context) {
|
||||
for i := range runs {
|
||||
r := &runs[i]
|
||||
seen[r.ID] = true
|
||||
if r.State != model.StateWaitingWoL {
|
||||
if r.State != model.StateWaitingReboot && r.State != model.StateWaitingWoL {
|
||||
continue
|
||||
}
|
||||
last, ok := d.lastBeat[r.ID]
|
||||
if !ok {
|
||||
// Run already in WaitingWoL from a previous process lifetime
|
||||
// — prime so we don't spam immediately.
|
||||
// Run already waiting from a previous process lifetime — prime
|
||||
// so we don't spam immediately.
|
||||
d.lastBeat[r.ID] = now
|
||||
continue
|
||||
}
|
||||
@@ -187,11 +218,11 @@ func (d *Dispatcher) heartbeatWaiting(ctx context.Context) {
|
||||
}
|
||||
elapsed := now.Sub(r.StartedAt).Truncate(time.Second)
|
||||
d.runLog(r.ID, "info", fmt.Sprintf(
|
||||
"still waiting for agent claim (%s) — check BIOS WoL, pxe.enabled, and live-image presence",
|
||||
"waiting for reporter to reboot + PXE-boot into live image (%s) — if this exceeds 2m, verify pxe.enabled in vetting.yaml and that the reporter actually invoked systemctl reboot",
|
||||
elapsed))
|
||||
d.lastBeat[r.ID] = now
|
||||
}
|
||||
// Garbage-collect entries for runs that have left WaitingWoL.
|
||||
// Garbage-collect entries for runs that have left the waiting states.
|
||||
for id := range d.lastBeat {
|
||||
if !seen[id] {
|
||||
delete(d.lastBeat, id)
|
||||
|
||||
Reference in New Issue
Block a user