Heartbeat-first dispatch: retire WoL-as-default, add WaitingReboot
CI / Lint + build + test (push) Has been cancelled

Every supported host runs vetting-reporter in-OS and heartbeats every
30s. WoL was never the thing that started vetting — the heartbeat
response's reboot_for_vetting command was. Firing WoL first only
crowded the run log with misleading diagnostics when the real failure
mode is "reporter isn't installed."

- StartRun 409s if the host hasn't heartbeated within 60s, pointing
  the operator at /register/quick.sh.
- Dispatcher re-checks LastSeenAt at dispatch time (run may sit in
  Queued long enough for the host to go offline); stale hosts mark
  the run Failed with failed_stage=dispatch instead of looping.
- New StateWaitingReboot + TriggerRebootCommanded capture the actual
  semantics. StateWaitingWoL kept as the hook point for a future
  manual-override button.
- Tile disables the Start button with a quick.sh tooltip when the
  host is offline, matching the server-side 409.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 01:10:34 -04:00
parent c9927ca2bf
commit d0bfae14c8
17 changed files with 632 additions and 155 deletions
+10 -9
View File
@@ -37,8 +37,8 @@ func setupHeartbeat(t *testing.T) (*api.UI, *store.Hosts) {
}
// setupHeartbeatWithRunner also wires a Runs store + Runner so
// Phase-2 tests can exercise the Queued → WaitingWoL transition and
// the 10-minute WaitingWoL re-issue window.
// Phase-2 tests can exercise the Queued → WaitingReboot transition and
// the 10-minute retry window on waiting states.
func setupHeartbeatWithRunner(t *testing.T) (*api.UI, *store.Hosts, *store.Runs) {
t.Helper()
conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db"))
@@ -158,17 +158,20 @@ func TestUIHeartbeat_QueuedDispatches(t *testing.T) {
if resp.Cmd != "reboot_for_vetting" || resp.RunID != runID {
t.Fatalf("response = %+v, want cmd=reboot_for_vetting run_id=%d", resp, runID)
}
// Run advanced Queued → WaitingWoL via the state machine.
// Run advanced Queued → WaitingReboot via the state machine.
got, err := runs.Get(ctx, runID)
if err != nil {
t.Fatalf("get run: %v", err)
}
if got.State != model.StateWaitingWoL {
t.Fatalf("state = %s, want WaitingWoL", got.State)
if got.State != model.StateWaitingReboot {
t.Fatalf("state = %s, want WaitingReboot", got.State)
}
}
func TestUIHeartbeat_WaitingWoLRetries(t *testing.T) {
// TestUIHeartbeat_WaitingRebootRetries covers the reporter crashing
// mid-reboot and coming back: the heartbeat must keep returning the
// reboot command while the run is still young (<10min).
func TestUIHeartbeat_WaitingRebootRetries(t *testing.T) {
ui, hosts, runs := setupHeartbeatWithRunner(t)
ctx := context.Background()
hostID, err := hosts.Create(ctx, model.Host{
@@ -185,9 +188,7 @@ func TestUIHeartbeat_WaitingWoLRetries(t *testing.T) {
if err != nil {
t.Fatalf("create run: %v", err)
}
// Simulate: dispatcher already moved the run to WaitingWoL, now
// the host's reporter comes back from a crashed reboot.
if err := runs.SetState(ctx, runID, model.StateWaitingWoL); err != nil {
if err := runs.SetState(ctx, runID, model.StateWaitingReboot); err != nil {
t.Fatalf("set state: %v", err)
}