Heartbeat-first dispatch: retire WoL-as-default, add WaitingReboot
CI / Lint + build + test (push) Has been cancelled

Every supported host runs vetting-reporter in-OS and heartbeats every
30s. WoL was never the thing that started vetting — the heartbeat
response's reboot_for_vetting command was. Firing WoL first only
crowded the run log with misleading diagnostics when the real failure
mode is "reporter isn't installed."

- StartRun 409s if the host hasn't heartbeated within 60s, pointing
  the operator at /register/quick.sh.
- Dispatcher re-checks LastSeenAt at dispatch time (run may sit in
  Queued long enough for the host to go offline); stale hosts mark
  the run Failed with failed_stage=dispatch instead of looping.
- New StateWaitingReboot + TriggerRebootCommanded capture the actual
  semantics. StateWaitingWoL kept as the hook point for a future
  manual-override button.
- Tile disables the Start button with a quick.sh tooltip when the
  host is offline, matching the server-side 409.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 01:10:34 -04:00
parent c9927ca2bf
commit d0bfae14c8
17 changed files with 632 additions and 155 deletions
+36 -19
View File
@@ -9,19 +9,19 @@ import (
// node indexes for the default pipeline layout: pre-stages (3) + stage
// rows (9) + terminal Completed (1) = 13 nodes.
const (
idxQueued = 0
idxWaitingWoL = 1
idxBooting = 2
idxInventory = 3
idxSpecValidate = 4
idxSMART = 5
idxCPUStress = 6
idxStorage = 7
idxNetwork = 8
idxGPU = 9
idxPSU = 10
idxReporting = 11
idxCompleted = 12
idxQueued = 0
idxWaitingReboot = 1
idxBooting = 2
idxInventory = 3
idxSpecValidate = 4
idxSMART = 5
idxCPUStress = 6
idxStorage = 7
idxNetwork = 8
idxGPU = 9
idxPSU = 10
idxReporting = 11
idxCompleted = 12
)
// seedStages returns a fresh all-pending stage slice in the canonical order.
@@ -48,12 +48,12 @@ func TestBuildPipeline_NoRun(t *testing.T) {
}
}
// TestBuildPipeline_GhostStagesBeforeClaim models the real WaitingWoL
// TestBuildPipeline_GhostStagesBeforeClaim models the real WaitingReboot
// case: the run exists but agent hasn't called /claim yet, so there are
// no stage rows. Pipeline must still render all 9 stage nodes as ghosts
// so the operator sees the full timeline ahead of them.
func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
run := &model.Run{State: model.StateWaitingWoL}
run := &model.Run{State: model.StateWaitingReboot}
nodes := BuildPipeline(run, nil)
if len(nodes) != 13 {
t.Fatalf("len = %d, want 13", len(nodes))
@@ -61,8 +61,8 @@ func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
if nodes[idxQueued].State != "passed" {
t.Errorf("Queued = %q, want passed", nodes[idxQueued].State)
}
if nodes[idxWaitingWoL].State != "running" {
t.Errorf("WaitingWoL = %q, want running", nodes[idxWaitingWoL].State)
if nodes[idxWaitingReboot].State != "running" {
t.Errorf("WaitingReboot = %q, want running", nodes[idxWaitingReboot].State)
}
// All 9 stage ghosts must be pending — nothing has started yet.
for i := idxInventory; i <= idxReporting; i++ {
@@ -179,7 +179,24 @@ func TestBuildPipeline_QueuedNow(t *testing.T) {
if nodes[idxQueued].State != "running" {
t.Errorf("Queued = %q, want running", nodes[idxQueued].State)
}
if nodes[idxWaitingWoL].State != "pending" {
t.Errorf("WaitingWoL = %q, want pending", nodes[idxWaitingWoL].State)
if nodes[idxWaitingReboot].State != "pending" {
t.Errorf("WaitingReboot = %q, want pending", nodes[idxWaitingReboot].State)
}
}
// TestBuildPipeline_PreStageRunning_WaitingReboot confirms the pre-stage
// node for WaitingReboot lights up while the run sits there — the new
// happy-path state must map onto its pipeline slot.
func TestBuildPipeline_PreStageRunning_WaitingReboot(t *testing.T) {
run := &model.Run{State: model.StateWaitingReboot}
nodes := BuildPipeline(run, seedStages())
if nodes[idxQueued].State != "passed" {
t.Errorf("Queued = %q, want passed", nodes[idxQueued].State)
}
if nodes[idxWaitingReboot].State != "running" {
t.Errorf("WaitingReboot = %q, want running", nodes[idxWaitingReboot].State)
}
if nodes[idxBooting].State != "pending" {
t.Errorf("Booting = %q, want pending", nodes[idxBooting].State)
}
}