Heartbeat-first dispatch: retire WoL-as-default, add WaitingReboot
CI / Lint + build + test (push) Has been cancelled
CI / Lint + build + test (push) Has been cancelled
Every supported host runs vetting-reporter in-OS and heartbeats every 30s. WoL was never the thing that started vetting — the heartbeat response's reboot_for_vetting command was. Firing WoL first only crowded the run log with misleading diagnostics when the real failure mode is "reporter isn't installed." - StartRun 409s if the host hasn't heartbeated within 60s, pointing the operator at /register/quick.sh. - Dispatcher re-checks LastSeenAt at dispatch time (run may sit in Queued long enough for the host to go offline); stale hosts mark the run Failed with failed_stage=dispatch instead of looping. - New StateWaitingReboot + TriggerRebootCommanded capture the actual semantics. StateWaitingWoL kept as the hook point for a future manual-override button. - Tile disables the Start button with a quick.sh tooltip when the host is offline, matching the server-side 409. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -37,8 +37,8 @@ func setupHeartbeat(t *testing.T) (*api.UI, *store.Hosts) {
|
||||
}
|
||||
|
||||
// setupHeartbeatWithRunner also wires a Runs store + Runner so
|
||||
// Phase-2 tests can exercise the Queued → WaitingWoL transition and
|
||||
// the 10-minute WaitingWoL re-issue window.
|
||||
// Phase-2 tests can exercise the Queued → WaitingReboot transition and
|
||||
// the 10-minute retry window on waiting states.
|
||||
func setupHeartbeatWithRunner(t *testing.T) (*api.UI, *store.Hosts, *store.Runs) {
|
||||
t.Helper()
|
||||
conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db"))
|
||||
@@ -158,17 +158,20 @@ func TestUIHeartbeat_QueuedDispatches(t *testing.T) {
|
||||
if resp.Cmd != "reboot_for_vetting" || resp.RunID != runID {
|
||||
t.Fatalf("response = %+v, want cmd=reboot_for_vetting run_id=%d", resp, runID)
|
||||
}
|
||||
// Run advanced Queued → WaitingWoL via the state machine.
|
||||
// Run advanced Queued → WaitingReboot via the state machine.
|
||||
got, err := runs.Get(ctx, runID)
|
||||
if err != nil {
|
||||
t.Fatalf("get run: %v", err)
|
||||
}
|
||||
if got.State != model.StateWaitingWoL {
|
||||
t.Fatalf("state = %s, want WaitingWoL", got.State)
|
||||
if got.State != model.StateWaitingReboot {
|
||||
t.Fatalf("state = %s, want WaitingReboot", got.State)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUIHeartbeat_WaitingWoLRetries(t *testing.T) {
|
||||
// TestUIHeartbeat_WaitingRebootRetries covers the reporter crashing
|
||||
// mid-reboot and coming back: the heartbeat must keep returning the
|
||||
// reboot command while the run is still young (<10min).
|
||||
func TestUIHeartbeat_WaitingRebootRetries(t *testing.T) {
|
||||
ui, hosts, runs := setupHeartbeatWithRunner(t)
|
||||
ctx := context.Background()
|
||||
hostID, err := hosts.Create(ctx, model.Host{
|
||||
@@ -185,9 +188,7 @@ func TestUIHeartbeat_WaitingWoLRetries(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
// Simulate: dispatcher already moved the run to WaitingWoL, now
|
||||
// the host's reporter comes back from a crashed reboot.
|
||||
if err := runs.SetState(ctx, runID, model.StateWaitingWoL); err != nil {
|
||||
if err := runs.SetState(ctx, runID, model.StateWaitingReboot); err != nil {
|
||||
t.Fatalf("set state: %v", err)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user