Heartbeat-first dispatch: retire WoL-as-default, add WaitingReboot
CI / Lint + build + test (push) Has been cancelled
CI / Lint + build + test (push) Has been cancelled
Every supported host runs vetting-reporter in-OS and heartbeats every 30s. WoL was never the thing that started vetting — the heartbeat response's reboot_for_vetting command was. Firing WoL first only crowded the run log with misleading diagnostics when the real failure mode is "reporter isn't installed." - StartRun 409s if the host hasn't heartbeated within 60s, pointing the operator at /register/quick.sh. - Dispatcher re-checks LastSeenAt at dispatch time (run may sit in Queued long enough for the host to go offline); stale hosts mark the run Failed with failed_stage=dispatch instead of looping. - New StateWaitingReboot + TriggerRebootCommanded capture the actual semantics. StateWaitingWoL kept as the hook point for a future manual-override button. - Tile disables the Start button with a quick.sh tooltip when the host is offline, matching the server-side 409. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -37,8 +37,8 @@ func setupHeartbeat(t *testing.T) (*api.UI, *store.Hosts) {
|
||||
}
|
||||
|
||||
// setupHeartbeatWithRunner also wires a Runs store + Runner so
|
||||
// Phase-2 tests can exercise the Queued → WaitingWoL transition and
|
||||
// the 10-minute WaitingWoL re-issue window.
|
||||
// Phase-2 tests can exercise the Queued → WaitingReboot transition and
|
||||
// the 10-minute retry window on waiting states.
|
||||
func setupHeartbeatWithRunner(t *testing.T) (*api.UI, *store.Hosts, *store.Runs) {
|
||||
t.Helper()
|
||||
conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db"))
|
||||
@@ -158,17 +158,20 @@ func TestUIHeartbeat_QueuedDispatches(t *testing.T) {
|
||||
if resp.Cmd != "reboot_for_vetting" || resp.RunID != runID {
|
||||
t.Fatalf("response = %+v, want cmd=reboot_for_vetting run_id=%d", resp, runID)
|
||||
}
|
||||
// Run advanced Queued → WaitingWoL via the state machine.
|
||||
// Run advanced Queued → WaitingReboot via the state machine.
|
||||
got, err := runs.Get(ctx, runID)
|
||||
if err != nil {
|
||||
t.Fatalf("get run: %v", err)
|
||||
}
|
||||
if got.State != model.StateWaitingWoL {
|
||||
t.Fatalf("state = %s, want WaitingWoL", got.State)
|
||||
if got.State != model.StateWaitingReboot {
|
||||
t.Fatalf("state = %s, want WaitingReboot", got.State)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUIHeartbeat_WaitingWoLRetries(t *testing.T) {
|
||||
// TestUIHeartbeat_WaitingRebootRetries covers the reporter crashing
|
||||
// mid-reboot and coming back: the heartbeat must keep returning the
|
||||
// reboot command while the run is still young (<10min).
|
||||
func TestUIHeartbeat_WaitingRebootRetries(t *testing.T) {
|
||||
ui, hosts, runs := setupHeartbeatWithRunner(t)
|
||||
ctx := context.Background()
|
||||
hostID, err := hosts.Create(ctx, model.Host{
|
||||
@@ -185,9 +188,7 @@ func TestUIHeartbeat_WaitingWoLRetries(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
// Simulate: dispatcher already moved the run to WaitingWoL, now
|
||||
// the host's reporter comes back from a crashed reboot.
|
||||
if err := runs.SetState(ctx, runID, model.StateWaitingWoL); err != nil {
|
||||
if err := runs.SetState(ctx, runID, model.StateWaitingReboot); err != nil {
|
||||
t.Fatalf("set state: %v", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
package api_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"vetting/internal/api"
|
||||
"vetting/internal/db"
|
||||
"vetting/internal/events"
|
||||
"vetting/internal/model"
|
||||
"vetting/internal/orchestrator"
|
||||
"vetting/internal/store"
|
||||
)
|
||||
|
||||
func setupStartRun(t *testing.T) (*api.UI, *store.Hosts, *store.Runs) {
|
||||
t.Helper()
|
||||
conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open db: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { _ = conn.Close() })
|
||||
hosts := &store.Hosts{DB: conn}
|
||||
runs := &store.Runs{DB: conn}
|
||||
stages := &store.Stages{DB: conn}
|
||||
hub := events.NewHub()
|
||||
runner := &orchestrator.Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub}
|
||||
ui := &api.UI{Hosts: hosts, Runs: runs, Runner: runner, EventHub: hub}
|
||||
return ui, hosts, runs
|
||||
}
|
||||
|
||||
func startRunReq(id int64) *http.Request {
|
||||
req := httptest.NewRequest(http.MethodPost, fmt.Sprintf("/hosts/%d/start", id), nil)
|
||||
rctx := chi.NewRouteContext()
|
||||
rctx.URLParams.Add("id", fmt.Sprintf("%d", id))
|
||||
return req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
|
||||
}
|
||||
|
||||
// TestStartRun_RefusesWhenNeverHeartbeated: operator clicks Start on a
|
||||
// host whose reporter has never phoned home. The handler must 409 with
|
||||
// a message pointing at the quick.sh install path so the operator knows
|
||||
// exactly what to fix.
|
||||
func TestStartRun_RefusesWhenNeverHeartbeated(t *testing.T) {
|
||||
ui, hosts, _ := setupStartRun(t)
|
||||
id, err := hosts.Create(context.Background(), model.Host{
|
||||
Name: "sr-never",
|
||||
MAC: "aa:bb:cc:dd:ee:60",
|
||||
WoLBroadcastIP: "10.0.0.255",
|
||||
WoLPort: 9,
|
||||
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
|
||||
rr := httptest.NewRecorder()
|
||||
ui.StartRun(rr, startRunReq(id))
|
||||
if rr.Code != http.StatusConflict {
|
||||
t.Fatalf("status = %d, want 409, body=%q", rr.Code, rr.Body.String())
|
||||
}
|
||||
if !strings.Contains(rr.Body.String(), "quick.sh") {
|
||||
t.Fatalf("body missing quick.sh hint: %s", rr.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestStartRun_RefusesWhenHeartbeatStale: last heartbeat was 5 minutes
|
||||
// ago — well past the 60s staleness cutoff. Same 409 path as never-seen.
|
||||
func TestStartRun_RefusesWhenHeartbeatStale(t *testing.T) {
|
||||
ui, hosts, _ := setupStartRun(t)
|
||||
ctx := context.Background()
|
||||
id, err := hosts.Create(ctx, model.Host{
|
||||
Name: "sr-stale",
|
||||
MAC: "aa:bb:cc:dd:ee:61",
|
||||
WoLBroadcastIP: "10.0.0.255",
|
||||
WoLPort: 9,
|
||||
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:61", time.Now().UTC().Add(-5*time.Minute)); err != nil {
|
||||
t.Fatalf("stamp: %v", err)
|
||||
}
|
||||
|
||||
rr := httptest.NewRecorder()
|
||||
ui.StartRun(rr, startRunReq(id))
|
||||
if rr.Code != http.StatusConflict {
|
||||
t.Fatalf("status = %d, want 409", rr.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// TestStartRun_AcceptsWhenHeartbeating: fresh heartbeat within the
|
||||
// staleness window → 303 redirect + a Queued run in the DB.
|
||||
func TestStartRun_AcceptsWhenHeartbeating(t *testing.T) {
|
||||
ui, hosts, runs := setupStartRun(t)
|
||||
ctx := context.Background()
|
||||
id, err := hosts.Create(ctx, model.Host{
|
||||
Name: "sr-live",
|
||||
MAC: "aa:bb:cc:dd:ee:62",
|
||||
WoLBroadcastIP: "10.0.0.255",
|
||||
WoLPort: 9,
|
||||
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:62", time.Now().UTC()); err != nil {
|
||||
t.Fatalf("stamp: %v", err)
|
||||
}
|
||||
|
||||
rr := httptest.NewRecorder()
|
||||
ui.StartRun(rr, startRunReq(id))
|
||||
if rr.Code != http.StatusSeeOther {
|
||||
t.Fatalf("status = %d, want 303, body=%q", rr.Code, rr.Body.String())
|
||||
}
|
||||
latest, err := runs.LatestForHost(ctx, id)
|
||||
if err != nil {
|
||||
t.Fatalf("latest: %v", err)
|
||||
}
|
||||
if latest == nil || latest.State != model.StateQueued {
|
||||
t.Fatalf("latest run = %+v, want Queued", latest)
|
||||
}
|
||||
}
|
||||
+28
-13
@@ -130,7 +130,10 @@ func (u *UI) HostDetail(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
// StartRun creates a new Run for the host, issues an agent token, and
|
||||
// transitions Registered→Queued. The dispatcher goroutine picks it up
|
||||
// and fires WoL.
|
||||
// on its next tick; the happy path is heartbeat-driven (the reporter's
|
||||
// next heartbeat fetches reboot_for_vetting). Refuses the click outright
|
||||
// if the host isn't currently heartbeating — there is no path from
|
||||
// Queued to live-image without an in-OS reporter on the target.
|
||||
func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
|
||||
idStr := chi.URLParam(r, "id")
|
||||
hostID, err := strconv.ParseInt(idStr, 10, 64)
|
||||
@@ -138,7 +141,8 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "bad host id", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if _, err := u.Hosts.Get(r.Context(), hostID); err != nil {
|
||||
host, err := u.Hosts.Get(r.Context(), hostID)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
@@ -147,10 +151,20 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
// Preflight: host must be heartbeating. The dispatcher re-checks at
|
||||
// dispatch time (belt-and-braces for the gap between click and tick),
|
||||
// but rejecting here gives the operator an immediate, actionable
|
||||
// error instead of a mysterious Failed run 2s later.
|
||||
if host.LastSeenAt == nil || time.Since(*host.LastSeenAt) > orchestrator.HostHeartbeatStaleAfter {
|
||||
writeJSONError(w, http.StatusConflict,
|
||||
"host is not heartbeating — install the reporter via /register/quick.sh on the target host, then retry")
|
||||
return
|
||||
}
|
||||
|
||||
// Guard: refuse to start a second run while one is still active.
|
||||
if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
|
||||
switch latest.State {
|
||||
case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
|
||||
case model.StateCompleted, model.StateReleased, model.StateFailed, model.StateFailedHolding:
|
||||
// ok to start fresh
|
||||
default:
|
||||
http.Error(w, "host already has an active run", http.StatusConflict)
|
||||
@@ -343,11 +357,12 @@ type heartbeatResponse struct {
|
||||
// pickHostCommand decides what the host-mode agent should do on the
|
||||
// back of this heartbeat. Returns ("", 0) when there's nothing to do.
|
||||
//
|
||||
// - Queued run → Transition(Dispatched) and tell the agent to reboot.
|
||||
// The dispatcher would have WoL'd it anyway; we beat it to the
|
||||
// punch so the host skips the WoL dance.
|
||||
// - WaitingWoL run created <10min ago → also return reboot, covering
|
||||
// "host crashed mid-reboot, systemd brought the reporter back".
|
||||
// - Queued run → Transition(RebootCommanded) and tell the agent to
|
||||
// reboot. Beats the dispatcher's 2s poll to the punch, but either
|
||||
// path ends at WaitingReboot.
|
||||
// - WaitingReboot (or legacy WaitingWoL) run <10min old → also return
|
||||
// reboot, covering "host crashed mid-reboot, systemd brought the
|
||||
// reporter back".
|
||||
// - anything else → idle.
|
||||
func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) {
|
||||
if u.Runs == nil || u.Runner == nil {
|
||||
@@ -363,7 +378,7 @@ func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64)
|
||||
}
|
||||
switch run.State {
|
||||
case model.StateQueued:
|
||||
if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerDispatched); err != nil {
|
||||
if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerRebootCommanded); err != nil {
|
||||
// Benign race with the dispatcher's own 2s poll — the
|
||||
// state machine refuses the second transition; we just
|
||||
// log and return idle so the agent doesn't reboot on a
|
||||
@@ -371,13 +386,13 @@ func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64)
|
||||
log.Printf("heartbeat: transition run %d: %v", run.ID, err)
|
||||
return "", 0
|
||||
}
|
||||
log.Printf("heartbeat: dispatched run %d for host %d via heartbeat (no WoL)", run.ID, hostID)
|
||||
log.Printf("heartbeat: dispatched run %d for host %d (reboot commanded)", run.ID, hostID)
|
||||
return cmdRebootForVetting, run.ID
|
||||
case model.StateWaitingWoL:
|
||||
case model.StateWaitingReboot, model.StateWaitingWoL:
|
||||
// Tolerate a crashed-mid-reboot retry: the reporter is the
|
||||
// only thing that could be telling us about this host right
|
||||
// now, and WoL is only the fallback anyway. Bound it so a
|
||||
// perpetually-broken PXE doesn't reboot-loop the box.
|
||||
// now. Bound it so a perpetually-broken PXE doesn't
|
||||
// reboot-loop the box.
|
||||
if time.Since(run.StartedAt) < 10*time.Minute {
|
||||
return cmdRebootForVetting, run.ID
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user