diff --git a/internal/api/heartbeat_test.go b/internal/api/heartbeat_test.go
index 08257eb..8e4a9e9 100644
--- a/internal/api/heartbeat_test.go
+++ b/internal/api/heartbeat_test.go
@@ -37,8 +37,8 @@ func setupHeartbeat(t *testing.T) (*api.UI, *store.Hosts) {
}
// setupHeartbeatWithRunner also wires a Runs store + Runner so
-// Phase-2 tests can exercise the Queued → WaitingWoL transition and
-// the 10-minute WaitingWoL re-issue window.
+// Phase-2 tests can exercise the Queued → WaitingReboot transition and
+// the 10-minute retry window on waiting states.
func setupHeartbeatWithRunner(t *testing.T) (*api.UI, *store.Hosts, *store.Runs) {
t.Helper()
conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db"))
@@ -158,17 +158,20 @@ func TestUIHeartbeat_QueuedDispatches(t *testing.T) {
if resp.Cmd != "reboot_for_vetting" || resp.RunID != runID {
t.Fatalf("response = %+v, want cmd=reboot_for_vetting run_id=%d", resp, runID)
}
- // Run advanced Queued → WaitingWoL via the state machine.
+ // Run advanced Queued → WaitingReboot via the state machine.
got, err := runs.Get(ctx, runID)
if err != nil {
t.Fatalf("get run: %v", err)
}
- if got.State != model.StateWaitingWoL {
- t.Fatalf("state = %s, want WaitingWoL", got.State)
+ if got.State != model.StateWaitingReboot {
+ t.Fatalf("state = %s, want WaitingReboot", got.State)
}
}
-func TestUIHeartbeat_WaitingWoLRetries(t *testing.T) {
+// TestUIHeartbeat_WaitingRebootRetries covers the reporter crashing
+// mid-reboot and coming back: the heartbeat must keep returning the
+// reboot command while the run is still young (<10min).
+func TestUIHeartbeat_WaitingRebootRetries(t *testing.T) {
ui, hosts, runs := setupHeartbeatWithRunner(t)
ctx := context.Background()
hostID, err := hosts.Create(ctx, model.Host{
@@ -185,9 +188,7 @@ func TestUIHeartbeat_WaitingWoLRetries(t *testing.T) {
if err != nil {
t.Fatalf("create run: %v", err)
}
- // Simulate: dispatcher already moved the run to WaitingWoL, now
- // the host's reporter comes back from a crashed reboot.
- if err := runs.SetState(ctx, runID, model.StateWaitingWoL); err != nil {
+ if err := runs.SetState(ctx, runID, model.StateWaitingReboot); err != nil {
t.Fatalf("set state: %v", err)
}
diff --git a/internal/api/start_run_test.go b/internal/api/start_run_test.go
new file mode 100644
index 0000000..60a9c1b
--- /dev/null
+++ b/internal/api/start_run_test.go
@@ -0,0 +1,130 @@
+package api_test
+
+import (
+ "context"
+ "fmt"
+ "net/http"
+ "net/http/httptest"
+ "path/filepath"
+ "strings"
+ "testing"
+ "time"
+
+ "github.com/go-chi/chi/v5"
+
+ "vetting/internal/api"
+ "vetting/internal/db"
+ "vetting/internal/events"
+ "vetting/internal/model"
+ "vetting/internal/orchestrator"
+ "vetting/internal/store"
+)
+
+func setupStartRun(t *testing.T) (*api.UI, *store.Hosts, *store.Runs) {
+ t.Helper()
+ conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db"))
+ if err != nil {
+ t.Fatalf("open db: %v", err)
+ }
+ t.Cleanup(func() { _ = conn.Close() })
+ hosts := &store.Hosts{DB: conn}
+ runs := &store.Runs{DB: conn}
+ stages := &store.Stages{DB: conn}
+ hub := events.NewHub()
+ runner := &orchestrator.Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub}
+ ui := &api.UI{Hosts: hosts, Runs: runs, Runner: runner, EventHub: hub}
+ return ui, hosts, runs
+}
+
+func startRunReq(id int64) *http.Request {
+ req := httptest.NewRequest(http.MethodPost, fmt.Sprintf("/hosts/%d/start", id), nil)
+ rctx := chi.NewRouteContext()
+ rctx.URLParams.Add("id", fmt.Sprintf("%d", id))
+ return req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
+}
+
+// TestStartRun_RefusesWhenNeverHeartbeated: operator clicks Start on a
+// host whose reporter has never phoned home. The handler must 409 with
+// a message pointing at the quick.sh install path so the operator knows
+// exactly what to fix.
+func TestStartRun_RefusesWhenNeverHeartbeated(t *testing.T) {
+ ui, hosts, _ := setupStartRun(t)
+ id, err := hosts.Create(context.Background(), model.Host{
+ Name: "sr-never",
+ MAC: "aa:bb:cc:dd:ee:60",
+ WoLBroadcastIP: "10.0.0.255",
+ WoLPort: 9,
+ ExpectedSpecYAML: "memory:\n total_gib: 16\n",
+ })
+ if err != nil {
+ t.Fatalf("create host: %v", err)
+ }
+
+ rr := httptest.NewRecorder()
+ ui.StartRun(rr, startRunReq(id))
+ if rr.Code != http.StatusConflict {
+ t.Fatalf("status = %d, want 409, body=%q", rr.Code, rr.Body.String())
+ }
+ if !strings.Contains(rr.Body.String(), "quick.sh") {
+ t.Fatalf("body missing quick.sh hint: %s", rr.Body.String())
+ }
+}
+
+// TestStartRun_RefusesWhenHeartbeatStale: last heartbeat was 5 minutes
+// ago — well past the 60s staleness cutoff. Same 409 path as never-seen.
+func TestStartRun_RefusesWhenHeartbeatStale(t *testing.T) {
+ ui, hosts, _ := setupStartRun(t)
+ ctx := context.Background()
+ id, err := hosts.Create(ctx, model.Host{
+ Name: "sr-stale",
+ MAC: "aa:bb:cc:dd:ee:61",
+ WoLBroadcastIP: "10.0.0.255",
+ WoLPort: 9,
+ ExpectedSpecYAML: "memory:\n total_gib: 16\n",
+ })
+ if err != nil {
+ t.Fatalf("create host: %v", err)
+ }
+ if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:61", time.Now().UTC().Add(-5*time.Minute)); err != nil {
+ t.Fatalf("stamp: %v", err)
+ }
+
+ rr := httptest.NewRecorder()
+ ui.StartRun(rr, startRunReq(id))
+ if rr.Code != http.StatusConflict {
+ t.Fatalf("status = %d, want 409", rr.Code)
+ }
+}
+
+// TestStartRun_AcceptsWhenHeartbeating: fresh heartbeat within the
+// staleness window → 303 redirect + a Queued run in the DB.
+func TestStartRun_AcceptsWhenHeartbeating(t *testing.T) {
+ ui, hosts, runs := setupStartRun(t)
+ ctx := context.Background()
+ id, err := hosts.Create(ctx, model.Host{
+ Name: "sr-live",
+ MAC: "aa:bb:cc:dd:ee:62",
+ WoLBroadcastIP: "10.0.0.255",
+ WoLPort: 9,
+ ExpectedSpecYAML: "memory:\n total_gib: 16\n",
+ })
+ if err != nil {
+ t.Fatalf("create host: %v", err)
+ }
+ if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:62", time.Now().UTC()); err != nil {
+ t.Fatalf("stamp: %v", err)
+ }
+
+ rr := httptest.NewRecorder()
+ ui.StartRun(rr, startRunReq(id))
+ if rr.Code != http.StatusSeeOther {
+ t.Fatalf("status = %d, want 303, body=%q", rr.Code, rr.Body.String())
+ }
+ latest, err := runs.LatestForHost(ctx, id)
+ if err != nil {
+ t.Fatalf("latest: %v", err)
+ }
+ if latest == nil || latest.State != model.StateQueued {
+ t.Fatalf("latest run = %+v, want Queued", latest)
+ }
+}
diff --git a/internal/api/ui_handlers.go b/internal/api/ui_handlers.go
index 2c95f3c..aff370e 100644
--- a/internal/api/ui_handlers.go
+++ b/internal/api/ui_handlers.go
@@ -130,7 +130,10 @@ func (u *UI) HostDetail(w http.ResponseWriter, r *http.Request) {
// StartRun creates a new Run for the host, issues an agent token, and
// transitions Registered→Queued. The dispatcher goroutine picks it up
-// and fires WoL.
+// on its next tick; the happy path is heartbeat-driven (the reporter's
+// next heartbeat fetches reboot_for_vetting). Refuses the click outright
+// if the host isn't currently heartbeating — there is no path from
+// Queued to live-image without an in-OS reporter on the target.
func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
hostID, err := strconv.ParseInt(idStr, 10, 64)
@@ -138,7 +141,8 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
- if _, err := u.Hosts.Get(r.Context(), hostID); err != nil {
+ host, err := u.Hosts.Get(r.Context(), hostID)
+ if err != nil {
if errors.Is(err, store.ErrNotFound) {
http.NotFound(w, r)
return
@@ -147,10 +151,20 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
return
}
+ // Preflight: host must be heartbeating. The dispatcher re-checks at
+ // dispatch time (belt-and-braces for the gap between click and tick),
+ // but rejecting here gives the operator an immediate, actionable
+ // error instead of a mysterious Failed run 2s later.
+ if host.LastSeenAt == nil || time.Since(*host.LastSeenAt) > orchestrator.HostHeartbeatStaleAfter {
+ writeJSONError(w, http.StatusConflict,
+ "host is not heartbeating — install the reporter via /register/quick.sh on the target host, then retry")
+ return
+ }
+
// Guard: refuse to start a second run while one is still active.
if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
switch latest.State {
- case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
+ case model.StateCompleted, model.StateReleased, model.StateFailed, model.StateFailedHolding:
// ok to start fresh
default:
http.Error(w, "host already has an active run", http.StatusConflict)
@@ -343,11 +357,12 @@ type heartbeatResponse struct {
// pickHostCommand decides what the host-mode agent should do on the
// back of this heartbeat. Returns ("", 0) when there's nothing to do.
//
-// - Queued run → Transition(Dispatched) and tell the agent to reboot.
-// The dispatcher would have WoL'd it anyway; we beat it to the
-// punch so the host skips the WoL dance.
-// - WaitingWoL run created <10min ago → also return reboot, covering
-// "host crashed mid-reboot, systemd brought the reporter back".
+// - Queued run → Transition(RebootCommanded) and tell the agent to
+// reboot. Beats the dispatcher's 2s poll to the punch, but either
+// path ends at WaitingReboot.
+// - WaitingReboot (or legacy WaitingWoL) run <10min old → also return
+// reboot, covering "host crashed mid-reboot, systemd brought the
+// reporter back".
// - anything else → idle.
func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) {
if u.Runs == nil || u.Runner == nil {
@@ -363,7 +378,7 @@ func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64)
}
switch run.State {
case model.StateQueued:
- if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerDispatched); err != nil {
+ if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerRebootCommanded); err != nil {
// Benign race with the dispatcher's own 2s poll — the
// state machine refuses the second transition; we just
// log and return idle so the agent doesn't reboot on a
@@ -371,13 +386,13 @@ func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64)
log.Printf("heartbeat: transition run %d: %v", run.ID, err)
return "", 0
}
- log.Printf("heartbeat: dispatched run %d for host %d via heartbeat (no WoL)", run.ID, hostID)
+ log.Printf("heartbeat: dispatched run %d for host %d (reboot commanded)", run.ID, hostID)
return cmdRebootForVetting, run.ID
- case model.StateWaitingWoL:
+ case model.StateWaitingReboot, model.StateWaitingWoL:
// Tolerate a crashed-mid-reboot retry: the reporter is the
// only thing that could be telling us about this host right
- // now, and WoL is only the fallback anyway. Bound it so a
- // perpetually-broken PXE doesn't reboot-loop the box.
+ // now. Bound it so a perpetually-broken PXE doesn't
+ // reboot-loop the box.
if time.Since(run.StartedAt) < 10*time.Minute {
return cmdRebootForVetting, run.ID
}
diff --git a/internal/model/model.go b/internal/model/model.go
index f7a5103..9f8528e 100644
--- a/internal/model/model.go
+++ b/internal/model/model.go
@@ -23,6 +23,7 @@ const (
StateRegistered RunState = "Registered"
StateQueued RunState = "Queued"
StateWaitingWoL RunState = "WaitingWoL"
+ StateWaitingReboot RunState = "WaitingReboot"
StateBooting RunState = "Booting"
StateInventoryCheck RunState = "InventoryCheck"
StateSpecValidate RunState = "SpecValidate"
diff --git a/internal/orchestrator/dispatcher.go b/internal/orchestrator/dispatcher.go
index 4b1e2e8..85c637f 100644
--- a/internal/orchestrator/dispatcher.go
+++ b/internal/orchestrator/dispatcher.go
@@ -12,14 +12,25 @@ import (
"vetting/internal/store"
)
-// Dispatcher picks Queued runs off the DB and drives them through
-// WaitingWoL (sending a WoL packet). Concurrency is capped at Max.
+// HostHeartbeatStaleAfter is how long we tolerate a host's last_seen_at
+// being in the past before treating the host as offline. Set to 2× the
+// default reporter heartbeat interval (30s) so a single dropped heartbeat
+// doesn't block dispatch. Used by the StartRun preflight and the
+// dispatcher itself — both must agree or the operator's click-time
+// validation wouldn't match the dispatch-time check.
+const HostHeartbeatStaleAfter = 60 * time.Second
+
+// Dispatcher picks Queued runs off the DB and drives them to
+// WaitingReboot — the happy path is heartbeat-first: we transition and
+// rely on the host-mode reporter's next heartbeat to fetch the
+// reboot_for_vetting command. WoL is not fired in the default flow
+// because every supported host already runs the reporter in-OS.
//
-// Pre-stage log lines (picked, WoL-sent, heartbeat, agent-claimed)
-// are written into the per-run log via Logs so the detail page's
-// log pane can show what's happening before the agent is alive.
+// Pre-stage log lines (picked, heartbeating, agent-claimed) are
+// written into the per-run log via Logs so the detail page's log pane
+// can show what's happening before the agent is alive.
//
-// For Phase 2 the dispatcher's job ends at WaitingWoL; further
+// For Phase 2 the dispatcher's job ends at WaitingReboot; further
// transitions are driven by iPXE and agent callbacks. Phase 4+ will
// return here and shepherd each run through stage execution.
type Dispatcher struct {
@@ -107,10 +118,10 @@ func (d *Dispatcher) pickNext(ctx context.Context) {
if queued == nil {
queued = &runs[i]
}
- case model.StateWaitingWoL, model.StateBooting, model.StateInventoryCheck,
- model.StateSpecValidate, model.StateSMART, model.StateCPUStress,
- model.StateStorage, model.StateNetwork, model.StateGPU,
- model.StatePSU, model.StateReporting:
+ case model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
+ model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
+ model.StateCPUStress, model.StateStorage, model.StateNetwork,
+ model.StateGPU, model.StatePSU, model.StateReporting:
inFlight++
}
}
@@ -124,23 +135,43 @@ func (d *Dispatcher) pickNext(ctx context.Context) {
log.Printf("dispatcher: get host %d: %v", queued.HostID, err)
return
}
- d.runLog(queued.ID, "info", fmt.Sprintf("dispatcher: picked run for host %s (mac=%s wol=%s:%d)",
- host.Name, host.MAC, host.WoLBroadcastIP, host.WoLPort))
- if _, err := d.Runner.Transition(ctx, queued.ID, TriggerDispatched); err != nil {
+
+ // Heartbeat gate: the StartRun preflight catches this at click time,
+ // but a run can sit in Queued long enough for the host to go offline
+ // between click and dispatch. Re-check here so we never fire a
+ // reboot command at a host that can't receive it.
+ if host.LastSeenAt == nil || time.Since(*host.LastSeenAt) > HostHeartbeatStaleAfter {
+ var ageMsg string
+ if host.LastSeenAt == nil {
+ ageMsg = "never heartbeated"
+ } else {
+ ageMsg = fmt.Sprintf("last heartbeat %s ago", time.Since(*host.LastSeenAt).Truncate(time.Second))
+ }
+ d.runLog(queued.ID, "error", fmt.Sprintf(
+ "dispatcher: host %s is offline (%s) — refusing to dispatch; install the reporter via /register/quick.sh on the target and retry",
+ host.Name, ageMsg))
+ if err := d.Runs.MarkDispatchFailed(ctx, queued.ID, "dispatch", "host stopped heartbeating before dispatch"); err != nil {
+ log.Printf("dispatcher: mark run %d dispatch-failed: %v", queued.ID, err)
+ }
+ if d.Runner != nil {
+ d.Runner.PublishTileUpdate(ctx, host.ID)
+ }
+ return
+ }
+
+ age := time.Since(*host.LastSeenAt).Truncate(time.Second)
+ d.runLog(queued.ID, "info", fmt.Sprintf(
+ "dispatcher: picked run for host %s (mac=%s, heartbeating, last seen %s ago)",
+ host.Name, host.MAC, age))
+ if _, err := d.Runner.Transition(ctx, queued.ID, TriggerRebootCommanded); err != nil {
log.Printf("dispatcher: transition run %d: %v", queued.ID, err)
- d.runLog(queued.ID, "error", fmt.Sprintf("dispatcher: transition to WaitingWoL failed: %v", err))
+ d.runLog(queued.ID, "error", fmt.Sprintf("dispatcher: transition to WaitingReboot failed: %v", err))
return
}
- if err := SendWoL(host.MAC, host.WoLBroadcastIP, host.WoLPort); err != nil {
- log.Printf("dispatcher: WoL run %d host %s: %v", queued.ID, host.Name, err)
- d.runLog(queued.ID, "error", fmt.Sprintf("dispatcher: WoL send failed: %v — check broadcast %s:%d is reachable",
- err, host.WoLBroadcastIP, host.WoLPort))
- // Stay in WaitingWoL; operator can retry or investigate.
- return
- }
- log.Printf("dispatcher: WoL sent for run %d (host=%s mac=%s)", queued.ID, host.Name, host.MAC)
- d.runLog(queued.ID, "info", fmt.Sprintf("dispatcher: sent WoL packet to %s via %s:%d — waiting for agent claim",
- host.MAC, host.WoLBroadcastIP, host.WoLPort))
+ log.Printf("dispatcher: run %d host %s → WaitingReboot (heartbeat-driven)", queued.ID, host.Name)
+ d.runLog(queued.ID, "info", fmt.Sprintf(
+ "dispatcher: host %s heartbeating — waiting for next reporter heartbeat to deliver reboot_for_vetting",
+ host.Name))
// Prime the heartbeat so the first "still waiting" fires 30s after
// dispatch, not immediately.
@@ -155,8 +186,8 @@ func (d *Dispatcher) pickNext(ctx context.Context) {
}
// heartbeatWaiting emits a "still waiting" log line every beatEvery for
-// each run still sitting in WaitingWoL. Helps the operator spot hangs
-// without having to tail journalctl on the LXC.
+// each run still sitting in WaitingReboot (or legacy WaitingWoL). Helps
+// the operator spot hangs without having to tail journalctl on the LXC.
func (d *Dispatcher) heartbeatWaiting(ctx context.Context) {
if d.Logs == nil {
return
@@ -172,13 +203,13 @@ func (d *Dispatcher) heartbeatWaiting(ctx context.Context) {
for i := range runs {
r := &runs[i]
seen[r.ID] = true
- if r.State != model.StateWaitingWoL {
+ if r.State != model.StateWaitingReboot && r.State != model.StateWaitingWoL {
continue
}
last, ok := d.lastBeat[r.ID]
if !ok {
- // Run already in WaitingWoL from a previous process lifetime
- // — prime so we don't spam immediately.
+ // Run already waiting from a previous process lifetime — prime
+ // so we don't spam immediately.
d.lastBeat[r.ID] = now
continue
}
@@ -187,11 +218,11 @@ func (d *Dispatcher) heartbeatWaiting(ctx context.Context) {
}
elapsed := now.Sub(r.StartedAt).Truncate(time.Second)
d.runLog(r.ID, "info", fmt.Sprintf(
- "still waiting for agent claim (%s) — check BIOS WoL, pxe.enabled, and live-image presence",
+ "waiting for reporter to reboot + PXE-boot into live image (%s) — if this exceeds 2m, verify pxe.enabled in vetting.yaml and that the reporter actually invoked systemctl reboot",
elapsed))
d.lastBeat[r.ID] = now
}
- // Garbage-collect entries for runs that have left WaitingWoL.
+ // Garbage-collect entries for runs that have left the waiting states.
for id := range d.lastBeat {
if !seen[id] {
delete(d.lastBeat, id)
diff --git a/internal/orchestrator/dispatcher_test.go b/internal/orchestrator/dispatcher_test.go
index 4dea228..699c697 100644
--- a/internal/orchestrator/dispatcher_test.go
+++ b/internal/orchestrator/dispatcher_test.go
@@ -1,15 +1,64 @@
package orchestrator
import (
+ "context"
"os"
"path/filepath"
"strings"
"testing"
+ "time"
+ "vetting/internal/db"
"vetting/internal/events"
"vetting/internal/logs"
+ "vetting/internal/model"
+ "vetting/internal/store"
)
+// setupPickNext wires a real SQLite DB so pickNext can exercise the
+// full Hosts/Runs/Runner path. Returns the dispatcher + seeded host ID +
+// a cleanup. Host starts with a fresh heartbeat stamp so the default is
+// "dispatch would succeed"; callers stale it out as needed.
+func setupPickNext(t *testing.T) (*Dispatcher, *store.Hosts, *store.Runs, int64, func()) {
+ t.Helper()
+ conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db"))
+ if err != nil {
+ t.Fatalf("open db: %v", err)
+ }
+ hosts := &store.Hosts{DB: conn}
+ runs := &store.Runs{DB: conn}
+ stages := &store.Stages{DB: conn}
+ hub := events.NewHub()
+ runner := &Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub}
+ logDir := t.TempDir()
+ lh, err := logs.NewHub(logDir, hub)
+ if err != nil {
+ t.Fatalf("NewHub: %v", err)
+ }
+ d := NewDispatcher(3, runs, hosts, runner, lh)
+
+ ctx := context.Background()
+ hostID, err := hosts.Create(ctx, model.Host{
+ Name: "pn-host",
+ MAC: "aa:bb:cc:dd:ee:50",
+ WoLBroadcastIP: "10.0.0.255",
+ WoLPort: 9,
+ ExpectedSpecYAML: "memory:\n total_gib: 16\n",
+ })
+ if err != nil {
+ t.Fatalf("create host: %v", err)
+ }
+ // Default: heartbeating now.
+ if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:50", time.Now().UTC()); err != nil {
+ t.Fatalf("stamp: %v", err)
+ }
+ cleanup := func() {
+ lh.Close()
+ _ = conn.Close()
+ }
+ return d, hosts, runs, hostID, cleanup
+}
+
// TestDispatcher_RunLogWritesToHub verifies the plumbing between the
// dispatcher and the per-run log hub: runLog must persist to the on-disk
// file so the detail page's replay + SSE fan-out see the same
@@ -45,3 +94,105 @@ func TestDispatcher_RunLogNilHubDoesNotPanic(t *testing.T) {
d := &Dispatcher{}
d.runLog(1, "info", "fallback path")
}
+
+// TestDispatcher_TransitionsToWaitingRebootNoWoL: happy path. Host is
+// heartbeating, run is Queued — one pickNext tick must transition to
+// WaitingReboot via the new RebootCommanded trigger and log that the
+// host is heartbeating. No "sent WoL packet" line allowed.
+func TestDispatcher_TransitionsToWaitingRebootNoWoL(t *testing.T) {
+ d, _, runs, hostID, cleanup := setupPickNext(t)
+ defer cleanup()
+ ctx := context.Background()
+ runID, err := runs.Create(ctx, hostID, "deadbeef")
+ if err != nil {
+ t.Fatalf("create run: %v", err)
+ }
+
+ d.pickNext(ctx)
+
+ got, err := runs.Get(ctx, runID)
+ if err != nil {
+ t.Fatalf("get run: %v", err)
+ }
+ if got.State != model.StateWaitingReboot {
+ t.Fatalf("state = %s, want WaitingReboot", got.State)
+ }
+ body, err := os.ReadFile(filepath.Join(d.Logs.PathFor(runID))) //nolint:staticcheck
+ if err != nil {
+ t.Fatalf("read log: %v", err)
+ }
+ text := string(body)
+ if strings.Contains(text, "sent WoL packet") {
+ t.Fatalf("dispatcher should not fire WoL on heartbeating host: %s", text)
+ }
+ if !strings.Contains(text, "heartbeating") {
+ t.Fatalf("missing heartbeating log line: %s", text)
+ }
+}
+
+// TestDispatcher_FailsStaleHeartbeat: host hasn't heartbeat for >60s.
+// Dispatcher must refuse, mark the run Failed with failed_stage=dispatch,
+// and log at error level — not loop forever on an unreachable box.
+func TestDispatcher_FailsStaleHeartbeat(t *testing.T) {
+ d, hosts, runs, hostID, cleanup := setupPickNext(t)
+ defer cleanup()
+ ctx := context.Background()
+ // Stale: 5m ago is well past the 60s cutoff.
+ if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:50", time.Now().UTC().Add(-5*time.Minute)); err != nil {
+ t.Fatalf("stamp stale: %v", err)
+ }
+ runID, err := runs.Create(ctx, hostID, "deadbeef")
+ if err != nil {
+ t.Fatalf("create run: %v", err)
+ }
+
+ d.pickNext(ctx)
+
+ got, err := runs.Get(ctx, runID)
+ if err != nil {
+ t.Fatalf("get run: %v", err)
+ }
+ if got.State != model.StateFailed {
+ t.Fatalf("state = %s, want Failed", got.State)
+ }
+ if got.FailedStage != "dispatch" {
+ t.Fatalf("failed_stage = %q, want dispatch", got.FailedStage)
+ }
+ body, _ := os.ReadFile(d.Logs.PathFor(runID))
+ if !strings.Contains(string(body), "quick.sh") {
+ t.Fatalf("expected quick.sh hint in run log: %s", body)
+ }
+}
+
+// TestDispatcher_FailsNeverSeenHost mirrors the stale-heartbeat test for
+// a host that has never heartbeated at all — LastSeenAt is NULL.
+func TestDispatcher_FailsNeverSeenHost(t *testing.T) {
+ d, hosts, runs, _, cleanup := setupPickNext(t)
+ defer cleanup()
+ ctx := context.Background()
+ // Create a fresh host with no heartbeat stamp.
+ neverID, err := hosts.Create(ctx, model.Host{
+ Name: "pn-never",
+ MAC: "aa:bb:cc:dd:ee:51",
+ WoLBroadcastIP: "10.0.0.255",
+ WoLPort: 9,
+ ExpectedSpecYAML: "memory:\n total_gib: 16\n",
+ })
+ if err != nil {
+ t.Fatalf("create host: %v", err)
+ }
+ runID, err := runs.Create(ctx, neverID, "deadbeef")
+ if err != nil {
+ t.Fatalf("create run: %v", err)
+ }
+
+ d.pickNext(ctx)
+
+ got, err := runs.Get(ctx, runID)
+ if err != nil {
+ t.Fatalf("get run: %v", err)
+ }
+ if got.State != model.StateFailed {
+ t.Fatalf("state = %s, want Failed", got.State)
+ }
+}
diff --git a/internal/orchestrator/statemachine.go b/internal/orchestrator/statemachine.go
index d8921b6..b40eaf9 100644
--- a/internal/orchestrator/statemachine.go
+++ b/internal/orchestrator/statemachine.go
@@ -11,7 +11,8 @@ type Trigger string
const (
TriggerStartRequested Trigger = "StartRequested" // user clicks Start Vetting
- TriggerDispatched Trigger = "Dispatched" // dispatcher picked this run
+ TriggerDispatched Trigger = "Dispatched" // dispatcher picked this run (manual-WoL override path; dormant in happy path)
+ TriggerRebootCommanded Trigger = "RebootCommanded" // dispatcher (or heartbeat race) told the reporter to reboot
TriggerPXEObserved Trigger = "PXEObserved" // iPXE fetched cmdline for MAC
TriggerAgentClaimed Trigger = "AgentClaimed" // agent POSTed /claim with valid token
TriggerStageFailed Trigger = "StageFailed" // a stage reported failure
@@ -59,8 +60,9 @@ type transition struct {
var table = map[Trigger]transition{
TriggerStartRequested: {from: []model.RunState{model.StateRegistered}, to: model.StateQueued},
TriggerDispatched: {from: []model.RunState{model.StateQueued}, to: model.StateWaitingWoL},
- TriggerPXEObserved: {from: []model.RunState{model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting},
- TriggerAgentClaimed: {from: []model.RunState{model.StateBooting, model.StateWaitingWoL}, to: model.StateInventoryCheck},
+ TriggerRebootCommanded: {from: []model.RunState{model.StateQueued}, to: model.StateWaitingReboot},
+ TriggerPXEObserved: {from: []model.RunState{model.StateWaitingReboot, model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting},
+ TriggerAgentClaimed: {from: []model.RunState{model.StateBooting, model.StateWaitingReboot, model.StateWaitingWoL}, to: model.StateInventoryCheck},
TriggerStageFailed: {from: allActiveStates(), to: model.StateFailedHolding},
TriggerAllStagesPassed: {from: []model.RunState{model.StateReporting}, to: model.StateCompleted},
TriggerOperatorReleased: {from: []model.RunState{model.StateFailedHolding}, to: model.StateReleased},
@@ -121,7 +123,7 @@ func nextStageState(current model.RunState) (model.RunState, error) {
func allActiveStates() []model.RunState {
return []model.RunState{
- model.StateQueued, model.StateWaitingWoL, model.StateBooting,
+ model.StateQueued, model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
model.StateCPUStress, model.StateStorage, model.StateNetwork,
model.StateGPU, model.StatePSU, model.StateReporting,
diff --git a/internal/orchestrator/statemachine_test.go b/internal/orchestrator/statemachine_test.go
index 33a68c1..b9ad812 100644
--- a/internal/orchestrator/statemachine_test.go
+++ b/internal/orchestrator/statemachine_test.go
@@ -40,6 +40,40 @@ func TestNextForOverride(t *testing.T) {
}
}
+// TestTriggerRebootCommanded exercises the new heartbeat-first trigger:
+// Queued → WaitingReboot, and any other current state is an error.
+func TestTriggerRebootCommanded(t *testing.T) {
+ got, err := orchestrator.Next(model.StateQueued, orchestrator.TriggerRebootCommanded)
+ if err != nil {
+ t.Fatalf("Queued + RebootCommanded: %v", err)
+ }
+ if got != model.StateWaitingReboot {
+ t.Fatalf("got %q, want %q", got, model.StateWaitingReboot)
+ }
+ for _, bad := range []model.RunState{
+ model.StateRegistered, model.StateBooting, model.StateInventoryCheck, model.StateCompleted,
+ } {
+ if _, err := orchestrator.Next(bad, orchestrator.TriggerRebootCommanded); err == nil {
+ t.Fatalf("RebootCommanded from %q: expected error", bad)
+ }
+ }
+}
+
+// TestTriggerAgentClaimedFromWaitingReboot: the agent's /claim must
+// advance the run out of WaitingReboot (new happy path) AND out of
+// legacy WaitingWoL, otherwise live boots wouldn't be recognised.
+func TestTriggerAgentClaimedFromWaitingReboot(t *testing.T) {
+ for _, from := range []model.RunState{model.StateWaitingReboot, model.StateWaitingWoL, model.StateBooting} {
+ got, err := orchestrator.Next(from, orchestrator.TriggerAgentClaimed)
+ if err != nil {
+ t.Fatalf("AgentClaimed from %q: %v", from, err)
+ }
+ if got != model.StateInventoryCheck {
+ t.Fatalf("AgentClaimed from %q = %q, want InventoryCheck", from, got)
+ }
+ }
+}
+
func TestNextStageWalk(t *testing.T) {
// Walking StageCompleted from each stage should land on the next
// one in the canonical order, and from Reporting onto Completed.
diff --git a/internal/store/runs.go b/internal/store/runs.go
index 70c8e14..3897e83 100644
--- a/internal/store/runs.go
+++ b/internal/store/runs.go
@@ -76,6 +76,19 @@ func (r *Runs) MarkFailed(ctx context.Context, runID int64, failedStage, holdIP
return err
}
+// MarkDispatchFailed records a terminal failure discovered before the run
+// ever reached a live image, e.g. the dispatcher refused to start because
+// the host isn't heartbeating. Goes to StateFailed (not FailedHolding)
+// because there's no live image to ssh into.
+func (r *Runs) MarkDispatchFailed(ctx context.Context, runID int64, failedStage, result string) error {
+ now := time.Now().UTC()
+ _, err := r.DB.ExecContext(ctx, `
+ UPDATE runs SET state = ?, result = ?, failed_stage = ?, completed_at = ?
+ WHERE id = ?
+ `, string(model.StateFailed), result, failedStage, now, runID)
+ return err
+}
+
func (r *Runs) MarkCompleted(ctx context.Context, runID int64, reportPath string) error {
now := time.Now().UTC()
_, err := r.DB.ExecContext(ctx, `
diff --git a/internal/web/templates/host_detail.templ b/internal/web/templates/host_detail.templ
index 3415cac..f6f68d0 100644
--- a/internal/web/templates/host_detail.templ
+++ b/internal/web/templates/host_detail.templ
@@ -88,10 +88,12 @@ templ HostDetail(d HostDetailData) {
")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, " ")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var28 string
templ_7745c5c3_Var28, templ_7745c5c3_Err = templ.JoinStringErrs(d.Tile.Host.Notes)
if templ_7745c5c3_Err != nil {
- return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 139, Col: 29}
+ return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 141, Col: 29}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var28))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, "Actions
Spec diffs (")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, ">
Spec diffs (")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var22 string
templ_7745c5c3_Var22, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", len(d.SpecDiffs)))
if templ_7745c5c3_Err != nil {
- return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 115, Col: 68}
+ return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 117, Col: 68}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var22))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, ")
")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 39, ")
")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
@@ -418,7 +423,7 @@ func HostDetail(d HostDetailData) templ.Component {
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 39, "
")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var26 string
templ_7745c5c3_Var26, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Expected)
if templ_7745c5c3_Err != nil {
- return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 120, Col: 67}
+ return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 122, Col: 67}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var26))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var27 string
templ_7745c5c3_Var27, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Actual)
if templ_7745c5c3_Err != nil {
- return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 121, Col: 61}
+ return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 123, Col: 61}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var27))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "
")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, "Host details
")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
if d.Tile.Host.Notes != "" {
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, "Host details
Notes
Notes
Expected spec
")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "
Expected spec
")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var29 string
templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(d.Tile.Host.ExpectedSpecYAML)
if templ_7745c5c3_Err != nil {
- return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 144, Col: 66}
+ return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 146, Col: 66}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var29))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "Log
Log