diff --git a/internal/api/heartbeat_test.go b/internal/api/heartbeat_test.go index 08257eb..8e4a9e9 100644 --- a/internal/api/heartbeat_test.go +++ b/internal/api/heartbeat_test.go @@ -37,8 +37,8 @@ func setupHeartbeat(t *testing.T) (*api.UI, *store.Hosts) { } // setupHeartbeatWithRunner also wires a Runs store + Runner so -// Phase-2 tests can exercise the Queued → WaitingWoL transition and -// the 10-minute WaitingWoL re-issue window. +// Phase-2 tests can exercise the Queued → WaitingReboot transition and +// the 10-minute retry window on waiting states. func setupHeartbeatWithRunner(t *testing.T) (*api.UI, *store.Hosts, *store.Runs) { t.Helper() conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db")) @@ -158,17 +158,20 @@ func TestUIHeartbeat_QueuedDispatches(t *testing.T) { if resp.Cmd != "reboot_for_vetting" || resp.RunID != runID { t.Fatalf("response = %+v, want cmd=reboot_for_vetting run_id=%d", resp, runID) } - // Run advanced Queued → WaitingWoL via the state machine. + // Run advanced Queued → WaitingReboot via the state machine. got, err := runs.Get(ctx, runID) if err != nil { t.Fatalf("get run: %v", err) } - if got.State != model.StateWaitingWoL { - t.Fatalf("state = %s, want WaitingWoL", got.State) + if got.State != model.StateWaitingReboot { + t.Fatalf("state = %s, want WaitingReboot", got.State) } } -func TestUIHeartbeat_WaitingWoLRetries(t *testing.T) { +// TestUIHeartbeat_WaitingRebootRetries covers the reporter crashing +// mid-reboot and coming back: the heartbeat must keep returning the +// reboot command while the run is still young (<10min). +func TestUIHeartbeat_WaitingRebootRetries(t *testing.T) { ui, hosts, runs := setupHeartbeatWithRunner(t) ctx := context.Background() hostID, err := hosts.Create(ctx, model.Host{ @@ -185,9 +188,7 @@ func TestUIHeartbeat_WaitingWoLRetries(t *testing.T) { if err != nil { t.Fatalf("create run: %v", err) } - // Simulate: dispatcher already moved the run to WaitingWoL, now - // the host's reporter comes back from a crashed reboot. - if err := runs.SetState(ctx, runID, model.StateWaitingWoL); err != nil { + if err := runs.SetState(ctx, runID, model.StateWaitingReboot); err != nil { t.Fatalf("set state: %v", err) } diff --git a/internal/api/start_run_test.go b/internal/api/start_run_test.go new file mode 100644 index 0000000..60a9c1b --- /dev/null +++ b/internal/api/start_run_test.go @@ -0,0 +1,130 @@ +package api_test + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/go-chi/chi/v5" + + "vetting/internal/api" + "vetting/internal/db" + "vetting/internal/events" + "vetting/internal/model" + "vetting/internal/orchestrator" + "vetting/internal/store" +) + +func setupStartRun(t *testing.T) (*api.UI, *store.Hosts, *store.Runs) { + t.Helper() + conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db")) + if err != nil { + t.Fatalf("open db: %v", err) + } + t.Cleanup(func() { _ = conn.Close() }) + hosts := &store.Hosts{DB: conn} + runs := &store.Runs{DB: conn} + stages := &store.Stages{DB: conn} + hub := events.NewHub() + runner := &orchestrator.Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub} + ui := &api.UI{Hosts: hosts, Runs: runs, Runner: runner, EventHub: hub} + return ui, hosts, runs +} + +func startRunReq(id int64) *http.Request { + req := httptest.NewRequest(http.MethodPost, fmt.Sprintf("/hosts/%d/start", id), nil) + rctx := chi.NewRouteContext() + rctx.URLParams.Add("id", fmt.Sprintf("%d", id)) + return req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx)) +} + +// TestStartRun_RefusesWhenNeverHeartbeated: operator clicks Start on a +// host whose reporter has never phoned home. The handler must 409 with +// a message pointing at the quick.sh install path so the operator knows +// exactly what to fix. +func TestStartRun_RefusesWhenNeverHeartbeated(t *testing.T) { + ui, hosts, _ := setupStartRun(t) + id, err := hosts.Create(context.Background(), model.Host{ + Name: "sr-never", + MAC: "aa:bb:cc:dd:ee:60", + WoLBroadcastIP: "10.0.0.255", + WoLPort: 9, + ExpectedSpecYAML: "memory:\n total_gib: 16\n", + }) + if err != nil { + t.Fatalf("create host: %v", err) + } + + rr := httptest.NewRecorder() + ui.StartRun(rr, startRunReq(id)) + if rr.Code != http.StatusConflict { + t.Fatalf("status = %d, want 409, body=%q", rr.Code, rr.Body.String()) + } + if !strings.Contains(rr.Body.String(), "quick.sh") { + t.Fatalf("body missing quick.sh hint: %s", rr.Body.String()) + } +} + +// TestStartRun_RefusesWhenHeartbeatStale: last heartbeat was 5 minutes +// ago — well past the 60s staleness cutoff. Same 409 path as never-seen. +func TestStartRun_RefusesWhenHeartbeatStale(t *testing.T) { + ui, hosts, _ := setupStartRun(t) + ctx := context.Background() + id, err := hosts.Create(ctx, model.Host{ + Name: "sr-stale", + MAC: "aa:bb:cc:dd:ee:61", + WoLBroadcastIP: "10.0.0.255", + WoLPort: 9, + ExpectedSpecYAML: "memory:\n total_gib: 16\n", + }) + if err != nil { + t.Fatalf("create host: %v", err) + } + if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:61", time.Now().UTC().Add(-5*time.Minute)); err != nil { + t.Fatalf("stamp: %v", err) + } + + rr := httptest.NewRecorder() + ui.StartRun(rr, startRunReq(id)) + if rr.Code != http.StatusConflict { + t.Fatalf("status = %d, want 409", rr.Code) + } +} + +// TestStartRun_AcceptsWhenHeartbeating: fresh heartbeat within the +// staleness window → 303 redirect + a Queued run in the DB. +func TestStartRun_AcceptsWhenHeartbeating(t *testing.T) { + ui, hosts, runs := setupStartRun(t) + ctx := context.Background() + id, err := hosts.Create(ctx, model.Host{ + Name: "sr-live", + MAC: "aa:bb:cc:dd:ee:62", + WoLBroadcastIP: "10.0.0.255", + WoLPort: 9, + ExpectedSpecYAML: "memory:\n total_gib: 16\n", + }) + if err != nil { + t.Fatalf("create host: %v", err) + } + if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:62", time.Now().UTC()); err != nil { + t.Fatalf("stamp: %v", err) + } + + rr := httptest.NewRecorder() + ui.StartRun(rr, startRunReq(id)) + if rr.Code != http.StatusSeeOther { + t.Fatalf("status = %d, want 303, body=%q", rr.Code, rr.Body.String()) + } + latest, err := runs.LatestForHost(ctx, id) + if err != nil { + t.Fatalf("latest: %v", err) + } + if latest == nil || latest.State != model.StateQueued { + t.Fatalf("latest run = %+v, want Queued", latest) + } +} diff --git a/internal/api/ui_handlers.go b/internal/api/ui_handlers.go index 2c95f3c..aff370e 100644 --- a/internal/api/ui_handlers.go +++ b/internal/api/ui_handlers.go @@ -130,7 +130,10 @@ func (u *UI) HostDetail(w http.ResponseWriter, r *http.Request) { // StartRun creates a new Run for the host, issues an agent token, and // transitions Registered→Queued. The dispatcher goroutine picks it up -// and fires WoL. +// on its next tick; the happy path is heartbeat-driven (the reporter's +// next heartbeat fetches reboot_for_vetting). Refuses the click outright +// if the host isn't currently heartbeating — there is no path from +// Queued to live-image without an in-OS reporter on the target. func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) { idStr := chi.URLParam(r, "id") hostID, err := strconv.ParseInt(idStr, 10, 64) @@ -138,7 +141,8 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) { http.Error(w, "bad host id", http.StatusBadRequest) return } - if _, err := u.Hosts.Get(r.Context(), hostID); err != nil { + host, err := u.Hosts.Get(r.Context(), hostID) + if err != nil { if errors.Is(err, store.ErrNotFound) { http.NotFound(w, r) return @@ -147,10 +151,20 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) { return } + // Preflight: host must be heartbeating. The dispatcher re-checks at + // dispatch time (belt-and-braces for the gap between click and tick), + // but rejecting here gives the operator an immediate, actionable + // error instead of a mysterious Failed run 2s later. + if host.LastSeenAt == nil || time.Since(*host.LastSeenAt) > orchestrator.HostHeartbeatStaleAfter { + writeJSONError(w, http.StatusConflict, + "host is not heartbeating — install the reporter via /register/quick.sh on the target host, then retry") + return + } + // Guard: refuse to start a second run while one is still active. if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil { switch latest.State { - case model.StateCompleted, model.StateReleased, model.StateFailedHolding: + case model.StateCompleted, model.StateReleased, model.StateFailed, model.StateFailedHolding: // ok to start fresh default: http.Error(w, "host already has an active run", http.StatusConflict) @@ -343,11 +357,12 @@ type heartbeatResponse struct { // pickHostCommand decides what the host-mode agent should do on the // back of this heartbeat. Returns ("", 0) when there's nothing to do. // -// - Queued run → Transition(Dispatched) and tell the agent to reboot. -// The dispatcher would have WoL'd it anyway; we beat it to the -// punch so the host skips the WoL dance. -// - WaitingWoL run created <10min ago → also return reboot, covering -// "host crashed mid-reboot, systemd brought the reporter back". +// - Queued run → Transition(RebootCommanded) and tell the agent to +// reboot. Beats the dispatcher's 2s poll to the punch, but either +// path ends at WaitingReboot. +// - WaitingReboot (or legacy WaitingWoL) run <10min old → also return +// reboot, covering "host crashed mid-reboot, systemd brought the +// reporter back". // - anything else → idle. func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) { if u.Runs == nil || u.Runner == nil { @@ -363,7 +378,7 @@ func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) } switch run.State { case model.StateQueued: - if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerDispatched); err != nil { + if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerRebootCommanded); err != nil { // Benign race with the dispatcher's own 2s poll — the // state machine refuses the second transition; we just // log and return idle so the agent doesn't reboot on a @@ -371,13 +386,13 @@ func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) log.Printf("heartbeat: transition run %d: %v", run.ID, err) return "", 0 } - log.Printf("heartbeat: dispatched run %d for host %d via heartbeat (no WoL)", run.ID, hostID) + log.Printf("heartbeat: dispatched run %d for host %d (reboot commanded)", run.ID, hostID) return cmdRebootForVetting, run.ID - case model.StateWaitingWoL: + case model.StateWaitingReboot, model.StateWaitingWoL: // Tolerate a crashed-mid-reboot retry: the reporter is the // only thing that could be telling us about this host right - // now, and WoL is only the fallback anyway. Bound it so a - // perpetually-broken PXE doesn't reboot-loop the box. + // now. Bound it so a perpetually-broken PXE doesn't + // reboot-loop the box. if time.Since(run.StartedAt) < 10*time.Minute { return cmdRebootForVetting, run.ID } diff --git a/internal/model/model.go b/internal/model/model.go index f7a5103..9f8528e 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -23,6 +23,7 @@ const ( StateRegistered RunState = "Registered" StateQueued RunState = "Queued" StateWaitingWoL RunState = "WaitingWoL" + StateWaitingReboot RunState = "WaitingReboot" StateBooting RunState = "Booting" StateInventoryCheck RunState = "InventoryCheck" StateSpecValidate RunState = "SpecValidate" diff --git a/internal/orchestrator/dispatcher.go b/internal/orchestrator/dispatcher.go index 4b1e2e8..85c637f 100644 --- a/internal/orchestrator/dispatcher.go +++ b/internal/orchestrator/dispatcher.go @@ -12,14 +12,25 @@ import ( "vetting/internal/store" ) -// Dispatcher picks Queued runs off the DB and drives them through -// WaitingWoL (sending a WoL packet). Concurrency is capped at Max. +// HostHeartbeatStaleAfter is how long we tolerate a host's last_seen_at +// being in the past before treating the host as offline. Set to 2× the +// default reporter heartbeat interval (30s) so a single dropped heartbeat +// doesn't block dispatch. Used by the StartRun preflight and the +// dispatcher itself — both must agree or the operator's click-time +// validation wouldn't match the dispatch-time check. +const HostHeartbeatStaleAfter = 60 * time.Second + +// Dispatcher picks Queued runs off the DB and drives them to +// WaitingReboot — the happy path is heartbeat-first: we transition and +// rely on the host-mode reporter's next heartbeat to fetch the +// reboot_for_vetting command. WoL is not fired in the default flow +// because every supported host already runs the reporter in-OS. // -// Pre-stage log lines (picked, WoL-sent, heartbeat, agent-claimed) -// are written into the per-run log via Logs so the detail page's -// log pane can show what's happening before the agent is alive. +// Pre-stage log lines (picked, heartbeating, agent-claimed) are +// written into the per-run log via Logs so the detail page's log pane +// can show what's happening before the agent is alive. // -// For Phase 2 the dispatcher's job ends at WaitingWoL; further +// For Phase 2 the dispatcher's job ends at WaitingReboot; further // transitions are driven by iPXE and agent callbacks. Phase 4+ will // return here and shepherd each run through stage execution. type Dispatcher struct { @@ -107,10 +118,10 @@ func (d *Dispatcher) pickNext(ctx context.Context) { if queued == nil { queued = &runs[i] } - case model.StateWaitingWoL, model.StateBooting, model.StateInventoryCheck, - model.StateSpecValidate, model.StateSMART, model.StateCPUStress, - model.StateStorage, model.StateNetwork, model.StateGPU, - model.StatePSU, model.StateReporting: + case model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting, + model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART, + model.StateCPUStress, model.StateStorage, model.StateNetwork, + model.StateGPU, model.StatePSU, model.StateReporting: inFlight++ } } @@ -124,23 +135,43 @@ func (d *Dispatcher) pickNext(ctx context.Context) { log.Printf("dispatcher: get host %d: %v", queued.HostID, err) return } - d.runLog(queued.ID, "info", fmt.Sprintf("dispatcher: picked run for host %s (mac=%s wol=%s:%d)", - host.Name, host.MAC, host.WoLBroadcastIP, host.WoLPort)) - if _, err := d.Runner.Transition(ctx, queued.ID, TriggerDispatched); err != nil { + + // Heartbeat gate: the StartRun preflight catches this at click time, + // but a run can sit in Queued long enough for the host to go offline + // between click and dispatch. Re-check here so we never fire a + // reboot command at a host that can't receive it. + if host.LastSeenAt == nil || time.Since(*host.LastSeenAt) > HostHeartbeatStaleAfter { + var ageMsg string + if host.LastSeenAt == nil { + ageMsg = "never heartbeated" + } else { + ageMsg = fmt.Sprintf("last heartbeat %s ago", time.Since(*host.LastSeenAt).Truncate(time.Second)) + } + d.runLog(queued.ID, "error", fmt.Sprintf( + "dispatcher: host %s is offline (%s) — refusing to dispatch; install the reporter via /register/quick.sh on the target and retry", + host.Name, ageMsg)) + if err := d.Runs.MarkDispatchFailed(ctx, queued.ID, "dispatch", "host stopped heartbeating before dispatch"); err != nil { + log.Printf("dispatcher: mark run %d dispatch-failed: %v", queued.ID, err) + } + if d.Runner != nil { + d.Runner.PublishTileUpdate(ctx, host.ID) + } + return + } + + age := time.Since(*host.LastSeenAt).Truncate(time.Second) + d.runLog(queued.ID, "info", fmt.Sprintf( + "dispatcher: picked run for host %s (mac=%s, heartbeating, last seen %s ago)", + host.Name, host.MAC, age)) + if _, err := d.Runner.Transition(ctx, queued.ID, TriggerRebootCommanded); err != nil { log.Printf("dispatcher: transition run %d: %v", queued.ID, err) - d.runLog(queued.ID, "error", fmt.Sprintf("dispatcher: transition to WaitingWoL failed: %v", err)) + d.runLog(queued.ID, "error", fmt.Sprintf("dispatcher: transition to WaitingReboot failed: %v", err)) return } - if err := SendWoL(host.MAC, host.WoLBroadcastIP, host.WoLPort); err != nil { - log.Printf("dispatcher: WoL run %d host %s: %v", queued.ID, host.Name, err) - d.runLog(queued.ID, "error", fmt.Sprintf("dispatcher: WoL send failed: %v — check broadcast %s:%d is reachable", - err, host.WoLBroadcastIP, host.WoLPort)) - // Stay in WaitingWoL; operator can retry or investigate. - return - } - log.Printf("dispatcher: WoL sent for run %d (host=%s mac=%s)", queued.ID, host.Name, host.MAC) - d.runLog(queued.ID, "info", fmt.Sprintf("dispatcher: sent WoL packet to %s via %s:%d — waiting for agent claim", - host.MAC, host.WoLBroadcastIP, host.WoLPort)) + log.Printf("dispatcher: run %d host %s → WaitingReboot (heartbeat-driven)", queued.ID, host.Name) + d.runLog(queued.ID, "info", fmt.Sprintf( + "dispatcher: host %s heartbeating — waiting for next reporter heartbeat to deliver reboot_for_vetting", + host.Name)) // Prime the heartbeat so the first "still waiting" fires 30s after // dispatch, not immediately. @@ -155,8 +186,8 @@ func (d *Dispatcher) pickNext(ctx context.Context) { } // heartbeatWaiting emits a "still waiting" log line every beatEvery for -// each run still sitting in WaitingWoL. Helps the operator spot hangs -// without having to tail journalctl on the LXC. +// each run still sitting in WaitingReboot (or legacy WaitingWoL). Helps +// the operator spot hangs without having to tail journalctl on the LXC. func (d *Dispatcher) heartbeatWaiting(ctx context.Context) { if d.Logs == nil { return @@ -172,13 +203,13 @@ func (d *Dispatcher) heartbeatWaiting(ctx context.Context) { for i := range runs { r := &runs[i] seen[r.ID] = true - if r.State != model.StateWaitingWoL { + if r.State != model.StateWaitingReboot && r.State != model.StateWaitingWoL { continue } last, ok := d.lastBeat[r.ID] if !ok { - // Run already in WaitingWoL from a previous process lifetime - // — prime so we don't spam immediately. + // Run already waiting from a previous process lifetime — prime + // so we don't spam immediately. d.lastBeat[r.ID] = now continue } @@ -187,11 +218,11 @@ func (d *Dispatcher) heartbeatWaiting(ctx context.Context) { } elapsed := now.Sub(r.StartedAt).Truncate(time.Second) d.runLog(r.ID, "info", fmt.Sprintf( - "still waiting for agent claim (%s) — check BIOS WoL, pxe.enabled, and live-image presence", + "waiting for reporter to reboot + PXE-boot into live image (%s) — if this exceeds 2m, verify pxe.enabled in vetting.yaml and that the reporter actually invoked systemctl reboot", elapsed)) d.lastBeat[r.ID] = now } - // Garbage-collect entries for runs that have left WaitingWoL. + // Garbage-collect entries for runs that have left the waiting states. for id := range d.lastBeat { if !seen[id] { delete(d.lastBeat, id) diff --git a/internal/orchestrator/dispatcher_test.go b/internal/orchestrator/dispatcher_test.go index 4dea228..699c697 100644 --- a/internal/orchestrator/dispatcher_test.go +++ b/internal/orchestrator/dispatcher_test.go @@ -1,15 +1,64 @@ package orchestrator import ( + "context" "os" "path/filepath" "strings" "testing" + "time" + "vetting/internal/db" "vetting/internal/events" "vetting/internal/logs" + "vetting/internal/model" + "vetting/internal/store" ) +// setupPickNext wires a real SQLite DB so pickNext can exercise the +// full Hosts/Runs/Runner path. Returns the dispatcher + seeded host ID + +// a cleanup. Host starts with a fresh heartbeat stamp so the default is +// "dispatch would succeed"; callers stale it out as needed. +func setupPickNext(t *testing.T) (*Dispatcher, *store.Hosts, *store.Runs, int64, func()) { + t.Helper() + conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db")) + if err != nil { + t.Fatalf("open db: %v", err) + } + hosts := &store.Hosts{DB: conn} + runs := &store.Runs{DB: conn} + stages := &store.Stages{DB: conn} + hub := events.NewHub() + runner := &Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub} + logDir := t.TempDir() + lh, err := logs.NewHub(logDir, hub) + if err != nil { + t.Fatalf("NewHub: %v", err) + } + d := NewDispatcher(3, runs, hosts, runner, lh) + + ctx := context.Background() + hostID, err := hosts.Create(ctx, model.Host{ + Name: "pn-host", + MAC: "aa:bb:cc:dd:ee:50", + WoLBroadcastIP: "10.0.0.255", + WoLPort: 9, + ExpectedSpecYAML: "memory:\n total_gib: 16\n", + }) + if err != nil { + t.Fatalf("create host: %v", err) + } + // Default: heartbeating now. + if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:50", time.Now().UTC()); err != nil { + t.Fatalf("stamp: %v", err) + } + cleanup := func() { + lh.Close() + _ = conn.Close() + } + return d, hosts, runs, hostID, cleanup +} + // TestDispatcher_RunLogWritesToHub verifies the plumbing between the // dispatcher and the per-run log hub: runLog must persist to the on-disk // file so the detail page's replay + SSE fan-out see the same @@ -45,3 +94,105 @@ func TestDispatcher_RunLogNilHubDoesNotPanic(t *testing.T) { d := &Dispatcher{} d.runLog(1, "info", "fallback path") } + +// TestDispatcher_TransitionsToWaitingRebootNoWoL: happy path. Host is +// heartbeating, run is Queued — one pickNext tick must transition to +// WaitingReboot via the new RebootCommanded trigger and log that the +// host is heartbeating. No "sent WoL packet" line allowed. +func TestDispatcher_TransitionsToWaitingRebootNoWoL(t *testing.T) { + d, _, runs, hostID, cleanup := setupPickNext(t) + defer cleanup() + ctx := context.Background() + runID, err := runs.Create(ctx, hostID, "deadbeef") + if err != nil { + t.Fatalf("create run: %v", err) + } + + d.pickNext(ctx) + + got, err := runs.Get(ctx, runID) + if err != nil { + t.Fatalf("get run: %v", err) + } + if got.State != model.StateWaitingReboot { + t.Fatalf("state = %s, want WaitingReboot", got.State) + } + body, err := os.ReadFile(filepath.Join(d.Logs.PathFor(runID))) //nolint:staticcheck + if err != nil { + t.Fatalf("read log: %v", err) + } + text := string(body) + if strings.Contains(text, "sent WoL packet") { + t.Fatalf("dispatcher should not fire WoL on heartbeating host: %s", text) + } + if !strings.Contains(text, "heartbeating") { + t.Fatalf("missing heartbeating log line: %s", text) + } +} + +// TestDispatcher_FailsStaleHeartbeat: host hasn't heartbeat for >60s. +// Dispatcher must refuse, mark the run Failed with failed_stage=dispatch, +// and log at error level — not loop forever on an unreachable box. +func TestDispatcher_FailsStaleHeartbeat(t *testing.T) { + d, hosts, runs, hostID, cleanup := setupPickNext(t) + defer cleanup() + ctx := context.Background() + // Stale: 5m ago is well past the 60s cutoff. + if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:50", time.Now().UTC().Add(-5*time.Minute)); err != nil { + t.Fatalf("stamp stale: %v", err) + } + runID, err := runs.Create(ctx, hostID, "deadbeef") + if err != nil { + t.Fatalf("create run: %v", err) + } + + d.pickNext(ctx) + + got, err := runs.Get(ctx, runID) + if err != nil { + t.Fatalf("get run: %v", err) + } + if got.State != model.StateFailed { + t.Fatalf("state = %s, want Failed", got.State) + } + if got.FailedStage != "dispatch" { + t.Fatalf("failed_stage = %q, want dispatch", got.FailedStage) + } + body, _ := os.ReadFile(d.Logs.PathFor(runID)) + if !strings.Contains(string(body), "quick.sh") { + t.Fatalf("expected quick.sh hint in run log: %s", body) + } +} + +// TestDispatcher_FailsNeverSeenHost mirrors the stale-heartbeat test for +// a host that has never heartbeated at all — LastSeenAt is NULL. +func TestDispatcher_FailsNeverSeenHost(t *testing.T) { + d, hosts, runs, _, cleanup := setupPickNext(t) + defer cleanup() + ctx := context.Background() + // Create a fresh host with no heartbeat stamp. + neverID, err := hosts.Create(ctx, model.Host{ + Name: "pn-never", + MAC: "aa:bb:cc:dd:ee:51", + WoLBroadcastIP: "10.0.0.255", + WoLPort: 9, + ExpectedSpecYAML: "memory:\n total_gib: 16\n", + }) + if err != nil { + t.Fatalf("create host: %v", err) + } + runID, err := runs.Create(ctx, neverID, "deadbeef") + if err != nil { + t.Fatalf("create run: %v", err) + } + + d.pickNext(ctx) + + got, err := runs.Get(ctx, runID) + if err != nil { + t.Fatalf("get run: %v", err) + } + if got.State != model.StateFailed { + t.Fatalf("state = %s, want Failed", got.State) + } +} diff --git a/internal/orchestrator/statemachine.go b/internal/orchestrator/statemachine.go index d8921b6..b40eaf9 100644 --- a/internal/orchestrator/statemachine.go +++ b/internal/orchestrator/statemachine.go @@ -11,7 +11,8 @@ type Trigger string const ( TriggerStartRequested Trigger = "StartRequested" // user clicks Start Vetting - TriggerDispatched Trigger = "Dispatched" // dispatcher picked this run + TriggerDispatched Trigger = "Dispatched" // dispatcher picked this run (manual-WoL override path; dormant in happy path) + TriggerRebootCommanded Trigger = "RebootCommanded" // dispatcher (or heartbeat race) told the reporter to reboot TriggerPXEObserved Trigger = "PXEObserved" // iPXE fetched cmdline for MAC TriggerAgentClaimed Trigger = "AgentClaimed" // agent POSTed /claim with valid token TriggerStageFailed Trigger = "StageFailed" // a stage reported failure @@ -59,8 +60,9 @@ type transition struct { var table = map[Trigger]transition{ TriggerStartRequested: {from: []model.RunState{model.StateRegistered}, to: model.StateQueued}, TriggerDispatched: {from: []model.RunState{model.StateQueued}, to: model.StateWaitingWoL}, - TriggerPXEObserved: {from: []model.RunState{model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting}, - TriggerAgentClaimed: {from: []model.RunState{model.StateBooting, model.StateWaitingWoL}, to: model.StateInventoryCheck}, + TriggerRebootCommanded: {from: []model.RunState{model.StateQueued}, to: model.StateWaitingReboot}, + TriggerPXEObserved: {from: []model.RunState{model.StateWaitingReboot, model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting}, + TriggerAgentClaimed: {from: []model.RunState{model.StateBooting, model.StateWaitingReboot, model.StateWaitingWoL}, to: model.StateInventoryCheck}, TriggerStageFailed: {from: allActiveStates(), to: model.StateFailedHolding}, TriggerAllStagesPassed: {from: []model.RunState{model.StateReporting}, to: model.StateCompleted}, TriggerOperatorReleased: {from: []model.RunState{model.StateFailedHolding}, to: model.StateReleased}, @@ -121,7 +123,7 @@ func nextStageState(current model.RunState) (model.RunState, error) { func allActiveStates() []model.RunState { return []model.RunState{ - model.StateQueued, model.StateWaitingWoL, model.StateBooting, + model.StateQueued, model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting, model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART, model.StateCPUStress, model.StateStorage, model.StateNetwork, model.StateGPU, model.StatePSU, model.StateReporting, diff --git a/internal/orchestrator/statemachine_test.go b/internal/orchestrator/statemachine_test.go index 33a68c1..b9ad812 100644 --- a/internal/orchestrator/statemachine_test.go +++ b/internal/orchestrator/statemachine_test.go @@ -40,6 +40,40 @@ func TestNextForOverride(t *testing.T) { } } +// TestTriggerRebootCommanded exercises the new heartbeat-first trigger: +// Queued → WaitingReboot, and any other current state is an error. +func TestTriggerRebootCommanded(t *testing.T) { + got, err := orchestrator.Next(model.StateQueued, orchestrator.TriggerRebootCommanded) + if err != nil { + t.Fatalf("Queued + RebootCommanded: %v", err) + } + if got != model.StateWaitingReboot { + t.Fatalf("got %q, want %q", got, model.StateWaitingReboot) + } + for _, bad := range []model.RunState{ + model.StateRegistered, model.StateBooting, model.StateInventoryCheck, model.StateCompleted, + } { + if _, err := orchestrator.Next(bad, orchestrator.TriggerRebootCommanded); err == nil { + t.Fatalf("RebootCommanded from %q: expected error", bad) + } + } +} + +// TestTriggerAgentClaimedFromWaitingReboot: the agent's /claim must +// advance the run out of WaitingReboot (new happy path) AND out of +// legacy WaitingWoL, otherwise live boots wouldn't be recognised. +func TestTriggerAgentClaimedFromWaitingReboot(t *testing.T) { + for _, from := range []model.RunState{model.StateWaitingReboot, model.StateWaitingWoL, model.StateBooting} { + got, err := orchestrator.Next(from, orchestrator.TriggerAgentClaimed) + if err != nil { + t.Fatalf("AgentClaimed from %q: %v", from, err) + } + if got != model.StateInventoryCheck { + t.Fatalf("AgentClaimed from %q = %q, want InventoryCheck", from, got) + } + } +} + func TestNextStageWalk(t *testing.T) { // Walking StageCompleted from each stage should land on the next // one in the canonical order, and from Reporting onto Completed. diff --git a/internal/store/runs.go b/internal/store/runs.go index 70c8e14..3897e83 100644 --- a/internal/store/runs.go +++ b/internal/store/runs.go @@ -76,6 +76,19 @@ func (r *Runs) MarkFailed(ctx context.Context, runID int64, failedStage, holdIP return err } +// MarkDispatchFailed records a terminal failure discovered before the run +// ever reached a live image, e.g. the dispatcher refused to start because +// the host isn't heartbeating. Goes to StateFailed (not FailedHolding) +// because there's no live image to ssh into. +func (r *Runs) MarkDispatchFailed(ctx context.Context, runID int64, failedStage, result string) error { + now := time.Now().UTC() + _, err := r.DB.ExecContext(ctx, ` + UPDATE runs SET state = ?, result = ?, failed_stage = ?, completed_at = ? + WHERE id = ? + `, string(model.StateFailed), result, failedStage, now, runID) + return err +} + func (r *Runs) MarkCompleted(ctx context.Context, runID int64, reportPath string) error { now := time.Now().UTC() _, err := r.DB.ExecContext(ctx, ` diff --git a/internal/web/templates/host_detail.templ b/internal/web/templates/host_detail.templ index 3415cac..f6f68d0 100644 --- a/internal/web/templates/host_detail.templ +++ b/internal/web/templates/host_detail.templ @@ -88,10 +88,12 @@ templ HostDetail(d HostDetailData) {

Actions

- if canStart(d.Tile.Latest) { + if canStart(d.Tile) {
+ } else if canStartIfOnline(d.Tile.Latest) { + } else { } diff --git a/internal/web/templates/host_detail_templ.go b/internal/web/templates/host_detail_templ.go index f5fc45d..a278a6c 100644 --- a/internal/web/templates/host_detail_templ.go +++ b/internal/web/templates/host_detail_templ.go @@ -305,7 +305,7 @@ func HostDetail(d HostDetailData) templ.Component { if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - if canStart(d.Tile.Latest) { + if canStart(d.Tile) { templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 26, "
Start vetting ") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } } else { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 28, " ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 29, " ") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } if canOverrideWipe(d.Tile.Latest) { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 29, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 31, "\" class=\"inline\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } if hasReport(d.Tile.Latest) { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 31, "View report") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 33, "\" target=\"_blank\" rel=\"noopener\">View report") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 33, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 35, "\" class=\"inline\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } if len(d.SpecDiffs) > 0 { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 35, "

Spec diffs (") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, ">

Spec diffs (") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } var templ_7745c5c3_Var22 string templ_7745c5c3_Var22, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", len(d.SpecDiffs))) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 115, Col: 68} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 117, Col: 68} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var22)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, ")

    ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 39, ")

    ") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -418,7 +423,7 @@ func HostDetail(d HostDetailData) templ.Component { if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 39, "
  • ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 41, "\">
    ") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } var templ_7745c5c3_Var25 string templ_7745c5c3_Var25, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Field) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 119, Col: 45} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 121, Col: 45} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var25)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 41, "
    expected: ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "
    expected: ") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } var templ_7745c5c3_Var26 string templ_7745c5c3_Var26, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Expected) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 120, Col: 67} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 122, Col: 67} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var26)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "
    actual: ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "
    actual: ") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } var templ_7745c5c3_Var27 string templ_7745c5c3_Var27, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Actual) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 121, Col: 61} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 123, Col: 61} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var27)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "
  • ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 45, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -486,43 +491,43 @@ func HostDetail(d HostDetailData) templ.Component { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 45, "

Host details

") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, "

Host details

") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } if d.Tile.Host.Notes != "" { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, "

Notes

") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, "

Notes

") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } var templ_7745c5c3_Var28 string templ_7745c5c3_Var28, templ_7745c5c3_Err = templ.JoinStringErrs(d.Tile.Host.Notes) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 139, Col: 29} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 141, Col: 29} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var28)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, "

") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "

") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "

Expected spec

")
+			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "

Expected spec

")
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
 			var templ_7745c5c3_Var29 string
 			templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(d.Tile.Host.ExpectedSpecYAML)
 			if templ_7745c5c3_Err != nil {
-				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 144, Col: 66}
+				return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_detail.templ`, Line: 146, Col: 66}
 			}
 			_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var29))
 			if templ_7745c5c3_Err != nil {
 				return templ_7745c5c3_Err
 			}
-			templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 50, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -573,46 +578,46 @@ func LogTabs(runID int64, replay string) templ.Component { templ_7745c5c3_Var30 = templ.NopComponent } ctx = templ.ClearChildren(ctx) - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 50, "

Log

Log

") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -622,33 +627,33 @@ func LogTabs(runID int64, replay string) templ.Component { if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 54, " ") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 60, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 63, "\" hx-swap=\"beforeend show:bottom\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -726,7 +731,7 @@ func LogTabs(runID int64, replay string) templ.Component { if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 63, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 64, "
") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -736,7 +741,7 @@ func LogTabs(runID int64, replay string) templ.Component { if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 64, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 68, "\" hx-swap=\"beforeend show:bottom\">
") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 68, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 69, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } diff --git a/internal/web/templates/host_tile.templ b/internal/web/templates/host_tile.templ index 81c1450..b23df4b 100644 --- a/internal/web/templates/host_tile.templ +++ b/internal/web/templates/host_tile.templ @@ -28,10 +28,12 @@ templ HostTile(t TileData) {
- if canStart(t.Latest) { + if canStart(t) {
+ } else if canStartIfOnline(t.Latest) { + } else if hasReport(t.Latest) { View report } @@ -53,12 +55,29 @@ func hasReport(r *model.Run) bool { return r != nil && r.State == model.StateCompleted } -func canStart(r *model.Run) bool { +// canStart gates the Start button on two things: the run is in a state +// that accepts a fresh start, AND the host is currently heartbeating. +// The heartbeat check mirrors the StartRun handler's preflight so the +// button never offers a click that the server would reject with 409. +func canStart(t TileData) bool { + if !canStartIfOnline(t.Latest) { + return false + } + if t.LastSeenAt == nil { + return false + } + return time.Since(*t.LastSeenAt) <= 60*time.Second +} + +// canStartIfOnline is the run-state half of canStart, split out so the +// template can distinguish "waiting on run to end" (no button) from +// "run is done but host is offline" (disabled button with tooltip). +func canStartIfOnline(r *model.Run) bool { if r == nil { return true } switch r.State { - case model.StateCompleted, model.StateReleased, model.StateFailedHolding: + case model.StateCompleted, model.StateReleased, model.StateFailed, model.StateFailedHolding: return true } return false @@ -68,6 +87,10 @@ func tileStatus(r *model.Run) string { if r == nil { return "Idle" } + switch r.State { + case model.StateWaitingReboot: + return "Waiting for reboot" + } return string(r.State) } diff --git a/internal/web/templates/host_tile_templ.go b/internal/web/templates/host_tile_templ.go index ba3067d..95ed279 100644 --- a/internal/web/templates/host_tile_templ.go +++ b/internal/web/templates/host_tile_templ.go @@ -176,7 +176,7 @@ func HostTile(t TileData) templ.Component { if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - if canStart(t.Latest) { + if canStart(t) { templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 12, "
Start vetting") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } } else if hasReport(t.Latest) { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 14, "View report") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 16, "\" target=\"_blank\" rel=\"noopener\">View report") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 16, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 17, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -235,12 +240,29 @@ func hasReport(r *model.Run) bool { return r != nil && r.State == model.StateCompleted } -func canStart(r *model.Run) bool { +// canStart gates the Start button on two things: the run is in a state +// that accepts a fresh start, AND the host is currently heartbeating. +// The heartbeat check mirrors the StartRun handler's preflight so the +// button never offers a click that the server would reject with 409. +func canStart(t TileData) bool { + if !canStartIfOnline(t.Latest) { + return false + } + if t.LastSeenAt == nil { + return false + } + return time.Since(*t.LastSeenAt) <= 60*time.Second +} + +// canStartIfOnline is the run-state half of canStart, split out so the +// template can distinguish "waiting on run to end" (no button) from +// "run is done but host is offline" (disabled button with tooltip). +func canStartIfOnline(r *model.Run) bool { if r == nil { return true } switch r.State { - case model.StateCompleted, model.StateReleased, model.StateFailedHolding: + case model.StateCompleted, model.StateReleased, model.StateFailed, model.StateFailedHolding: return true } return false @@ -250,6 +272,10 @@ func tileStatus(r *model.Run) string { if r == nil { return "Idle" } + switch r.State { + case model.StateWaitingReboot: + return "Waiting for reboot" + } return string(r.State) } diff --git a/internal/web/templates/host_tile_test.go b/internal/web/templates/host_tile_test.go index f497e38..8b8006f 100644 --- a/internal/web/templates/host_tile_test.go +++ b/internal/web/templates/host_tile_test.go @@ -42,9 +42,15 @@ func TestHumanAgoFrom(t *testing.T) { // TestHostTile_OverlayLink asserts the tile includes the tile-link // that makes the whole card clickable. The action button stays a // sibling element, so CSS (z-index) keeps it on top of the overlay. +// +// Heartbeat must be fresh because canStart now gates on LastSeenAt — +// an offline host renders a disabled button (no form), which is +// covered by TestHostTile_DisabledStartWhenOffline below. func TestHostTile_OverlayLink(t *testing.T) { + now := time.Now() data := TileData{ - Host: model.Host{ID: 42, Name: "tile-test", MAC: "aa:bb:cc:dd:ee:ff"}, + Host: model.Host{ID: 42, Name: "tile-test", MAC: "aa:bb:cc:dd:ee:ff"}, + LastSeenAt: &now, } var buf strings.Builder if err := HostTile(data).Render(context.Background(), &buf); err != nil { @@ -57,7 +63,7 @@ func TestHostTile_OverlayLink(t *testing.T) { if !strings.Contains(html, `class="tile-link"`) { t.Fatalf("tile missing tile-link class: %s", html) } - // canStart(nil) is true → Start form must be present. + // Fresh heartbeat + no run → Start form must render. if !strings.Contains(html, `/hosts/42/start`) { t.Fatalf("expected Start vetting form in tile: %s", html) } @@ -70,6 +76,26 @@ func TestHostTile_OverlayLink(t *testing.T) { } } +// TestHostTile_DisabledStartWhenOffline: no heartbeat → disabled button +// with the quick.sh tooltip, not a submittable form. Mirrors the +// server-side StartRun 409 so the UI matches the handler. +func TestHostTile_DisabledStartWhenOffline(t *testing.T) { + data := TileData{ + Host: model.Host{ID: 42, Name: "tile-test", MAC: "aa:bb:cc:dd:ee:ff"}, + } + var buf strings.Builder + if err := HostTile(data).Render(context.Background(), &buf); err != nil { + t.Fatalf("render: %v", err) + } + html := buf.String() + if strings.Contains(html, `/hosts/42/start`) { + t.Fatalf("offline host should not expose a Start form: %s", html) + } + if !strings.Contains(html, `disabled`) || !strings.Contains(html, `quick.sh`) { + t.Fatalf("expected disabled Start button with quick.sh tooltip: %s", html) + } +} + func TestLastSeenLabelAndClass(t *testing.T) { if got := lastSeenLabel(nil); got != "never" { t.Fatalf("label nil = %q, want never", got) diff --git a/internal/web/templates/pipeline.templ b/internal/web/templates/pipeline.templ index ba86737..8b0b1e6 100644 --- a/internal/web/templates/pipeline.templ +++ b/internal/web/templates/pipeline.templ @@ -25,7 +25,7 @@ type PipelineNode struct { // pre-stage timestamps. var preStageOrder = []model.RunState{ model.StateQueued, - model.StateWaitingWoL, + model.StateWaitingReboot, model.StateBooting, } @@ -37,7 +37,7 @@ func runStateRank(s model.RunState) int { order := []model.RunState{ model.StateRegistered, model.StateQueued, - model.StateWaitingWoL, + model.StateWaitingReboot, model.StateBooting, model.StateInventoryCheck, model.StateSpecValidate, diff --git a/internal/web/templates/pipeline_templ.go b/internal/web/templates/pipeline_templ.go index 3831f4e..f8de28a 100644 --- a/internal/web/templates/pipeline_templ.go +++ b/internal/web/templates/pipeline_templ.go @@ -33,7 +33,7 @@ type PipelineNode struct { // pre-stage timestamps. var preStageOrder = []model.RunState{ model.StateQueued, - model.StateWaitingWoL, + model.StateWaitingReboot, model.StateBooting, } @@ -45,7 +45,7 @@ func runStateRank(s model.RunState) int { order := []model.RunState{ model.StateRegistered, model.StateQueued, - model.StateWaitingWoL, + model.StateWaitingReboot, model.StateBooting, model.StateInventoryCheck, model.StateSpecValidate, diff --git a/internal/web/templates/pipeline_test.go b/internal/web/templates/pipeline_test.go index d88ea8e..9e0c9aa 100644 --- a/internal/web/templates/pipeline_test.go +++ b/internal/web/templates/pipeline_test.go @@ -9,19 +9,19 @@ import ( // node indexes for the default pipeline layout: pre-stages (3) + stage // rows (9) + terminal Completed (1) = 13 nodes. const ( - idxQueued = 0 - idxWaitingWoL = 1 - idxBooting = 2 - idxInventory = 3 - idxSpecValidate = 4 - idxSMART = 5 - idxCPUStress = 6 - idxStorage = 7 - idxNetwork = 8 - idxGPU = 9 - idxPSU = 10 - idxReporting = 11 - idxCompleted = 12 + idxQueued = 0 + idxWaitingReboot = 1 + idxBooting = 2 + idxInventory = 3 + idxSpecValidate = 4 + idxSMART = 5 + idxCPUStress = 6 + idxStorage = 7 + idxNetwork = 8 + idxGPU = 9 + idxPSU = 10 + idxReporting = 11 + idxCompleted = 12 ) // seedStages returns a fresh all-pending stage slice in the canonical order. @@ -48,12 +48,12 @@ func TestBuildPipeline_NoRun(t *testing.T) { } } -// TestBuildPipeline_GhostStagesBeforeClaim models the real WaitingWoL +// TestBuildPipeline_GhostStagesBeforeClaim models the real WaitingReboot // case: the run exists but agent hasn't called /claim yet, so there are // no stage rows. Pipeline must still render all 9 stage nodes as ghosts // so the operator sees the full timeline ahead of them. func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) { - run := &model.Run{State: model.StateWaitingWoL} + run := &model.Run{State: model.StateWaitingReboot} nodes := BuildPipeline(run, nil) if len(nodes) != 13 { t.Fatalf("len = %d, want 13", len(nodes)) @@ -61,8 +61,8 @@ func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) { if nodes[idxQueued].State != "passed" { t.Errorf("Queued = %q, want passed", nodes[idxQueued].State) } - if nodes[idxWaitingWoL].State != "running" { - t.Errorf("WaitingWoL = %q, want running", nodes[idxWaitingWoL].State) + if nodes[idxWaitingReboot].State != "running" { + t.Errorf("WaitingReboot = %q, want running", nodes[idxWaitingReboot].State) } // All 9 stage ghosts must be pending — nothing has started yet. for i := idxInventory; i <= idxReporting; i++ { @@ -179,7 +179,24 @@ func TestBuildPipeline_QueuedNow(t *testing.T) { if nodes[idxQueued].State != "running" { t.Errorf("Queued = %q, want running", nodes[idxQueued].State) } - if nodes[idxWaitingWoL].State != "pending" { - t.Errorf("WaitingWoL = %q, want pending", nodes[idxWaitingWoL].State) + if nodes[idxWaitingReboot].State != "pending" { + t.Errorf("WaitingReboot = %q, want pending", nodes[idxWaitingReboot].State) + } +} + +// TestBuildPipeline_PreStageRunning_WaitingReboot confirms the pre-stage +// node for WaitingReboot lights up while the run sits there — the new +// happy-path state must map onto its pipeline slot. +func TestBuildPipeline_PreStageRunning_WaitingReboot(t *testing.T) { + run := &model.Run{State: model.StateWaitingReboot} + nodes := BuildPipeline(run, seedStages()) + if nodes[idxQueued].State != "passed" { + t.Errorf("Queued = %q, want passed", nodes[idxQueued].State) + } + if nodes[idxWaitingReboot].State != "running" { + t.Errorf("WaitingReboot = %q, want running", nodes[idxWaitingReboot].State) + } + if nodes[idxBooting].State != "pending" { + t.Errorf("Booting = %q, want pending", nodes[idxBooting].State) } }