diff --git a/internal/api/agent_handlers.go b/internal/api/agent_handlers.go index 3b1d23c..b5c52dd 100644 --- a/internal/api/agent_handlers.go +++ b/internal/api/agent_handlers.go @@ -286,8 +286,14 @@ func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) { } else { cmd = "cancel_stage" } - case run.State == model.StateFailedHolding || run.State == model.StateReleased: + case run.State == model.StateReleased: + // Operator accepted the failure outcome. No further agent + // action is possible — stop the heartbeat loop. cmd = "abort" + // FailedHolding intentionally falls through to cmd=continue: the + // agent is parked in waitForOverride awaiting operator action + // (Cancel → reboot, Override → retry_stage). Keeping the + // heartbeat loop alive is what lets those commands reach it. case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON): // Operator pressed "Override wipe & retry". Agent should // re-enter Storage with the wipe-probe bypass armed. diff --git a/internal/api/agent_handlers_test.go b/internal/api/agent_handlers_test.go index bb8de96..ee42171 100644 --- a/internal/api/agent_handlers_test.go +++ b/internal/api/agent_handlers_test.go @@ -188,6 +188,37 @@ func TestHeartbeatCancelStageWhenCancelledMidRun(t *testing.T) { } } +// TestHeartbeatContinuesDuringFailedHolding: while a run sits in +// FailedHolding the agent is parked in waitForOverride waiting on an +// operator. The heartbeat handler must answer cmd=continue (not +// cmd=abort) so the heartbeat loop stays alive — otherwise a later +// Cancel (→ cmd=reboot) or Override (→ cmd=retry_stage) has no +// recipient and the host sits idle forever. +func TestHeartbeatContinuesDuringFailedHolding(t *testing.T) { + a, runID, token := setupAgent(t) + a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()} + if err := a.Runs.SetFailedStage(context.Background(), runID, "Storage"); err != nil { + t.Fatalf("set failed stage: %v", err) + } + if err := a.Runs.SetState(context.Background(), runID, model.StateFailedHolding); err != nil { + t.Fatalf("set state: %v", err) + } + req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil) + req.Header.Set("Authorization", "Bearer "+token) + rr := httptest.NewRecorder() + a.Heartbeat(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String()) + } + var resp map[string]any + if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil { + t.Fatalf("decode: %v", err) + } + if resp["cmd"] != "continue" { + t.Fatalf("cmd = %v, want continue (abort would kill the heartbeat loop before operator acts)", resp["cmd"]) + } +} + // TestResult_RejectsMismatchedStage is the silent-skip guard's unit // test. The Orion failure mode: agent crashes mid-CPUStress, systemd // restarts it, restarted agent replays Inventory and /results it.