fix(agent): keep heartbeat loop alive during FailedHolding
CI / Lint + build + test (push) Successful in 1m51s
Release / release (push) Failing after 4m28s

The heartbeat handler was returning cmd=abort for FailedHolding, which
caused the agent's heartbeat goroutine to exit after ~10s in hold.
Subsequent state changes (Cancel -> reboot, Override -> retry_stage)
then had no recipient, so the host sat idle at the SSH hold prompt
forever. Narrowed cmd=abort to StateReleased only; FailedHolding falls
through to cmd=continue so the loop keeps polling and can receive the
operator's eventual command.
This commit is contained in:
2026-04-20 18:28:43 -04:00
parent 62bddac110
commit 73f727b4c1
2 changed files with 38 additions and 1 deletions
+7 -1
View File
@@ -286,8 +286,14 @@ func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) {
} else { } else {
cmd = "cancel_stage" cmd = "cancel_stage"
} }
case run.State == model.StateFailedHolding || run.State == model.StateReleased: case run.State == model.StateReleased:
// Operator accepted the failure outcome. No further agent
// action is possible — stop the heartbeat loop.
cmd = "abort" cmd = "abort"
// FailedHolding intentionally falls through to cmd=continue: the
// agent is parked in waitForOverride awaiting operator action
// (Cancel → reboot, Override → retry_stage). Keeping the
// heartbeat loop alive is what lets those commands reach it.
case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON): case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON):
// Operator pressed "Override wipe & retry". Agent should // Operator pressed "Override wipe & retry". Agent should
// re-enter Storage with the wipe-probe bypass armed. // re-enter Storage with the wipe-probe bypass armed.
+31
View File
@@ -188,6 +188,37 @@ func TestHeartbeatCancelStageWhenCancelledMidRun(t *testing.T) {
} }
} }
// TestHeartbeatContinuesDuringFailedHolding: while a run sits in
// FailedHolding the agent is parked in waitForOverride waiting on an
// operator. The heartbeat handler must answer cmd=continue (not
// cmd=abort) so the heartbeat loop stays alive — otherwise a later
// Cancel (→ cmd=reboot) or Override (→ cmd=retry_stage) has no
// recipient and the host sits idle forever.
func TestHeartbeatContinuesDuringFailedHolding(t *testing.T) {
a, runID, token := setupAgent(t)
a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()}
if err := a.Runs.SetFailedStage(context.Background(), runID, "Storage"); err != nil {
t.Fatalf("set failed stage: %v", err)
}
if err := a.Runs.SetState(context.Background(), runID, model.StateFailedHolding); err != nil {
t.Fatalf("set state: %v", err)
}
req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil)
req.Header.Set("Authorization", "Bearer "+token)
rr := httptest.NewRecorder()
a.Heartbeat(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
}
var resp map[string]any
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
t.Fatalf("decode: %v", err)
}
if resp["cmd"] != "continue" {
t.Fatalf("cmd = %v, want continue (abort would kill the heartbeat loop before operator acts)", resp["cmd"])
}
}
// TestResult_RejectsMismatchedStage is the silent-skip guard's unit // TestResult_RejectsMismatchedStage is the silent-skip guard's unit
// test. The Orion failure mode: agent crashes mid-CPUStress, systemd // test. The Orion failure mode: agent crashes mid-CPUStress, systemd
// restarts it, restarted agent replays Inventory and /results it. // restarts it, restarted agent replays Inventory and /results it.