fix(agent): keep heartbeat loop alive during FailedHolding
The heartbeat handler was returning cmd=abort for FailedHolding, which caused the agent's heartbeat goroutine to exit after ~10s in hold. Subsequent state changes (Cancel -> reboot, Override -> retry_stage) then had no recipient, so the host sat idle at the SSH hold prompt forever. Narrowed cmd=abort to StateReleased only; FailedHolding falls through to cmd=continue so the loop keeps polling and can receive the operator's eventual command.
This commit is contained in:
@@ -286,8 +286,14 @@ func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) {
|
||||
} else {
|
||||
cmd = "cancel_stage"
|
||||
}
|
||||
case run.State == model.StateFailedHolding || run.State == model.StateReleased:
|
||||
case run.State == model.StateReleased:
|
||||
// Operator accepted the failure outcome. No further agent
|
||||
// action is possible — stop the heartbeat loop.
|
||||
cmd = "abort"
|
||||
// FailedHolding intentionally falls through to cmd=continue: the
|
||||
// agent is parked in waitForOverride awaiting operator action
|
||||
// (Cancel → reboot, Override → retry_stage). Keeping the
|
||||
// heartbeat loop alive is what lets those commands reach it.
|
||||
case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON):
|
||||
// Operator pressed "Override wipe & retry". Agent should
|
||||
// re-enter Storage with the wipe-probe bypass armed.
|
||||
|
||||
@@ -188,6 +188,37 @@ func TestHeartbeatCancelStageWhenCancelledMidRun(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestHeartbeatContinuesDuringFailedHolding: while a run sits in
|
||||
// FailedHolding the agent is parked in waitForOverride waiting on an
|
||||
// operator. The heartbeat handler must answer cmd=continue (not
|
||||
// cmd=abort) so the heartbeat loop stays alive — otherwise a later
|
||||
// Cancel (→ cmd=reboot) or Override (→ cmd=retry_stage) has no
|
||||
// recipient and the host sits idle forever.
|
||||
func TestHeartbeatContinuesDuringFailedHolding(t *testing.T) {
|
||||
a, runID, token := setupAgent(t)
|
||||
a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()}
|
||||
if err := a.Runs.SetFailedStage(context.Background(), runID, "Storage"); err != nil {
|
||||
t.Fatalf("set failed stage: %v", err)
|
||||
}
|
||||
if err := a.Runs.SetState(context.Background(), runID, model.StateFailedHolding); err != nil {
|
||||
t.Fatalf("set state: %v", err)
|
||||
}
|
||||
req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil)
|
||||
req.Header.Set("Authorization", "Bearer "+token)
|
||||
rr := httptest.NewRecorder()
|
||||
a.Heartbeat(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
var resp map[string]any
|
||||
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if resp["cmd"] != "continue" {
|
||||
t.Fatalf("cmd = %v, want continue (abort would kill the heartbeat loop before operator acts)", resp["cmd"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestResult_RejectsMismatchedStage is the silent-skip guard's unit
|
||||
// test. The Orion failure mode: agent crashes mid-CPUStress, systemd
|
||||
// restarts it, restarted agent replays Inventory and /results it.
|
||||
|
||||
Reference in New Issue
Block a user