diff --git a/internal/api/agent_handlers.go b/internal/api/agent_handlers.go index 1a28cf2..3b1d23c 100644 --- a/internal/api/agent_handlers.go +++ b/internal/api/agent_handlers.go @@ -272,9 +272,20 @@ func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) { // intervention. cmd = "reboot" case run.State == model.StateCancelled: - // Operator clicked Cancel — agent cancels the active stage ctx, - // posts a cancelled outcome, and powers off. - cmd = "cancel_stage" + // Operator clicked Cancel. Two sub-cases: + // - FailedStage set → run was sitting in FailedHolding with no + // in-flight stage subprocess; the agent is parked in + // waitForOverride. Send cmd=reboot so the heartbeat loop + // reboots the host, falls through iPXE's no-active-run + // script and boots local disk. + // - FailedStage empty → cancel mid-stage; kill the stage ctx + // first so the running subprocess exits cleanly, then the + // agent powers off via its existing cancel path. + if run.FailedStage != "" { + cmd = "reboot" + } else { + cmd = "cancel_stage" + } case run.State == model.StateFailedHolding || run.State == model.StateReleased: cmd = "abort" case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON): diff --git a/internal/api/agent_handlers_test.go b/internal/api/agent_handlers_test.go index a105e7c..bb8de96 100644 --- a/internal/api/agent_handlers_test.go +++ b/internal/api/agent_handlers_test.go @@ -131,6 +131,63 @@ func TestHeartbeatRebootWhenCompleted(t *testing.T) { } } +// TestHeartbeatRebootWhenCancelledFromHold: operator hit Cancel on a +// FailedHolding run. Because there's no in-flight stage subprocess (the +// agent is parked in waitForOverride), the heartbeat must answer with +// cmd=reboot — not cmd=cancel_stage which only makes sense mid-stage. +// The FailedStage marker is the discriminator: set means we came +// through hold; empty means a mid-stage cancel. +func TestHeartbeatRebootWhenCancelledFromHold(t *testing.T) { + a, runID, token := setupAgent(t) + a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()} + if err := a.Runs.SetFailedStage(context.Background(), runID, "Storage"); err != nil { + t.Fatalf("set failed stage: %v", err) + } + if err := a.Runs.SetState(context.Background(), runID, model.StateCancelled); err != nil { + t.Fatalf("set state: %v", err) + } + req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil) + req.Header.Set("Authorization", "Bearer "+token) + rr := httptest.NewRecorder() + a.Heartbeat(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String()) + } + var resp map[string]any + if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil { + t.Fatalf("decode: %v", err) + } + if resp["cmd"] != "reboot" { + t.Fatalf("cmd = %v, want reboot", resp["cmd"]) + } +} + +// TestHeartbeatCancelStageWhenCancelledMidRun: the mid-stage cancel +// path (no FailedStage marker) still answers cmd=cancel_stage so the +// agent kills its in-flight subprocess before powering off. This is +// the pre-existing behaviour; the hold-cancel branch is additive. +func TestHeartbeatCancelStageWhenCancelledMidRun(t *testing.T) { + a, runID, token := setupAgent(t) + a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()} + if err := a.Runs.SetState(context.Background(), runID, model.StateCancelled); err != nil { + t.Fatalf("set state: %v", err) + } + req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil) + req.Header.Set("Authorization", "Bearer "+token) + rr := httptest.NewRecorder() + a.Heartbeat(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String()) + } + var resp map[string]any + if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil { + t.Fatalf("decode: %v", err) + } + if resp["cmd"] != "cancel_stage" { + t.Fatalf("cmd = %v, want cancel_stage", resp["cmd"]) + } +} + // TestResult_RejectsMismatchedStage is the silent-skip guard's unit // test. The Orion failure mode: agent crashes mid-CPUStress, systemd // restarts it, restarted agent replays Inventory and /results it. diff --git a/internal/api/ui_handlers.go b/internal/api/ui_handlers.go index 745b6db..fd5829a 100644 --- a/internal/api/ui_handlers.go +++ b/internal/api/ui_handlers.go @@ -660,7 +660,11 @@ func (u *UI) CancelRun(w http.ResponseWriter, r *http.Request) { http.Error(w, err.Error(), http.StatusInternalServerError) return } - if latest == nil || latest.State.IsTerminal() { + // FailedHolding is "terminal" for the pipeline but the host is still + // alive and holding at an SSH prompt; the operator can walk away from + // the hold via Cancel (which reboots to local disk). Every other + // terminal state is truly done — nothing to cancel. + if latest == nil || (latest.State.IsTerminal() && latest.State != model.StateFailedHolding) { http.Error(w, "no active run to cancel", http.StatusConflict) return } diff --git a/internal/orchestrator/statemachine.go b/internal/orchestrator/statemachine.go index c94c96d..497e5b0 100644 --- a/internal/orchestrator/statemachine.go +++ b/internal/orchestrator/statemachine.go @@ -73,7 +73,7 @@ var table = map[Trigger]transition{ TriggerStageMismatch: {from: stageExecutionStates(), to: model.StateFailedHolding}, TriggerAllStagesPassed: {from: []model.RunState{model.StateReporting}, to: model.StateCompleted}, TriggerOperatorReleased: {from: []model.RunState{model.StateFailedHolding}, to: model.StateReleased}, - TriggerOperatorCancelled: {from: allActiveStates(), to: model.StateCancelled}, + TriggerOperatorCancelled: {from: append(allActiveStates(), model.StateFailedHolding), to: model.StateCancelled}, } // Next computes the target state for a trigger against the current state. diff --git a/internal/orchestrator/statemachine_test.go b/internal/orchestrator/statemachine_test.go index 32231a9..620086a 100644 --- a/internal/orchestrator/statemachine_test.go +++ b/internal/orchestrator/statemachine_test.go @@ -142,6 +142,30 @@ func TestStageNameForState(t *testing.T) { } } +// TestTriggerOperatorCancelledFromHold: cancelling a held run must be +// allowed so the operator can walk away from a FailedHolding SSH prompt +// and have the host reboot to local disk. Before this, FailedHolding +// was considered terminal and Cancel errored out with "trigger not +// allowed from FailedHolding". +func TestTriggerOperatorCancelledFromHold(t *testing.T) { + got, err := orchestrator.Next(model.StateFailedHolding, orchestrator.TriggerOperatorCancelled) + if err != nil { + t.Fatalf("FailedHolding + OperatorCancelled: %v", err) + } + if got != model.StateCancelled { + t.Fatalf("got %q, want %q", got, model.StateCancelled) + } + // Sanity: other terminal states still reject the trigger so we don't + // accidentally allow Cancel after Completed/Cancelled/Released. + for _, bad := range []model.RunState{ + model.StateCompleted, model.StateCancelled, model.StateReleased, model.StateFailed, + } { + if _, err := orchestrator.Next(bad, orchestrator.TriggerOperatorCancelled); err == nil { + t.Fatalf("OperatorCancelled from %q: expected error, got none", bad) + } + } +} + func TestNextStageWalk(t *testing.T) { // Walking StageCompleted from each stage should land on the next // one in the canonical order, and from Reporting onto Completed. diff --git a/internal/web/templates/host_tile.templ b/internal/web/templates/host_tile.templ index d4ac3c4..56b92d5 100644 --- a/internal/web/templates/host_tile.templ +++ b/internal/web/templates/host_tile.templ @@ -89,12 +89,20 @@ func canStartIfOnline(r *model.Run) bool { return r.State.IsTerminal() } -// canCancel is true for any non-terminal run — the Cancel button shows -// whenever the pipeline is live (Queued through the stage states). The -// handler refuses the action once the run enters a terminal state, so -// the render decision just has to mirror that. +// canCancel is true for any non-terminal run, plus FailedHolding — +// a held run technically classifies as terminal for the pipeline but +// the host is still live on the SSH hold prompt, and the operator +// can walk away from it via Cancel (which reboots to local disk). +// Every other terminal state is truly done, so no Cancel button. +// The server-side CancelRun handler mirrors this predicate. func canCancel(r *model.Run) bool { - return r != nil && !r.State.IsTerminal() + if r == nil { + return false + } + if !r.State.IsTerminal() { + return true + } + return r.State == model.StateFailedHolding } func tileStatus(r *model.Run) string { diff --git a/internal/web/templates/host_tile_templ.go b/internal/web/templates/host_tile_templ.go index c3663b0..17e7434 100644 --- a/internal/web/templates/host_tile_templ.go +++ b/internal/web/templates/host_tile_templ.go @@ -284,12 +284,20 @@ func canStartIfOnline(r *model.Run) bool { return r.State.IsTerminal() } -// canCancel is true for any non-terminal run — the Cancel button shows -// whenever the pipeline is live (Queued through the stage states). The -// handler refuses the action once the run enters a terminal state, so -// the render decision just has to mirror that. +// canCancel is true for any non-terminal run, plus FailedHolding — +// a held run technically classifies as terminal for the pipeline but +// the host is still live on the SSH hold prompt, and the operator +// can walk away from it via Cancel (which reboots to local disk). +// Every other terminal state is truly done, so no Cancel button. +// The server-side CancelRun handler mirrors this predicate. func canCancel(r *model.Run) bool { - return r != nil && !r.State.IsTerminal() + if r == nil { + return false + } + if !r.State.IsTerminal() { + return true + } + return r.State == model.StateFailedHolding } func tileStatus(r *model.Run) string { diff --git a/internal/web/templates/run_detail.templ b/internal/web/templates/run_detail.templ index 4075e40..ef57703 100644 --- a/internal/web/templates/run_detail.templ +++ b/internal/web/templates/run_detail.templ @@ -100,9 +100,15 @@ templ RunHeader(d RunPageData) {
")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 40, "\" class=\"hold-banner\" sse-swap=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var29 string
- templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(d.HoldKeyPath, d.Run.HoldIP))
+ templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
if templ_7745c5c3_Err != nil {
- return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 137, Col: 70}
+ return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 139, Col: 53}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var29))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 40, "")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var30 string
- templ_7745c5c3_Var30, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
+ templ_7745c5c3_Var30, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(d.HoldKeyPath, d.Run.HoldIP))
if templ_7745c5c3_Err != nil {
- return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 141, Col: 47}
+ return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 143, Col: 70}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var30))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "\" class=\"detail-hold-placeholder\" sse-swap=\"")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 54, "\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var39 string
- templ_7745c5c3_Var39, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Expected)
+ templ_7745c5c3_Var39, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Field)
if templ_7745c5c3_Err != nil {
- return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 166, Col: 65}
+ return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 171, Col: 43}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var39))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 54, "actual: ")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 55, "expected: ")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var40 string
- templ_7745c5c3_Var40, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Actual)
+ templ_7745c5c3_Var40, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Expected)
if templ_7745c5c3_Err != nil {
- return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 167, Col: 59}
+ return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 172, Col: 65}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var40))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
- templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 55, "")
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 56, "")
+ if templ_7745c5c3_Err != nil {
+ return templ_7745c5c3_Err
+ }
+ var templ_7745c5c3_Var41 string
+ templ_7745c5c3_Var41, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Actual)
+ if templ_7745c5c3_Err != nil {
+ return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 173, Col: 59}
+ }
+ _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var41))
+ if templ_7745c5c3_Err != nil {
+ return templ_7745c5c3_Err
+ }
+ templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 57, "