diff --git a/internal/api/agent_handlers.go b/internal/api/agent_handlers.go index 1a28cf2..3b1d23c 100644 --- a/internal/api/agent_handlers.go +++ b/internal/api/agent_handlers.go @@ -272,9 +272,20 @@ func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) { // intervention. cmd = "reboot" case run.State == model.StateCancelled: - // Operator clicked Cancel — agent cancels the active stage ctx, - // posts a cancelled outcome, and powers off. - cmd = "cancel_stage" + // Operator clicked Cancel. Two sub-cases: + // - FailedStage set → run was sitting in FailedHolding with no + // in-flight stage subprocess; the agent is parked in + // waitForOverride. Send cmd=reboot so the heartbeat loop + // reboots the host, falls through iPXE's no-active-run + // script and boots local disk. + // - FailedStage empty → cancel mid-stage; kill the stage ctx + // first so the running subprocess exits cleanly, then the + // agent powers off via its existing cancel path. + if run.FailedStage != "" { + cmd = "reboot" + } else { + cmd = "cancel_stage" + } case run.State == model.StateFailedHolding || run.State == model.StateReleased: cmd = "abort" case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON): diff --git a/internal/api/agent_handlers_test.go b/internal/api/agent_handlers_test.go index a105e7c..bb8de96 100644 --- a/internal/api/agent_handlers_test.go +++ b/internal/api/agent_handlers_test.go @@ -131,6 +131,63 @@ func TestHeartbeatRebootWhenCompleted(t *testing.T) { } } +// TestHeartbeatRebootWhenCancelledFromHold: operator hit Cancel on a +// FailedHolding run. Because there's no in-flight stage subprocess (the +// agent is parked in waitForOverride), the heartbeat must answer with +// cmd=reboot — not cmd=cancel_stage which only makes sense mid-stage. +// The FailedStage marker is the discriminator: set means we came +// through hold; empty means a mid-stage cancel. +func TestHeartbeatRebootWhenCancelledFromHold(t *testing.T) { + a, runID, token := setupAgent(t) + a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()} + if err := a.Runs.SetFailedStage(context.Background(), runID, "Storage"); err != nil { + t.Fatalf("set failed stage: %v", err) + } + if err := a.Runs.SetState(context.Background(), runID, model.StateCancelled); err != nil { + t.Fatalf("set state: %v", err) + } + req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil) + req.Header.Set("Authorization", "Bearer "+token) + rr := httptest.NewRecorder() + a.Heartbeat(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String()) + } + var resp map[string]any + if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil { + t.Fatalf("decode: %v", err) + } + if resp["cmd"] != "reboot" { + t.Fatalf("cmd = %v, want reboot", resp["cmd"]) + } +} + +// TestHeartbeatCancelStageWhenCancelledMidRun: the mid-stage cancel +// path (no FailedStage marker) still answers cmd=cancel_stage so the +// agent kills its in-flight subprocess before powering off. This is +// the pre-existing behaviour; the hold-cancel branch is additive. +func TestHeartbeatCancelStageWhenCancelledMidRun(t *testing.T) { + a, runID, token := setupAgent(t) + a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()} + if err := a.Runs.SetState(context.Background(), runID, model.StateCancelled); err != nil { + t.Fatalf("set state: %v", err) + } + req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil) + req.Header.Set("Authorization", "Bearer "+token) + rr := httptest.NewRecorder() + a.Heartbeat(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String()) + } + var resp map[string]any + if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil { + t.Fatalf("decode: %v", err) + } + if resp["cmd"] != "cancel_stage" { + t.Fatalf("cmd = %v, want cancel_stage", resp["cmd"]) + } +} + // TestResult_RejectsMismatchedStage is the silent-skip guard's unit // test. The Orion failure mode: agent crashes mid-CPUStress, systemd // restarts it, restarted agent replays Inventory and /results it. diff --git a/internal/api/ui_handlers.go b/internal/api/ui_handlers.go index 745b6db..fd5829a 100644 --- a/internal/api/ui_handlers.go +++ b/internal/api/ui_handlers.go @@ -660,7 +660,11 @@ func (u *UI) CancelRun(w http.ResponseWriter, r *http.Request) { http.Error(w, err.Error(), http.StatusInternalServerError) return } - if latest == nil || latest.State.IsTerminal() { + // FailedHolding is "terminal" for the pipeline but the host is still + // alive and holding at an SSH prompt; the operator can walk away from + // the hold via Cancel (which reboots to local disk). Every other + // terminal state is truly done — nothing to cancel. + if latest == nil || (latest.State.IsTerminal() && latest.State != model.StateFailedHolding) { http.Error(w, "no active run to cancel", http.StatusConflict) return } diff --git a/internal/orchestrator/statemachine.go b/internal/orchestrator/statemachine.go index c94c96d..497e5b0 100644 --- a/internal/orchestrator/statemachine.go +++ b/internal/orchestrator/statemachine.go @@ -73,7 +73,7 @@ var table = map[Trigger]transition{ TriggerStageMismatch: {from: stageExecutionStates(), to: model.StateFailedHolding}, TriggerAllStagesPassed: {from: []model.RunState{model.StateReporting}, to: model.StateCompleted}, TriggerOperatorReleased: {from: []model.RunState{model.StateFailedHolding}, to: model.StateReleased}, - TriggerOperatorCancelled: {from: allActiveStates(), to: model.StateCancelled}, + TriggerOperatorCancelled: {from: append(allActiveStates(), model.StateFailedHolding), to: model.StateCancelled}, } // Next computes the target state for a trigger against the current state. diff --git a/internal/orchestrator/statemachine_test.go b/internal/orchestrator/statemachine_test.go index 32231a9..620086a 100644 --- a/internal/orchestrator/statemachine_test.go +++ b/internal/orchestrator/statemachine_test.go @@ -142,6 +142,30 @@ func TestStageNameForState(t *testing.T) { } } +// TestTriggerOperatorCancelledFromHold: cancelling a held run must be +// allowed so the operator can walk away from a FailedHolding SSH prompt +// and have the host reboot to local disk. Before this, FailedHolding +// was considered terminal and Cancel errored out with "trigger not +// allowed from FailedHolding". +func TestTriggerOperatorCancelledFromHold(t *testing.T) { + got, err := orchestrator.Next(model.StateFailedHolding, orchestrator.TriggerOperatorCancelled) + if err != nil { + t.Fatalf("FailedHolding + OperatorCancelled: %v", err) + } + if got != model.StateCancelled { + t.Fatalf("got %q, want %q", got, model.StateCancelled) + } + // Sanity: other terminal states still reject the trigger so we don't + // accidentally allow Cancel after Completed/Cancelled/Released. + for _, bad := range []model.RunState{ + model.StateCompleted, model.StateCancelled, model.StateReleased, model.StateFailed, + } { + if _, err := orchestrator.Next(bad, orchestrator.TriggerOperatorCancelled); err == nil { + t.Fatalf("OperatorCancelled from %q: expected error, got none", bad) + } + } +} + func TestNextStageWalk(t *testing.T) { // Walking StageCompleted from each stage should land on the next // one in the canonical order, and from Reporting onto Completed. diff --git a/internal/web/templates/host_tile.templ b/internal/web/templates/host_tile.templ index d4ac3c4..56b92d5 100644 --- a/internal/web/templates/host_tile.templ +++ b/internal/web/templates/host_tile.templ @@ -89,12 +89,20 @@ func canStartIfOnline(r *model.Run) bool { return r.State.IsTerminal() } -// canCancel is true for any non-terminal run — the Cancel button shows -// whenever the pipeline is live (Queued through the stage states). The -// handler refuses the action once the run enters a terminal state, so -// the render decision just has to mirror that. +// canCancel is true for any non-terminal run, plus FailedHolding — +// a held run technically classifies as terminal for the pipeline but +// the host is still live on the SSH hold prompt, and the operator +// can walk away from it via Cancel (which reboots to local disk). +// Every other terminal state is truly done, so no Cancel button. +// The server-side CancelRun handler mirrors this predicate. func canCancel(r *model.Run) bool { - return r != nil && !r.State.IsTerminal() + if r == nil { + return false + } + if !r.State.IsTerminal() { + return true + } + return r.State == model.StateFailedHolding } func tileStatus(r *model.Run) string { diff --git a/internal/web/templates/host_tile_templ.go b/internal/web/templates/host_tile_templ.go index c3663b0..17e7434 100644 --- a/internal/web/templates/host_tile_templ.go +++ b/internal/web/templates/host_tile_templ.go @@ -284,12 +284,20 @@ func canStartIfOnline(r *model.Run) bool { return r.State.IsTerminal() } -// canCancel is true for any non-terminal run — the Cancel button shows -// whenever the pipeline is live (Queued through the stage states). The -// handler refuses the action once the run enters a terminal state, so -// the render decision just has to mirror that. +// canCancel is true for any non-terminal run, plus FailedHolding — +// a held run technically classifies as terminal for the pipeline but +// the host is still live on the SSH hold prompt, and the operator +// can walk away from it via Cancel (which reboots to local disk). +// Every other terminal state is truly done, so no Cancel button. +// The server-side CancelRun handler mirrors this predicate. func canCancel(r *model.Run) bool { - return r != nil && !r.State.IsTerminal() + if r == nil { + return false + } + if !r.State.IsTerminal() { + return true + } + return r.State == model.StateFailedHolding } func tileStatus(r *model.Run) string { diff --git a/internal/web/templates/run_detail.templ b/internal/web/templates/run_detail.templ index 4075e40..ef57703 100644 --- a/internal/web/templates/run_detail.templ +++ b/internal/web/templates/run_detail.templ @@ -100,9 +100,15 @@ templ RunHeader(d RunPageData) {
if canCancel(&d.Run) { -
- -
+ if d.Run.State == model.StateFailedHolding { +
+ +
+ } else { +
+ +
+ } } if canOverrideWipe(&d.Run) {
diff --git a/internal/web/templates/run_detail_templ.go b/internal/web/templates/run_detail_templ.go index 0ec7e6c..83e841f 100644 --- a/internal/web/templates/run_detail_templ.go +++ b/internal/web/templates/run_detail_templ.go @@ -405,82 +405,102 @@ func RunHeader(d RunPageData) templ.Component { return templ_7745c5c3_Err } if canCancel(&d.Run) { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 28, "
") - if templ_7745c5c3_Err != nil { - return templ_7745c5c3_Err + if d.Run.State == model.StateFailedHolding { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 28, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } else { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 30, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } } } if canOverrideWipe(&d.Run) { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 30, "
") - if templ_7745c5c3_Err != nil { - return templ_7745c5c3_Err - } - } - if hasReport(&d.Run) { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 32, "View report ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 33, "\" class=\"inline\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - if d.Run.State.IsTerminal() { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 34, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 35, "\" target=\"_blank\" rel=\"noopener\">View report ") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 36, "
") + if d.Run.State.IsTerminal() { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 36, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -508,83 +528,83 @@ func HoldBanner(d RunPageData) templ.Component { }() } ctx = templ.InitializeContext(ctx) - templ_7745c5c3_Var26 := templ.GetChildren(ctx) - if templ_7745c5c3_Var26 == nil { - templ_7745c5c3_Var26 = templ.NopComponent + templ_7745c5c3_Var27 := templ.GetChildren(ctx) + if templ_7745c5c3_Var27 == nil { + templ_7745c5c3_Var27 = templ.NopComponent } ctx = templ.ClearChildren(ctx) if d.Run.State == model.StateFailedHolding && d.Run.HoldIP != "" { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 37, "
Host is holding — SSH available: ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 40, "\" class=\"hold-banner\" sse-swap=\"") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } var templ_7745c5c3_Var29 string - templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(d.HoldKeyPath, d.Run.HoldIP)) + templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID)) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 137, Col: 70} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 139, Col: 53} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var29)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 40, "
") - if templ_7745c5c3_Err != nil { - return templ_7745c5c3_Err - } - } else { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 41, "
Host is holding — SSH available: ") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } var templ_7745c5c3_Var30 string - templ_7745c5c3_Var30, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID)) + templ_7745c5c3_Var30, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(d.HoldKeyPath, d.Run.HoldIP)) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 141, Col: 47} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 143, Col: 70} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var30)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "\" class=\"detail-hold-placeholder\" sse-swap=\"") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } else { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, "\" class=\"detail-hold-placeholder\" sse-swap=\"") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var32 string + templ_7745c5c3_Var32, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID)) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 149, Col: 53} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var32)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 45, "\" hx-swap=\"outerHTML\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -612,138 +632,138 @@ func RunSpecDiffs(d RunPageData) templ.Component { }() } ctx = templ.InitializeContext(ctx) - templ_7745c5c3_Var32 := templ.GetChildren(ctx) - if templ_7745c5c3_Var32 == nil { - templ_7745c5c3_Var32 = templ.NopComponent + templ_7745c5c3_Var33 := templ.GetChildren(ctx) + if templ_7745c5c3_Var33 == nil { + templ_7745c5c3_Var33 = templ.NopComponent } ctx = templ.ClearChildren(ctx) - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, "\" class=\"detail-section detail-diffs\" sse-swap=\"") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var35 string + templ_7745c5c3_Var35, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-specdiffs-%d", d.Run.ID)) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 162, Col: 57} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var35)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "\" hx-swap=\"outerHTML\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } if len(d.SpecDiffs) > 0 { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, "

Spec diffs (") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 51, ">

Spec diffs (") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - var templ_7745c5c3_Var35 string - templ_7745c5c3_Var35, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", len(d.SpecDiffs))) + var templ_7745c5c3_Var36 string + templ_7745c5c3_Var36, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", len(d.SpecDiffs))) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 161, Col: 66} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 167, Col: 66} } - _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var35)) + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var36)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 50, ")

    ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 52, ")

") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 58, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 57, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 59, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err }