runs: add non-destructive flag + operator Cancel button
Non-destructive pre-declares "don't touch the disks" on Start: the Storage stage skips wipe-probe, badblocks -w, and write-mode fio, and reports a read-only summary. Runs a new non_destructive column; threaded through Claim → agent tests.Deps → Storage stage. Cancel halts an in-flight run. The orchestrator transitions to a new StateCancelled via TriggerOperatorCancelled (valid from any active state); the agent's next heartbeat returns cmd=cancel_stage, which fires a stored CancelFunc on the per-stage context. Stage subprocesses spawned with exec.CommandContext die with the context, the agent posts a cancelled outcome, then powers the host off. Destructive stages mid-run may leave the host in an intermediate state — the UI confirm dialog warns the operator; recovery is manual for now. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -207,11 +207,12 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
|
||||
iperfPort = 5201
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{
|
||||
"ok": true,
|
||||
"run_id": runID,
|
||||
"stages": store.DefaultStageOrder,
|
||||
"expected_disks": expectedDisks,
|
||||
"iperf_port": iperfPort,
|
||||
"ok": true,
|
||||
"run_id": runID,
|
||||
"stages": store.DefaultStageOrder,
|
||||
"expected_disks": expectedDisks,
|
||||
"iperf_port": iperfPort,
|
||||
"non_destructive": run.NonDestructive,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -236,6 +237,10 @@ func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) {
|
||||
case run.State == model.StateCompleted:
|
||||
// Pipeline succeeded — agent should power the host down.
|
||||
cmd = "shutdown"
|
||||
case run.State == model.StateCancelled:
|
||||
// Operator clicked Cancel — agent cancels the active stage ctx,
|
||||
// posts a cancelled outcome, and powers off.
|
||||
cmd = "cancel_stage"
|
||||
case run.State == model.StateFailedHolding || run.State == model.StateReleased:
|
||||
cmd = "abort"
|
||||
case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON):
|
||||
|
||||
@@ -46,7 +46,7 @@ func setupAgent(t *testing.T) (*api.Agent, int64, string) {
|
||||
if err != nil {
|
||||
t.Fatalf("issue token: %v", err)
|
||||
}
|
||||
runID, err := runs.Create(context.Background(), hostID, hash)
|
||||
runID, err := runs.Create(context.Background(), hostID, hash, false)
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
|
||||
@@ -137,7 +137,7 @@ func TestUIHeartbeat_QueuedDispatches(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
runID, err := runs.Create(ctx, hostID, "deadbeef")
|
||||
runID, err := runs.Create(ctx, hostID, "deadbeef", false)
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
@@ -184,7 +184,7 @@ func TestUIHeartbeat_WaitingRebootRetries(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
runID, err := runs.Create(ctx, hostID, "deadbeef")
|
||||
runID, err := runs.Create(ctx, hostID, "deadbeef", false)
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
@@ -241,7 +241,7 @@ func TestUIHeartbeat_CompletedRunIsIdle(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
runID, err := runs.Create(ctx, hostID, "deadbeef")
|
||||
runID, err := runs.Create(ctx, hostID, "deadbeef", false)
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
|
||||
@@ -67,7 +67,7 @@ func TestHostDetail_OK(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
runID, err := runs.Create(ctx, id, "deadbeef")
|
||||
runID, err := runs.Create(ctx, id, "deadbeef", false)
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
@@ -137,7 +137,7 @@ func TestHostDetail_LogTabsRendered(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
runID, err := runs.Create(ctx, id, "cafef00d")
|
||||
runID, err := runs.Create(ctx, id, "cafef00d", false)
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
|
||||
if err != nil {
|
||||
t.Fatalf("issue token: %v", err)
|
||||
}
|
||||
runID, err := runStore.Create(context.Background(), hostID, hash)
|
||||
runID, err := runStore.Create(context.Background(), hostID, hash, false)
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
|
||||
@@ -191,21 +191,20 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
// Guard: refuse to start a second run while one is still active.
|
||||
if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
|
||||
switch latest.State {
|
||||
case model.StateCompleted, model.StateReleased, model.StateFailed, model.StateFailedHolding:
|
||||
// ok to start fresh
|
||||
default:
|
||||
if !latest.State.IsTerminal() {
|
||||
http.Error(w, "host already has an active run", http.StatusConflict)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
nonDestructive := r.PostFormValue("non_destructive") == "1"
|
||||
|
||||
_, hash, err := orchestrator.IssueRunToken()
|
||||
if err != nil {
|
||||
http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
runID, err := u.Runs.Create(r.Context(), hostID, hash)
|
||||
runID, err := u.Runs.Create(r.Context(), hostID, hash, nonDestructive)
|
||||
if err != nil {
|
||||
http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
@@ -471,6 +470,35 @@ func (u *UI) OverrideWipeStorage(w http.ResponseWriter, r *http.Request) {
|
||||
http.Redirect(w, r, "/", http.StatusSeeOther)
|
||||
}
|
||||
|
||||
// CancelRun halts an in-flight run. Transitions the run to
|
||||
// StateCancelled; the next agent heartbeat receives cmd=cancel_stage
|
||||
// which cancels the stage ctx on the agent side. Destructive stages
|
||||
// mid-run can leave the host in an intermediate state — the confirm
|
||||
// dialog in the UI warns the operator.
|
||||
func (u *UI) CancelRun(w http.ResponseWriter, r *http.Request) {
|
||||
idStr := chi.URLParam(r, "id")
|
||||
hostID, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil {
|
||||
http.Error(w, "bad host id", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
latest, err := u.Runs.LatestForHost(r.Context(), hostID)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if latest == nil || latest.State.IsTerminal() {
|
||||
http.Error(w, "no active run to cancel", http.StatusConflict)
|
||||
return
|
||||
}
|
||||
if _, err := u.Runner.Transition(r.Context(), latest.ID, orchestrator.TriggerOperatorCancelled); err != nil {
|
||||
http.Error(w, "cancel: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
log.Printf("ui: cancelled run %d for host %d", latest.ID, hostID)
|
||||
http.Redirect(w, r, "/", http.StatusSeeOther)
|
||||
}
|
||||
|
||||
func (u *UI) DeleteHost(w http.ResponseWriter, r *http.Request) {
|
||||
idStr := chi.URLParam(r, "id")
|
||||
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||
|
||||
Reference in New Issue
Block a user