runs: add non-destructive flag + operator Cancel button
CI / Lint + build + test (push) Successful in 2m5s
Release / release (push) Successful in 3m5s

Non-destructive pre-declares "don't touch the disks" on Start: the
Storage stage skips wipe-probe, badblocks -w, and write-mode fio,
and reports a read-only summary. Runs a new non_destructive column;
threaded through Claim → agent tests.Deps → Storage stage.

Cancel halts an in-flight run. The orchestrator transitions to a
new StateCancelled via TriggerOperatorCancelled (valid from any
active state); the agent's next heartbeat returns cmd=cancel_stage,
which fires a stored CancelFunc on the per-stage context. Stage
subprocesses spawned with exec.CommandContext die with the context,
the agent posts a cancelled outcome, then powers the host off.

Destructive stages mid-run may leave the host in an intermediate
state — the UI confirm dialog warns the operator; recovery is
manual for now.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 13:01:42 -04:00
parent 2c440fce8a
commit 4524ab8dc0
22 changed files with 434 additions and 230 deletions
+33 -5
View File
@@ -191,21 +191,20 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
// Guard: refuse to start a second run while one is still active.
if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
switch latest.State {
case model.StateCompleted, model.StateReleased, model.StateFailed, model.StateFailedHolding:
// ok to start fresh
default:
if !latest.State.IsTerminal() {
http.Error(w, "host already has an active run", http.StatusConflict)
return
}
}
nonDestructive := r.PostFormValue("non_destructive") == "1"
_, hash, err := orchestrator.IssueRunToken()
if err != nil {
http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
return
}
runID, err := u.Runs.Create(r.Context(), hostID, hash)
runID, err := u.Runs.Create(r.Context(), hostID, hash, nonDestructive)
if err != nil {
http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
return
@@ -471,6 +470,35 @@ func (u *UI) OverrideWipeStorage(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/", http.StatusSeeOther)
}
// CancelRun halts an in-flight run. Transitions the run to
// StateCancelled; the next agent heartbeat receives cmd=cancel_stage
// which cancels the stage ctx on the agent side. Destructive stages
// mid-run can leave the host in an intermediate state — the confirm
// dialog in the UI warns the operator.
func (u *UI) CancelRun(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
hostID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
latest, err := u.Runs.LatestForHost(r.Context(), hostID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if latest == nil || latest.State.IsTerminal() {
http.Error(w, "no active run to cancel", http.StatusConflict)
return
}
if _, err := u.Runner.Transition(r.Context(), latest.ID, orchestrator.TriggerOperatorCancelled); err != nil {
http.Error(w, "cancel: "+err.Error(), http.StatusInternalServerError)
return
}
log.Printf("ui: cancelled run %d for host %d", latest.ID, hostID)
http.Redirect(w, r, "/", http.StatusSeeOther)
}
func (u *UI) DeleteHost(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
id, err := strconv.ParseInt(idStr, 10, 64)