runs: add non-destructive flag + operator Cancel button
Non-destructive pre-declares "don't touch the disks" on Start: the Storage stage skips wipe-probe, badblocks -w, and write-mode fio, and reports a read-only summary. Runs a new non_destructive column; threaded through Claim → agent tests.Deps → Storage stage. Cancel halts an in-flight run. The orchestrator transitions to a new StateCancelled via TriggerOperatorCancelled (valid from any active state); the agent's next heartbeat returns cmd=cancel_stage, which fires a stored CancelFunc on the per-stage context. Stage subprocesses spawned with exec.CommandContext die with the context, the agent posts a cancelled outcome, then powers the host off. Destructive stages mid-run may leave the host in an intermediate state — the UI confirm dialog warns the operator; recovery is manual for now. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+81
-9
@@ -27,6 +27,7 @@ import (
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"vetting/agent/bootstate"
|
||||
@@ -35,6 +36,12 @@ import (
|
||||
"vetting/internal/spec"
|
||||
)
|
||||
|
||||
// stageCancel holds the cancel func for the in-flight stage ctx so the
|
||||
// heartbeat loop can fire it when the orchestrator returns
|
||||
// cmd=cancel_stage. Stored as an atomic.Value so the heartbeat goroutine
|
||||
// can read without locking; writes happen only on the main loop.
|
||||
var stageCancel atomic.Value // context.CancelFunc
|
||||
|
||||
// Run is the long-lived entry point. It blocks until ctx is cancelled
|
||||
// or a fatal error makes progress impossible.
|
||||
func Run(ctx context.Context, p *bootstate.Params) error {
|
||||
@@ -81,7 +88,12 @@ func Run(ctx context.Context, p *bootstate.Params) error {
|
||||
default:
|
||||
}
|
||||
fwd.info("stage: starting " + nextStage)
|
||||
outcome := runStage(ctx, nextStage, claim, fwd, c, overrideFlags{})
|
||||
outcome := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{})
|
||||
if outcome.Cancelled {
|
||||
fwd.warn("stage cancelled by operator; posting result and exiting")
|
||||
_, _ = postResult(ctx, c, nextStage, outcome)
|
||||
return powerOffAndReturn(fwd)
|
||||
}
|
||||
resp, err := postResult(ctx, c, nextStage, outcome)
|
||||
if err != nil {
|
||||
fwd.error("submit result for " + nextStage + ": " + err.Error())
|
||||
@@ -164,6 +176,46 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
|
||||
type stageOutcome struct {
|
||||
Outcome tests.Outcome
|
||||
Inventory *spec.Inventory // only for Inventory stage
|
||||
Cancelled bool // set when the stage was cut short by operator cancel
|
||||
}
|
||||
|
||||
// runStageCancellable wraps runStage in a per-stage context so the
|
||||
// heartbeat loop's cancel_stage directive can kill whatever subprocess
|
||||
// is currently running. If the derived context was cancelled while the
|
||||
// stage executed, the outcome is rewritten as a cancellation record so
|
||||
// the orchestrator has something to persist.
|
||||
func runStageCancellable(parent context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome {
|
||||
stageCtx, cancel := context.WithCancel(parent)
|
||||
stageCancel.Store(cancel)
|
||||
defer func() {
|
||||
cancel()
|
||||
stageCancel.Store(context.CancelFunc(nil))
|
||||
}()
|
||||
out := runStage(stageCtx, stage, claim, fwd, c, ovr)
|
||||
// If the parent is still live but the stage ctx was cancelled, the
|
||||
// operator fired a cancel — mark the outcome so the caller can exit
|
||||
// the pipeline cleanly. Plain ctx-cancel on ctx.Done (e.g. shutdown)
|
||||
// is handled elsewhere by the main loop's select.
|
||||
if parent.Err() == nil && stageCtx.Err() != nil {
|
||||
out.Cancelled = true
|
||||
out.Outcome.Passed = false
|
||||
if out.Outcome.Message == "" {
|
||||
out.Outcome.Message = "stage cancelled by operator"
|
||||
}
|
||||
out.Outcome.Summary = "cancelled"
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// powerOffAndReturn shuts the host down after an operator cancel. Same
|
||||
// best-effort poweroff path as the shutdown heartbeat cmd.
|
||||
func powerOffAndReturn(fwd *logForwarder) error {
|
||||
fwd.info("cancel: powering off host")
|
||||
if err := exec.Command("systemctl", "poweroff").Run(); err != nil {
|
||||
fwd.warn("systemctl poweroff failed: " + err.Error())
|
||||
_ = exec.Command("shutdown", "-h", "now").Run()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type overrideFlags struct {
|
||||
@@ -176,12 +228,13 @@ func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlag
|
||||
expected = append(expected, tests.ExpectedDisk{Serial: e.Serial, SizeGB: e.SizeGB})
|
||||
}
|
||||
return tests.Deps{
|
||||
Info: fwd.info,
|
||||
Warn: fwd.warn,
|
||||
Error: fwd.error,
|
||||
OverrideWipe: ovr.Wipe,
|
||||
ExpectedDisks: expected,
|
||||
StageTimeout: 2 * time.Minute,
|
||||
Info: fwd.info,
|
||||
Warn: fwd.warn,
|
||||
Error: fwd.error,
|
||||
OverrideWipe: ovr.Wipe,
|
||||
NonDestructive: claim.NonDestructive,
|
||||
ExpectedDisks: expected,
|
||||
StageTimeout: 2 * time.Minute,
|
||||
Sensor: func(ctx context.Context, samples []tests.Sample) error {
|
||||
out := make([]SensorSample, 0, len(samples))
|
||||
for _, s := range samples {
|
||||
@@ -248,7 +301,12 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha
|
||||
if len(cmd.OverrideFlags) > 0 {
|
||||
_ = json.Unmarshal(cmd.OverrideFlags, &ovr)
|
||||
}
|
||||
outcome := runStage(ctx, cmd.Stage, claim, fwd, c, ovr)
|
||||
outcome := runStageCancellable(ctx, cmd.Stage, claim, fwd, c, ovr)
|
||||
if outcome.Cancelled {
|
||||
fwd.warn("stage cancelled by operator; posting result and exiting")
|
||||
_, _ = postResult(ctx, c, cmd.Stage, outcome)
|
||||
return powerOffAndReturn(fwd)
|
||||
}
|
||||
resp, err := postResult(ctx, c, cmd.Stage, outcome)
|
||||
if err != nil {
|
||||
fwd.error("override: submit result: " + err.Error())
|
||||
@@ -272,7 +330,12 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha
|
||||
default:
|
||||
}
|
||||
fwd.info("stage: starting " + nextStage)
|
||||
out := runStage(ctx, nextStage, claim, fwd, c, overrideFlags{})
|
||||
out := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{})
|
||||
if out.Cancelled {
|
||||
fwd.warn("stage cancelled by operator; posting result and exiting")
|
||||
_, _ = postResult(ctx, c, nextStage, out)
|
||||
return powerOffAndReturn(fwd)
|
||||
}
|
||||
rr, err := postResult(ctx, c, nextStage, out)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -380,6 +443,15 @@ func heartbeatLoop(ctx context.Context, c *Client, fwd *logForwarder, out chan<-
|
||||
}
|
||||
return
|
||||
}
|
||||
if resp.Cmd == "cancel_stage" {
|
||||
fwd.warn("orchestrator said cancel_stage; cancelling in-flight stage ctx")
|
||||
if v := stageCancel.Load(); v != nil {
|
||||
if fn, ok := v.(context.CancelFunc); ok && fn != nil {
|
||||
fn()
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
if resp.Cmd == "retry_stage" {
|
||||
select {
|
||||
case out <- *resp:
|
||||
|
||||
Reference in New Issue
Block a user