runs: add non-destructive flag + operator Cancel button
CI / Lint + build + test (push) Successful in 2m5s
Release / release (push) Successful in 3m5s

Non-destructive pre-declares "don't touch the disks" on Start: the
Storage stage skips wipe-probe, badblocks -w, and write-mode fio,
and reports a read-only summary. Runs a new non_destructive column;
threaded through Claim → agent tests.Deps → Storage stage.

Cancel halts an in-flight run. The orchestrator transitions to a
new StateCancelled via TriggerOperatorCancelled (valid from any
active state); the agent's next heartbeat returns cmd=cancel_stage,
which fires a stored CancelFunc on the per-stage context. Stage
subprocesses spawned with exec.CommandContext die with the context,
the agent posts a cancelled outcome, then powers the host off.

Destructive stages mid-run may leave the host in an intermediate
state — the UI confirm dialog warns the operator; recovery is
manual for now.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 13:01:42 -04:00
parent 2c440fce8a
commit 4524ab8dc0
22 changed files with 434 additions and 230 deletions
+6 -5
View File
@@ -112,11 +112,12 @@ type SensorSample struct {
}
type ClaimResponse struct {
OK bool `json:"ok"`
RunID int64 `json:"run_id"`
Stages []string `json:"stages"`
ExpectedDisks []ClaimExpectedDiskSpec `json:"expected_disks"`
IperfPort int `json:"iperf_port"`
OK bool `json:"ok"`
RunID int64 `json:"run_id"`
Stages []string `json:"stages"`
ExpectedDisks []ClaimExpectedDiskSpec `json:"expected_disks"`
IperfPort int `json:"iperf_port"`
NonDestructive bool `json:"non_destructive"`
}
type ClaimExpectedDiskSpec struct {
+81 -9
View File
@@ -27,6 +27,7 @@ import (
"os/exec"
"path/filepath"
"sync"
"sync/atomic"
"time"
"vetting/agent/bootstate"
@@ -35,6 +36,12 @@ import (
"vetting/internal/spec"
)
// stageCancel holds the cancel func for the in-flight stage ctx so the
// heartbeat loop can fire it when the orchestrator returns
// cmd=cancel_stage. Stored as an atomic.Value so the heartbeat goroutine
// can read without locking; writes happen only on the main loop.
var stageCancel atomic.Value // context.CancelFunc
// Run is the long-lived entry point. It blocks until ctx is cancelled
// or a fatal error makes progress impossible.
func Run(ctx context.Context, p *bootstate.Params) error {
@@ -81,7 +88,12 @@ func Run(ctx context.Context, p *bootstate.Params) error {
default:
}
fwd.info("stage: starting " + nextStage)
outcome := runStage(ctx, nextStage, claim, fwd, c, overrideFlags{})
outcome := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{})
if outcome.Cancelled {
fwd.warn("stage cancelled by operator; posting result and exiting")
_, _ = postResult(ctx, c, nextStage, outcome)
return powerOffAndReturn(fwd)
}
resp, err := postResult(ctx, c, nextStage, outcome)
if err != nil {
fwd.error("submit result for " + nextStage + ": " + err.Error())
@@ -164,6 +176,46 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
type stageOutcome struct {
Outcome tests.Outcome
Inventory *spec.Inventory // only for Inventory stage
Cancelled bool // set when the stage was cut short by operator cancel
}
// runStageCancellable wraps runStage in a per-stage context so the
// heartbeat loop's cancel_stage directive can kill whatever subprocess
// is currently running. If the derived context was cancelled while the
// stage executed, the outcome is rewritten as a cancellation record so
// the orchestrator has something to persist.
func runStageCancellable(parent context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome {
stageCtx, cancel := context.WithCancel(parent)
stageCancel.Store(cancel)
defer func() {
cancel()
stageCancel.Store(context.CancelFunc(nil))
}()
out := runStage(stageCtx, stage, claim, fwd, c, ovr)
// If the parent is still live but the stage ctx was cancelled, the
// operator fired a cancel — mark the outcome so the caller can exit
// the pipeline cleanly. Plain ctx-cancel on ctx.Done (e.g. shutdown)
// is handled elsewhere by the main loop's select.
if parent.Err() == nil && stageCtx.Err() != nil {
out.Cancelled = true
out.Outcome.Passed = false
if out.Outcome.Message == "" {
out.Outcome.Message = "stage cancelled by operator"
}
out.Outcome.Summary = "cancelled"
}
return out
}
// powerOffAndReturn shuts the host down after an operator cancel. Same
// best-effort poweroff path as the shutdown heartbeat cmd.
func powerOffAndReturn(fwd *logForwarder) error {
fwd.info("cancel: powering off host")
if err := exec.Command("systemctl", "poweroff").Run(); err != nil {
fwd.warn("systemctl poweroff failed: " + err.Error())
_ = exec.Command("shutdown", "-h", "now").Run()
}
return nil
}
type overrideFlags struct {
@@ -176,12 +228,13 @@ func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlag
expected = append(expected, tests.ExpectedDisk{Serial: e.Serial, SizeGB: e.SizeGB})
}
return tests.Deps{
Info: fwd.info,
Warn: fwd.warn,
Error: fwd.error,
OverrideWipe: ovr.Wipe,
ExpectedDisks: expected,
StageTimeout: 2 * time.Minute,
Info: fwd.info,
Warn: fwd.warn,
Error: fwd.error,
OverrideWipe: ovr.Wipe,
NonDestructive: claim.NonDestructive,
ExpectedDisks: expected,
StageTimeout: 2 * time.Minute,
Sensor: func(ctx context.Context, samples []tests.Sample) error {
out := make([]SensorSample, 0, len(samples))
for _, s := range samples {
@@ -248,7 +301,12 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha
if len(cmd.OverrideFlags) > 0 {
_ = json.Unmarshal(cmd.OverrideFlags, &ovr)
}
outcome := runStage(ctx, cmd.Stage, claim, fwd, c, ovr)
outcome := runStageCancellable(ctx, cmd.Stage, claim, fwd, c, ovr)
if outcome.Cancelled {
fwd.warn("stage cancelled by operator; posting result and exiting")
_, _ = postResult(ctx, c, cmd.Stage, outcome)
return powerOffAndReturn(fwd)
}
resp, err := postResult(ctx, c, cmd.Stage, outcome)
if err != nil {
fwd.error("override: submit result: " + err.Error())
@@ -272,7 +330,12 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha
default:
}
fwd.info("stage: starting " + nextStage)
out := runStage(ctx, nextStage, claim, fwd, c, overrideFlags{})
out := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{})
if out.Cancelled {
fwd.warn("stage cancelled by operator; posting result and exiting")
_, _ = postResult(ctx, c, nextStage, out)
return powerOffAndReturn(fwd)
}
rr, err := postResult(ctx, c, nextStage, out)
if err != nil {
return err
@@ -380,6 +443,15 @@ func heartbeatLoop(ctx context.Context, c *Client, fwd *logForwarder, out chan<-
}
return
}
if resp.Cmd == "cancel_stage" {
fwd.warn("orchestrator said cancel_stage; cancelling in-flight stage ctx")
if v := stageCancel.Load(); v != nil {
if fn, ok := v.(context.CancelFunc); ok && fn != nil {
fn()
}
}
continue
}
if resp.Cmd == "retry_stage" {
select {
case out <- *resp:
+1
View File
@@ -46,6 +46,7 @@ type Deps struct {
Error func(string)
Sensor func(ctx context.Context, samples []Sample) error
OverrideWipe bool
NonDestructive bool // skip wipe-probe + writes in Storage
ExpectedDisks []ExpectedDisk // serials + sizes from host.expected_spec
StageTimeout time.Duration
}
+17
View File
@@ -44,6 +44,23 @@ func Storage(ctx context.Context, d Deps) Outcome {
}
}
// Non-destructive runs skip wipe-probe (nothing to refuse), badblocks
// -w, and write-mode fio. Every expected disk is still asserted
// present + readable by listing /sys/block and reading SMART-accessible
// identity; the per-disk map flags the shortcut so the report is clear.
if d.NonDestructive {
perDisk := map[string]any{}
for _, t := range targets {
perDisk[t.Device] = map[string]any{"mode": "non_destructive", "serial": t.Serial}
}
d.Info(fmt.Sprintf("Storage: non-destructive — verified %d disk(s) present", len(targets)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("non-destructive: read-only checks only (%d disks)", len(targets)),
Extras: map[string]any{"per_disk": perDisk, "non_destructive": true},
}
}
// Wipe probe on every target. A single dirty disk halts the stage
// unless the operator has set OverrideWipe via the UI.
probes := map[string]wipeProbeResult{}