feat(end-of-run): reboot to local disk instead of powering off
CI / Lint + build + test (push) Successful in 1m47s
Release / release (push) Successful in 10m8s

Completed runs now reboot the host and fall through iPXE to the next
boot device (local disk) instead of powering off. Three coordinated
changes:

- pxe/ipxe: NoActiveRunScript exits iPXE (drops to next boot entry)
  instead of `sleep 10; poweroff`. Without this, a Completed reboot
  just loops through PXE and gets told to poweroff.
- api/agent_handlers: heartbeat returns cmd=reboot (was cmd=shutdown)
  when the run reaches Completed.
- agent/runner: runs `systemctl reboot` (with `shutdown -r now`
  fallback) in response to cmd=reboot.

Operator cancel still powers off — powerOffAndReturn is unchanged
because a cancel means the operator wants the host idle so they can
walk up to it, not back in rotation.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-19 22:45:11 -04:00
parent 8acef92a60
commit 3656af9823
4 changed files with 23 additions and 17 deletions
+8 -7
View File
@@ -10,8 +10,9 @@
// Terminal states: // Terminal states:
// - FailedHolding → request hold key, install authorized_keys, wait // - FailedHolding → request hold key, install authorized_keys, wait
// on heartbeats for a retry_stage directive. // on heartbeats for a retry_stage directive.
// - Completed → heartbeat carries cmd=shutdown; agent runs // - Completed → heartbeat carries cmd=reboot; agent runs
// `systemctl poweroff` and exits. // `systemctl reboot` and exits. The host comes back through iPXE,
// finds no active run, and exits iPXE into the next boot device.
// //
// Thermal sidecar runs from the moment the agent claims until ctx // Thermal sidecar runs from the moment the agent claims until ctx
// cancel; it posts a handful of /sys/class/hwmon samples every 5s. // cancel; it posts a handful of /sys/class/hwmon samples every 5s.
@@ -604,13 +605,13 @@ func heartbeatLoop(ctx context.Context, c *Client, fwd *logForwarder, out chan<-
fwd.warn("orchestrator said abort; stopping loop") fwd.warn("orchestrator said abort; stopping loop")
return return
} }
if resp.Cmd == "shutdown" { if resp.Cmd == "reboot" {
fwd.info("orchestrator said shutdown; powering off host") fwd.info("orchestrator said reboot; rebooting host")
// Best effort: systemd then sysvinit fallback. Either way, // Best effort: systemd then sysvinit fallback. Either way,
// return so the agent process stops issuing heartbeats. // return so the agent process stops issuing heartbeats.
if err := exec.Command("systemctl", "poweroff").Run(); err != nil { if err := exec.Command("systemctl", "reboot").Run(); err != nil {
fwd.warn("systemctl poweroff failed: " + err.Error()) fwd.warn("systemctl reboot failed: " + err.Error())
_ = exec.Command("shutdown", "-h", "now").Run() _ = exec.Command("shutdown", "-r", "now").Run()
} }
return return
} }
+5 -2
View File
@@ -266,8 +266,11 @@ func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) {
resp := map[string]any{"state": run.State} resp := map[string]any{"state": run.State}
switch { switch {
case run.State == model.StateCompleted: case run.State == model.StateCompleted:
// Pipeline succeeded — agent should power the host down. // Pipeline succeeded — agent reboots so the host falls through
cmd = "shutdown" // iPXE's no-active-run script to the next boot device (local
// disk), landing back on the installed OS without operator
// intervention.
cmd = "reboot"
case run.State == model.StateCancelled: case run.State == model.StateCancelled:
// Operator clicked Cancel — agent cancels the active stage ctx, // Operator clicked Cancel — agent cancels the active stage ctx,
// posts a cancelled outcome, and powers off. // posts a cancelled outcome, and powers off.
+5 -5
View File
@@ -105,10 +105,10 @@ func TestSensorRejectsBadToken(t *testing.T) {
} }
} }
// TestHeartbeatShutdownWhenCompleted: once the orchestrator has flipped // TestHeartbeatRebootWhenCompleted: once the orchestrator has flipped
// the run into Completed, the next heartbeat response must carry // the run into Completed, the next heartbeat response must carry
// cmd=shutdown so the agent powers the host down. // cmd=reboot so the agent reboots the host back to local disk.
func TestHeartbeatShutdownWhenCompleted(t *testing.T) { func TestHeartbeatRebootWhenCompleted(t *testing.T) {
a, runID, token := setupAgent(t) a, runID, token := setupAgent(t)
// Wire a runner so Heartbeat's TouchHeartbeat call doesn't nil-panic. // Wire a runner so Heartbeat's TouchHeartbeat call doesn't nil-panic.
a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()} a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()}
@@ -126,8 +126,8 @@ func TestHeartbeatShutdownWhenCompleted(t *testing.T) {
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil { if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
t.Fatalf("decode: %v", err) t.Fatalf("decode: %v", err)
} }
if resp["cmd"] != "shutdown" { if resp["cmd"] != "reboot" {
t.Fatalf("cmd = %v, want shutdown", resp["cmd"]) t.Fatalf("cmd = %v, want reboot", resp["cmd"])
} }
} }
+5 -3
View File
@@ -82,10 +82,12 @@ func NotRegisteredScript(mac string) string {
} }
// NoActiveRunScript is served when a registered MAC PXE-boots but has // NoActiveRunScript is served when a registered MAC PXE-boots but has
// no currently active run. The host is told to shut down rather than // no currently active run. `exit` drops back to the firmware so the
// loop forever. // next configured boot entry (local disk) fires — this is what makes a
// post-Completed reboot come back up on the installed OS instead of
// looping through PXE and powering off.
func NoActiveRunScript(mac string) string { func NoActiveRunScript(mac string) string {
return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — powering off in 10s.\nsleep 10\npoweroff\n", mac) return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — exiting to next boot device.\nsleep 2\nexit\n", mac)
} }
// Used by handlers to compose URLs; exposed for tests. // Used by handlers to compose URLs; exposed for tests.