From 3656af9823961d57309d4e1c8abc315639aa85bf Mon Sep 17 00:00:00 2001 From: josh Date: Sun, 19 Apr 2026 22:45:11 -0400 Subject: [PATCH] feat(end-of-run): reboot to local disk instead of powering off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completed runs now reboot the host and fall through iPXE to the next boot device (local disk) instead of powering off. Three coordinated changes: - pxe/ipxe: NoActiveRunScript exits iPXE (drops to next boot entry) instead of `sleep 10; poweroff`. Without this, a Completed reboot just loops through PXE and gets told to poweroff. - api/agent_handlers: heartbeat returns cmd=reboot (was cmd=shutdown) when the run reaches Completed. - agent/runner: runs `systemctl reboot` (with `shutdown -r now` fallback) in response to cmd=reboot. Operator cancel still powers off — powerOffAndReturn is unchanged because a cancel means the operator wants the host idle so they can walk up to it, not back in rotation. Co-Authored-By: Claude Opus 4.7 --- agent/runner.go | 15 ++++++++------- internal/api/agent_handlers.go | 7 +++++-- internal/api/agent_handlers_test.go | 10 +++++----- internal/pxe/ipxe.go | 8 +++++--- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/agent/runner.go b/agent/runner.go index 35f26ba..25a6e3e 100644 --- a/agent/runner.go +++ b/agent/runner.go @@ -10,8 +10,9 @@ // Terminal states: // - FailedHolding → request hold key, install authorized_keys, wait // on heartbeats for a retry_stage directive. -// - Completed → heartbeat carries cmd=shutdown; agent runs -// `systemctl poweroff` and exits. +// - Completed → heartbeat carries cmd=reboot; agent runs +// `systemctl reboot` and exits. The host comes back through iPXE, +// finds no active run, and exits iPXE into the next boot device. // // Thermal sidecar runs from the moment the agent claims until ctx // cancel; it posts a handful of /sys/class/hwmon samples every 5s. @@ -604,13 +605,13 @@ func heartbeatLoop(ctx context.Context, c *Client, fwd *logForwarder, out chan<- fwd.warn("orchestrator said abort; stopping loop") return } - if resp.Cmd == "shutdown" { - fwd.info("orchestrator said shutdown; powering off host") + if resp.Cmd == "reboot" { + fwd.info("orchestrator said reboot; rebooting host") // Best effort: systemd then sysvinit fallback. Either way, // return so the agent process stops issuing heartbeats. - if err := exec.Command("systemctl", "poweroff").Run(); err != nil { - fwd.warn("systemctl poweroff failed: " + err.Error()) - _ = exec.Command("shutdown", "-h", "now").Run() + if err := exec.Command("systemctl", "reboot").Run(); err != nil { + fwd.warn("systemctl reboot failed: " + err.Error()) + _ = exec.Command("shutdown", "-r", "now").Run() } return } diff --git a/internal/api/agent_handlers.go b/internal/api/agent_handlers.go index 19a597b..1a28cf2 100644 --- a/internal/api/agent_handlers.go +++ b/internal/api/agent_handlers.go @@ -266,8 +266,11 @@ func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) { resp := map[string]any{"state": run.State} switch { case run.State == model.StateCompleted: - // Pipeline succeeded — agent should power the host down. - cmd = "shutdown" + // Pipeline succeeded — agent reboots so the host falls through + // iPXE's no-active-run script to the next boot device (local + // disk), landing back on the installed OS without operator + // intervention. + cmd = "reboot" case run.State == model.StateCancelled: // Operator clicked Cancel — agent cancels the active stage ctx, // posts a cancelled outcome, and powers off. diff --git a/internal/api/agent_handlers_test.go b/internal/api/agent_handlers_test.go index f9be593..a105e7c 100644 --- a/internal/api/agent_handlers_test.go +++ b/internal/api/agent_handlers_test.go @@ -105,10 +105,10 @@ func TestSensorRejectsBadToken(t *testing.T) { } } -// TestHeartbeatShutdownWhenCompleted: once the orchestrator has flipped +// TestHeartbeatRebootWhenCompleted: once the orchestrator has flipped // the run into Completed, the next heartbeat response must carry -// cmd=shutdown so the agent powers the host down. -func TestHeartbeatShutdownWhenCompleted(t *testing.T) { +// cmd=reboot so the agent reboots the host back to local disk. +func TestHeartbeatRebootWhenCompleted(t *testing.T) { a, runID, token := setupAgent(t) // Wire a runner so Heartbeat's TouchHeartbeat call doesn't nil-panic. a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()} @@ -126,8 +126,8 @@ func TestHeartbeatShutdownWhenCompleted(t *testing.T) { if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil { t.Fatalf("decode: %v", err) } - if resp["cmd"] != "shutdown" { - t.Fatalf("cmd = %v, want shutdown", resp["cmd"]) + if resp["cmd"] != "reboot" { + t.Fatalf("cmd = %v, want reboot", resp["cmd"]) } } diff --git a/internal/pxe/ipxe.go b/internal/pxe/ipxe.go index b488898..8d13000 100644 --- a/internal/pxe/ipxe.go +++ b/internal/pxe/ipxe.go @@ -82,10 +82,12 @@ func NotRegisteredScript(mac string) string { } // NoActiveRunScript is served when a registered MAC PXE-boots but has -// no currently active run. The host is told to shut down rather than -// loop forever. +// no currently active run. `exit` drops back to the firmware so the +// next configured boot entry (local disk) fires — this is what makes a +// post-Completed reboot come back up on the installed OS instead of +// looping through PXE and powering off. func NoActiveRunScript(mac string) string { - return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — powering off in 10s.\nsleep 10\npoweroff\n", mac) + return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — exiting to next boot device.\nsleep 2\nexit\n", mac) } // Used by handlers to compose URLs; exposed for tests.