feat(end-of-run): reboot to local disk instead of powering off
Completed runs now reboot the host and fall through iPXE to the next boot device (local disk) instead of powering off. Three coordinated changes: - pxe/ipxe: NoActiveRunScript exits iPXE (drops to next boot entry) instead of `sleep 10; poweroff`. Without this, a Completed reboot just loops through PXE and gets told to poweroff. - api/agent_handlers: heartbeat returns cmd=reboot (was cmd=shutdown) when the run reaches Completed. - agent/runner: runs `systemctl reboot` (with `shutdown -r now` fallback) in response to cmd=reboot. Operator cancel still powers off — powerOffAndReturn is unchanged because a cancel means the operator wants the host idle so they can walk up to it, not back in rotation. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+8
-7
@@ -10,8 +10,9 @@
|
|||||||
// Terminal states:
|
// Terminal states:
|
||||||
// - FailedHolding → request hold key, install authorized_keys, wait
|
// - FailedHolding → request hold key, install authorized_keys, wait
|
||||||
// on heartbeats for a retry_stage directive.
|
// on heartbeats for a retry_stage directive.
|
||||||
// - Completed → heartbeat carries cmd=shutdown; agent runs
|
// - Completed → heartbeat carries cmd=reboot; agent runs
|
||||||
// `systemctl poweroff` and exits.
|
// `systemctl reboot` and exits. The host comes back through iPXE,
|
||||||
|
// finds no active run, and exits iPXE into the next boot device.
|
||||||
//
|
//
|
||||||
// Thermal sidecar runs from the moment the agent claims until ctx
|
// Thermal sidecar runs from the moment the agent claims until ctx
|
||||||
// cancel; it posts a handful of /sys/class/hwmon samples every 5s.
|
// cancel; it posts a handful of /sys/class/hwmon samples every 5s.
|
||||||
@@ -604,13 +605,13 @@ func heartbeatLoop(ctx context.Context, c *Client, fwd *logForwarder, out chan<-
|
|||||||
fwd.warn("orchestrator said abort; stopping loop")
|
fwd.warn("orchestrator said abort; stopping loop")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if resp.Cmd == "shutdown" {
|
if resp.Cmd == "reboot" {
|
||||||
fwd.info("orchestrator said shutdown; powering off host")
|
fwd.info("orchestrator said reboot; rebooting host")
|
||||||
// Best effort: systemd then sysvinit fallback. Either way,
|
// Best effort: systemd then sysvinit fallback. Either way,
|
||||||
// return so the agent process stops issuing heartbeats.
|
// return so the agent process stops issuing heartbeats.
|
||||||
if err := exec.Command("systemctl", "poweroff").Run(); err != nil {
|
if err := exec.Command("systemctl", "reboot").Run(); err != nil {
|
||||||
fwd.warn("systemctl poweroff failed: " + err.Error())
|
fwd.warn("systemctl reboot failed: " + err.Error())
|
||||||
_ = exec.Command("shutdown", "-h", "now").Run()
|
_ = exec.Command("shutdown", "-r", "now").Run()
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -266,8 +266,11 @@ func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) {
|
|||||||
resp := map[string]any{"state": run.State}
|
resp := map[string]any{"state": run.State}
|
||||||
switch {
|
switch {
|
||||||
case run.State == model.StateCompleted:
|
case run.State == model.StateCompleted:
|
||||||
// Pipeline succeeded — agent should power the host down.
|
// Pipeline succeeded — agent reboots so the host falls through
|
||||||
cmd = "shutdown"
|
// iPXE's no-active-run script to the next boot device (local
|
||||||
|
// disk), landing back on the installed OS without operator
|
||||||
|
// intervention.
|
||||||
|
cmd = "reboot"
|
||||||
case run.State == model.StateCancelled:
|
case run.State == model.StateCancelled:
|
||||||
// Operator clicked Cancel — agent cancels the active stage ctx,
|
// Operator clicked Cancel — agent cancels the active stage ctx,
|
||||||
// posts a cancelled outcome, and powers off.
|
// posts a cancelled outcome, and powers off.
|
||||||
|
|||||||
@@ -105,10 +105,10 @@ func TestSensorRejectsBadToken(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHeartbeatShutdownWhenCompleted: once the orchestrator has flipped
|
// TestHeartbeatRebootWhenCompleted: once the orchestrator has flipped
|
||||||
// the run into Completed, the next heartbeat response must carry
|
// the run into Completed, the next heartbeat response must carry
|
||||||
// cmd=shutdown so the agent powers the host down.
|
// cmd=reboot so the agent reboots the host back to local disk.
|
||||||
func TestHeartbeatShutdownWhenCompleted(t *testing.T) {
|
func TestHeartbeatRebootWhenCompleted(t *testing.T) {
|
||||||
a, runID, token := setupAgent(t)
|
a, runID, token := setupAgent(t)
|
||||||
// Wire a runner so Heartbeat's TouchHeartbeat call doesn't nil-panic.
|
// Wire a runner so Heartbeat's TouchHeartbeat call doesn't nil-panic.
|
||||||
a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()}
|
a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}, EventHub: events.NewHub()}
|
||||||
@@ -126,8 +126,8 @@ func TestHeartbeatShutdownWhenCompleted(t *testing.T) {
|
|||||||
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
||||||
t.Fatalf("decode: %v", err)
|
t.Fatalf("decode: %v", err)
|
||||||
}
|
}
|
||||||
if resp["cmd"] != "shutdown" {
|
if resp["cmd"] != "reboot" {
|
||||||
t.Fatalf("cmd = %v, want shutdown", resp["cmd"])
|
t.Fatalf("cmd = %v, want reboot", resp["cmd"])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -82,10 +82,12 @@ func NotRegisteredScript(mac string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// NoActiveRunScript is served when a registered MAC PXE-boots but has
|
// NoActiveRunScript is served when a registered MAC PXE-boots but has
|
||||||
// no currently active run. The host is told to shut down rather than
|
// no currently active run. `exit` drops back to the firmware so the
|
||||||
// loop forever.
|
// next configured boot entry (local disk) fires — this is what makes a
|
||||||
|
// post-Completed reboot come back up on the installed OS instead of
|
||||||
|
// looping through PXE and powering off.
|
||||||
func NoActiveRunScript(mac string) string {
|
func NoActiveRunScript(mac string) string {
|
||||||
return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — powering off in 10s.\nsleep 10\npoweroff\n", mac)
|
return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — exiting to next boot device.\nsleep 2\nexit\n", mac)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used by handlers to compose URLs; exposed for tests.
|
// Used by handlers to compose URLs; exposed for tests.
|
||||||
|
|||||||
Reference in New Issue
Block a user