Files
josh 3656af9823
CI / Lint + build + test (push) Successful in 1m47s
Release / release (push) Successful in 10m8s
feat(end-of-run): reboot to local disk instead of powering off
Completed runs now reboot the host and fall through iPXE to the next
boot device (local disk) instead of powering off. Three coordinated
changes:

- pxe/ipxe: NoActiveRunScript exits iPXE (drops to next boot entry)
  instead of `sleep 10; poweroff`. Without this, a Completed reboot
  just loops through PXE and gets told to poweroff.
- api/agent_handlers: heartbeat returns cmd=reboot (was cmd=shutdown)
  when the run reaches Completed.
- agent/runner: runs `systemctl reboot` (with `shutdown -r now`
  fallback) in response to cmd=reboot.

Operator cancel still powers off — powerOffAndReturn is unchanged
because a cancel means the operator wants the host idle so they can
walk up to it, not back in rotation.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-19 22:45:11 -04:00

112 lines
4.3 KiB
Go

package pxe
import (
"fmt"
"io"
"strings"
"vetting/internal/model"
)
// IPXEParams is everything an iPXE boot script needs.
// For Phase 2 the boot target is always "linux" — Memtest chain-load
// is not required because we replaced Memtest86+ with stress-ng under
// Linux (see plan §3.2).
type IPXEParams struct {
OrchestratorURL string // e.g. http://10.0.0.5:8080
LiveKernelURL string // e.g. http://10.0.0.5:8080/live/vmlinuz
LiveInitrdURL string // e.g. http://10.0.0.5:8080/live/initrd.img
TLSCertFPR string // optional; empty = skip pin
RunID int64
MAC string
Token string // plaintext, hashed on server side
}
// BuildScript returns an iPXE script tailored for this run.
// iPXE scripts are plain text beginning with "#!ipxe".
func BuildScript(p IPXEParams) string {
cmdline := []string{
"initrd=initrd.img",
fmt.Sprintf("vetting.orchestrator=%s", p.OrchestratorURL),
fmt.Sprintf("vetting.run_id=%d", p.RunID),
fmt.Sprintf("vetting.mac=%s", p.MAC),
fmt.Sprintf("vetting.token=%s", p.Token),
}
if p.TLSCertFPR != "" {
cmdline = append(cmdline, fmt.Sprintf("vetting.cert_fpr=%s", p.TLSCertFPR))
}
// Verbose kernel + systemd logging on both the video console and the
// serial port so first-boot failures on unfamiliar hardware aren't
// invisible. Drop `quiet` entirely — once boot is stable we can
// re-add it. systemd.log_target=kmsg makes early systemd go through
// the same dmesg buffer as the kernel, so nothing is lost before
// journald comes up.
cmdline = append(cmdline,
"console=tty0",
"console=ttyS0,115200n8",
"ip=dhcp",
"loglevel=7",
"systemd.log_level=info",
"systemd.log_target=kmsg",
"systemd.journald.forward_to_console=1",
// systemd-getty-generator sees console=ttyS0 and creates
// serial-getty@ttyS0.service, which waits up to 90s for
// /dev/ttyS0 to appear. Hosts without a serial port time
// out and delay boot. Mask the getty — kernel logs still
// tee to ttyS0 if the port exists, we just don't spawn
// a login prompt there.
"systemd.mask=serial-getty@ttyS0.service",
// systemd-firstboot.service is the interactive "first boot
// wizard" (locale/timezone/root-password prompts). In a
// PXE live image there's no operator at the console to
// answer, so it blocks sysinit.target forever and the
// agent never comes up. systemd.firstboot=off short-
// circuits it entirely.
"systemd.firstboot=off",
)
var b strings.Builder
fmt.Fprintln(&b, "#!ipxe")
fmt.Fprintf(&b, "echo Vetting run %d — booting live image for %s\n", p.RunID, p.MAC)
fmt.Fprintf(&b, "kernel %s %s\n", p.LiveKernelURL, strings.Join(cmdline, " "))
fmt.Fprintf(&b, "initrd %s\n", p.LiveInitrdURL)
fmt.Fprintln(&b, "boot")
return b.String()
}
// NotRegisteredScript is served for unknown MACs. The MAC allowlist
// at the dnsmasq level should prevent this from ever being reachable,
// but it exists as belt-and-braces.
func NotRegisteredScript(mac string) string {
return fmt.Sprintf("#!ipxe\necho MAC %s not registered for vetting — halting.\nshell\n", mac)
}
// NoActiveRunScript is served when a registered MAC PXE-boots but has
// no currently active run. `exit` drops back to the firmware so the
// next configured boot entry (local disk) fires — this is what makes a
// post-Completed reboot come back up on the installed OS instead of
// looping through PXE and powering off.
func NoActiveRunScript(mac string) string {
return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — exiting to next boot device.\nsleep 2\nexit\n", mac)
}
// Used by handlers to compose URLs; exposed for tests.
func BuildLiveURLs(base string) (kernel, initrd string) {
base = strings.TrimRight(base, "/")
return base + "/live/vmlinuz", base + "/live/initrd.img"
}
// WriteNotFound is a small convenience so handlers can return a shell
// script error directly to iPXE without cluttering handlers with a
// mime-type dance.
func WriteNotFound(w io.Writer, mac string) {
_, _ = w.Write([]byte(NotRegisteredScript(mac)))
}
// ScriptMarker is used by iPXE to detect that the response is a script.
const ScriptMarker = "#!ipxe"
// State returns the compact single-word status used for logging.
// Takes a Run's state because iPXE handler already looked it up.
func State(run model.Run) string { return string(run.State) }