From a5055b3c7a3c12f5125c25dbd2bac460c7de6d36 Mon Sep 17 00:00:00 2001 From: josh Date: Sat, 18 Apr 2026 01:38:43 -0400 Subject: [PATCH] Automate PXE setup: release bundle + pxe-setup.sh + startup validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collapses the LXC side of PXE enablement from a six-step manual dance (build, fetch iPXE, scp, bridge, hand-edit yaml) into: make release # dev box (Linux/WSL) scp bundle.tar.gz lxc:/tmp/ sudo ./install.sh # base install, unchanged sudo ./pxe-setup.sh --interface ... --dhcp-range ... --orchestrator-url ... pxe-setup.sh fetches iPXE from boot.ipxe.org, verifies against pinned SHA256s in deploy/ipxe-shas.txt (fail-closed), places vmlinuz/initrd.img from the bundle, and rewrites only the pxe: block of vetting.yaml. Idempotent; --force gates overwriting a hand-edited block. Adds Supervisor.Validate() — called before dnsmasq spawn — so typo'd configs fail at orchestrator startup with clear errors naming the missing file or yaml key, instead of silently serving broken TFTP until a real host tries to PXE-boot. Nine tests cover missing files, bogus interface, malformed dhcp_range, bad orchestrator_url, and aggregate reporting. Hypervisor bridge creation stays documented (LXC can't do it) but everything downstream of the bridge is now scripted. Co-Authored-By: Claude Opus 4.7 --- Makefile | 18 ++- cmd/vetting/main.go | 1 + deploy/ipxe-shas.txt | 18 +++ deploy/pxe-setup.sh | 268 +++++++++++++++++++++++++++++++++++ docs/operations.md | 128 +++++++++++------ internal/pxe/dnsmasq.go | 67 +++++++++ internal/pxe/dnsmasq_test.go | 191 +++++++++++++++++++++++++ live-image/README.md | 23 +-- 8 files changed, 660 insertions(+), 54 deletions(-) create mode 100644 deploy/ipxe-shas.txt create mode 100644 deploy/pxe-setup.sh create mode 100644 internal/pxe/dnsmasq_test.go diff --git a/Makefile b/Makefile index d4ee8b6..a84ff53 100644 --- a/Makefile +++ b/Makefile @@ -70,6 +70,22 @@ run: orchestrator ## Build and run orchestrator with example config install: orchestrator-linux agent-linux ## Run deploy/install.sh (must be run on the target LXC as root) sudo ./deploy/install.sh --binary ./bin/vetting-linux-amd64 --agent-binary ./bin/vetting-agent.linux-amd64 +.PHONY: release +release: orchestrator-linux agent-linux live-image ## Build the scp-and-go release tarball (run from Linux/WSL) +ifneq ($(findstring Windows,$(UNAME_S))$(findstring MINGW,$(UNAME_S))$(findstring MSYS,$(UNAME_S)),) + @echo "ERROR: make release must be run from Linux/WSL (live-image dep needs mkosi)." && exit 1 +endif + @stamp=vetting-bundle-$(GIT_SHA); \ + rm -rf build/$$stamp bin/$$stamp.tar.gz; \ + mkdir -p build/$$stamp/bin build/$$stamp/live-image; \ + cp bin/vetting-linux-amd64 bin/vetting-agent.linux-amd64 build/$$stamp/bin/; \ + cp live-image/build/vmlinuz live-image/build/initrd.img build/$$stamp/live-image/; \ + cp deploy/install.sh deploy/pxe-setup.sh deploy/vetting.service \ + deploy/vetting.production.yaml deploy/ipxe-shas.txt build/$$stamp/; \ + echo $(GIT_SHA) > build/$$stamp/VERSION; \ + tar -C build -czf bin/$$stamp.tar.gz $$stamp; \ + echo "wrote bin/$$stamp.tar.gz ($$(du -h bin/$$stamp.tar.gz | cut -f1))" + .PHONY: clean clean: ## Remove build artifacts - rm -rf bin out dist tmp + rm -rf bin out dist tmp build diff --git a/cmd/vetting/main.go b/cmd/vetting/main.go index dde27c6..f4e9edf 100644 --- a/cmd/vetting/main.go +++ b/cmd/vetting/main.go @@ -149,6 +149,7 @@ func main() { OrchestratorURL: cfg.PXE.OrchestratorURL, RuntimeDir: filepath.Join(cfg.Logs.Dir, "..", "pxe"), TFTPRoot: tftpRoot, + LiveDir: cfg.PXE.LiveDir, }) } diff --git a/deploy/ipxe-shas.txt b/deploy/ipxe-shas.txt new file mode 100644 index 0000000..48cc123 --- /dev/null +++ b/deploy/ipxe-shas.txt @@ -0,0 +1,18 @@ +# Pinned iPXE binary checksums. pxe-setup.sh fetches ipxe.efi + +# undionly.kpxe from https://boot.ipxe.org and verifies the SHA256 +# against these pins. Mismatch = hard fail; the script will not place +# mismatched binaries into tftp_root. +# +# Sources (both from the iPXE project's latest-build tree): +# ipxe.efi → https://boot.ipxe.org/x86_64-efi/ipxe.efi +# undionly.kpxe → https://boot.ipxe.org/undionly.kpxe +# +# To bump: fetch fresh binaries, verify via a second trusted source +# (e.g. a checksum published by a distro package, or a second mirror), +# regenerate with `sha256sum ipxe.efi undionly.kpxe > ipxe-shas.txt`, +# and commit. Treat this as a security-sensitive change. +# +# Format: compatible with `sha256sum -c ipxe-shas.txt` when run from +# the directory containing both files. +270afb529c4a8c1a89e2b852eca150789d948edaca9ca7099a12f170cc9c82e5 ipxe.efi +a84c7945d5ac941b8284a279bb2c93062bc19370681c9cf9a28b52daa1782a95 undionly.kpxe diff --git a/deploy/pxe-setup.sh b/deploy/pxe-setup.sh new file mode 100644 index 0000000..9e06b8f --- /dev/null +++ b/deploy/pxe-setup.sh @@ -0,0 +1,268 @@ +#!/usr/bin/env bash +# pxe-setup.sh — finish the PXE half of a vetting install. +# +# Run AFTER deploy/install.sh on the LXC (or wherever the orchestrator +# lives). Fetches pinned iPXE binaries, places the live image, and +# writes the pxe: block of /etc/vetting/vetting.yaml. Does NOT create +# the PXE bridge — that's a hypervisor-level step, see +# docs/operations.md. +# +# Idempotent: safe to re-run with the same args. A second run with +# different args overwrites the pxe: block; pass --force to override +# a hand-edited block that differs from our args. +# +# Usage: +# sudo ./pxe-setup.sh \ +# --interface eth1 \ +# --dhcp-range 10.77.0.100,10.77.0.200,12h \ +# --orchestrator-url http://10.77.0.2:8080 +# +# Optional: +# --tftp-root DIR default /var/lib/vetting/tftp +# --live-dir DIR default /var/lib/vetting/live +# --config PATH default /etc/vetting/vetting.yaml +# --bundle-dir DIR default: this script's dir (release tarball root) +# --force overwrite a customised pxe: block +set -euo pipefail + +INTERFACE="" +DHCP_RANGE="" +ORCH_URL="" +TFTP_ROOT="/var/lib/vetting/tftp" +LIVE_DIR="/var/lib/vetting/live" +CONFIG="/etc/vetting/vetting.yaml" +BUNDLE_DIR="" +FORCE=0 +SERVICE_USER="vetting" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +usage() { + sed -n '2,24p' "${BASH_SOURCE[0]}" +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --interface) INTERFACE="$2"; shift 2 ;; + --dhcp-range) DHCP_RANGE="$2"; shift 2 ;; + --orchestrator-url) ORCH_URL="$2"; shift 2 ;; + --tftp-root) TFTP_ROOT="$2"; shift 2 ;; + --live-dir) LIVE_DIR="$2"; shift 2 ;; + --config) CONFIG="$2"; shift 2 ;; + --bundle-dir) BUNDLE_DIR="$2"; shift 2 ;; + --force) FORCE=1; shift ;; + -h|--help) usage; exit 0 ;; + *) echo "unknown arg: $1" >&2; usage; exit 2 ;; + esac +done + +if [[ $EUID -ne 0 ]]; then + echo "pxe-setup.sh must be run as root (try: sudo $0 ...)" >&2 + exit 1 +fi + +[[ -z "${INTERFACE}" ]] && { echo "ERROR: --interface is required" >&2; exit 2; } +[[ -z "${DHCP_RANGE}" ]] && { echo "ERROR: --dhcp-range is required" >&2; exit 2; } +[[ -z "${ORCH_URL}" ]] && { echo "ERROR: --orchestrator-url is required" >&2; exit 2; } + +# --- sanity checks ----------------------------------------------------- + +if ! ip link show "${INTERFACE}" >/dev/null 2>&1; then + echo "ERROR: interface ${INTERFACE} not found on host. Check \`ip link\` — the" >&2 + echo " interface must exist *before* the orchestrator starts dnsmasq." >&2 + exit 1 +fi + +# "start_ip,end_ip,lease" — dnsmasq will still validate, but catch the +# obvious shape errors before we write anything to disk. +if [[ ! "${DHCP_RANGE}" =~ ^([0-9]{1,3}\.){3}[0-9]{1,3},([0-9]{1,3}\.){3}[0-9]{1,3},[^[:space:]]+$ ]]; then + echo "ERROR: --dhcp-range must be start_ip,end_ip,lease (e.g. 10.77.0.100,10.77.0.200,12h)" >&2 + exit 2 +fi + +if [[ ! -f "${CONFIG}" ]]; then + echo "ERROR: ${CONFIG} not found — run deploy/install.sh first." >&2 + exit 1 +fi + +if ! id -u "${SERVICE_USER}" >/dev/null 2>&1; then + echo "ERROR: ${SERVICE_USER} user not found — run deploy/install.sh first." >&2 + exit 1 +fi + +# Resolve the bundle dir. When pxe-setup.sh is run from a release +# tarball it sits alongside ipxe-shas.txt and a live-image/ subdir; when +# run from the repo tree it's deploy/pxe-setup.sh and the live image is +# under live-image/build/. Detect both. +if [[ -z "${BUNDLE_DIR}" ]]; then + if [[ -f "${SCRIPT_DIR}/ipxe-shas.txt" ]]; then + BUNDLE_DIR="${SCRIPT_DIR}" + else + BUNDLE_DIR="${SCRIPT_DIR}" + fi +fi +SHAS_FILE="${BUNDLE_DIR}/ipxe-shas.txt" +if [[ ! -f "${SHAS_FILE}" ]]; then + echo "ERROR: ${SHAS_FILE} not found — bundle is incomplete." >&2 + exit 1 +fi + +# --- iPXE binaries: stage, verify, install ---------------------------- +# +# Stage into a temp dir so a corrupt download never clobbers a known- +# good file in tftp_root. sha256sum -c must pass before we `install` — +# install(1) unlink-replaces, which avoids ETXTBSY and makes the whole +# operation atomic per file. + +echo "==> ensuring ${TFTP_ROOT} exists" +install -d -m 0755 -o "${SERVICE_USER}" -g "${SERVICE_USER}" "${TFTP_ROOT}" + +STAGE="$(mktemp -d)" +trap 'rm -rf "${STAGE}"' EXIT + +need_fetch=0 +for name in ipxe.efi undionly.kpxe; do + if [[ ! -f "${TFTP_ROOT}/${name}" ]]; then + need_fetch=1 + break + fi +done + +# Even if both files exist, re-verify against pinned SHAs. If they match +# we skip the fetch entirely; if not, re-download. +if (( ! need_fetch )); then + if ! ( cd "${TFTP_ROOT}" && sha256sum -c --status "${SHAS_FILE}" ); then + echo "==> ${TFTP_ROOT} iPXE binaries don't match pinned SHAs — re-fetching" + need_fetch=1 + else + echo "==> iPXE binaries already match pins — skipping fetch" + fi +fi + +if (( need_fetch )); then + echo "==> fetching iPXE binaries from boot.ipxe.org" + curl -fsSLo "${STAGE}/ipxe.efi" "https://boot.ipxe.org/x86_64-efi/ipxe.efi" + curl -fsSLo "${STAGE}/undionly.kpxe" "https://boot.ipxe.org/undionly.kpxe" + + echo "==> verifying SHA256 against ${SHAS_FILE}" + if ! ( cd "${STAGE}" && sha256sum -c "${SHAS_FILE}" ); then + echo "ERROR: iPXE SHA256 mismatch. Upstream binaries changed, or a MITM." >&2 + echo " To accept the new binaries, regenerate ${SHAS_FILE} after" >&2 + echo " independently verifying the new checksums, then re-run." >&2 + exit 1 + fi + + install -m 0644 -o "${SERVICE_USER}" -g "${SERVICE_USER}" \ + "${STAGE}/ipxe.efi" "${TFTP_ROOT}/ipxe.efi" + install -m 0644 -o "${SERVICE_USER}" -g "${SERVICE_USER}" \ + "${STAGE}/undionly.kpxe" "${TFTP_ROOT}/undionly.kpxe" +fi + +# --- live image: copy from bundle into live_dir ----------------------- + +# Accept two layouts: release tarball (${BUNDLE_DIR}/live-image/) or +# repo tree (${BUNDLE_DIR}/../live-image/build/). +LIVE_SRC="" +for cand in \ + "${BUNDLE_DIR}/live-image" \ + "${BUNDLE_DIR}/../live-image/build"; do + if [[ -f "${cand}/vmlinuz" && -f "${cand}/initrd.img" ]]; then + LIVE_SRC="${cand}" + break + fi +done + +if [[ -z "${LIVE_SRC}" ]]; then + echo "WARN: no live image found under ${BUNDLE_DIR}/live-image or" >&2 + echo " ${BUNDLE_DIR}/../live-image/build — skipping live_dir staging." >&2 + echo " Build with 'wsl make live-image' or use a release tarball," >&2 + echo " then copy vmlinuz + initrd.img into ${LIVE_DIR} manually." >&2 +else + echo "==> staging live image from ${LIVE_SRC} into ${LIVE_DIR}" + install -d -m 0755 -o "${SERVICE_USER}" -g "${SERVICE_USER}" "${LIVE_DIR}" + install -m 0644 -o "${SERVICE_USER}" -g "${SERVICE_USER}" \ + "${LIVE_SRC}/vmlinuz" "${LIVE_DIR}/vmlinuz" + install -m 0644 -o "${SERVICE_USER}" -g "${SERVICE_USER}" \ + "${LIVE_SRC}/initrd.img" "${LIVE_DIR}/initrd.img" +fi + +# --- patch the pxe: block in vetting.yaml ----------------------------- +# +# Replace the contents of the pxe: block in-place. Uses awk to walk +# line-by-line: when we hit `pxe:`, skip everything up to the next +# top-level key (a line starting with a non-whitespace letter + ":") +# or EOF, and emit our freshly-rendered block instead. Everything +# outside the pxe: block is passed through unchanged, so hand-tuned +# server:/database:/notifiers: blocks survive intact. + +existing_iface="$(awk ' + /^pxe:/ { in_pxe=1; next } + in_pxe && /^[A-Za-z_][A-Za-z0-9_]*:/ { in_pxe=0 } + in_pxe && /^[[:space:]]+interface:/ { + sub(/^[[:space:]]+interface:[[:space:]]*/, "") + gsub(/^"|"$/, "") + print; exit + } +' "${CONFIG}")" +existing_range="$(awk ' + /^pxe:/ { in_pxe=1; next } + in_pxe && /^[A-Za-z_][A-Za-z0-9_]*:/ { in_pxe=0 } + in_pxe && /^[[:space:]]+dhcp_range:/ { + sub(/^[[:space:]]+dhcp_range:[[:space:]]*/, "") + gsub(/^"|"$/, "") + print; exit + } +' "${CONFIG}")" + +if [[ -n "${existing_iface}" && "${existing_iface}" != "${INTERFACE}" && ${FORCE} -eq 0 ]]; then + echo "ERROR: pxe.interface in ${CONFIG} is already set to ${existing_iface}, which" >&2 + echo " differs from --interface ${INTERFACE}. Pass --force to overwrite." >&2 + exit 1 +fi +if [[ -n "${existing_range}" && "${existing_range}" != "${DHCP_RANGE}" && ${FORCE} -eq 0 ]]; then + echo "ERROR: pxe.dhcp_range in ${CONFIG} is already ${existing_range}, which" >&2 + echo " differs from --dhcp-range ${DHCP_RANGE}. Pass --force to overwrite." >&2 + exit 1 +fi + +new_block=$(cat < "${tmp_yaml}" + +# Preserve owner + mode from the original. +orig_mode="$(stat -c '%a' "${CONFIG}")" +orig_owner="$(stat -c '%U:%G' "${CONFIG}")" +install -m "${orig_mode}" -o "${orig_owner%:*}" -g "${orig_owner#*:}" \ + "${tmp_yaml}" "${CONFIG}" +rm -f "${tmp_yaml}" + +echo +echo "==> rendered pxe: block in ${CONFIG}:" +echo "${new_block}" | sed 's/^/ /' +echo +echo "Next: systemctl restart vetting && journalctl -fu vetting" +echo "The orchestrator will refuse to start with clear errors if anything" +echo "is still missing; you should see dnsmasq come up cleanly." diff --git a/docs/operations.md b/docs/operations.md index 0dc01c8..fa4e604 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -11,66 +11,104 @@ Target: a Debian/Ubuntu LXC on the Proxmox host that holds the cluster you're vetting for. The LXC must be on the same L2 segment as the repaired nodes so DHCP and WoL work. -1. On your workstation, cross-build the binary: +### One-shot release bundle (recommended) - ``` - make orchestrator-linux - ``` +On your dev workstation (Linux, or WSL on Windows): - This produces `bin/vetting-linux-amd64`. +``` +make release +``` -2. Copy the repo tree (or just `bin/`, `deploy/`) into the LXC, then - from inside the LXC: +Produces `bin/vetting-bundle-.tar.gz` containing the orchestrator +binary, agent binary, live image (`vmlinuz` + `initrd.img`), install +scripts, `vetting.service`, the production yaml, and the pinned iPXE +SHA256 file. - ``` - sudo ./deploy/install.sh - ``` +Ship it to the LXC: - The installer: - - `apt install`s `dnsmasq`, `iperf3`, `ca-certificates` - - creates the `vetting` system user (home = `/var/lib/vetting`) - - installs the binary into `/usr/local/bin/vetting` - - drops `vetting.example.yaml` into `/etc/vetting/vetting.yaml` - (only if there's no existing config — existing configs are - preserved) - - drops `/etc/systemd/system/vetting.service` - - disables the distro-default dnsmasq (the orchestrator supervises - its own) +``` +scp bin/vetting-bundle-.tar.gz lxc:/tmp/ +ssh lxc 'cd /tmp && tar xzf vetting-bundle-*.tar.gz' +ssh lxc 'cd /tmp/vetting-bundle- && sudo ./install.sh' +``` - The installer does **not** enable the service. You'll want to edit - the config first. +`install.sh` does the base install (user, binaries, config, systemd +unit). If you don't need PXE (e.g. host-mode reporter only, no +automated live-boots), you can stop here — edit +`/etc/vetting/vetting.yaml` to tune `server.bind` / `public_url`, +then `sudo systemctl enable --now vetting`. -3. Edit `/etc/vetting/vetting.yaml`: +### PXE enablement - - `server.bind` — defaults to `127.0.0.1:8080`. Switch to - `0.0.0.0:8080` (or bind to a specific LAN IP) once you're ready - to expose it. There is no built-in auth — see *Exposing outside - the LAN* below. - - `server.public_url` — the URL your browser hits the LXC on - (e.g. `http://vetting.lan:8080`). Used as the click-through link - in notifications. +PXE is gated behind a second script so non-PXE installs stay simple. -4. (Optional) Configure notifiers in the same file — see the - commented-out example block for ntfy / Discord / SMTP. +**Prerequisite: dedicated PXE bridge on the Proxmox hypervisor.** The +LXC can't create bridges on its host, so do this once on the Proxmox +node (not inside the LXC): -5. Enable and start: +``` +sudo ip link add br-vetting type bridge +sudo ip addr add 10.77.0.1/24 dev br-vetting +sudo ip link set br-vetting up +``` - ``` - sudo systemctl enable --now vetting - sudo journalctl -fu vetting - ``` +Attach a veth from the LXC onto `br-vetting` (e.g. `eth1` inside the +LXC at `10.77.0.2/24`). Repaired nodes PXE-boot from a NIC cabled or +bridged onto `br-vetting` only — keep this network isolated from your +household DHCP, or both DHCP servers will fight. + +On the LXC, inside the extracted bundle: + +``` +sudo ./pxe-setup.sh \ + --interface eth1 \ + --dhcp-range 10.77.0.100,10.77.0.200,12h \ + --orchestrator-url http://10.77.0.2:8080 +``` + +The script: + +- Fetches `ipxe.efi` + `undionly.kpxe` from boot.ipxe.org and verifies + SHA256 against `ipxe-shas.txt` (fail-closed on mismatch). +- Places `vmlinuz` + `initrd.img` into `/var/lib/vetting/live/`. +- Rewrites the `pxe:` block of `/etc/vetting/vetting.yaml` to enable + PXE with the flags you passed. + +It does **not** restart the service — review the rendered config, +then: + +``` +sudo systemctl restart vetting +sudo journalctl -fu vetting +``` + +The orchestrator validates PXE preconditions at startup (interface +exists, iPXE binaries are on disk, `dhcp_range` parses) and exits +non-zero with a clear error if anything's wrong, instead of failing +silently when a host first PXE-boots. + +`pxe-setup.sh` is idempotent — safe to re-run. Pass `--force` to +overwrite a hand-edited `pxe:` block. + +### Manual install (no release tarball) + +For dev-loop iteration on the LXC itself: + +1. On your workstation: `make orchestrator-linux && make agent-linux` +2. Copy the repo tree (or just `bin/` + `deploy/`) onto the LXC +3. `sudo ./deploy/install.sh` → base install +4. For PXE: `wsl make live-image` on your workstation, + `scp live-image/build/vmlinuz lxc:/tmp/ && scp live-image/build/initrd.img lxc:/tmp/`, + then run `pxe-setup.sh --bundle-dir /tmp` (or accept the default + repo-tree detection when running from the repo root). ## First vetting run Against a QEMU VM first, before you point it at real hardware: -1. On the Proxmox host (or wherever your LXC lives): - - ``` - sudo ip link add br-vetting type bridge - sudo ip addr add 10.77.0.1/24 dev br-vetting - sudo ip link set br-vetting up - ``` +1. Make sure the `br-vetting` bridge exists on the hypervisor (see + above). From inside the LXC, confirm it's reachable on your + PXE-side interface. 2. In the UI at `http://:8080`, register a host: - Name: `qemu-test` @@ -82,7 +120,7 @@ Against a QEMU VM first, before you point it at real hardware: cpu: { logical_cores: 4 } ``` -3. Click **Start Vetting**. The UI tile will sit at `Queued → WaitingWoL`. +3. Click **Start Vetting**. The UI tile will sit at `Queued → WaitingReboot`. 4. Launch the QEMU VM on the bridge so it PXE-boots from dnsmasq: diff --git a/internal/pxe/dnsmasq.go b/internal/pxe/dnsmasq.go index 2876f60..d0f396f 100644 --- a/internal/pxe/dnsmasq.go +++ b/internal/pxe/dnsmasq.go @@ -2,12 +2,16 @@ package pxe import ( "context" + "errors" "fmt" "io" "log" + "net" + "net/url" "os" "os/exec" "path/filepath" + "regexp" "runtime" "strings" "sync" @@ -25,6 +29,7 @@ type SupervisorConfig struct { OrchestratorURL string // baked into iPXE scripts RuntimeDir string // writable dir for dnsmasq.conf and leases TFTPRoot string // holds ipxe.efi, undionly.kpxe + LiveDir string // holds vmlinuz, initrd.img (served via HTTP, not dnsmasq; "" disables validation) DNSMasqBin string // path to dnsmasq binary (default: "dnsmasq") } @@ -45,6 +50,65 @@ func NewSupervisor(cfg SupervisorConfig) *Supervisor { return &Supervisor{cfg: cfg} } +// dhcpRangeRE matches "start_ip,end_ip,lease" — the three-field form +// dnsmasq expects. Lease can be "12h", "infinite", etc.; any non-empty +// token is accepted here and dnsmasq will reject nonsense at startup. +var dhcpRangeRE = regexp.MustCompile(`^(\d{1,3}\.){3}\d{1,3},(\d{1,3}\.){3}\d{1,3},\S+$`) + +// Validate checks the preconditions required for dnsmasq to actually +// serve PXE boots: the interface must exist, the iPXE payloads must +// be on disk, the DHCP range + orchestrator URL must parse. Returns +// nil when Enabled=false — tests and dev mode skip all of this. +// +// Without Validate(), dnsmasq starts cleanly on typo'd configs and +// the only symptom is a silent TFTP 404 when a real host PXE-boots. +func (s *Supervisor) Validate() error { + if !s.cfg.Enabled { + return nil + } + var errs []error + + if s.cfg.Interface == "" { + errs = append(errs, fmt.Errorf("pxe.interface is required")) + } else if _, err := net.InterfaceByName(s.cfg.Interface); err != nil { + errs = append(errs, fmt.Errorf("pxe.interface %q not found on host — check `ip link` or fix pxe.interface in vetting.yaml", s.cfg.Interface)) + } + + if s.cfg.TFTPRoot == "" { + errs = append(errs, fmt.Errorf("pxe.tftp_root is required")) + } else { + for _, name := range []string{"ipxe.efi", "undionly.kpxe"} { + p := filepath.Join(s.cfg.TFTPRoot, name) + if _, err := os.Stat(p); err != nil { + errs = append(errs, fmt.Errorf("missing %s — run deploy/pxe-setup.sh to fetch iPXE binaries", p)) + } + } + } + + if s.cfg.LiveDir != "" { + for _, name := range []string{"vmlinuz", "initrd.img"} { + p := filepath.Join(s.cfg.LiveDir, name) + if _, err := os.Stat(p); err != nil { + errs = append(errs, fmt.Errorf("missing %s — build the live image (`make live-image`) and copy into pxe.live_dir, or use the release tarball", p)) + } + } + } + + if s.cfg.DHCPRange == "" { + errs = append(errs, fmt.Errorf("pxe.dhcp_range is required (e.g. \"10.77.0.100,10.77.0.200,12h\")")) + } else if !dhcpRangeRE.MatchString(s.cfg.DHCPRange) { + errs = append(errs, fmt.Errorf("pxe.dhcp_range %q must be \"start_ip,end_ip,lease\"", s.cfg.DHCPRange)) + } + + if s.cfg.OrchestratorURL == "" { + errs = append(errs, fmt.Errorf("pxe.orchestrator_url is required")) + } else if u, err := url.Parse(s.cfg.OrchestratorURL); err != nil || (u.Scheme != "http" && u.Scheme != "https") || u.Host == "" { + errs = append(errs, fmt.Errorf("pxe.orchestrator_url %q must be an http(s) URL with a host", s.cfg.OrchestratorURL)) + } + + return errors.Join(errs...) +} + // Start launches dnsmasq in the background. If cfg.Enabled is false // Start is a no-op (useful for dev on Windows where dnsmasq isn't // available). @@ -56,6 +120,9 @@ func (s *Supervisor) Start(ctx context.Context, hosts []model.Host) error { if runtime.GOOS == "windows" { return fmt.Errorf("dnsmasq supervision is not supported on Windows — run orchestrator on Linux") } + if err := s.Validate(); err != nil { + return fmt.Errorf("pxe preconditions failed: %w", err) + } if err := os.MkdirAll(s.cfg.RuntimeDir, 0o755); err != nil { return fmt.Errorf("mkdir runtime: %w", err) } diff --git a/internal/pxe/dnsmasq_test.go b/internal/pxe/dnsmasq_test.go new file mode 100644 index 0000000..3f23014 --- /dev/null +++ b/internal/pxe/dnsmasq_test.go @@ -0,0 +1,191 @@ +package pxe + +import ( + "net" + "os" + "path/filepath" + "strings" + "testing" +) + +// existingInterface returns any real interface on the host so the +// Validate tests can exercise the happy path without hardcoding +// "lo" (which exists on Linux but might be gated elsewhere). +func existingInterface(t *testing.T) string { + t.Helper() + ifaces, err := net.Interfaces() + if err != nil || len(ifaces) == 0 { + t.Skipf("no network interfaces: %v", err) + } + return ifaces[0].Name +} + +// seedTFTP drops zero-byte ipxe.efi + undionly.kpxe into dir so the +// stat check passes. Callers can omit a name to simulate "missing". +func seedTFTP(t *testing.T, dir string, names ...string) { + t.Helper() + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatalf("mkdir tftp: %v", err) + } + for _, name := range names { + if err := os.WriteFile(filepath.Join(dir, name), nil, 0o644); err != nil { + t.Fatalf("seed %s: %v", name, err) + } + } +} + +func goodCfg(t *testing.T, tftpRoot string) SupervisorConfig { + t.Helper() + return SupervisorConfig{ + Enabled: true, + Interface: existingInterface(t), + DHCPRange: "10.77.0.100,10.77.0.200,12h", + OrchestratorURL: "http://10.77.0.1:8080", + TFTPRoot: tftpRoot, + } +} + +func TestValidate_DisabledSkipsChecks(t *testing.T) { + s := NewSupervisor(SupervisorConfig{Enabled: false}) + if err := s.Validate(); err != nil { + t.Fatalf("disabled supervisor should skip validation, got: %v", err) + } +} + +func TestValidate_HappyPath(t *testing.T) { + tftp := t.TempDir() + seedTFTP(t, tftp, "ipxe.efi", "undionly.kpxe") + s := NewSupervisor(goodCfg(t, tftp)) + if err := s.Validate(); err != nil { + t.Fatalf("happy-path validate: %v", err) + } +} + +func TestValidate_MissingIPXEBinary(t *testing.T) { + tftp := t.TempDir() + // Only seed one of the two required files. + seedTFTP(t, tftp, "undionly.kpxe") + s := NewSupervisor(goodCfg(t, tftp)) + err := s.Validate() + if err == nil { + t.Fatalf("expected error for missing ipxe.efi") + } + if !strings.Contains(err.Error(), "ipxe.efi") { + t.Fatalf("error should name the missing file, got: %v", err) + } + if !strings.Contains(err.Error(), "pxe-setup.sh") { + t.Fatalf("error should point operator at pxe-setup.sh, got: %v", err) + } +} + +func TestValidate_MissingUndionly(t *testing.T) { + tftp := t.TempDir() + seedTFTP(t, tftp, "ipxe.efi") + s := NewSupervisor(goodCfg(t, tftp)) + err := s.Validate() + if err == nil || !strings.Contains(err.Error(), "undionly.kpxe") { + t.Fatalf("expected undionly.kpxe error, got: %v", err) + } +} + +func TestValidate_MissingInterface(t *testing.T) { + tftp := t.TempDir() + seedTFTP(t, tftp, "ipxe.efi", "undionly.kpxe") + cfg := goodCfg(t, tftp) + cfg.Interface = "definitely-not-a-real-iface-9999" + s := NewSupervisor(cfg) + err := s.Validate() + if err == nil || !strings.Contains(err.Error(), "pxe.interface") { + t.Fatalf("expected interface error, got: %v", err) + } +} + +func TestValidate_MissingLiveImage(t *testing.T) { + tftp := t.TempDir() + seedTFTP(t, tftp, "ipxe.efi", "undionly.kpxe") + cfg := goodCfg(t, tftp) + cfg.LiveDir = t.TempDir() // empty dir; vmlinuz + initrd.img missing + s := NewSupervisor(cfg) + err := s.Validate() + if err == nil { + t.Fatalf("expected live image error") + } + for _, want := range []string{"vmlinuz", "initrd.img"} { + if !strings.Contains(err.Error(), want) { + t.Fatalf("error should name %s, got: %v", want, err) + } + } +} + +func TestValidate_LiveDirEmptySkipsLiveChecks(t *testing.T) { + tftp := t.TempDir() + seedTFTP(t, tftp, "ipxe.efi", "undionly.kpxe") + cfg := goodCfg(t, tftp) + cfg.LiveDir = "" // explicit opt-out; HTTP /live just 404s + s := NewSupervisor(cfg) + if err := s.Validate(); err != nil { + t.Fatalf("empty LiveDir should not trigger live checks, got: %v", err) + } +} + +func TestValidate_MalformedDHCPRange(t *testing.T) { + tftp := t.TempDir() + seedTFTP(t, tftp, "ipxe.efi", "undionly.kpxe") + cases := []struct { + name string + dhcp string + }{ + {"single field", "10.77.0.100"}, + {"two fields", "10.77.0.100,10.77.0.200"}, + {"non-ip start", "hello,10.77.0.200,12h"}, + {"empty lease", "10.77.0.100,10.77.0.200,"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + cfg := goodCfg(t, tftp) + cfg.DHCPRange = tc.dhcp + s := NewSupervisor(cfg) + err := s.Validate() + if err == nil || !strings.Contains(err.Error(), "dhcp_range") { + t.Fatalf("expected dhcp_range error for %q, got: %v", tc.dhcp, err) + } + }) + } +} + +func TestValidate_BadOrchestratorURL(t *testing.T) { + tftp := t.TempDir() + seedTFTP(t, tftp, "ipxe.efi", "undionly.kpxe") + cases := []string{"", "not a url", "ftp://10.0.0.1", "http://"} + for _, u := range cases { + t.Run(u, func(t *testing.T) { + cfg := goodCfg(t, tftp) + cfg.OrchestratorURL = u + s := NewSupervisor(cfg) + err := s.Validate() + if err == nil || !strings.Contains(err.Error(), "orchestrator_url") { + t.Fatalf("expected orchestrator_url error for %q, got: %v", u, err) + } + }) + } +} + +func TestValidate_AggregatesErrors(t *testing.T) { + // Multiple problems at once: Validate must report them all in + // one pass so the operator sees the full picture instead of + // whack-a-mole-ing one error per restart. + cfg := SupervisorConfig{ + Enabled: true, + // Everything else zero. + } + s := NewSupervisor(cfg) + err := s.Validate() + if err == nil { + t.Fatalf("expected aggregated error") + } + for _, want := range []string{"pxe.interface", "pxe.tftp_root", "pxe.dhcp_range", "pxe.orchestrator_url"} { + if !strings.Contains(err.Error(), want) { + t.Fatalf("expected %q in aggregated error, got: %v", want, err) + } + } +} diff --git a/live-image/README.md b/live-image/README.md index e6985e3..44ec644 100644 --- a/live-image/README.md +++ b/live-image/README.md @@ -4,9 +4,16 @@ Debian-based Linux live image that PXE-booted hosts drop into. Runs the `vetting-agent` binary under systemd and reaches back to the orchestrator over HTTP+SSE. -## Building +## Preferred build path: `make release` -Must be built on Linux (or WSL). On Windows: +Run `make release` from the repo root (Linux/WSL) — it builds the live +image *and* bundles it with the orchestrator binary, install scripts, +and pinned iPXE SHAs into a single `vetting-bundle-.tar.gz`. See +[../docs/operations.md](../docs/operations.md) for the install flow. + +## Manual build (dev loop) + +On Windows: ```sh wsl make -C live-image all @@ -19,15 +26,15 @@ make -C live-image all ``` This produces `live-image/build/vmlinuz` and `live-image/build/initrd.img`. -Copy (or symlink) them into the directory configured as `pxe.live_dir` in -`deploy/vetting.yaml`; the orchestrator serves them at `/live/*`. +`deploy/pxe-setup.sh` picks them up automatically when run from the repo +tree — no manual copy needed. ## iPXE binaries -The dnsmasq supervisor expects `ipxe.efi` and `undionly.kpxe` to live in -`pxe.tftp_root`. Fetch the latest release binaries from -https://boot.ipxe.org and drop them in that directory. The Makefile does -not download them automatically so their SHA256 can be operator-verified. +The dnsmasq supervisor expects `ipxe.efi` and `undionly.kpxe` in +`pxe.tftp_root`. `deploy/pxe-setup.sh` fetches them from boot.ipxe.org +and verifies against pinned SHA256s in `deploy/ipxe-shas.txt`. Bumping +the pins requires a deliberate repo commit. ## WSL prerequisites (Windows dev)