From 506c856046dbd04788079455304cd79846adcd8a Mon Sep 17 00:00:00 2001 From: josh Date: Sat, 18 Apr 2026 12:02:49 -0400 Subject: [PATCH] pxe: switch dnsmasq to proxy-DHCP mode on the LAN Previously the orchestrator ran a full DHCP server on a dedicated br-vetting bridge (10.77.0.0/24), which required a hypervisor-level bridge + physical cabling onto that bridge for every repaired host. Real-world bite: the LXC's br-vetting had no L2 path to the target host's PXE NIC, so DHCPDISCOVERs never reached eth1 and PXE silently timed out. dnsmasq's proxy-DHCP mode is the idiomatic answer: it coexists with the LAN's existing DHCP server (UniFi, etc.), never assigns an IP itself, and only supplements the PXE options. No dedicated bridge, no VLAN, no cabling changes \u2014 dnsmasq binds to the LAN interface and layers option 66/67 + the PXE BINL on top of the real DHCP exchange. The MAC allowlist still gates replies, so random LAN clients booting from network get nothing. Template switches dhcp-range= to dhcp-range=,proxy and replaces dhcp-boot= for first-boot ROM clients with pxe-service= directives (the correct proxy-mode chainload form). Validation drops the dhcp_range regex for a net.ParseCIDR check on pxe.subnet. Config, production/example yaml, and pxe-setup.sh swap --dhcp-range for --subnet. Co-Authored-By: Claude Opus 4.7 --- cmd/vetting/main.go | 2 +- deploy/pxe-setup.sh | 47 ++++++++++++++++++---------------- deploy/vetting.example.yaml | 6 ++--- deploy/vetting.production.yaml | 6 ++--- internal/config/config.go | 2 +- internal/pxe/dnsmasq.go | 42 ++++++++++++------------------ internal/pxe/dnsmasq_test.go | 26 +++++++++---------- 7 files changed, 63 insertions(+), 68 deletions(-) diff --git a/cmd/vetting/main.go b/cmd/vetting/main.go index 00036ca..5633fac 100644 --- a/cmd/vetting/main.go +++ b/cmd/vetting/main.go @@ -151,7 +151,7 @@ func main() { supervisor = pxe.NewSupervisor(pxe.SupervisorConfig{ Enabled: true, Interface: cfg.PXE.Interface, - DHCPRange: cfg.PXE.DHCPRange, + Subnet: cfg.PXE.Subnet, OrchestratorURL: cfg.PXE.OrchestratorURL, RuntimeDir: filepath.Join(stateRoot, "pxe"), TFTPRoot: tftpRoot, diff --git a/deploy/pxe-setup.sh b/deploy/pxe-setup.sh index d5853cf..95b378b 100755 --- a/deploy/pxe-setup.sh +++ b/deploy/pxe-setup.sh @@ -3,9 +3,12 @@ # # Run AFTER deploy/install.sh on the LXC (or wherever the orchestrator # lives). Fetches pinned iPXE binaries, places the live image, and -# writes the pxe: block of /etc/vetting/vetting.yaml. Does NOT create -# the PXE bridge — that's a hypervisor-level step, see -# docs/operations.md. +# writes the pxe: block of /etc/vetting/vetting.yaml. +# +# dnsmasq runs in proxy-DHCP mode: it coexists with whatever DHCP +# server already serves your LAN (UniFi, pfSense, Asus, etc.) and +# only supplements the PXE options. No dedicated bridge, no VLAN, +# no cabling changes. # # Idempotent: safe to re-run with the same args. A second run with # different args overwrites the pxe: block; pass --force to override @@ -13,9 +16,9 @@ # # Usage: # sudo ./pxe-setup.sh \ -# --interface eth1 \ -# --dhcp-range 10.77.0.100,10.77.0.200,12h \ -# --orchestrator-url http://10.77.0.2:8080 +# --interface eth0 \ +# --subnet 192.168.1.0/24 \ +# --orchestrator-url http://192.168.1.135:8080 # # Optional: # --tftp-root DIR default /var/lib/vetting/tftp @@ -26,7 +29,7 @@ set -euo pipefail INTERFACE="" -DHCP_RANGE="" +SUBNET="" ORCH_URL="" TFTP_ROOT="/var/lib/vetting/tftp" LIVE_DIR="/var/lib/vetting/live" @@ -38,13 +41,13 @@ SERVICE_USER="vetting" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" usage() { - sed -n '2,24p' "${BASH_SOURCE[0]}" + sed -n '2,28p' "${BASH_SOURCE[0]}" } while [[ $# -gt 0 ]]; do case "$1" in --interface) INTERFACE="$2"; shift 2 ;; - --dhcp-range) DHCP_RANGE="$2"; shift 2 ;; + --subnet) SUBNET="$2"; shift 2 ;; --orchestrator-url) ORCH_URL="$2"; shift 2 ;; --tftp-root) TFTP_ROOT="$2"; shift 2 ;; --live-dir) LIVE_DIR="$2"; shift 2 ;; @@ -61,9 +64,9 @@ if [[ $EUID -ne 0 ]]; then exit 1 fi -[[ -z "${INTERFACE}" ]] && { echo "ERROR: --interface is required" >&2; exit 2; } -[[ -z "${DHCP_RANGE}" ]] && { echo "ERROR: --dhcp-range is required" >&2; exit 2; } -[[ -z "${ORCH_URL}" ]] && { echo "ERROR: --orchestrator-url is required" >&2; exit 2; } +[[ -z "${INTERFACE}" ]] && { echo "ERROR: --interface is required" >&2; exit 2; } +[[ -z "${SUBNET}" ]] && { echo "ERROR: --subnet is required (e.g. 192.168.1.0/24)" >&2; exit 2; } +[[ -z "${ORCH_URL}" ]] && { echo "ERROR: --orchestrator-url is required" >&2; exit 2; } # --- sanity checks ----------------------------------------------------- @@ -73,10 +76,10 @@ if ! ip link show "${INTERFACE}" >/dev/null 2>&1; then exit 1 fi -# "start_ip,end_ip,lease" — dnsmasq will still validate, but catch the -# obvious shape errors before we write anything to disk. -if [[ ! "${DHCP_RANGE}" =~ ^([0-9]{1,3}\.){3}[0-9]{1,3},([0-9]{1,3}\.){3}[0-9]{1,3},[^[:space:]]+$ ]]; then - echo "ERROR: --dhcp-range must be start_ip,end_ip,lease (e.g. 10.77.0.100,10.77.0.200,12h)" >&2 +# CIDR shape check — dnsmasq will re-validate, but catch the obvious +# errors before we write anything to disk. +if [[ ! "${SUBNET}" =~ ^([0-9]{1,3}\.){3}[0-9]{1,3}/[0-9]{1,2}$ ]]; then + echo "ERROR: --subnet must be CIDR form (e.g. 192.168.1.0/24), got '${SUBNET}'" >&2 exit 2 fi @@ -226,17 +229,17 @@ extract_yaml_value() { ' "${path}" } -existing_iface="$(extract_yaml_value interface "${CONFIG}")" -existing_range="$(extract_yaml_value dhcp_range "${CONFIG}")" +existing_iface="$(extract_yaml_value interface "${CONFIG}")" +existing_subnet="$(extract_yaml_value subnet "${CONFIG}")" if [[ -n "${existing_iface}" && "${existing_iface}" != "${INTERFACE}" && ${FORCE} -eq 0 ]]; then echo "ERROR: pxe.interface in ${CONFIG} is already set to ${existing_iface}, which" >&2 echo " differs from --interface ${INTERFACE}. Pass --force to overwrite." >&2 exit 1 fi -if [[ -n "${existing_range}" && "${existing_range}" != "${DHCP_RANGE}" && ${FORCE} -eq 0 ]]; then - echo "ERROR: pxe.dhcp_range in ${CONFIG} is already ${existing_range}, which" >&2 - echo " differs from --dhcp-range ${DHCP_RANGE}. Pass --force to overwrite." >&2 +if [[ -n "${existing_subnet}" && "${existing_subnet}" != "${SUBNET}" && ${FORCE} -eq 0 ]]; then + echo "ERROR: pxe.subnet in ${CONFIG} is already ${existing_subnet}, which" >&2 + echo " differs from --subnet ${SUBNET}. Pass --force to overwrite." >&2 exit 1 fi @@ -244,7 +247,7 @@ new_block=$(cat < iPXE. In proxy-DHCP mode, chainloading uses +# pxe-service= (not dhcp-boot=) because the real LAN DHCP has already +# assigned the IP; we only supplement the boot menu. +pxe-service=tag:!ipxe,x86PC,"iPXE (BIOS)",undionly.kpxe +pxe-service=tag:!ipxe,X86-64_EFI,"iPXE (UEFI)",ipxe.efi log-facility=- ` diff --git a/internal/pxe/dnsmasq_test.go b/internal/pxe/dnsmasq_test.go index 3f23014..29f2179 100644 --- a/internal/pxe/dnsmasq_test.go +++ b/internal/pxe/dnsmasq_test.go @@ -39,8 +39,8 @@ func goodCfg(t *testing.T, tftpRoot string) SupervisorConfig { return SupervisorConfig{ Enabled: true, Interface: existingInterface(t), - DHCPRange: "10.77.0.100,10.77.0.200,12h", - OrchestratorURL: "http://10.77.0.1:8080", + Subnet: "192.168.1.0/24", + OrchestratorURL: "http://192.168.1.2:8080", TFTPRoot: tftpRoot, } } @@ -128,26 +128,26 @@ func TestValidate_LiveDirEmptySkipsLiveChecks(t *testing.T) { } } -func TestValidate_MalformedDHCPRange(t *testing.T) { +func TestValidate_MalformedSubnet(t *testing.T) { tftp := t.TempDir() seedTFTP(t, tftp, "ipxe.efi", "undionly.kpxe") cases := []struct { - name string - dhcp string + name string + subnet string }{ - {"single field", "10.77.0.100"}, - {"two fields", "10.77.0.100,10.77.0.200"}, - {"non-ip start", "hello,10.77.0.200,12h"}, - {"empty lease", "10.77.0.100,10.77.0.200,"}, + {"no mask", "192.168.1.0"}, + {"bad ip", "hello/24"}, + {"bad mask", "192.168.1.0/99"}, + {"leftover dhcp_range form", "192.168.1.100,192.168.1.200,12h"}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { cfg := goodCfg(t, tftp) - cfg.DHCPRange = tc.dhcp + cfg.Subnet = tc.subnet s := NewSupervisor(cfg) err := s.Validate() - if err == nil || !strings.Contains(err.Error(), "dhcp_range") { - t.Fatalf("expected dhcp_range error for %q, got: %v", tc.dhcp, err) + if err == nil || !strings.Contains(err.Error(), "pxe.subnet") { + t.Fatalf("expected pxe.subnet error for %q, got: %v", tc.subnet, err) } }) } @@ -183,7 +183,7 @@ func TestValidate_AggregatesErrors(t *testing.T) { if err == nil { t.Fatalf("expected aggregated error") } - for _, want := range []string{"pxe.interface", "pxe.tftp_root", "pxe.dhcp_range", "pxe.orchestrator_url"} { + for _, want := range []string{"pxe.interface", "pxe.tftp_root", "pxe.subnet", "pxe.orchestrator_url"} { if !strings.Contains(err.Error(), want) { t.Fatalf("expected %q in aggregated error, got: %v", want, err) }