Automate PXE setup: release bundle + pxe-setup.sh + startup validation
CI / Lint + build + test (push) Has been cancelled
CI / Lint + build + test (push) Has been cancelled
Collapses the LXC side of PXE enablement from a six-step manual dance (build, fetch iPXE, scp, bridge, hand-edit yaml) into: make release # dev box (Linux/WSL) scp bundle.tar.gz lxc:/tmp/ sudo ./install.sh # base install, unchanged sudo ./pxe-setup.sh --interface ... --dhcp-range ... --orchestrator-url ... pxe-setup.sh fetches iPXE from boot.ipxe.org, verifies against pinned SHA256s in deploy/ipxe-shas.txt (fail-closed), places vmlinuz/initrd.img from the bundle, and rewrites only the pxe: block of vetting.yaml. Idempotent; --force gates overwriting a hand-edited block. Adds Supervisor.Validate() — called before dnsmasq spawn — so typo'd configs fail at orchestrator startup with clear errors naming the missing file or yaml key, instead of silently serving broken TFTP until a real host tries to PXE-boot. Nine tests cover missing files, bogus interface, malformed dhcp_range, bad orchestrator_url, and aggregate reporting. Hypervisor bridge creation stays documented (LXC can't do it) but everything downstream of the bridge is now scripted. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -2,12 +2,16 @@ package pxe
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -25,6 +29,7 @@ type SupervisorConfig struct {
|
||||
OrchestratorURL string // baked into iPXE scripts
|
||||
RuntimeDir string // writable dir for dnsmasq.conf and leases
|
||||
TFTPRoot string // holds ipxe.efi, undionly.kpxe
|
||||
LiveDir string // holds vmlinuz, initrd.img (served via HTTP, not dnsmasq; "" disables validation)
|
||||
DNSMasqBin string // path to dnsmasq binary (default: "dnsmasq")
|
||||
}
|
||||
|
||||
@@ -45,6 +50,65 @@ func NewSupervisor(cfg SupervisorConfig) *Supervisor {
|
||||
return &Supervisor{cfg: cfg}
|
||||
}
|
||||
|
||||
// dhcpRangeRE matches "start_ip,end_ip,lease" — the three-field form
|
||||
// dnsmasq expects. Lease can be "12h", "infinite", etc.; any non-empty
|
||||
// token is accepted here and dnsmasq will reject nonsense at startup.
|
||||
var dhcpRangeRE = regexp.MustCompile(`^(\d{1,3}\.){3}\d{1,3},(\d{1,3}\.){3}\d{1,3},\S+$`)
|
||||
|
||||
// Validate checks the preconditions required for dnsmasq to actually
|
||||
// serve PXE boots: the interface must exist, the iPXE payloads must
|
||||
// be on disk, the DHCP range + orchestrator URL must parse. Returns
|
||||
// nil when Enabled=false — tests and dev mode skip all of this.
|
||||
//
|
||||
// Without Validate(), dnsmasq starts cleanly on typo'd configs and
|
||||
// the only symptom is a silent TFTP 404 when a real host PXE-boots.
|
||||
func (s *Supervisor) Validate() error {
|
||||
if !s.cfg.Enabled {
|
||||
return nil
|
||||
}
|
||||
var errs []error
|
||||
|
||||
if s.cfg.Interface == "" {
|
||||
errs = append(errs, fmt.Errorf("pxe.interface is required"))
|
||||
} else if _, err := net.InterfaceByName(s.cfg.Interface); err != nil {
|
||||
errs = append(errs, fmt.Errorf("pxe.interface %q not found on host — check `ip link` or fix pxe.interface in vetting.yaml", s.cfg.Interface))
|
||||
}
|
||||
|
||||
if s.cfg.TFTPRoot == "" {
|
||||
errs = append(errs, fmt.Errorf("pxe.tftp_root is required"))
|
||||
} else {
|
||||
for _, name := range []string{"ipxe.efi", "undionly.kpxe"} {
|
||||
p := filepath.Join(s.cfg.TFTPRoot, name)
|
||||
if _, err := os.Stat(p); err != nil {
|
||||
errs = append(errs, fmt.Errorf("missing %s — run deploy/pxe-setup.sh to fetch iPXE binaries", p))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if s.cfg.LiveDir != "" {
|
||||
for _, name := range []string{"vmlinuz", "initrd.img"} {
|
||||
p := filepath.Join(s.cfg.LiveDir, name)
|
||||
if _, err := os.Stat(p); err != nil {
|
||||
errs = append(errs, fmt.Errorf("missing %s — build the live image (`make live-image`) and copy into pxe.live_dir, or use the release tarball", p))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if s.cfg.DHCPRange == "" {
|
||||
errs = append(errs, fmt.Errorf("pxe.dhcp_range is required (e.g. \"10.77.0.100,10.77.0.200,12h\")"))
|
||||
} else if !dhcpRangeRE.MatchString(s.cfg.DHCPRange) {
|
||||
errs = append(errs, fmt.Errorf("pxe.dhcp_range %q must be \"start_ip,end_ip,lease\"", s.cfg.DHCPRange))
|
||||
}
|
||||
|
||||
if s.cfg.OrchestratorURL == "" {
|
||||
errs = append(errs, fmt.Errorf("pxe.orchestrator_url is required"))
|
||||
} else if u, err := url.Parse(s.cfg.OrchestratorURL); err != nil || (u.Scheme != "http" && u.Scheme != "https") || u.Host == "" {
|
||||
errs = append(errs, fmt.Errorf("pxe.orchestrator_url %q must be an http(s) URL with a host", s.cfg.OrchestratorURL))
|
||||
}
|
||||
|
||||
return errors.Join(errs...)
|
||||
}
|
||||
|
||||
// Start launches dnsmasq in the background. If cfg.Enabled is false
|
||||
// Start is a no-op (useful for dev on Windows where dnsmasq isn't
|
||||
// available).
|
||||
@@ -56,6 +120,9 @@ func (s *Supervisor) Start(ctx context.Context, hosts []model.Host) error {
|
||||
if runtime.GOOS == "windows" {
|
||||
return fmt.Errorf("dnsmasq supervision is not supported on Windows — run orchestrator on Linux")
|
||||
}
|
||||
if err := s.Validate(); err != nil {
|
||||
return fmt.Errorf("pxe preconditions failed: %w", err)
|
||||
}
|
||||
if err := os.MkdirAll(s.cfg.RuntimeDir, 0o755); err != nil {
|
||||
return fmt.Errorf("mkdir runtime: %w", err)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user