package pxe import ( "context" "errors" "fmt" "io" "log" "net" "net/url" "os" "os/exec" "path/filepath" "runtime" "strings" "sync" "text/template" "time" "vetting/internal/model" ) // SupervisorConfig controls how dnsmasq is launched and configured. type SupervisorConfig struct { Enabled bool Interface string // e.g. "eth0" Subnet string // LAN CIDR, e.g. "192.168.1.0/24"; scopes the proxy-DHCP response OrchestratorURL string // baked into iPXE scripts RuntimeDir string // writable dir for dnsmasq.conf and leases TFTPRoot string // holds ipxe.efi, undionly.kpxe LiveDir string // holds vmlinuz, initrd.img (served via HTTP, not dnsmasq; "" disables validation) DNSMasqBin string // path to dnsmasq binary (default: "dnsmasq") } // Supervisor owns a dnsmasq subprocess, rewrites its config when the // host registry changes, and sends SIGHUP to reload. The MAC allowlist // is the safety barrier: only registered MACs see a DHCP reply. type Supervisor struct { cfg SupervisorConfig mu sync.Mutex cmd *exec.Cmd cancel context.CancelFunc } func NewSupervisor(cfg SupervisorConfig) *Supervisor { if cfg.DNSMasqBin == "" { cfg.DNSMasqBin = "dnsmasq" } return &Supervisor{cfg: cfg} } // Validate checks the preconditions required for dnsmasq to actually // serve PXE boots: the interface must exist, the iPXE payloads must // be on disk, the DHCP range + orchestrator URL must parse. Returns // nil when Enabled=false — tests and dev mode skip all of this. // // Without Validate(), dnsmasq starts cleanly on typo'd configs and // the only symptom is a silent TFTP 404 when a real host PXE-boots. func (s *Supervisor) Validate() error { if !s.cfg.Enabled { return nil } var errs []error if s.cfg.Interface == "" { errs = append(errs, fmt.Errorf("pxe.interface is required")) } else if _, err := net.InterfaceByName(s.cfg.Interface); err != nil { errs = append(errs, fmt.Errorf("pxe.interface %q not found on host — check `ip link` or fix pxe.interface in vetting.yaml", s.cfg.Interface)) } if s.cfg.TFTPRoot == "" { errs = append(errs, fmt.Errorf("pxe.tftp_root is required")) } else { for _, name := range []string{"ipxe.efi", "undionly.kpxe"} { p := filepath.Join(s.cfg.TFTPRoot, name) if _, err := os.Stat(p); err != nil { errs = append(errs, fmt.Errorf("missing %s — run deploy/pxe-setup.sh to fetch iPXE binaries", p)) } } } if s.cfg.LiveDir != "" { for _, name := range []string{"vmlinuz", "initrd.img"} { p := filepath.Join(s.cfg.LiveDir, name) if _, err := os.Stat(p); err != nil { errs = append(errs, fmt.Errorf("missing %s — build the live image (`make live-image`) and copy into pxe.live_dir, or use the release tarball", p)) } } } if s.cfg.Subnet == "" { errs = append(errs, fmt.Errorf("pxe.subnet is required (e.g. \"192.168.1.0/24\") — the LAN CIDR dnsmasq proxy-DHCP scopes to")) } else if _, _, err := net.ParseCIDR(s.cfg.Subnet); err != nil { errs = append(errs, fmt.Errorf("pxe.subnet %q is not a valid CIDR: %v", s.cfg.Subnet, err)) } if s.cfg.OrchestratorURL == "" { errs = append(errs, fmt.Errorf("pxe.orchestrator_url is required")) } else if u, err := url.Parse(s.cfg.OrchestratorURL); err != nil || (u.Scheme != "http" && u.Scheme != "https") || u.Host == "" { errs = append(errs, fmt.Errorf("pxe.orchestrator_url %q must be an http(s) URL with a host", s.cfg.OrchestratorURL)) } return errors.Join(errs...) } // Start launches dnsmasq in the background. If cfg.Enabled is false // Start is a no-op (useful for dev on Windows where dnsmasq isn't // available). func (s *Supervisor) Start(ctx context.Context, hosts []model.Host) error { if !s.cfg.Enabled { log.Printf("pxe: disabled in config — skipping dnsmasq") return nil } if runtime.GOOS == "windows" { return fmt.Errorf("dnsmasq supervision is not supported on Windows — run orchestrator on Linux") } if err := s.Validate(); err != nil { return fmt.Errorf("pxe preconditions failed: %w", err) } if err := os.MkdirAll(s.cfg.RuntimeDir, 0o755); err != nil { return fmt.Errorf("mkdir runtime: %w", err) } if err := s.writeConf(hosts); err != nil { return err } subCtx, cancel := context.WithCancel(ctx) s.mu.Lock() s.cancel = cancel s.mu.Unlock() confPath := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf") cmd := exec.CommandContext(subCtx, s.cfg.DNSMasqBin, "--conf-file="+confPath, "--no-daemon", "--log-queries", "--log-dhcp", ) cmd.Stdout = logWriter{prefix: "dnsmasq"} cmd.Stderr = logWriter{prefix: "dnsmasq"} if err := cmd.Start(); err != nil { cancel() return fmt.Errorf("start dnsmasq: %w", err) } s.mu.Lock() s.cmd = cmd s.mu.Unlock() go func() { if err := cmd.Wait(); err != nil && subCtx.Err() == nil { log.Printf("dnsmasq exited: %v", err) } }() return nil } // Reload rewrites the conf with the latest host registry and sends // SIGHUP. It will restart the subprocess if SIGHUP is unsupported // (e.g. when running behind an OS that doesn't support it). func (s *Supervisor) Reload(hosts []model.Host) error { if !s.cfg.Enabled { return nil } if err := s.writeConf(hosts); err != nil { return err } s.mu.Lock() cmd := s.cmd s.mu.Unlock() if cmd == nil || cmd.Process == nil { return nil } if err := sighup(cmd.Process); err != nil { return fmt.Errorf("sighup dnsmasq: %w", err) } return nil } // Shutdown stops dnsmasq within the timeout. func (s *Supervisor) Shutdown(timeout time.Duration) error { if !s.cfg.Enabled { return nil } s.mu.Lock() cancel := s.cancel cmd := s.cmd s.mu.Unlock() if cancel != nil { cancel() } if cmd != nil && cmd.Process != nil { done := make(chan struct{}) go func() { _, _ = cmd.Process.Wait() close(done) }() select { case <-done: case <-time.After(timeout): _ = cmd.Process.Kill() } } return nil } func (s *Supervisor) writeConf(hosts []model.Host) error { tmpl, err := template.New("dnsmasq").Parse(dnsmasqTemplate) if err != nil { return err } conf := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf") tmp := conf + ".new" f, err := os.Create(tmp) if err != nil { return fmt.Errorf("create conf: %w", err) } data := struct { Cfg SupervisorConfig Hosts []model.Host }{s.cfg, hosts} if err := tmpl.Execute(f, data); err != nil { _ = f.Close() return fmt.Errorf("render conf: %w", err) } if err := f.Sync(); err != nil { _ = f.Close() return err } if err := f.Close(); err != nil { return err } if err := os.Rename(tmp, conf); err != nil { return fmt.Errorf("rename conf: %w", err) } return nil } // Exposed for the UI handlers to show operators what config is live. func (s *Supervisor) ConfPath() string { return filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf") } type logWriter struct{ prefix string } func (w logWriter) Write(p []byte) (int, error) { for _, line := range strings.Split(strings.TrimRight(string(p), "\n"), "\n") { if line == "" { continue } log.Printf("[%s] %s", w.prefix, line) } return len(p), nil } // Allow package consumers to swap io.Writer for logs in tests. var _ io.Writer = logWriter{} const dnsmasqTemplate = `# Generated by Vetting — do not hand-edit. interface={{ .Cfg.Interface }} bind-interfaces port=0 domain-needed bogus-priv no-resolv # Proxy DHCP: coexist with the LAN's real DHCP server. We never hand # out an IP — we only answer the PXE options (option 66/67 and the # PXE BINL on port 4011) when a registered MAC boots from the network. dhcp-range={{ .Cfg.Subnet }},proxy # MAC allowlist: dnsmasq only answers DHCP for MACs with a dhcp-host= below. dhcp-ignore=tag:!known {{- range .Hosts }} dhcp-host={{ .MAC }},set:known {{- end }} # Keep runtime state inside RuntimeDir so the systemd sandbox # (ReadWritePaths=/var/lib/vetting ...) doesn't block writes to the # distro defaults (/var/lib/misc/dnsmasq.leases, /run/dnsmasq.pid). dhcp-leasefile={{ .Cfg.RuntimeDir }}/dhcp.leases pid-file={{ .Cfg.RuntimeDir }}/dnsmasq.pid # TFTP for first-boot BIOS/UEFI clients; already-iPXE clients skip it. enable-tftp tftp-root={{ .Cfg.TFTPRoot }} # Already-iPXE clients: chainload the per-MAC HTTP script directly. dhcp-match=set:ipxe,175 dhcp-boot=tag:ipxe,{{ .Cfg.OrchestratorURL }}/ipxe/${mac} # First-boot PXE ROM -> iPXE. In proxy-DHCP mode, chainloading uses # pxe-service= (not dhcp-boot=) because the real LAN DHCP has already # assigned the IP; we only supplement the boot menu. pxe-service=tag:!ipxe,x86PC,"iPXE (BIOS)",undionly.kpxe pxe-service=tag:!ipxe,X86-64_EFI,"iPXE (UEFI)",ipxe.efi log-facility=- `