Files
Vetting/internal/pxe/dnsmasq.go
T
josh 2c440fce8a
CI / Lint + build + test (push) Successful in 1m38s
Release / release (push) Successful in 2m25s
pxe: move dhcp-host allowlist into a SIGHUP-reloadable file
dnsmasq's SIGHUP re-reads /etc/ethers and any --dhcp-hostsfile= paths,
but NOT dhcp-host= lines from the main conf. Reload() was faithfully
rewriting dnsmasq.conf with the new MAC, sending SIGHUP, and then
dnsmasq kept serving its startup view — so a freshly-registered host
still showed up as "proxy-ignored, tags: eth0" with no "known" tag.

Split the allowlist into ${RuntimeDir}/dhcp-hosts, referenced from the
main conf via dhcp-hostsfile=. writeConf() is static-ish now; Reload
just rewrites the hosts file and SIGHUPs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 12:41:27 -04:00

335 lines
9.8 KiB
Go

package pxe
import (
"context"
"errors"
"fmt"
"io"
"log"
"net"
"net/url"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"sync"
"text/template"
"time"
"vetting/internal/model"
)
// SupervisorConfig controls how dnsmasq is launched and configured.
type SupervisorConfig struct {
Enabled bool
Interface string // e.g. "eth0"
Subnet string // LAN CIDR, e.g. "192.168.1.0/24"; scopes the proxy-DHCP response
OrchestratorURL string // baked into iPXE scripts
RuntimeDir string // writable dir for dnsmasq.conf and leases
TFTPRoot string // holds ipxe.efi, undionly.kpxe
LiveDir string // holds vmlinuz, initrd.img (served via HTTP, not dnsmasq; "" disables validation)
DNSMasqBin string // path to dnsmasq binary (default: "dnsmasq")
}
// Supervisor owns a dnsmasq subprocess, rewrites its config when the
// host registry changes, and sends SIGHUP to reload. The MAC allowlist
// is the safety barrier: only registered MACs see a DHCP reply.
type Supervisor struct {
cfg SupervisorConfig
mu sync.Mutex
cmd *exec.Cmd
cancel context.CancelFunc
}
func NewSupervisor(cfg SupervisorConfig) *Supervisor {
if cfg.DNSMasqBin == "" {
cfg.DNSMasqBin = "dnsmasq"
}
return &Supervisor{cfg: cfg}
}
// Validate checks the preconditions required for dnsmasq to actually
// serve PXE boots: the interface must exist, the iPXE payloads must
// be on disk, the DHCP range + orchestrator URL must parse. Returns
// nil when Enabled=false — tests and dev mode skip all of this.
//
// Without Validate(), dnsmasq starts cleanly on typo'd configs and
// the only symptom is a silent TFTP 404 when a real host PXE-boots.
func (s *Supervisor) Validate() error {
if !s.cfg.Enabled {
return nil
}
var errs []error
if s.cfg.Interface == "" {
errs = append(errs, fmt.Errorf("pxe.interface is required"))
} else if _, err := net.InterfaceByName(s.cfg.Interface); err != nil {
errs = append(errs, fmt.Errorf("pxe.interface %q not found on host — check `ip link` or fix pxe.interface in vetting.yaml", s.cfg.Interface))
}
if s.cfg.TFTPRoot == "" {
errs = append(errs, fmt.Errorf("pxe.tftp_root is required"))
} else {
for _, name := range []string{"ipxe.efi", "undionly.kpxe"} {
p := filepath.Join(s.cfg.TFTPRoot, name)
if _, err := os.Stat(p); err != nil {
errs = append(errs, fmt.Errorf("missing %s — run deploy/pxe-setup.sh to fetch iPXE binaries", p))
}
}
}
if s.cfg.LiveDir != "" {
for _, name := range []string{"vmlinuz", "initrd.img"} {
p := filepath.Join(s.cfg.LiveDir, name)
if _, err := os.Stat(p); err != nil {
errs = append(errs, fmt.Errorf("missing %s — build the live image (`make live-image`) and copy into pxe.live_dir, or use the release tarball", p))
}
}
}
if s.cfg.Subnet == "" {
errs = append(errs, fmt.Errorf("pxe.subnet is required (e.g. \"192.168.1.0/24\") — the LAN CIDR dnsmasq proxy-DHCP scopes to"))
} else if _, _, err := net.ParseCIDR(s.cfg.Subnet); err != nil {
errs = append(errs, fmt.Errorf("pxe.subnet %q is not a valid CIDR: %v", s.cfg.Subnet, err))
}
if s.cfg.OrchestratorURL == "" {
errs = append(errs, fmt.Errorf("pxe.orchestrator_url is required"))
} else if u, err := url.Parse(s.cfg.OrchestratorURL); err != nil || (u.Scheme != "http" && u.Scheme != "https") || u.Host == "" {
errs = append(errs, fmt.Errorf("pxe.orchestrator_url %q must be an http(s) URL with a host", s.cfg.OrchestratorURL))
}
return errors.Join(errs...)
}
// Start launches dnsmasq in the background. If cfg.Enabled is false
// Start is a no-op (useful for dev on Windows where dnsmasq isn't
// available).
func (s *Supervisor) Start(ctx context.Context, hosts []model.Host) error {
if !s.cfg.Enabled {
log.Printf("pxe: disabled in config — skipping dnsmasq")
return nil
}
if runtime.GOOS == "windows" {
return fmt.Errorf("dnsmasq supervision is not supported on Windows — run orchestrator on Linux")
}
if err := s.Validate(); err != nil {
return fmt.Errorf("pxe preconditions failed: %w", err)
}
if err := os.MkdirAll(s.cfg.RuntimeDir, 0o755); err != nil {
return fmt.Errorf("mkdir runtime: %w", err)
}
if err := s.writeHosts(hosts); err != nil {
return err
}
if err := s.writeConf(); err != nil {
return err
}
subCtx, cancel := context.WithCancel(ctx)
s.mu.Lock()
s.cancel = cancel
s.mu.Unlock()
confPath := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
cmd := exec.CommandContext(subCtx, s.cfg.DNSMasqBin,
"--conf-file="+confPath,
"--no-daemon",
"--log-queries",
"--log-dhcp",
)
cmd.Stdout = logWriter{prefix: "dnsmasq"}
cmd.Stderr = logWriter{prefix: "dnsmasq"}
if err := cmd.Start(); err != nil {
cancel()
return fmt.Errorf("start dnsmasq: %w", err)
}
s.mu.Lock()
s.cmd = cmd
s.mu.Unlock()
go func() {
if err := cmd.Wait(); err != nil && subCtx.Err() == nil {
log.Printf("dnsmasq exited: %v", err)
}
}()
return nil
}
// Reload rewrites the dhcp-hosts allowlist with the latest host
// registry and SIGHUPs dnsmasq to pick it up. The main dnsmasq.conf
// is unchanged — it only references the hosts file by path.
func (s *Supervisor) Reload(hosts []model.Host) error {
if !s.cfg.Enabled {
return nil
}
if err := s.writeHosts(hosts); err != nil {
return err
}
s.mu.Lock()
cmd := s.cmd
s.mu.Unlock()
if cmd == nil || cmd.Process == nil {
return nil
}
if err := sighup(cmd.Process); err != nil {
return fmt.Errorf("sighup dnsmasq: %w", err)
}
return nil
}
// Shutdown stops dnsmasq within the timeout.
func (s *Supervisor) Shutdown(timeout time.Duration) error {
if !s.cfg.Enabled {
return nil
}
s.mu.Lock()
cancel := s.cancel
cmd := s.cmd
s.mu.Unlock()
if cancel != nil {
cancel()
}
if cmd != nil && cmd.Process != nil {
done := make(chan struct{})
go func() {
_, _ = cmd.Process.Wait()
close(done)
}()
select {
case <-done:
case <-time.After(timeout):
_ = cmd.Process.Kill()
}
}
return nil
}
func (s *Supervisor) writeConf() error {
tmpl, err := template.New("dnsmasq").Parse(dnsmasqTemplate)
if err != nil {
return err
}
conf := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
tmp := conf + ".new"
f, err := os.Create(tmp)
if err != nil {
return fmt.Errorf("create conf: %w", err)
}
_, ipnet, err := net.ParseCIDR(s.cfg.Subnet)
if err != nil {
_ = f.Close()
return fmt.Errorf("parse subnet %q: %w", s.cfg.Subnet, err)
}
data := struct {
Cfg SupervisorConfig
Network string
Netmask string
}{s.cfg, ipnet.IP.String(), net.IP(ipnet.Mask).String()}
if err := tmpl.Execute(f, data); err != nil {
_ = f.Close()
return fmt.Errorf("render conf: %w", err)
}
if err := f.Sync(); err != nil {
_ = f.Close()
return err
}
if err := f.Close(); err != nil {
return err
}
if err := os.Rename(tmp, conf); err != nil {
return fmt.Errorf("rename conf: %w", err)
}
return nil
}
// writeHosts renders the dhcp-hostsfile referenced by dnsmasq.conf.
// Each registered host contributes one line:
//
// <mac>,set:known
//
// dnsmasq re-reads this file on SIGHUP — that's the whole point of
// keeping it separate from the main conf.
func (s *Supervisor) writeHosts(hosts []model.Host) error {
if err := os.MkdirAll(s.cfg.RuntimeDir, 0o755); err != nil {
return fmt.Errorf("mkdir runtime: %w", err)
}
path := filepath.Join(s.cfg.RuntimeDir, "dhcp-hosts")
tmp := path + ".new"
var b strings.Builder
for _, h := range hosts {
fmt.Fprintf(&b, "%s,set:known\n", h.MAC)
}
if err := os.WriteFile(tmp, []byte(b.String()), 0o644); err != nil {
return fmt.Errorf("write dhcp-hosts: %w", err)
}
if err := os.Rename(tmp, path); err != nil {
return fmt.Errorf("rename dhcp-hosts: %w", err)
}
return nil
}
// Exposed for the UI handlers to show operators what config is live.
func (s *Supervisor) ConfPath() string {
return filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
}
type logWriter struct{ prefix string }
func (w logWriter) Write(p []byte) (int, error) {
for _, line := range strings.Split(strings.TrimRight(string(p), "\n"), "\n") {
if line == "" {
continue
}
log.Printf("[%s] %s", w.prefix, line)
}
return len(p), nil
}
// Allow package consumers to swap io.Writer for logs in tests.
var _ io.Writer = logWriter{}
const dnsmasqTemplate = `# Generated by Vetting — do not hand-edit.
interface={{ .Cfg.Interface }}
bind-interfaces
port=0
domain-needed
bogus-priv
no-resolv
# Proxy DHCP: coexist with the LAN's real DHCP server. We never hand
# out an IP — we only answer the PXE options (option 66/67 and the
# PXE BINL on port 4011) when a registered MAC boots from the network.
# dnsmasq's proxy syntax takes a bare network address + netmask, not a
# CIDR — we split Subnet upstream in writeConf().
dhcp-range={{ .Network }},proxy,{{ .Netmask }}
# MAC allowlist: dnsmasq only answers DHCP for MACs tagged "known".
# The per-MAC dhcp-host= entries live in a separate file so SIGHUP
# can reload them — dnsmasq does NOT re-read dhcp-host= from the
# main conf on SIGHUP, only from dhcp-hostsfile=.
dhcp-ignore=tag:!known
dhcp-hostsfile={{ .Cfg.RuntimeDir }}/dhcp-hosts
# Keep runtime state inside RuntimeDir so the systemd sandbox
# (ReadWritePaths=/var/lib/vetting ...) doesn't block writes to the
# distro defaults (/var/lib/misc/dnsmasq.leases, /run/dnsmasq.pid).
dhcp-leasefile={{ .Cfg.RuntimeDir }}/dhcp.leases
pid-file={{ .Cfg.RuntimeDir }}/dnsmasq.pid
# TFTP for first-boot BIOS/UEFI clients; already-iPXE clients skip it.
enable-tftp
tftp-root={{ .Cfg.TFTPRoot }}
# Already-iPXE clients: chainload the per-MAC HTTP script directly.
dhcp-match=set:ipxe,175
dhcp-boot=tag:ipxe,{{ .Cfg.OrchestratorURL }}/ipxe/${mac}
# First-boot PXE ROM -> iPXE. In proxy-DHCP mode, chainloading uses
# pxe-service= (not dhcp-boot=) because the real LAN DHCP has already
# assigned the IP; we only supplement the boot menu.
pxe-service=tag:!ipxe,x86PC,"iPXE (BIOS)",undionly.kpxe
pxe-service=tag:!ipxe,X86-64_EFI,"iPXE (UEFI)",ipxe.efi
log-facility=-
`