package pxe import ( "context" "errors" "fmt" "io" "log" "net" "net/url" "os" "os/exec" "path/filepath" "regexp" "runtime" "strings" "sync" "text/template" "time" "vetting/internal/model" ) // SupervisorConfig controls how dnsmasq is launched and configured. type SupervisorConfig struct { Enabled bool Interface string // e.g. "eth0" DHCPRange string // e.g. "10.77.0.100,10.77.0.200,12h" OrchestratorURL string // baked into iPXE scripts RuntimeDir string // writable dir for dnsmasq.conf and leases TFTPRoot string // holds ipxe.efi, undionly.kpxe LiveDir string // holds vmlinuz, initrd.img (served via HTTP, not dnsmasq; "" disables validation) DNSMasqBin string // path to dnsmasq binary (default: "dnsmasq") } // Supervisor owns a dnsmasq subprocess, rewrites its config when the // host registry changes, and sends SIGHUP to reload. The MAC allowlist // is the safety barrier: only registered MACs see a DHCP reply. type Supervisor struct { cfg SupervisorConfig mu sync.Mutex cmd *exec.Cmd cancel context.CancelFunc } func NewSupervisor(cfg SupervisorConfig) *Supervisor { if cfg.DNSMasqBin == "" { cfg.DNSMasqBin = "dnsmasq" } return &Supervisor{cfg: cfg} } // dhcpRangeRE matches "start_ip,end_ip,lease" — the three-field form // dnsmasq expects. Lease can be "12h", "infinite", etc.; any non-empty // token is accepted here and dnsmasq will reject nonsense at startup. var dhcpRangeRE = regexp.MustCompile(`^(\d{1,3}\.){3}\d{1,3},(\d{1,3}\.){3}\d{1,3},\S+$`) // Validate checks the preconditions required for dnsmasq to actually // serve PXE boots: the interface must exist, the iPXE payloads must // be on disk, the DHCP range + orchestrator URL must parse. Returns // nil when Enabled=false — tests and dev mode skip all of this. // // Without Validate(), dnsmasq starts cleanly on typo'd configs and // the only symptom is a silent TFTP 404 when a real host PXE-boots. func (s *Supervisor) Validate() error { if !s.cfg.Enabled { return nil } var errs []error if s.cfg.Interface == "" { errs = append(errs, fmt.Errorf("pxe.interface is required")) } else if _, err := net.InterfaceByName(s.cfg.Interface); err != nil { errs = append(errs, fmt.Errorf("pxe.interface %q not found on host — check `ip link` or fix pxe.interface in vetting.yaml", s.cfg.Interface)) } if s.cfg.TFTPRoot == "" { errs = append(errs, fmt.Errorf("pxe.tftp_root is required")) } else { for _, name := range []string{"ipxe.efi", "undionly.kpxe"} { p := filepath.Join(s.cfg.TFTPRoot, name) if _, err := os.Stat(p); err != nil { errs = append(errs, fmt.Errorf("missing %s — run deploy/pxe-setup.sh to fetch iPXE binaries", p)) } } } if s.cfg.LiveDir != "" { for _, name := range []string{"vmlinuz", "initrd.img"} { p := filepath.Join(s.cfg.LiveDir, name) if _, err := os.Stat(p); err != nil { errs = append(errs, fmt.Errorf("missing %s — build the live image (`make live-image`) and copy into pxe.live_dir, or use the release tarball", p)) } } } if s.cfg.DHCPRange == "" { errs = append(errs, fmt.Errorf("pxe.dhcp_range is required (e.g. \"10.77.0.100,10.77.0.200,12h\")")) } else if !dhcpRangeRE.MatchString(s.cfg.DHCPRange) { errs = append(errs, fmt.Errorf("pxe.dhcp_range %q must be \"start_ip,end_ip,lease\"", s.cfg.DHCPRange)) } if s.cfg.OrchestratorURL == "" { errs = append(errs, fmt.Errorf("pxe.orchestrator_url is required")) } else if u, err := url.Parse(s.cfg.OrchestratorURL); err != nil || (u.Scheme != "http" && u.Scheme != "https") || u.Host == "" { errs = append(errs, fmt.Errorf("pxe.orchestrator_url %q must be an http(s) URL with a host", s.cfg.OrchestratorURL)) } return errors.Join(errs...) } // Start launches dnsmasq in the background. If cfg.Enabled is false // Start is a no-op (useful for dev on Windows where dnsmasq isn't // available). func (s *Supervisor) Start(ctx context.Context, hosts []model.Host) error { if !s.cfg.Enabled { log.Printf("pxe: disabled in config — skipping dnsmasq") return nil } if runtime.GOOS == "windows" { return fmt.Errorf("dnsmasq supervision is not supported on Windows — run orchestrator on Linux") } if err := s.Validate(); err != nil { return fmt.Errorf("pxe preconditions failed: %w", err) } if err := os.MkdirAll(s.cfg.RuntimeDir, 0o755); err != nil { return fmt.Errorf("mkdir runtime: %w", err) } if err := s.writeConf(hosts); err != nil { return err } subCtx, cancel := context.WithCancel(ctx) s.mu.Lock() s.cancel = cancel s.mu.Unlock() confPath := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf") cmd := exec.CommandContext(subCtx, s.cfg.DNSMasqBin, "--conf-file="+confPath, "--no-daemon", "--log-queries", "--log-dhcp", ) cmd.Stdout = logWriter{prefix: "dnsmasq"} cmd.Stderr = logWriter{prefix: "dnsmasq"} if err := cmd.Start(); err != nil { cancel() return fmt.Errorf("start dnsmasq: %w", err) } s.mu.Lock() s.cmd = cmd s.mu.Unlock() go func() { if err := cmd.Wait(); err != nil && subCtx.Err() == nil { log.Printf("dnsmasq exited: %v", err) } }() return nil } // Reload rewrites the conf with the latest host registry and sends // SIGHUP. It will restart the subprocess if SIGHUP is unsupported // (e.g. when running behind an OS that doesn't support it). func (s *Supervisor) Reload(hosts []model.Host) error { if !s.cfg.Enabled { return nil } if err := s.writeConf(hosts); err != nil { return err } s.mu.Lock() cmd := s.cmd s.mu.Unlock() if cmd == nil || cmd.Process == nil { return nil } if err := sighup(cmd.Process); err != nil { return fmt.Errorf("sighup dnsmasq: %w", err) } return nil } // Shutdown stops dnsmasq within the timeout. func (s *Supervisor) Shutdown(timeout time.Duration) error { if !s.cfg.Enabled { return nil } s.mu.Lock() cancel := s.cancel cmd := s.cmd s.mu.Unlock() if cancel != nil { cancel() } if cmd != nil && cmd.Process != nil { done := make(chan struct{}) go func() { _, _ = cmd.Process.Wait() close(done) }() select { case <-done: case <-time.After(timeout): _ = cmd.Process.Kill() } } return nil } func (s *Supervisor) writeConf(hosts []model.Host) error { tmpl, err := template.New("dnsmasq").Parse(dnsmasqTemplate) if err != nil { return err } conf := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf") tmp := conf + ".new" f, err := os.Create(tmp) if err != nil { return fmt.Errorf("create conf: %w", err) } data := struct { Cfg SupervisorConfig Hosts []model.Host }{s.cfg, hosts} if err := tmpl.Execute(f, data); err != nil { _ = f.Close() return fmt.Errorf("render conf: %w", err) } if err := f.Sync(); err != nil { _ = f.Close() return err } if err := f.Close(); err != nil { return err } if err := os.Rename(tmp, conf); err != nil { return fmt.Errorf("rename conf: %w", err) } return nil } // Exposed for the UI handlers to show operators what config is live. func (s *Supervisor) ConfPath() string { return filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf") } type logWriter struct{ prefix string } func (w logWriter) Write(p []byte) (int, error) { for _, line := range strings.Split(strings.TrimRight(string(p), "\n"), "\n") { if line == "" { continue } log.Printf("[%s] %s", w.prefix, line) } return len(p), nil } // Allow package consumers to swap io.Writer for logs in tests. var _ io.Writer = logWriter{} const dnsmasqTemplate = `# Generated by Vetting — do not hand-edit. interface={{ .Cfg.Interface }} bind-interfaces port=0 domain-needed bogus-priv no-resolv # MAC allowlist: dnsmasq only answers DHCP for MACs with a dhcp-host= below. dhcp-ignore=tag:!known {{- range .Hosts }} dhcp-host={{ .MAC }},set:known {{- end }} # DHCP range (broader subnet coverage is fine; allowlist above gates replies). dhcp-range={{ .Cfg.DHCPRange }} # TFTP + HTTP boot (iPXE chainload). enable-tftp tftp-root={{ .Cfg.TFTPRoot }} # BIOS (undionly.kpxe) and UEFI (ipxe.efi) clients both get iPXE first, # which then re-requests a per-MAC script from the orchestrator. dhcp-match=set:bios,option:client-arch,0 dhcp-match=set:efi64,option:client-arch,7 dhcp-match=set:efi64,option:client-arch,9 # If the client is iPXE itself, send it the per-MAC HTTP script. dhcp-match=set:ipxe,175 dhcp-boot=tag:ipxe,{{ .Cfg.OrchestratorURL }}/ipxe/${mac} # Otherwise (first boot from ROM) chainload iPXE from TFTP. dhcp-boot=tag:!ipxe,tag:bios,undionly.kpxe dhcp-boot=tag:!ipxe,tag:efi64,ipxe.efi log-facility=- `