Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,231 @@
|
||||
package pxe
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"vetting/internal/model"
|
||||
)
|
||||
|
||||
// SupervisorConfig controls how dnsmasq is launched and configured.
|
||||
type SupervisorConfig struct {
|
||||
Enabled bool
|
||||
Interface string // e.g. "eth0"
|
||||
DHCPRange string // e.g. "10.77.0.100,10.77.0.200,12h"
|
||||
OrchestratorURL string // baked into iPXE scripts
|
||||
RuntimeDir string // writable dir for dnsmasq.conf and leases
|
||||
TFTPRoot string // holds ipxe.efi, undionly.kpxe
|
||||
DNSMasqBin string // path to dnsmasq binary (default: "dnsmasq")
|
||||
}
|
||||
|
||||
// Supervisor owns a dnsmasq subprocess, rewrites its config when the
|
||||
// host registry changes, and sends SIGHUP to reload. The MAC allowlist
|
||||
// is the safety barrier: only registered MACs see a DHCP reply.
|
||||
type Supervisor struct {
|
||||
cfg SupervisorConfig
|
||||
mu sync.Mutex
|
||||
cmd *exec.Cmd
|
||||
cancel context.CancelFunc
|
||||
}
|
||||
|
||||
func NewSupervisor(cfg SupervisorConfig) *Supervisor {
|
||||
if cfg.DNSMasqBin == "" {
|
||||
cfg.DNSMasqBin = "dnsmasq"
|
||||
}
|
||||
return &Supervisor{cfg: cfg}
|
||||
}
|
||||
|
||||
// Start launches dnsmasq in the background. If cfg.Enabled is false
|
||||
// Start is a no-op (useful for dev on Windows where dnsmasq isn't
|
||||
// available).
|
||||
func (s *Supervisor) Start(ctx context.Context, hosts []model.Host) error {
|
||||
if !s.cfg.Enabled {
|
||||
log.Printf("pxe: disabled in config — skipping dnsmasq")
|
||||
return nil
|
||||
}
|
||||
if runtime.GOOS == "windows" {
|
||||
return fmt.Errorf("dnsmasq supervision is not supported on Windows — run orchestrator on Linux")
|
||||
}
|
||||
if err := os.MkdirAll(s.cfg.RuntimeDir, 0o755); err != nil {
|
||||
return fmt.Errorf("mkdir runtime: %w", err)
|
||||
}
|
||||
if err := s.writeConf(hosts); err != nil {
|
||||
return err
|
||||
}
|
||||
subCtx, cancel := context.WithCancel(ctx)
|
||||
s.mu.Lock()
|
||||
s.cancel = cancel
|
||||
s.mu.Unlock()
|
||||
|
||||
confPath := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
|
||||
cmd := exec.CommandContext(subCtx, s.cfg.DNSMasqBin,
|
||||
"--conf-file="+confPath,
|
||||
"--no-daemon",
|
||||
"--log-queries",
|
||||
"--log-dhcp",
|
||||
)
|
||||
cmd.Stdout = logWriter{prefix: "dnsmasq"}
|
||||
cmd.Stderr = logWriter{prefix: "dnsmasq"}
|
||||
if err := cmd.Start(); err != nil {
|
||||
cancel()
|
||||
return fmt.Errorf("start dnsmasq: %w", err)
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.cmd = cmd
|
||||
s.mu.Unlock()
|
||||
go func() {
|
||||
if err := cmd.Wait(); err != nil && subCtx.Err() == nil {
|
||||
log.Printf("dnsmasq exited: %v", err)
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Reload rewrites the conf with the latest host registry and sends
|
||||
// SIGHUP. It will restart the subprocess if SIGHUP is unsupported
|
||||
// (e.g. when running behind an OS that doesn't support it).
|
||||
func (s *Supervisor) Reload(hosts []model.Host) error {
|
||||
if !s.cfg.Enabled {
|
||||
return nil
|
||||
}
|
||||
if err := s.writeConf(hosts); err != nil {
|
||||
return err
|
||||
}
|
||||
s.mu.Lock()
|
||||
cmd := s.cmd
|
||||
s.mu.Unlock()
|
||||
if cmd == nil || cmd.Process == nil {
|
||||
return nil
|
||||
}
|
||||
if err := sighup(cmd.Process); err != nil {
|
||||
return fmt.Errorf("sighup dnsmasq: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Shutdown stops dnsmasq within the timeout.
|
||||
func (s *Supervisor) Shutdown(timeout time.Duration) error {
|
||||
if !s.cfg.Enabled {
|
||||
return nil
|
||||
}
|
||||
s.mu.Lock()
|
||||
cancel := s.cancel
|
||||
cmd := s.cmd
|
||||
s.mu.Unlock()
|
||||
if cancel != nil {
|
||||
cancel()
|
||||
}
|
||||
if cmd != nil && cmd.Process != nil {
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
_, _ = cmd.Process.Wait()
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(timeout):
|
||||
_ = cmd.Process.Kill()
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Supervisor) writeConf(hosts []model.Host) error {
|
||||
tmpl, err := template.New("dnsmasq").Parse(dnsmasqTemplate)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
conf := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
|
||||
tmp := conf + ".new"
|
||||
f, err := os.Create(tmp)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create conf: %w", err)
|
||||
}
|
||||
data := struct {
|
||||
Cfg SupervisorConfig
|
||||
Hosts []model.Host
|
||||
}{s.cfg, hosts}
|
||||
if err := tmpl.Execute(f, data); err != nil {
|
||||
_ = f.Close()
|
||||
return fmt.Errorf("render conf: %w", err)
|
||||
}
|
||||
if err := f.Sync(); err != nil {
|
||||
_ = f.Close()
|
||||
return err
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.Rename(tmp, conf); err != nil {
|
||||
return fmt.Errorf("rename conf: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Exposed for the UI handlers to show operators what config is live.
|
||||
func (s *Supervisor) ConfPath() string {
|
||||
return filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
|
||||
}
|
||||
|
||||
type logWriter struct{ prefix string }
|
||||
|
||||
func (w logWriter) Write(p []byte) (int, error) {
|
||||
for _, line := range strings.Split(strings.TrimRight(string(p), "\n"), "\n") {
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
log.Printf("[%s] %s", w.prefix, line)
|
||||
}
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
// Allow package consumers to swap io.Writer for logs in tests.
|
||||
var _ io.Writer = logWriter{}
|
||||
|
||||
const dnsmasqTemplate = `# Generated by Vetting — do not hand-edit.
|
||||
interface={{ .Cfg.Interface }}
|
||||
bind-interfaces
|
||||
port=0
|
||||
domain-needed
|
||||
bogus-priv
|
||||
no-resolv
|
||||
|
||||
# MAC allowlist: dnsmasq only answers DHCP for MACs with a dhcp-host= below.
|
||||
dhcp-ignore=tag:!known
|
||||
{{- range .Hosts }}
|
||||
dhcp-host={{ .MAC }},set:known
|
||||
{{- end }}
|
||||
|
||||
# DHCP range (broader subnet coverage is fine; allowlist above gates replies).
|
||||
dhcp-range={{ .Cfg.DHCPRange }}
|
||||
|
||||
# TFTP + HTTP boot (iPXE chainload).
|
||||
enable-tftp
|
||||
tftp-root={{ .Cfg.TFTPRoot }}
|
||||
|
||||
# BIOS (undionly.kpxe) and UEFI (ipxe.efi) clients both get iPXE first,
|
||||
# which then re-requests a per-MAC script from the orchestrator.
|
||||
dhcp-match=set:bios,option:client-arch,0
|
||||
dhcp-match=set:efi64,option:client-arch,7
|
||||
dhcp-match=set:efi64,option:client-arch,9
|
||||
|
||||
# If the client is iPXE itself, send it the per-MAC HTTP script.
|
||||
dhcp-match=set:ipxe,175
|
||||
dhcp-boot=tag:ipxe,{{ .Cfg.OrchestratorURL }}/ipxe/${mac}
|
||||
|
||||
# Otherwise (first boot from ROM) chainload iPXE from TFTP.
|
||||
dhcp-boot=tag:!ipxe,tag:bios,undionly.kpxe
|
||||
dhcp-boot=tag:!ipxe,tag:efi64,ipxe.efi
|
||||
|
||||
log-facility=-
|
||||
`
|
||||
@@ -0,0 +1,88 @@
|
||||
package pxe
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"vetting/internal/model"
|
||||
)
|
||||
|
||||
// IPXEParams is everything an iPXE boot script needs.
|
||||
// For Phase 2 the boot target is always "linux" — Memtest chain-load
|
||||
// is not required because we replaced Memtest86+ with stress-ng under
|
||||
// Linux (see plan §3.2).
|
||||
type IPXEParams struct {
|
||||
OrchestratorURL string // e.g. http://10.0.0.5:8080
|
||||
LiveKernelURL string // e.g. http://10.0.0.5:8080/live/vmlinuz
|
||||
LiveInitrdURL string // e.g. http://10.0.0.5:8080/live/initrd.img
|
||||
TLSCertFPR string // optional; empty = skip pin
|
||||
RunID int64
|
||||
MAC string
|
||||
Token string // plaintext, hashed on server side
|
||||
}
|
||||
|
||||
// BuildScript returns an iPXE script tailored for this run.
|
||||
// iPXE scripts are plain text beginning with "#!ipxe".
|
||||
func BuildScript(p IPXEParams) string {
|
||||
cmdline := []string{
|
||||
"initrd=initrd.img",
|
||||
fmt.Sprintf("vetting.orchestrator=%s", p.OrchestratorURL),
|
||||
fmt.Sprintf("vetting.run_id=%d", p.RunID),
|
||||
fmt.Sprintf("vetting.mac=%s", p.MAC),
|
||||
fmt.Sprintf("vetting.token=%s", p.Token),
|
||||
}
|
||||
if p.TLSCertFPR != "" {
|
||||
cmdline = append(cmdline, fmt.Sprintf("vetting.cert_fpr=%s", p.TLSCertFPR))
|
||||
}
|
||||
// Reduce kernel log noise during the test run; keep loglevel high enough
|
||||
// for boot failures to still show up on the console.
|
||||
cmdline = append(cmdline,
|
||||
"console=tty0",
|
||||
"console=ttyS0,115200n8",
|
||||
"ip=dhcp",
|
||||
"quiet",
|
||||
)
|
||||
|
||||
var b strings.Builder
|
||||
fmt.Fprintln(&b, "#!ipxe")
|
||||
fmt.Fprintf(&b, "echo Vetting run %d — booting live image for %s\n", p.RunID, p.MAC)
|
||||
fmt.Fprintf(&b, "kernel %s %s\n", p.LiveKernelURL, strings.Join(cmdline, " "))
|
||||
fmt.Fprintf(&b, "initrd %s\n", p.LiveInitrdURL)
|
||||
fmt.Fprintln(&b, "boot")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// NotRegisteredScript is served for unknown MACs. The MAC allowlist
|
||||
// at the dnsmasq level should prevent this from ever being reachable,
|
||||
// but it exists as belt-and-braces.
|
||||
func NotRegisteredScript(mac string) string {
|
||||
return fmt.Sprintf("#!ipxe\necho MAC %s not registered for vetting — halting.\nshell\n", mac)
|
||||
}
|
||||
|
||||
// NoActiveRunScript is served when a registered MAC PXE-boots but has
|
||||
// no currently active run. The host is told to shut down rather than
|
||||
// loop forever.
|
||||
func NoActiveRunScript(mac string) string {
|
||||
return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — powering off in 10s.\nsleep 10\npoweroff\n", mac)
|
||||
}
|
||||
|
||||
// Used by handlers to compose URLs; exposed for tests.
|
||||
func BuildLiveURLs(base string) (kernel, initrd string) {
|
||||
base = strings.TrimRight(base, "/")
|
||||
return base + "/live/vmlinuz", base + "/live/initrd.img"
|
||||
}
|
||||
|
||||
// WriteNotFound is a small convenience so handlers can return a shell
|
||||
// script error directly to iPXE without cluttering handlers with a
|
||||
// mime-type dance.
|
||||
func WriteNotFound(w io.Writer, mac string) {
|
||||
_, _ = w.Write([]byte(NotRegisteredScript(mac)))
|
||||
}
|
||||
|
||||
// ScriptMarker is used by iPXE to detect that the response is a script.
|
||||
const ScriptMarker = "#!ipxe"
|
||||
|
||||
// State returns the compact single-word status used for logging.
|
||||
// Takes a Run's state because iPXE handler already looked it up.
|
||||
func State(run model.Run) string { return string(run.State) }
|
||||
@@ -0,0 +1,61 @@
|
||||
package pxe
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestBuildScriptIncludesAllCmdlineParams(t *testing.T) {
|
||||
s := BuildScript(IPXEParams{
|
||||
OrchestratorURL: "http://10.0.0.5:8080",
|
||||
LiveKernelURL: "http://10.0.0.5:8080/live/vmlinuz",
|
||||
LiveInitrdURL: "http://10.0.0.5:8080/live/initrd.img",
|
||||
RunID: 42,
|
||||
MAC: "aa:bb:cc:dd:ee:ff",
|
||||
Token: "deadbeefcafe",
|
||||
})
|
||||
if !strings.HasPrefix(s, "#!ipxe") {
|
||||
t.Fatalf("expected #!ipxe header, got %q", s[:10])
|
||||
}
|
||||
for _, want := range []string{
|
||||
"vetting.orchestrator=http://10.0.0.5:8080",
|
||||
"vetting.run_id=42",
|
||||
"vetting.mac=aa:bb:cc:dd:ee:ff",
|
||||
"vetting.token=deadbeefcafe",
|
||||
"kernel http://10.0.0.5:8080/live/vmlinuz",
|
||||
"initrd http://10.0.0.5:8080/live/initrd.img",
|
||||
"ip=dhcp",
|
||||
"boot",
|
||||
} {
|
||||
if !strings.Contains(s, want) {
|
||||
t.Errorf("script missing %q\n%s", want, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildScriptOmitsCertFPRWhenEmpty(t *testing.T) {
|
||||
s := BuildScript(IPXEParams{
|
||||
OrchestratorURL: "http://x", LiveKernelURL: "http://x/k", LiveInitrdURL: "http://x/i",
|
||||
RunID: 1, MAC: "aa:bb:cc:dd:ee:ff", Token: "t",
|
||||
})
|
||||
if strings.Contains(s, "vetting.cert_fpr") {
|
||||
t.Fatalf("cert_fpr should be absent when empty:\n%s", s)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNotRegisteredScriptMentionsMAC(t *testing.T) {
|
||||
s := NotRegisteredScript("aa:bb:cc:dd:ee:ff")
|
||||
if !strings.Contains(s, "aa:bb:cc:dd:ee:ff") {
|
||||
t.Fatalf("not-registered script should echo the MAC: %s", s)
|
||||
}
|
||||
if !strings.HasPrefix(s, "#!ipxe") {
|
||||
t.Fatalf("missing #!ipxe header: %s", s)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildLiveURLs(t *testing.T) {
|
||||
k, i := BuildLiveURLs("http://h:8080/")
|
||||
if k != "http://h:8080/live/vmlinuz" || i != "http://h:8080/live/initrd.img" {
|
||||
t.Fatalf("BuildLiveURLs: %s, %s", k, i)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
//go:build !windows
|
||||
|
||||
package pxe
|
||||
|
||||
import (
|
||||
"os"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
func sighup(p *os.Process) error {
|
||||
return p.Signal(syscall.SIGHUP)
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
//go:build windows
|
||||
|
||||
package pxe
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
)
|
||||
|
||||
func sighup(_ *os.Process) error {
|
||||
return fmt.Errorf("SIGHUP not supported on Windows")
|
||||
}
|
||||
Reference in New Issue
Block a user