Files
josh bcbbc35489
CI / Lint + build + test (push) Successful in 1m37s
Release / release (push) Has been cancelled
docs+e2e: document proxy-DHCP topology; default e2e bridge to LAN
Rewrites the PXE section of the ops runbook around the new proxy-DHCP
model (no dedicated bridge, coexists with UniFi/pfSense/etc.) and
swaps the e2e test's default bridge + orchestrator URL to match. The
e2e file now calls out the LAN-DHCP precondition in its header so
future-me (or CI) doesn't hang at PXE wondering why nothing answers.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 12:07:05 -04:00

233 lines
7.6 KiB
Go

//go:build e2e
// Package e2e exercises the orchestrator end-to-end against a real QEMU
// VM PXE-booting from the orchestrator-supervised dnsmasq into the
// mkosi-built live image.
//
// This test is gated behind the `e2e` build tag because:
// - it requires root (for bridge + qemu-system-x86_64 network setup),
// - it needs a pre-built live image at live-image/out/{vmlinuz,initrd.img},
// - it only runs on Linux (mkosi + qemu-kvm).
//
// Run with:
//
// sudo go test -tags=e2e -run TestQEMUFullRun ./test/e2e/...
//
// Network precondition: dnsmasq runs in proxy-DHCP mode on the LAN.
// The QEMU VM attaches to the LAN bridge (default `vmbr0`) and gets
// its IP from the LAN's real DHCP server (e.g. UniFi) while the
// orchestrator's dnsmasq layers on the PXE options. There must be a
// reachable DHCP server on that bridge — tests will hang at PXE
// otherwise. Override the bridge with VETTING_E2E_BRIDGE.
//
// See docs/operations.md for the manual QEMU invocation equivalent.
package e2e
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"testing"
"time"
)
// Tunables — overridable via env for CI, defaults match the manual
// setup documented in docs/operations.md.
var (
bridgeName = envOr("VETTING_E2E_BRIDGE", "vmbr0")
liveKernel = envOr("VETTING_E2E_KERNEL", "live-image/out/vmlinuz")
liveInitrd = envOr("VETTING_E2E_INITRD", "live-image/out/initrd.img")
testMAC = envOr("VETTING_E2E_MAC", "52:54:00:12:34:56")
publicURL = envOr("VETTING_E2E_URL", "http://127.0.0.1:8080")
// Overall budget for the run to reach Completed. Stage timeouts in
// the config should be tuned down for E2E to well under this.
runBudget = 10 * time.Minute
)
func envOr(k, d string) string {
if v := os.Getenv(k); v != "" {
return v
}
return d
}
// TestQEMUFullRun boots a QEMU VM against a running orchestrator and
// waits for the Run state to reach Completed.
//
// Preconditions (test skips unless all are true):
// - Linux host
// - Running as root (bridge networking + qemu-kvm)
// - `qemu-system-x86_64` on PATH
// - Live image built (kernel + initrd exist)
// - An orchestrator is already running at $VETTING_E2E_URL with a
// host registered for $VETTING_E2E_MAC and a run already queued
// (start the run via the UI before invoking this test, or via the
// orchestrator's /hosts/{id}/start endpoint).
//
// The test exercises the real PXE path. It does NOT embed its own
// orchestrator because dnsmasq needs CAP_NET_ADMIN and the test binary
// should stay focused on the "did the run complete?" assertion.
func TestQEMUFullRun(t *testing.T) {
if runtime.GOOS != "linux" {
t.Skip("E2E test requires Linux")
}
if os.Geteuid() != 0 {
t.Skip("E2E test requires root (sudo go test -tags=e2e ...)")
}
if _, err := exec.LookPath("qemu-system-x86_64"); err != nil {
t.Skip("qemu-system-x86_64 not on PATH")
}
if _, err := os.Stat(liveKernel); err != nil {
t.Skipf("live kernel missing at %s (run `make live-image`)", liveKernel)
}
if _, err := os.Stat(liveInitrd); err != nil {
t.Skipf("live initrd missing at %s", liveInitrd)
}
if err := pingOrchestrator(publicURL); err != nil {
t.Skipf("orchestrator not reachable at %s: %v", publicURL, err)
}
runID, err := findQueuedRunForMAC(publicURL, testMAC)
if err != nil {
t.Fatalf("no queued run for %s: %v (register the host and click Start Vetting first)", testMAC, err)
}
t.Logf("driving run %d for MAC %s", runID, testMAC)
disk, cleanup := makeThrowawayDisk(t)
defer cleanup()
qemuCtx, cancel := context.WithTimeout(context.Background(), runBudget)
defer cancel()
cmd := exec.CommandContext(qemuCtx, "qemu-system-x86_64",
"-enable-kvm", "-cpu", "host", "-smp", "4", "-m", "4096",
"-netdev", "bridge,id=n0,br="+bridgeName,
"-device", "virtio-net-pci,netdev=n0,mac="+testMAC,
"-drive", "file="+disk+",format=raw,if=virtio",
"-boot", "n", "-serial", "file:"+filepath.Join(os.TempDir(), fmt.Sprintf("vetting-e2e-%d.serial", runID)),
"-display", "none",
)
cmd.Stdout = testLogger{t}
cmd.Stderr = testLogger{t}
if err := cmd.Start(); err != nil {
t.Fatalf("start qemu: %v", err)
}
defer func() {
_ = cmd.Process.Kill()
_ = cmd.Wait()
}()
// Poll the orchestrator until the run reaches a terminal state.
poll := time.NewTicker(5 * time.Second)
defer poll.Stop()
for {
select {
case <-qemuCtx.Done():
t.Fatalf("run %d did not complete within %s", runID, runBudget)
case <-poll.C:
state, err := getRunState(publicURL, runID)
if err != nil {
t.Logf("poll state: %v (will retry)", err)
continue
}
t.Logf("run %d state = %s", runID, state)
switch state {
case "Completed":
return // green path
case "FailedHolding", "Failed", "Released":
t.Fatalf("run %d ended in non-success state %q", runID, state)
}
}
}
}
// ---- helpers ------------------------------------------------------------
func pingOrchestrator(url string) error {
req, err := http.NewRequest(http.MethodGet, url+"/login", nil)
if err != nil {
return err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode >= 500 {
return fmt.Errorf("status %d", resp.StatusCode)
}
return nil
}
// findQueuedRunForMAC hits a hypothetical /api/v1/runs?mac=... debug
// endpoint. Since Phase 6 doesn't add that endpoint (orchestrator stays
// browser-session-gated for UI routes), we fall back to requiring the
// caller to set VETTING_E2E_RUN_ID if the orchestrator hasn't been
// extended with a debug listing. This is a pragmatic hack — the E2E
// harness is developer-facing and the alternative would be scraping
// HTML.
func findQueuedRunForMAC(baseURL, mac string) (int64, error) {
if s := os.Getenv("VETTING_E2E_RUN_ID"); s != "" {
var id int64
_, err := fmt.Sscanf(s, "%d", &id)
return id, err
}
return 0, fmt.Errorf("set VETTING_E2E_RUN_ID (no debug API for MAC lookup yet)")
}
// getRunState reads the run's current state via the report route's
// fall-through: /reports/{id} returns 404 until Completed, which gives
// us a cheap terminal-check without a JSON API. For intermediate
// states we need a debug endpoint — deliberately left as a TODO so
// the test doesn't depend on an API surface that isn't stable.
func getRunState(baseURL string, runID int64) (string, error) {
// Proxy: if /reports/{id} returns 200, the run is Completed.
resp, err := http.Get(fmt.Sprintf("%s/reports/%d", baseURL, runID))
if err != nil {
return "", err
}
defer resp.Body.Close()
_, _ = io.Copy(io.Discard, resp.Body)
switch resp.StatusCode {
case 200:
return "Completed", nil
case 401, 403:
// Session-gated; caller must export VETTING_E2E_COOKIE to bypass.
return "", fmt.Errorf("auth required; set VETTING_E2E_COOKIE")
case 404:
return "InProgress", nil
default:
return "", fmt.Errorf("unexpected %d", resp.StatusCode)
}
}
func makeThrowawayDisk(t *testing.T) (string, func()) {
t.Helper()
path := filepath.Join(t.TempDir(), "test-disk.img")
cmd := exec.Command("qemu-img", "create", "-f", "raw", path, "4G")
if out, err := cmd.CombinedOutput(); err != nil {
t.Fatalf("qemu-img create: %v\n%s", err, strings.TrimSpace(string(out)))
}
return path, func() { _ = os.Remove(path) }
}
// testLogger lets exec.Cmd write into the test's log stream so QEMU's
// stderr shows up with the test name, not as an orphaned blob.
type testLogger struct{ t *testing.T }
func (w testLogger) Write(p []byte) (int, error) {
w.t.Logf("qemu: %s", strings.TrimRight(string(p), "\r\n"))
return len(p), nil
}
// Compile-time reminder: json is imported so future expansions can
// parse the orchestrator's response bodies when a debug API lands.
var _ = json.Marshal