Initial commit: full Phases 1-6 implementation

Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
@@ -0,0 +1,124 @@
+package orchestrator
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"vetting/internal/model"
+	"vetting/internal/store"
+)
+
+// Dispatcher picks Queued runs off the DB and drives them through
+// WaitingWoL (sending a WoL packet). Concurrency is capped at Max.
+//
+// For Phase 2 the dispatcher's job ends at WaitingWoL; further
+// transitions are driven by iPXE and agent callbacks. Phase 4+ will
+// return here and shepherd each run through stage execution.
+type Dispatcher struct {
+	Max    int
+	Runs   *store.Runs
+	Hosts  *store.Hosts
+	Runner *Runner
+
+	active chan struct{}
+	stop   chan struct{}
+}
+
+func NewDispatcher(max int, runs *store.Runs, hosts *store.Hosts, runner *Runner) *Dispatcher {
+	if max < 1 {
+		max = 1
+	}
+	return &Dispatcher{
+		Max:    max,
+		Runs:   runs,
+		Hosts:  hosts,
+		Runner: runner,
+		active: make(chan struct{}, max),
+		stop:   make(chan struct{}),
+	}
+}
+
+func (d *Dispatcher) Start(ctx context.Context) {
+	go d.loop(ctx)
+}
+
+func (d *Dispatcher) Stop() {
+	close(d.stop)
+}
+
+func (d *Dispatcher) loop(ctx context.Context) {
+	t := time.NewTicker(2 * time.Second)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-d.stop:
+			return
+		case <-t.C:
+			d.pickNext(ctx)
+		}
+	}
+}
+
+func (d *Dispatcher) pickNext(ctx context.Context) {
+	select {
+	case d.active <- struct{}{}:
+	default:
+		return // at capacity
+	}
+	released := false
+	defer func() {
+		if !released {
+			<-d.active
+		}
+	}()
+
+	runs, err := d.Runs.Active(ctx)
+	if err != nil {
+		log.Printf("dispatcher: list active: %v", err)
+		return
+	}
+
+	var queued *model.Run
+	inFlight := 0
+	for i := range runs {
+		switch runs[i].State {
+		case model.StateQueued:
+			if queued == nil {
+				queued = &runs[i]
+			}
+		case model.StateWaitingWoL, model.StateBooting, model.StateInventoryCheck,
+			model.StateSpecValidate, model.StateSMART, model.StateCPUStress,
+			model.StateStorage, model.StateNetwork, model.StateGPU,
+			model.StatePSU, model.StateReporting:
+			inFlight++
+		}
+	}
+
+	if inFlight >= d.Max || queued == nil {
+		return
+	}
+
+	host, err := d.Hosts.Get(ctx, queued.HostID)
+	if err != nil {
+		log.Printf("dispatcher: get host %d: %v", queued.HostID, err)
+		return
+	}
+	if _, err := d.Runner.Transition(ctx, queued.ID, TriggerDispatched); err != nil {
+		log.Printf("dispatcher: transition run %d: %v", queued.ID, err)
+		return
+	}
+	if err := SendWoL(host.MAC, host.WoLBroadcastIP, host.WoLPort); err != nil {
+		log.Printf("dispatcher: WoL run %d host %s: %v", queued.ID, host.Name, err)
+		// Stay in WaitingWoL; operator can retry or investigate.
+		return
+	}
+	log.Printf("dispatcher: WoL sent for run %d (host=%s mac=%s)", queued.ID, host.Name, host.MAC)
+
+	// Slot stays reserved until the run leaves active (Phase 4+).
+	// Phase 2 lets the loop observe inFlight via DB state.
+	released = true
+	<-d.active
+}
@@ -0,0 +1,92 @@
+package orchestrator
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log"
+	"os"
+	"os/exec"
+	"strconv"
+	"sync"
+	"time"
+)
+
+// IperfSupervisor runs a single `iperf3 -s` process under the
+// orchestrator so the Network stage has a stable server to dial. Each
+// run's Network test is sequential (stages are always serial), so one
+// server process handles every host under test.
+//
+// Missing iperf3 binary is logged once and the supervisor becomes a
+// no-op — the agent's Network stage will then fail to connect and skip
+// cleanly via the stage's own error path.
+type IperfSupervisor struct {
+	Port int // default 5201
+
+	mu      sync.Mutex
+	cmd     *exec.Cmd
+	started bool
+	fatal   error
+}
+
+func NewIperfSupervisor(port int) *IperfSupervisor {
+	if port <= 0 {
+		port = 5201
+	}
+	return &IperfSupervisor{Port: port}
+}
+
+func (s *IperfSupervisor) Start(ctx context.Context) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.started {
+		return nil
+	}
+	if _, err := exec.LookPath("iperf3"); err != nil {
+		s.fatal = fmt.Errorf("iperf3 not in PATH: %w", err)
+		log.Printf("iperf supervisor: %v (Network stage will fail to connect)", s.fatal)
+		return nil
+	}
+	cmd := exec.CommandContext(ctx, "iperf3", "-s", "-p", strconv.Itoa(s.Port))
+	if err := cmd.Start(); err != nil {
+		s.fatal = err
+		return err
+	}
+	s.cmd = cmd
+	s.started = true
+	log.Printf("iperf supervisor: iperf3 -s -p %d (pid=%d)", s.Port, cmd.Process.Pid)
+	go s.wait()
+	return nil
+}
+
+// Shutdown politely stops the iperf3 subprocess. Called from main on
+// SIGINT. A 3s grace period is enough for iperf3 to flush logs; after
+// that we kill.
+func (s *IperfSupervisor) Shutdown(timeout time.Duration) error {
+	s.mu.Lock()
+	cmd := s.cmd
+	s.mu.Unlock()
+	if cmd == nil || cmd.Process == nil {
+		return nil
+	}
+	// os.Interrupt is cross-platform; on Linux it maps to SIGINT which
+	// iperf3 handles gracefully. On Windows (dev only) it's a no-op and
+	// we'll fall through to Kill after the timeout.
+	_ = cmd.Process.Signal(os.Interrupt)
+	done := make(chan error, 1)
+	go func() { done <- cmd.Wait() }()
+	select {
+	case <-done:
+		return nil
+	case <-time.After(timeout):
+		_ = cmd.Process.Kill()
+		return errors.New("iperf3 did not exit in time; killed")
+	}
+}
+
+func (s *IperfSupervisor) wait() {
+	_ = s.cmd.Wait()
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.started = false
+}
@@ -0,0 +1,118 @@
+package orchestrator
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"time"
+
+	"vetting/internal/events"
+	"vetting/internal/model"
+	"vetting/internal/store"
+)
+
+// Runner is the authoritative mutator for run state. All state
+// transitions go through (*Runner).Transition so the DB update and
+// the event publication happen together.
+type Runner struct {
+	Runs     *store.Runs
+	Hosts    *store.Hosts
+	Stages   *store.Stages
+	EventHub *events.Hub
+}
+
+func (r *Runner) Transition(ctx context.Context, runID int64, trigger Trigger) (model.RunState, error) {
+	run, err := r.Runs.Get(ctx, runID)
+	if err != nil {
+		return "", fmt.Errorf("get run: %w", err)
+	}
+	next, err := Next(run.State, trigger)
+	if err != nil {
+		return "", err
+	}
+	if err := r.Runs.SetState(ctx, runID, next); err != nil {
+		return "", fmt.Errorf("persist transition: %w", err)
+	}
+	log.Printf("run %d: %s -> %s (%s)", runID, run.State, next, trigger)
+	r.publishTileUpdate(ctx, run.HostID)
+	return next, nil
+}
+
+// StartStage marks a stage row running and publishes a tile refresh.
+func (r *Runner) StartStage(ctx context.Context, runID int64, name string) error {
+	if err := r.Stages.StartByName(ctx, runID, name); err != nil {
+		return err
+	}
+	run, err := r.Runs.Get(ctx, runID)
+	if err == nil {
+		r.publishTileUpdate(ctx, run.HostID)
+	}
+	return nil
+}
+
+func (r *Runner) publishTileUpdate(ctx context.Context, hostID int64) {
+	host, err := r.Hosts.Get(ctx, hostID)
+	if err != nil {
+		log.Printf("publishTileUpdate: get host %d: %v", hostID, err)
+		return
+	}
+	latest, err := r.Runs.LatestForHost(ctx, hostID)
+	if err != nil {
+		log.Printf("publishTileUpdate: latest run: %v", err)
+		return
+	}
+	payload := renderTileSSE(ctx, *host, latest)
+	r.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", hostID), Payload: payload})
+}
+
+// TileRenderer renders a single tile fragment. Registered at startup
+// so the orchestrator package stays free of template / store-enrichment
+// imports. The closure is expected to do any DB lookups itself (spec-
+// diff count, hold-key path, …) before handing the data to the
+// template package.
+var TileRenderer func(ctx context.Context, host model.Host, latest *model.Run) string
+
+func renderTileSSE(ctx context.Context, host model.Host, latest *model.Run) string {
+	if TileRenderer == nil {
+		return fmt.Sprintf(`<article id="host-%d">state change</article>`, host.ID)
+	}
+	return TileRenderer(ctx, host, latest)
+}
+
+// TouchHeartbeat is called on every agent heartbeat so the orchestrator
+// can record last-seen; Phase 2 just logs, Phase 3+ will update a
+// last_seen_at column.
+func (r *Runner) TouchHeartbeat(runID int64) {
+	_ = runID
+	_ = time.Now()
+}
+
+// Override re-enters a held stage after the operator has acknowledged
+// the failure condition (e.g. wipe-probe override). It jumps
+// FailedHolding → StateFor(failed_stage), clears the failed marker, and
+// publishes a tile refresh so the UI drops the hold banner.
+func (r *Runner) Override(ctx context.Context, runID int64, flagsJSON string) (model.RunState, error) {
+	run, err := r.Runs.Get(ctx, runID)
+	if err != nil {
+		return "", fmt.Errorf("get run: %w", err)
+	}
+	if run.FailedStage == "" {
+		return "", fmt.Errorf("override: run has no failed_stage")
+	}
+	next, err := NextForOverride(run.State, run.FailedStage)
+	if err != nil {
+		return "", err
+	}
+	if err := r.Runs.SetOverrideFlags(ctx, runID, flagsJSON); err != nil {
+		return "", fmt.Errorf("persist override flags: %w", err)
+	}
+	if err := r.Runs.SetState(ctx, runID, next); err != nil {
+		return "", fmt.Errorf("override transition: %w", err)
+	}
+	if err := r.Runs.ClearFailedStage(ctx, runID); err != nil {
+		log.Printf("override: clear failed_stage: %v", err)
+	}
+	log.Printf("run %d: %s -> %s (OperatorOverride stage=%s flags=%s)", runID, run.State, next, run.FailedStage, flagsJSON)
+	r.publishTileUpdate(ctx, run.HostID)
+	return next, nil
+}
@@ -0,0 +1,129 @@
+package orchestrator
+
+import (
+	"fmt"
+
+	"vetting/internal/model"
+)
+
+// Trigger is an event that drives a state transition.
+type Trigger string
+
+const (
+	TriggerStartRequested   Trigger = "StartRequested"   // user clicks Start Vetting
+	TriggerDispatched       Trigger = "Dispatched"       // dispatcher picked this run
+	TriggerPXEObserved      Trigger = "PXEObserved"      // iPXE fetched cmdline for MAC
+	TriggerAgentClaimed     Trigger = "AgentClaimed"     // agent POSTed /claim with valid token
+	TriggerStageFailed      Trigger = "StageFailed"      // a stage reported failure
+	TriggerStageCompleted   Trigger = "StageCompleted"   // a stage reported success → advance
+	TriggerAllStagesPassed  Trigger = "AllStagesPassed"  // final stage passed
+	TriggerOperatorReleased Trigger = "OperatorReleased" // user clicked Release on a held run
+	TriggerOperatorOverride Trigger = "OperatorOverride" // user overrode a held stage; re-enter it
+)
+
+// stageStates maps the canonical stage name (from DefaultStageOrder)
+// to the matching RunState. Named differently for historical reasons:
+// the first stage is "Inventory" (stage row name) but the run state is
+// "InventoryCheck". Later stages share a name with their state.
+var stageStates = map[string]model.RunState{
+	"Inventory":    model.StateInventoryCheck,
+	"SpecValidate": model.StateSpecValidate,
+	"SMART":        model.StateSMART,
+	"CPUStress":    model.StateCPUStress,
+	"Storage":      model.StateStorage,
+	"Network":      model.StateNetwork,
+	"GPU":          model.StateGPU,
+	"PSU":          model.StatePSU,
+	"Reporting":    model.StateReporting,
+}
+
+// stageOrder is the sequence of RunStates the run walks through from
+// first stage to Completed. Kept in sync with store.DefaultStageOrder.
+var stageOrder = []model.RunState{
+	model.StateInventoryCheck,
+	model.StateSpecValidate,
+	model.StateSMART,
+	model.StateCPUStress,
+	model.StateStorage,
+	model.StateNetwork,
+	model.StateGPU,
+	model.StatePSU,
+	model.StateReporting,
+}
+
+type transition struct {
+	from []model.RunState
+	to   model.RunState
+}
+
+var table = map[Trigger]transition{
+	TriggerStartRequested:   {from: []model.RunState{model.StateRegistered}, to: model.StateQueued},
+	TriggerDispatched:       {from: []model.RunState{model.StateQueued}, to: model.StateWaitingWoL},
+	TriggerPXEObserved:      {from: []model.RunState{model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting},
+	TriggerAgentClaimed:     {from: []model.RunState{model.StateBooting, model.StateWaitingWoL}, to: model.StateInventoryCheck},
+	TriggerStageFailed:      {from: allActiveStates(), to: model.StateFailedHolding},
+	TriggerAllStagesPassed:  {from: []model.RunState{model.StateReporting}, to: model.StateCompleted},
+	TriggerOperatorReleased: {from: []model.RunState{model.StateFailedHolding}, to: model.StateReleased},
+}
+
+// Next computes the target state for a trigger against the current state.
+// StageCompleted is handled specially: it advances through stageOrder.
+func Next(current model.RunState, t Trigger) (model.RunState, error) {
+	if t == TriggerStageCompleted {
+		return nextStageState(current)
+	}
+	tr, ok := table[t]
+	if !ok {
+		return "", fmt.Errorf("unknown trigger %q", t)
+	}
+	for _, s := range tr.from {
+		if s == current {
+			return tr.to, nil
+		}
+	}
+	return "", fmt.Errorf("trigger %q not allowed from %q", t, current)
+}
+
+// NextForOverride returns the state we should jump to when the operator
+// overrides a held stage. It's separate from the generic table because
+// the target depends on the failed_stage, not on the current state
+// (which is always FailedHolding).
+func NextForOverride(current model.RunState, failedStage string) (model.RunState, error) {
+	if current != model.StateFailedHolding {
+		return "", fmt.Errorf("override not allowed from %q", current)
+	}
+	s, ok := stageStates[failedStage]
+	if !ok {
+		return "", fmt.Errorf("override: unknown failed stage %q", failedStage)
+	}
+	return s, nil
+}
+
+// StateForStage returns the RunState that corresponds to a stage name.
+// Used by handlers that receive a stage name and want to guard against
+// stale/out-of-order agent reports.
+func StateForStage(name string) (model.RunState, bool) {
+	s, ok := stageStates[name]
+	return s, ok
+}
+
+func nextStageState(current model.RunState) (model.RunState, error) {
+	for i, s := range stageOrder {
+		if s == current {
+			if i+1 >= len(stageOrder) {
+				return model.StateCompleted, nil
+			}
+			return stageOrder[i+1], nil
+		}
+	}
+	return "", fmt.Errorf("StageCompleted not valid from %q", current)
+}
+
+func allActiveStates() []model.RunState {
+	return []model.RunState{
+		model.StateQueued, model.StateWaitingWoL, model.StateBooting,
+		model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
+		model.StateCPUStress, model.StateStorage, model.StateNetwork,
+		model.StateGPU, model.StatePSU, model.StateReporting,
+	}
+}
@@ -0,0 +1,67 @@
+package orchestrator_test
+
+import (
+	"testing"
+
+	"vetting/internal/model"
+	"vetting/internal/orchestrator"
+)
+
+func TestNextForOverride(t *testing.T) {
+	tests := []struct {
+		name        string
+		from        model.RunState
+		failedStage string
+		want        model.RunState
+		wantErr     bool
+	}{
+		{"storage override", model.StateFailedHolding, "Storage", model.StateStorage, false},
+		{"smart override", model.StateFailedHolding, "SMART", model.StateSMART, false},
+		{"inventory override", model.StateFailedHolding, "Inventory", model.StateInventoryCheck, false},
+		{"unknown stage", model.StateFailedHolding, "NotAStage", "", true},
+		{"not holding", model.StateStorage, "Storage", "", true},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := orchestrator.NextForOverride(tc.from, tc.failedStage)
+			if tc.wantErr {
+				if err == nil {
+					t.Fatalf("expected error, got %q", got)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if got != tc.want {
+				t.Fatalf("got %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestNextStageWalk(t *testing.T) {
+	// Walking StageCompleted from each stage should land on the next
+	// one in the canonical order, and from Reporting onto Completed.
+	chain := []model.RunState{
+		model.StateInventoryCheck,
+		model.StateSpecValidate,
+		model.StateSMART,
+		model.StateCPUStress,
+		model.StateStorage,
+		model.StateNetwork,
+		model.StateGPU,
+		model.StatePSU,
+		model.StateReporting,
+		model.StateCompleted,
+	}
+	for i := 0; i < len(chain)-1; i++ {
+		got, err := orchestrator.Next(chain[i], orchestrator.TriggerStageCompleted)
+		if err != nil {
+			t.Fatalf("Next(%q): %v", chain[i], err)
+		}
+		if got != chain[i+1] {
+			t.Fatalf("Next(%q) = %q, want %q", chain[i], got, chain[i+1])
+		}
+	}
+}
@@ -0,0 +1,26 @@
+package orchestrator
+
+import (
+	"crypto/rand"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+)
+
+// IssueRunToken returns (plaintext, hashHex). The plaintext is passed
+// to the host via the iPXE kernel cmdline; the hash is persisted in the
+// runs table for later constant-time comparison.
+func IssueRunToken() (string, string, error) {
+	b := make([]byte, 32)
+	if _, err := rand.Read(b); err != nil {
+		return "", "", fmt.Errorf("random: %w", err)
+	}
+	plain := hex.EncodeToString(b)
+	sum := sha256.Sum256([]byte(plain))
+	return plain, hex.EncodeToString(sum[:]), nil
+}
+
+func HashRunToken(plain string) string {
+	sum := sha256.Sum256([]byte(plain))
+	return hex.EncodeToString(sum[:])
+}
@@ -0,0 +1,38 @@
+package orchestrator
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestIssueRunTokenRoundTrip(t *testing.T) {
+	plain, hash, err := IssueRunToken()
+	if err != nil {
+		t.Fatalf("IssueRunToken: %v", err)
+	}
+	if len(plain) != 64 {
+		t.Fatalf("plaintext should be 64 hex chars, got %d", len(plain))
+	}
+	if len(hash) != 64 {
+		t.Fatalf("hash should be 64 hex chars, got %d", len(hash))
+	}
+	if HashRunToken(plain) != hash {
+		t.Fatalf("HashRunToken(plain) != hash")
+	}
+	// Ensure high entropy: two consecutive issues differ.
+	plain2, _, _ := IssueRunToken()
+	if plain == plain2 {
+		t.Fatalf("expected distinct tokens on consecutive calls")
+	}
+}
+
+func TestHashRunTokenDeterministic(t *testing.T) {
+	h1 := HashRunToken("abc")
+	h2 := HashRunToken("abc")
+	if h1 != h2 {
+		t.Fatalf("hash not deterministic")
+	}
+	if strings.EqualFold(h1, HashRunToken("abd")) {
+		t.Fatalf("hash should differ for distinct inputs")
+	}
+}
@@ -0,0 +1,57 @@
+package orchestrator
+
+import (
+	"encoding/hex"
+	"fmt"
+	"net"
+	"strconv"
+	"strings"
+)
+
+// SendWoL sends a Wake-on-LAN magic packet to broadcastIP:port for the
+// given MAC (aa:bb:cc:dd:ee:ff). The packet is 6 bytes of 0xFF followed
+// by the MAC repeated 16 times.
+func SendWoL(mac, broadcastIP string, port int) error {
+	macBytes, err := parseMAC(mac)
+	if err != nil {
+		return err
+	}
+	packet := make([]byte, 6+16*6)
+	for i := 0; i < 6; i++ {
+		packet[i] = 0xff
+	}
+	for i := 0; i < 16; i++ {
+		copy(packet[6+i*6:], macBytes)
+	}
+
+	conn, err := net.Dial("udp", net.JoinHostPort(broadcastIP, strconv.Itoa(port)))
+	if err != nil {
+		return fmt.Errorf("dial wol: %w", err)
+	}
+	defer conn.Close()
+
+	if _, err := conn.Write(packet); err != nil {
+		return fmt.Errorf("write wol: %w", err)
+	}
+	return nil
+}
+
+func parseMAC(s string) ([]byte, error) {
+	s = strings.ToLower(strings.TrimSpace(s))
+	parts := strings.Split(s, ":")
+	if len(parts) != 6 {
+		return nil, fmt.Errorf("invalid MAC %q", s)
+	}
+	out := make([]byte, 6)
+	for i, p := range parts {
+		if len(p) != 2 {
+			return nil, fmt.Errorf("invalid MAC octet %q", p)
+		}
+		b, err := hex.DecodeString(p)
+		if err != nil {
+			return nil, fmt.Errorf("invalid MAC %q: %w", s, err)
+		}
+		out[i] = b[0]
+	}
+	return out, nil
+}
@@ -0,0 +1,37 @@
+package orchestrator
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestParseMAC(t *testing.T) {
+	got, err := parseMAC("aa:bb:cc:dd:ee:ff")
+	if err != nil {
+		t.Fatalf("parseMAC: %v", err)
+	}
+	want := []byte{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
+	if !bytes.Equal(got, want) {
+		t.Fatalf("parseMAC: %x != %x", got, want)
+	}
+}
+
+func TestParseMACUpper(t *testing.T) {
+	// Must be case-insensitive so users can paste either form.
+	got, err := parseMAC("AA:BB:CC:DD:EE:FF")
+	if err != nil {
+		t.Fatalf("parseMAC upper: %v", err)
+	}
+	want := []byte{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
+	if !bytes.Equal(got, want) {
+		t.Fatalf("parseMAC upper: %x != %x", got, want)
+	}
+}
+
+func TestParseMACInvalid(t *testing.T) {
+	for _, bad := range []string{"", "aa:bb:cc", "zz:yy:xx:ww:vv:uu", "aa-bb-cc-dd-ee-ff", "aa:bb:cc:dd:ee:ff:00"} {
+		if _, err := parseMAC(bad); err == nil {
+			t.Errorf("expected error for %q", bad)
+		}
+	}
+}