Initial commit: full Phases 1-6 implementation

Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
@@ -0,0 +1,124 @@
+package orchestrator
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"vetting/internal/model"
+	"vetting/internal/store"
+)
+
+// Dispatcher picks Queued runs off the DB and drives them through
+// WaitingWoL (sending a WoL packet). Concurrency is capped at Max.
+//
+// For Phase 2 the dispatcher's job ends at WaitingWoL; further
+// transitions are driven by iPXE and agent callbacks. Phase 4+ will
+// return here and shepherd each run through stage execution.
+type Dispatcher struct {
+	Max    int
+	Runs   *store.Runs
+	Hosts  *store.Hosts
+	Runner *Runner
+
+	active chan struct{}
+	stop   chan struct{}
+}
+
+func NewDispatcher(max int, runs *store.Runs, hosts *store.Hosts, runner *Runner) *Dispatcher {
+	if max < 1 {
+		max = 1
+	}
+	return &Dispatcher{
+		Max:    max,
+		Runs:   runs,
+		Hosts:  hosts,
+		Runner: runner,
+		active: make(chan struct{}, max),
+		stop:   make(chan struct{}),
+	}
+}
+
+func (d *Dispatcher) Start(ctx context.Context) {
+	go d.loop(ctx)
+}
+
+func (d *Dispatcher) Stop() {
+	close(d.stop)
+}
+
+func (d *Dispatcher) loop(ctx context.Context) {
+	t := time.NewTicker(2 * time.Second)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-d.stop:
+			return
+		case <-t.C:
+			d.pickNext(ctx)
+		}
+	}
+}
+
+func (d *Dispatcher) pickNext(ctx context.Context) {
+	select {
+	case d.active <- struct{}{}:
+	default:
+		return // at capacity
+	}
+	released := false
+	defer func() {
+		if !released {
+			<-d.active
+		}
+	}()
+
+	runs, err := d.Runs.Active(ctx)
+	if err != nil {
+		log.Printf("dispatcher: list active: %v", err)
+		return
+	}
+
+	var queued *model.Run
+	inFlight := 0
+	for i := range runs {
+		switch runs[i].State {
+		case model.StateQueued:
+			if queued == nil {
+				queued = &runs[i]
+			}
+		case model.StateWaitingWoL, model.StateBooting, model.StateInventoryCheck,
+			model.StateSpecValidate, model.StateSMART, model.StateCPUStress,
+			model.StateStorage, model.StateNetwork, model.StateGPU,
+			model.StatePSU, model.StateReporting:
+			inFlight++
+		}
+	}
+
+	if inFlight >= d.Max || queued == nil {
+		return
+	}
+
+	host, err := d.Hosts.Get(ctx, queued.HostID)
+	if err != nil {
+		log.Printf("dispatcher: get host %d: %v", queued.HostID, err)
+		return
+	}
+	if _, err := d.Runner.Transition(ctx, queued.ID, TriggerDispatched); err != nil {
+		log.Printf("dispatcher: transition run %d: %v", queued.ID, err)
+		return
+	}
+	if err := SendWoL(host.MAC, host.WoLBroadcastIP, host.WoLPort); err != nil {
+		log.Printf("dispatcher: WoL run %d host %s: %v", queued.ID, host.Name, err)
+		// Stay in WaitingWoL; operator can retry or investigate.
+		return
+	}
+	log.Printf("dispatcher: WoL sent for run %d (host=%s mac=%s)", queued.ID, host.Name, host.MAC)
+
+	// Slot stays reserved until the run leaves active (Phase 4+).
+	// Phase 2 lets the loop observe inFlight via DB state.
+	released = true
+	<-d.active
+}