Initial commit: full Phases 1-6 implementation
CI / Lint + build + test (push) Has been cancelled

Post-repair hardware validation pipeline for Proxmox cluster hosts.
Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq
PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
+124
View File
@@ -0,0 +1,124 @@
package orchestrator
import (
"context"
"log"
"time"
"vetting/internal/model"
"vetting/internal/store"
)
// Dispatcher picks Queued runs off the DB and drives them through
// WaitingWoL (sending a WoL packet). Concurrency is capped at Max.
//
// For Phase 2 the dispatcher's job ends at WaitingWoL; further
// transitions are driven by iPXE and agent callbacks. Phase 4+ will
// return here and shepherd each run through stage execution.
type Dispatcher struct {
Max int
Runs *store.Runs
Hosts *store.Hosts
Runner *Runner
active chan struct{}
stop chan struct{}
}
func NewDispatcher(max int, runs *store.Runs, hosts *store.Hosts, runner *Runner) *Dispatcher {
if max < 1 {
max = 1
}
return &Dispatcher{
Max: max,
Runs: runs,
Hosts: hosts,
Runner: runner,
active: make(chan struct{}, max),
stop: make(chan struct{}),
}
}
func (d *Dispatcher) Start(ctx context.Context) {
go d.loop(ctx)
}
func (d *Dispatcher) Stop() {
close(d.stop)
}
func (d *Dispatcher) loop(ctx context.Context) {
t := time.NewTicker(2 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-d.stop:
return
case <-t.C:
d.pickNext(ctx)
}
}
}
func (d *Dispatcher) pickNext(ctx context.Context) {
select {
case d.active <- struct{}{}:
default:
return // at capacity
}
released := false
defer func() {
if !released {
<-d.active
}
}()
runs, err := d.Runs.Active(ctx)
if err != nil {
log.Printf("dispatcher: list active: %v", err)
return
}
var queued *model.Run
inFlight := 0
for i := range runs {
switch runs[i].State {
case model.StateQueued:
if queued == nil {
queued = &runs[i]
}
case model.StateWaitingWoL, model.StateBooting, model.StateInventoryCheck,
model.StateSpecValidate, model.StateSMART, model.StateCPUStress,
model.StateStorage, model.StateNetwork, model.StateGPU,
model.StatePSU, model.StateReporting:
inFlight++
}
}
if inFlight >= d.Max || queued == nil {
return
}
host, err := d.Hosts.Get(ctx, queued.HostID)
if err != nil {
log.Printf("dispatcher: get host %d: %v", queued.HostID, err)
return
}
if _, err := d.Runner.Transition(ctx, queued.ID, TriggerDispatched); err != nil {
log.Printf("dispatcher: transition run %d: %v", queued.ID, err)
return
}
if err := SendWoL(host.MAC, host.WoLBroadcastIP, host.WoLPort); err != nil {
log.Printf("dispatcher: WoL run %d host %s: %v", queued.ID, host.Name, err)
// Stay in WaitingWoL; operator can retry or investigate.
return
}
log.Printf("dispatcher: WoL sent for run %d (host=%s mac=%s)", queued.ID, host.Name, host.MAC)
// Slot stays reserved until the run leaves active (Phase 4+).
// Phase 2 lets the loop observe inFlight via DB state.
released = true
<-d.active
}
+92
View File
@@ -0,0 +1,92 @@
package orchestrator
import (
"context"
"errors"
"fmt"
"log"
"os"
"os/exec"
"strconv"
"sync"
"time"
)
// IperfSupervisor runs a single `iperf3 -s` process under the
// orchestrator so the Network stage has a stable server to dial. Each
// run's Network test is sequential (stages are always serial), so one
// server process handles every host under test.
//
// Missing iperf3 binary is logged once and the supervisor becomes a
// no-op — the agent's Network stage will then fail to connect and skip
// cleanly via the stage's own error path.
type IperfSupervisor struct {
Port int // default 5201
mu sync.Mutex
cmd *exec.Cmd
started bool
fatal error
}
func NewIperfSupervisor(port int) *IperfSupervisor {
if port <= 0 {
port = 5201
}
return &IperfSupervisor{Port: port}
}
func (s *IperfSupervisor) Start(ctx context.Context) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.started {
return nil
}
if _, err := exec.LookPath("iperf3"); err != nil {
s.fatal = fmt.Errorf("iperf3 not in PATH: %w", err)
log.Printf("iperf supervisor: %v (Network stage will fail to connect)", s.fatal)
return nil
}
cmd := exec.CommandContext(ctx, "iperf3", "-s", "-p", strconv.Itoa(s.Port))
if err := cmd.Start(); err != nil {
s.fatal = err
return err
}
s.cmd = cmd
s.started = true
log.Printf("iperf supervisor: iperf3 -s -p %d (pid=%d)", s.Port, cmd.Process.Pid)
go s.wait()
return nil
}
// Shutdown politely stops the iperf3 subprocess. Called from main on
// SIGINT. A 3s grace period is enough for iperf3 to flush logs; after
// that we kill.
func (s *IperfSupervisor) Shutdown(timeout time.Duration) error {
s.mu.Lock()
cmd := s.cmd
s.mu.Unlock()
if cmd == nil || cmd.Process == nil {
return nil
}
// os.Interrupt is cross-platform; on Linux it maps to SIGINT which
// iperf3 handles gracefully. On Windows (dev only) it's a no-op and
// we'll fall through to Kill after the timeout.
_ = cmd.Process.Signal(os.Interrupt)
done := make(chan error, 1)
go func() { done <- cmd.Wait() }()
select {
case <-done:
return nil
case <-time.After(timeout):
_ = cmd.Process.Kill()
return errors.New("iperf3 did not exit in time; killed")
}
}
func (s *IperfSupervisor) wait() {
_ = s.cmd.Wait()
s.mu.Lock()
defer s.mu.Unlock()
s.started = false
}
+118
View File
@@ -0,0 +1,118 @@
package orchestrator
import (
"context"
"fmt"
"log"
"time"
"vetting/internal/events"
"vetting/internal/model"
"vetting/internal/store"
)
// Runner is the authoritative mutator for run state. All state
// transitions go through (*Runner).Transition so the DB update and
// the event publication happen together.
type Runner struct {
Runs *store.Runs
Hosts *store.Hosts
Stages *store.Stages
EventHub *events.Hub
}
func (r *Runner) Transition(ctx context.Context, runID int64, trigger Trigger) (model.RunState, error) {
run, err := r.Runs.Get(ctx, runID)
if err != nil {
return "", fmt.Errorf("get run: %w", err)
}
next, err := Next(run.State, trigger)
if err != nil {
return "", err
}
if err := r.Runs.SetState(ctx, runID, next); err != nil {
return "", fmt.Errorf("persist transition: %w", err)
}
log.Printf("run %d: %s -> %s (%s)", runID, run.State, next, trigger)
r.publishTileUpdate(ctx, run.HostID)
return next, nil
}
// StartStage marks a stage row running and publishes a tile refresh.
func (r *Runner) StartStage(ctx context.Context, runID int64, name string) error {
if err := r.Stages.StartByName(ctx, runID, name); err != nil {
return err
}
run, err := r.Runs.Get(ctx, runID)
if err == nil {
r.publishTileUpdate(ctx, run.HostID)
}
return nil
}
func (r *Runner) publishTileUpdate(ctx context.Context, hostID int64) {
host, err := r.Hosts.Get(ctx, hostID)
if err != nil {
log.Printf("publishTileUpdate: get host %d: %v", hostID, err)
return
}
latest, err := r.Runs.LatestForHost(ctx, hostID)
if err != nil {
log.Printf("publishTileUpdate: latest run: %v", err)
return
}
payload := renderTileSSE(ctx, *host, latest)
r.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", hostID), Payload: payload})
}
// TileRenderer renders a single tile fragment. Registered at startup
// so the orchestrator package stays free of template / store-enrichment
// imports. The closure is expected to do any DB lookups itself (spec-
// diff count, hold-key path, …) before handing the data to the
// template package.
var TileRenderer func(ctx context.Context, host model.Host, latest *model.Run) string
func renderTileSSE(ctx context.Context, host model.Host, latest *model.Run) string {
if TileRenderer == nil {
return fmt.Sprintf(`<article id="host-%d">state change</article>`, host.ID)
}
return TileRenderer(ctx, host, latest)
}
// TouchHeartbeat is called on every agent heartbeat so the orchestrator
// can record last-seen; Phase 2 just logs, Phase 3+ will update a
// last_seen_at column.
func (r *Runner) TouchHeartbeat(runID int64) {
_ = runID
_ = time.Now()
}
// Override re-enters a held stage after the operator has acknowledged
// the failure condition (e.g. wipe-probe override). It jumps
// FailedHolding → StateFor(failed_stage), clears the failed marker, and
// publishes a tile refresh so the UI drops the hold banner.
func (r *Runner) Override(ctx context.Context, runID int64, flagsJSON string) (model.RunState, error) {
run, err := r.Runs.Get(ctx, runID)
if err != nil {
return "", fmt.Errorf("get run: %w", err)
}
if run.FailedStage == "" {
return "", fmt.Errorf("override: run has no failed_stage")
}
next, err := NextForOverride(run.State, run.FailedStage)
if err != nil {
return "", err
}
if err := r.Runs.SetOverrideFlags(ctx, runID, flagsJSON); err != nil {
return "", fmt.Errorf("persist override flags: %w", err)
}
if err := r.Runs.SetState(ctx, runID, next); err != nil {
return "", fmt.Errorf("override transition: %w", err)
}
if err := r.Runs.ClearFailedStage(ctx, runID); err != nil {
log.Printf("override: clear failed_stage: %v", err)
}
log.Printf("run %d: %s -> %s (OperatorOverride stage=%s flags=%s)", runID, run.State, next, run.FailedStage, flagsJSON)
r.publishTileUpdate(ctx, run.HostID)
return next, nil
}
+129
View File
@@ -0,0 +1,129 @@
package orchestrator
import (
"fmt"
"vetting/internal/model"
)
// Trigger is an event that drives a state transition.
type Trigger string
const (
TriggerStartRequested Trigger = "StartRequested" // user clicks Start Vetting
TriggerDispatched Trigger = "Dispatched" // dispatcher picked this run
TriggerPXEObserved Trigger = "PXEObserved" // iPXE fetched cmdline for MAC
TriggerAgentClaimed Trigger = "AgentClaimed" // agent POSTed /claim with valid token
TriggerStageFailed Trigger = "StageFailed" // a stage reported failure
TriggerStageCompleted Trigger = "StageCompleted" // a stage reported success → advance
TriggerAllStagesPassed Trigger = "AllStagesPassed" // final stage passed
TriggerOperatorReleased Trigger = "OperatorReleased" // user clicked Release on a held run
TriggerOperatorOverride Trigger = "OperatorOverride" // user overrode a held stage; re-enter it
)
// stageStates maps the canonical stage name (from DefaultStageOrder)
// to the matching RunState. Named differently for historical reasons:
// the first stage is "Inventory" (stage row name) but the run state is
// "InventoryCheck". Later stages share a name with their state.
var stageStates = map[string]model.RunState{
"Inventory": model.StateInventoryCheck,
"SpecValidate": model.StateSpecValidate,
"SMART": model.StateSMART,
"CPUStress": model.StateCPUStress,
"Storage": model.StateStorage,
"Network": model.StateNetwork,
"GPU": model.StateGPU,
"PSU": model.StatePSU,
"Reporting": model.StateReporting,
}
// stageOrder is the sequence of RunStates the run walks through from
// first stage to Completed. Kept in sync with store.DefaultStageOrder.
var stageOrder = []model.RunState{
model.StateInventoryCheck,
model.StateSpecValidate,
model.StateSMART,
model.StateCPUStress,
model.StateStorage,
model.StateNetwork,
model.StateGPU,
model.StatePSU,
model.StateReporting,
}
type transition struct {
from []model.RunState
to model.RunState
}
var table = map[Trigger]transition{
TriggerStartRequested: {from: []model.RunState{model.StateRegistered}, to: model.StateQueued},
TriggerDispatched: {from: []model.RunState{model.StateQueued}, to: model.StateWaitingWoL},
TriggerPXEObserved: {from: []model.RunState{model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting},
TriggerAgentClaimed: {from: []model.RunState{model.StateBooting, model.StateWaitingWoL}, to: model.StateInventoryCheck},
TriggerStageFailed: {from: allActiveStates(), to: model.StateFailedHolding},
TriggerAllStagesPassed: {from: []model.RunState{model.StateReporting}, to: model.StateCompleted},
TriggerOperatorReleased: {from: []model.RunState{model.StateFailedHolding}, to: model.StateReleased},
}
// Next computes the target state for a trigger against the current state.
// StageCompleted is handled specially: it advances through stageOrder.
func Next(current model.RunState, t Trigger) (model.RunState, error) {
if t == TriggerStageCompleted {
return nextStageState(current)
}
tr, ok := table[t]
if !ok {
return "", fmt.Errorf("unknown trigger %q", t)
}
for _, s := range tr.from {
if s == current {
return tr.to, nil
}
}
return "", fmt.Errorf("trigger %q not allowed from %q", t, current)
}
// NextForOverride returns the state we should jump to when the operator
// overrides a held stage. It's separate from the generic table because
// the target depends on the failed_stage, not on the current state
// (which is always FailedHolding).
func NextForOverride(current model.RunState, failedStage string) (model.RunState, error) {
if current != model.StateFailedHolding {
return "", fmt.Errorf("override not allowed from %q", current)
}
s, ok := stageStates[failedStage]
if !ok {
return "", fmt.Errorf("override: unknown failed stage %q", failedStage)
}
return s, nil
}
// StateForStage returns the RunState that corresponds to a stage name.
// Used by handlers that receive a stage name and want to guard against
// stale/out-of-order agent reports.
func StateForStage(name string) (model.RunState, bool) {
s, ok := stageStates[name]
return s, ok
}
func nextStageState(current model.RunState) (model.RunState, error) {
for i, s := range stageOrder {
if s == current {
if i+1 >= len(stageOrder) {
return model.StateCompleted, nil
}
return stageOrder[i+1], nil
}
}
return "", fmt.Errorf("StageCompleted not valid from %q", current)
}
func allActiveStates() []model.RunState {
return []model.RunState{
model.StateQueued, model.StateWaitingWoL, model.StateBooting,
model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
model.StateCPUStress, model.StateStorage, model.StateNetwork,
model.StateGPU, model.StatePSU, model.StateReporting,
}
}
@@ -0,0 +1,67 @@
package orchestrator_test
import (
"testing"
"vetting/internal/model"
"vetting/internal/orchestrator"
)
func TestNextForOverride(t *testing.T) {
tests := []struct {
name string
from model.RunState
failedStage string
want model.RunState
wantErr bool
}{
{"storage override", model.StateFailedHolding, "Storage", model.StateStorage, false},
{"smart override", model.StateFailedHolding, "SMART", model.StateSMART, false},
{"inventory override", model.StateFailedHolding, "Inventory", model.StateInventoryCheck, false},
{"unknown stage", model.StateFailedHolding, "NotAStage", "", true},
{"not holding", model.StateStorage, "Storage", "", true},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got, err := orchestrator.NextForOverride(tc.from, tc.failedStage)
if tc.wantErr {
if err == nil {
t.Fatalf("expected error, got %q", got)
}
return
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != tc.want {
t.Fatalf("got %q, want %q", got, tc.want)
}
})
}
}
func TestNextStageWalk(t *testing.T) {
// Walking StageCompleted from each stage should land on the next
// one in the canonical order, and from Reporting onto Completed.
chain := []model.RunState{
model.StateInventoryCheck,
model.StateSpecValidate,
model.StateSMART,
model.StateCPUStress,
model.StateStorage,
model.StateNetwork,
model.StateGPU,
model.StatePSU,
model.StateReporting,
model.StateCompleted,
}
for i := 0; i < len(chain)-1; i++ {
got, err := orchestrator.Next(chain[i], orchestrator.TriggerStageCompleted)
if err != nil {
t.Fatalf("Next(%q): %v", chain[i], err)
}
if got != chain[i+1] {
t.Fatalf("Next(%q) = %q, want %q", chain[i], got, chain[i+1])
}
}
}
+26
View File
@@ -0,0 +1,26 @@
package orchestrator
import (
"crypto/rand"
"crypto/sha256"
"encoding/hex"
"fmt"
)
// IssueRunToken returns (plaintext, hashHex). The plaintext is passed
// to the host via the iPXE kernel cmdline; the hash is persisted in the
// runs table for later constant-time comparison.
func IssueRunToken() (string, string, error) {
b := make([]byte, 32)
if _, err := rand.Read(b); err != nil {
return "", "", fmt.Errorf("random: %w", err)
}
plain := hex.EncodeToString(b)
sum := sha256.Sum256([]byte(plain))
return plain, hex.EncodeToString(sum[:]), nil
}
func HashRunToken(plain string) string {
sum := sha256.Sum256([]byte(plain))
return hex.EncodeToString(sum[:])
}
+38
View File
@@ -0,0 +1,38 @@
package orchestrator
import (
"strings"
"testing"
)
func TestIssueRunTokenRoundTrip(t *testing.T) {
plain, hash, err := IssueRunToken()
if err != nil {
t.Fatalf("IssueRunToken: %v", err)
}
if len(plain) != 64 {
t.Fatalf("plaintext should be 64 hex chars, got %d", len(plain))
}
if len(hash) != 64 {
t.Fatalf("hash should be 64 hex chars, got %d", len(hash))
}
if HashRunToken(plain) != hash {
t.Fatalf("HashRunToken(plain) != hash")
}
// Ensure high entropy: two consecutive issues differ.
plain2, _, _ := IssueRunToken()
if plain == plain2 {
t.Fatalf("expected distinct tokens on consecutive calls")
}
}
func TestHashRunTokenDeterministic(t *testing.T) {
h1 := HashRunToken("abc")
h2 := HashRunToken("abc")
if h1 != h2 {
t.Fatalf("hash not deterministic")
}
if strings.EqualFold(h1, HashRunToken("abd")) {
t.Fatalf("hash should differ for distinct inputs")
}
}
+57
View File
@@ -0,0 +1,57 @@
package orchestrator
import (
"encoding/hex"
"fmt"
"net"
"strconv"
"strings"
)
// SendWoL sends a Wake-on-LAN magic packet to broadcastIP:port for the
// given MAC (aa:bb:cc:dd:ee:ff). The packet is 6 bytes of 0xFF followed
// by the MAC repeated 16 times.
func SendWoL(mac, broadcastIP string, port int) error {
macBytes, err := parseMAC(mac)
if err != nil {
return err
}
packet := make([]byte, 6+16*6)
for i := 0; i < 6; i++ {
packet[i] = 0xff
}
for i := 0; i < 16; i++ {
copy(packet[6+i*6:], macBytes)
}
conn, err := net.Dial("udp", net.JoinHostPort(broadcastIP, strconv.Itoa(port)))
if err != nil {
return fmt.Errorf("dial wol: %w", err)
}
defer conn.Close()
if _, err := conn.Write(packet); err != nil {
return fmt.Errorf("write wol: %w", err)
}
return nil
}
func parseMAC(s string) ([]byte, error) {
s = strings.ToLower(strings.TrimSpace(s))
parts := strings.Split(s, ":")
if len(parts) != 6 {
return nil, fmt.Errorf("invalid MAC %q", s)
}
out := make([]byte, 6)
for i, p := range parts {
if len(p) != 2 {
return nil, fmt.Errorf("invalid MAC octet %q", p)
}
b, err := hex.DecodeString(p)
if err != nil {
return nil, fmt.Errorf("invalid MAC %q: %w", s, err)
}
out[i] = b[0]
}
return out, nil
}
+37
View File
@@ -0,0 +1,37 @@
package orchestrator
import (
"bytes"
"testing"
)
func TestParseMAC(t *testing.T) {
got, err := parseMAC("aa:bb:cc:dd:ee:ff")
if err != nil {
t.Fatalf("parseMAC: %v", err)
}
want := []byte{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
if !bytes.Equal(got, want) {
t.Fatalf("parseMAC: %x != %x", got, want)
}
}
func TestParseMACUpper(t *testing.T) {
// Must be case-insensitive so users can paste either form.
got, err := parseMAC("AA:BB:CC:DD:EE:FF")
if err != nil {
t.Fatalf("parseMAC upper: %v", err)
}
want := []byte{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
if !bytes.Equal(got, want) {
t.Fatalf("parseMAC upper: %x != %x", got, want)
}
}
func TestParseMACInvalid(t *testing.T) {
for _, bad := range []string{"", "aa:bb:cc", "zz:yy:xx:ww:vv:uu", "aa-bb-cc-dd-ee-ff", "aa:bb:cc:dd:ee:ff:00"} {
if _, err := parseMAC(bad); err == nil {
t.Errorf("expected error for %q", bad)
}
}
}