Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,124 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"vetting/internal/model"
|
||||
"vetting/internal/store"
|
||||
)
|
||||
|
||||
// Dispatcher picks Queued runs off the DB and drives them through
|
||||
// WaitingWoL (sending a WoL packet). Concurrency is capped at Max.
|
||||
//
|
||||
// For Phase 2 the dispatcher's job ends at WaitingWoL; further
|
||||
// transitions are driven by iPXE and agent callbacks. Phase 4+ will
|
||||
// return here and shepherd each run through stage execution.
|
||||
type Dispatcher struct {
|
||||
Max int
|
||||
Runs *store.Runs
|
||||
Hosts *store.Hosts
|
||||
Runner *Runner
|
||||
|
||||
active chan struct{}
|
||||
stop chan struct{}
|
||||
}
|
||||
|
||||
func NewDispatcher(max int, runs *store.Runs, hosts *store.Hosts, runner *Runner) *Dispatcher {
|
||||
if max < 1 {
|
||||
max = 1
|
||||
}
|
||||
return &Dispatcher{
|
||||
Max: max,
|
||||
Runs: runs,
|
||||
Hosts: hosts,
|
||||
Runner: runner,
|
||||
active: make(chan struct{}, max),
|
||||
stop: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Dispatcher) Start(ctx context.Context) {
|
||||
go d.loop(ctx)
|
||||
}
|
||||
|
||||
func (d *Dispatcher) Stop() {
|
||||
close(d.stop)
|
||||
}
|
||||
|
||||
func (d *Dispatcher) loop(ctx context.Context) {
|
||||
t := time.NewTicker(2 * time.Second)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-d.stop:
|
||||
return
|
||||
case <-t.C:
|
||||
d.pickNext(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Dispatcher) pickNext(ctx context.Context) {
|
||||
select {
|
||||
case d.active <- struct{}{}:
|
||||
default:
|
||||
return // at capacity
|
||||
}
|
||||
released := false
|
||||
defer func() {
|
||||
if !released {
|
||||
<-d.active
|
||||
}
|
||||
}()
|
||||
|
||||
runs, err := d.Runs.Active(ctx)
|
||||
if err != nil {
|
||||
log.Printf("dispatcher: list active: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
var queued *model.Run
|
||||
inFlight := 0
|
||||
for i := range runs {
|
||||
switch runs[i].State {
|
||||
case model.StateQueued:
|
||||
if queued == nil {
|
||||
queued = &runs[i]
|
||||
}
|
||||
case model.StateWaitingWoL, model.StateBooting, model.StateInventoryCheck,
|
||||
model.StateSpecValidate, model.StateSMART, model.StateCPUStress,
|
||||
model.StateStorage, model.StateNetwork, model.StateGPU,
|
||||
model.StatePSU, model.StateReporting:
|
||||
inFlight++
|
||||
}
|
||||
}
|
||||
|
||||
if inFlight >= d.Max || queued == nil {
|
||||
return
|
||||
}
|
||||
|
||||
host, err := d.Hosts.Get(ctx, queued.HostID)
|
||||
if err != nil {
|
||||
log.Printf("dispatcher: get host %d: %v", queued.HostID, err)
|
||||
return
|
||||
}
|
||||
if _, err := d.Runner.Transition(ctx, queued.ID, TriggerDispatched); err != nil {
|
||||
log.Printf("dispatcher: transition run %d: %v", queued.ID, err)
|
||||
return
|
||||
}
|
||||
if err := SendWoL(host.MAC, host.WoLBroadcastIP, host.WoLPort); err != nil {
|
||||
log.Printf("dispatcher: WoL run %d host %s: %v", queued.ID, host.Name, err)
|
||||
// Stay in WaitingWoL; operator can retry or investigate.
|
||||
return
|
||||
}
|
||||
log.Printf("dispatcher: WoL sent for run %d (host=%s mac=%s)", queued.ID, host.Name, host.MAC)
|
||||
|
||||
// Slot stays reserved until the run leaves active (Phase 4+).
|
||||
// Phase 2 lets the loop observe inFlight via DB state.
|
||||
released = true
|
||||
<-d.active
|
||||
}
|
||||
Reference in New Issue
Block a user