Initial implementation: host lifecycle + PXE + admin dashboard

Go service for Proxmox homelab cluster provisioning. Handles PXE boot,
Proxmox autoinstall (answer file generation), cluster join via SSH,
and Infrastructure API registration.

- Host state machine (registered → pxe_ready → installing → ready)
- dnsmasq supervisor with MAC-based allowlist
- iPXE script and Proxmox answer file generation
- First-boot phone-home → cluster join → infra registration
- Operation locking with expiry (409 on conflict)
- SSE event hub for real-time dashboard updates
- Admin dashboard (host grid, detail, registration form)
- Config-driven server types with hot-reload
- Docker deployment (multi-stage fat image)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-03 20:55:14 -04:00
commit bda568b25c
39 changed files with 3067 additions and 0 deletions
+111
View File
@@ -0,0 +1,111 @@
package orchestrator
import (
"context"
"log"
"provisioning/internal/config"
"provisioning/internal/infra"
"provisioning/internal/model"
"provisioning/internal/statemachine"
"provisioning/internal/store"
)
type HostOrchestrator struct {
Runner *Runner
Hosts *store.Hosts
Ops *store.Operations
Locks *store.Locks
Cluster *ClusterJoiner
InfraClient *infra.Client
Config *config.Config
ServerTypes *config.ServerTypeRegistry
}
func (o *HostOrchestrator) HandlePhoneHome(ctx context.Context, hostID int64, ip string, hardwareID string) {
if err := o.Hosts.UpdateIP(ctx, hostID, ip, hardwareID); err != nil {
log.Printf("host %d: failed to update IP: %v", hostID, err)
o.Runner.FailHost(ctx, hostID, "failed to update IP: "+err.Error())
return
}
if _, err := o.Runner.Transition(ctx, hostID, statemachine.TriggerPhoneHome); err != nil {
log.Printf("host %d: phone-home transition failed: %v", hostID, err)
return
}
go o.postPhoneHome(hostID, ip, hardwareID)
}
func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID string) {
ctx := context.Background()
host, err := o.Hosts.Get(ctx, hostID)
if err != nil {
log.Printf("host %d: failed to get host for cluster join: %v", hostID, err)
o.Runner.FailHost(ctx, hostID, "get host: "+err.Error())
return
}
if _, err := o.Runner.Transition(ctx, hostID, statemachine.TriggerClusterJoinStart); err != nil {
log.Printf("host %d: cluster join start transition failed: %v", hostID, err)
return
}
if err := o.Cluster.Join(ctx, ip); err != nil {
log.Printf("host %d: cluster join failed: %v", hostID, err)
o.Runner.FailHost(ctx, hostID, "cluster join: "+err.Error())
return
}
if err := o.registerInfra(ctx, host, ip, hardwareID); err != nil {
log.Printf("host %d: infra registration failed: %v", hostID, err)
o.Runner.FailHost(ctx, hostID, "infra registration: "+err.Error())
return
}
if _, err := o.Runner.Transition(ctx, hostID, statemachine.TriggerJoinComplete); err != nil {
log.Printf("host %d: join complete transition failed: %v", hostID, err)
return
}
op, err := o.Ops.GetActive(ctx, hostID)
if err == nil {
_ = o.Ops.Complete(ctx, op.ID)
}
_ = o.Locks.Release(ctx, hostID)
log.Printf("host %d (%s): provisioning complete", hostID, host.Hostname)
}
func (o *HostOrchestrator) registerInfra(ctx context.Context, host *model.Host, ip string, hardwareID string) error {
if o.InfraClient == nil {
return nil
}
st, _ := o.ServerTypes.Get(host.ServerType)
serverTypeID := o.Config.Infrastructure.ServerTypeMap[host.ServerType]
infraID, err := o.InfraClient.CreateHost(ctx, infra.CreateHostRequest{
HardwareID: hardwareID,
Hostname: host.Hostname,
AssetID: host.Hostname,
RoomID: o.Config.Infrastructure.RoomID,
ServerTypeID: serverTypeID,
})
if err != nil {
return err
}
if err := o.Hosts.UpdateInfraID(ctx, host.ID, infraID); err != nil {
return err
}
_ = o.InfraClient.CreateInterface(ctx, infra.CreateInterfaceRequest{
HostID: int(infraID),
Name: st.ManagementNIC,
MACAddress: host.MAC,
IPAddress: ip,
})
return nil
}