Initial implementation: host lifecycle + PXE + admin dashboard
Go service for Proxmox homelab cluster provisioning. Handles PXE boot, Proxmox autoinstall (answer file generation), cluster join via SSH, and Infrastructure API registration. - Host state machine (registered → pxe_ready → installing → ready) - dnsmasq supervisor with MAC-based allowlist - iPXE script and Proxmox answer file generation - First-boot phone-home → cluster join → infra registration - Operation locking with expiry (409 on conflict) - SSE event hub for real-time dashboard updates - Admin dashboard (host grid, detail, registration form) - Config-driven server types with hot-reload - Docker deployment (multi-stage fat image) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"golang.org/x/crypto/ssh"
|
||||
)
|
||||
|
||||
type ClusterJoiner struct {
|
||||
ExistingNode string
|
||||
ClusterName string
|
||||
JoinFingerprint string
|
||||
SSHKeyPath string
|
||||
}
|
||||
|
||||
func (c *ClusterJoiner) Join(ctx context.Context, hostIP string) error {
|
||||
client, err := c.connect(hostIP)
|
||||
if err != nil {
|
||||
return fmt.Errorf("ssh connect to %s: %w", hostIP, err)
|
||||
}
|
||||
defer client.Close()
|
||||
|
||||
cmd := fmt.Sprintf("pvecm add %s --force", c.ExistingNode)
|
||||
log.Printf("cluster: running on %s: %s", hostIP, cmd)
|
||||
|
||||
session, err := client.NewSession()
|
||||
if err != nil {
|
||||
return fmt.Errorf("ssh session: %w", err)
|
||||
}
|
||||
defer session.Close()
|
||||
|
||||
output, err := session.CombinedOutput(cmd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("pvecm add failed: %w\noutput: %s", err, string(output))
|
||||
}
|
||||
log.Printf("cluster: %s joined successfully", hostIP)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *ClusterJoiner) connect(hostIP string) (*ssh.Client, error) {
|
||||
keyData, err := os.ReadFile(c.SSHKeyPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read ssh key: %w", err)
|
||||
}
|
||||
signer, err := ssh.ParsePrivateKey(keyData)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse ssh key: %w", err)
|
||||
}
|
||||
config := &ssh.ClientConfig{
|
||||
User: "root",
|
||||
Auth: []ssh.AuthMethod{ssh.PublicKeys(signer)},
|
||||
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
return ssh.Dial("tcp", hostIP+":22", config)
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
|
||||
"provisioning/internal/config"
|
||||
"provisioning/internal/infra"
|
||||
"provisioning/internal/model"
|
||||
"provisioning/internal/statemachine"
|
||||
"provisioning/internal/store"
|
||||
)
|
||||
|
||||
type HostOrchestrator struct {
|
||||
Runner *Runner
|
||||
Hosts *store.Hosts
|
||||
Ops *store.Operations
|
||||
Locks *store.Locks
|
||||
Cluster *ClusterJoiner
|
||||
InfraClient *infra.Client
|
||||
Config *config.Config
|
||||
ServerTypes *config.ServerTypeRegistry
|
||||
}
|
||||
|
||||
func (o *HostOrchestrator) HandlePhoneHome(ctx context.Context, hostID int64, ip string, hardwareID string) {
|
||||
if err := o.Hosts.UpdateIP(ctx, hostID, ip, hardwareID); err != nil {
|
||||
log.Printf("host %d: failed to update IP: %v", hostID, err)
|
||||
o.Runner.FailHost(ctx, hostID, "failed to update IP: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
if _, err := o.Runner.Transition(ctx, hostID, statemachine.TriggerPhoneHome); err != nil {
|
||||
log.Printf("host %d: phone-home transition failed: %v", hostID, err)
|
||||
return
|
||||
}
|
||||
|
||||
go o.postPhoneHome(hostID, ip, hardwareID)
|
||||
}
|
||||
|
||||
func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID string) {
|
||||
ctx := context.Background()
|
||||
|
||||
host, err := o.Hosts.Get(ctx, hostID)
|
||||
if err != nil {
|
||||
log.Printf("host %d: failed to get host for cluster join: %v", hostID, err)
|
||||
o.Runner.FailHost(ctx, hostID, "get host: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
if _, err := o.Runner.Transition(ctx, hostID, statemachine.TriggerClusterJoinStart); err != nil {
|
||||
log.Printf("host %d: cluster join start transition failed: %v", hostID, err)
|
||||
return
|
||||
}
|
||||
|
||||
if err := o.Cluster.Join(ctx, ip); err != nil {
|
||||
log.Printf("host %d: cluster join failed: %v", hostID, err)
|
||||
o.Runner.FailHost(ctx, hostID, "cluster join: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
if err := o.registerInfra(ctx, host, ip, hardwareID); err != nil {
|
||||
log.Printf("host %d: infra registration failed: %v", hostID, err)
|
||||
o.Runner.FailHost(ctx, hostID, "infra registration: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
if _, err := o.Runner.Transition(ctx, hostID, statemachine.TriggerJoinComplete); err != nil {
|
||||
log.Printf("host %d: join complete transition failed: %v", hostID, err)
|
||||
return
|
||||
}
|
||||
|
||||
op, err := o.Ops.GetActive(ctx, hostID)
|
||||
if err == nil {
|
||||
_ = o.Ops.Complete(ctx, op.ID)
|
||||
}
|
||||
_ = o.Locks.Release(ctx, hostID)
|
||||
log.Printf("host %d (%s): provisioning complete", hostID, host.Hostname)
|
||||
}
|
||||
|
||||
func (o *HostOrchestrator) registerInfra(ctx context.Context, host *model.Host, ip string, hardwareID string) error {
|
||||
if o.InfraClient == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
st, _ := o.ServerTypes.Get(host.ServerType)
|
||||
serverTypeID := o.Config.Infrastructure.ServerTypeMap[host.ServerType]
|
||||
|
||||
infraID, err := o.InfraClient.CreateHost(ctx, infra.CreateHostRequest{
|
||||
HardwareID: hardwareID,
|
||||
Hostname: host.Hostname,
|
||||
AssetID: host.Hostname,
|
||||
RoomID: o.Config.Infrastructure.RoomID,
|
||||
ServerTypeID: serverTypeID,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := o.Hosts.UpdateInfraID(ctx, host.ID, infraID); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_ = o.InfraClient.CreateInterface(ctx, infra.CreateInterfaceRequest{
|
||||
HostID: int(infraID),
|
||||
Name: st.ManagementNIC,
|
||||
MACAddress: host.MAC,
|
||||
IPAddress: ip,
|
||||
})
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"provisioning/internal/events"
|
||||
"provisioning/internal/model"
|
||||
"provisioning/internal/statemachine"
|
||||
"provisioning/internal/store"
|
||||
)
|
||||
|
||||
type Runner struct {
|
||||
Hosts *store.Hosts
|
||||
Ops *store.Operations
|
||||
Locks *store.Locks
|
||||
Hub *events.Hub
|
||||
}
|
||||
|
||||
func (r *Runner) Transition(ctx context.Context, hostID int64, trigger statemachine.Trigger) (model.HostState, error) {
|
||||
host, err := r.Hosts.Get(ctx, hostID)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("transition: get host: %w", err)
|
||||
}
|
||||
next, err := statemachine.Next(host.State, trigger)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("transition: %w", err)
|
||||
}
|
||||
if err := r.Hosts.UpdateState(ctx, hostID, next); err != nil {
|
||||
return "", fmt.Errorf("transition: update state: %w", err)
|
||||
}
|
||||
log.Printf("host %d (%s): %s -> %s [%s]", hostID, host.Hostname, host.State, next, trigger)
|
||||
r.Hub.Publish(events.Event{
|
||||
Name: "host.state_changed",
|
||||
Payload: fmt.Sprintf(`{"host_id":%d,"old_state":"%s","new_state":"%s"}`, hostID, host.State, next),
|
||||
})
|
||||
return next, nil
|
||||
}
|
||||
|
||||
func (r *Runner) FailHost(ctx context.Context, hostID int64, reason string) {
|
||||
if _, err := r.Transition(ctx, hostID, statemachine.TriggerFailed); err != nil {
|
||||
log.Printf("host %d: failed to transition to failed state: %v", hostID, err)
|
||||
return
|
||||
}
|
||||
op, err := r.Ops.GetActive(ctx, hostID)
|
||||
if err == nil {
|
||||
_ = r.Ops.Fail(ctx, op.ID, reason)
|
||||
}
|
||||
_ = r.Locks.Release(ctx, hostID)
|
||||
}
|
||||
Reference in New Issue
Block a user