a6603b463f
Hosts stuck in states like pxe_ready had zero visibility into why. This adds a persistent activity log that records every meaningful step (state transitions, PXE events, cluster join stages, failures) and surfaces it on the host detail page with live SSE updates. Includes a stuck-detection warning banner when a host sits in pxe_ready for >10 minutes with no iPXE request. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
136 lines
4.3 KiB
Go
136 lines
4.3 KiB
Go
package orchestrator
|
|
|
|
import (
|
|
"context"
|
|
"log"
|
|
|
|
"provisioning/internal/config"
|
|
"provisioning/internal/infra"
|
|
"provisioning/internal/model"
|
|
"provisioning/internal/statemachine"
|
|
"provisioning/internal/store"
|
|
)
|
|
|
|
type HostOrchestrator struct {
|
|
Runner *Runner
|
|
Hosts *store.Hosts
|
|
Ops *store.Operations
|
|
Locks *store.Locks
|
|
Cluster *ClusterJoiner
|
|
InfraClient *infra.Client
|
|
Config *config.Config
|
|
ServerTypes *config.ServerTypeRegistry
|
|
}
|
|
|
|
// PrepareRebuild generates an ephemeral SSH key pair and stores it on the host.
|
|
// The public key will be injected into the Proxmox answer file.
|
|
func (o *HostOrchestrator) PrepareRebuild(ctx context.Context, hostID int64) error {
|
|
kp, err := GenerateEphemeralKey()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return o.Hosts.SetEphemeralKey(ctx, hostID, kp.PrivateKey, kp.PublicKey)
|
|
}
|
|
|
|
func (o *HostOrchestrator) HandlePhoneHome(ctx context.Context, hostID int64, ip string, hardwareID string) {
|
|
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Phone-home: updating IP to "+ip)
|
|
if err := o.Hosts.UpdateIP(ctx, hostID, ip, hardwareID); err != nil {
|
|
log.Printf("host %d: failed to update IP: %v", hostID, err)
|
|
o.Runner.FailHost(ctx, hostID, "failed to update IP: "+err.Error())
|
|
return
|
|
}
|
|
|
|
if _, err := o.Runner.Transition(ctx, hostID, statemachine.TriggerPhoneHome); err != nil {
|
|
log.Printf("host %d: phone-home transition failed: %v", hostID, err)
|
|
return
|
|
}
|
|
|
|
go o.postPhoneHome(hostID, ip, hardwareID)
|
|
}
|
|
|
|
func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID string) {
|
|
ctx := context.Background()
|
|
|
|
host, err := o.Hosts.Get(ctx, hostID)
|
|
if err != nil {
|
|
log.Printf("host %d: failed to get host for cluster join: %v", hostID, err)
|
|
o.Runner.FailHost(ctx, hostID, "get host: "+err.Error())
|
|
return
|
|
}
|
|
|
|
privateKey, publicKey, err := o.Hosts.GetEphemeralKey(ctx, hostID)
|
|
if err != nil || privateKey == "" {
|
|
log.Printf("host %d: no ephemeral key available: %v", hostID, err)
|
|
o.Runner.FailHost(ctx, hostID, "no ephemeral SSH key")
|
|
return
|
|
}
|
|
|
|
if _, err := o.Runner.Transition(ctx, hostID, statemachine.TriggerClusterJoinStart); err != nil {
|
|
log.Printf("host %d: cluster join start transition failed: %v", hostID, err)
|
|
return
|
|
}
|
|
|
|
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Starting cluster join via "+o.Cluster.ExistingNode)
|
|
if err := o.Cluster.Join(ctx, ip, privateKey, publicKey); err != nil {
|
|
log.Printf("host %d: cluster join failed: %v", hostID, err)
|
|
o.Runner.FailHost(ctx, hostID, "cluster join: "+err.Error())
|
|
return
|
|
}
|
|
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Cluster join complete")
|
|
|
|
_ = o.Hosts.ClearEphemeralKey(ctx, hostID)
|
|
|
|
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Registering with infrastructure API")
|
|
if err := o.registerInfra(ctx, host, ip, hardwareID); err != nil {
|
|
log.Printf("host %d: infra registration failed: %v", hostID, err)
|
|
o.Runner.FailHost(ctx, hostID, "infra registration: "+err.Error())
|
|
return
|
|
}
|
|
|
|
if _, err := o.Runner.Transition(ctx, hostID, statemachine.TriggerJoinComplete); err != nil {
|
|
log.Printf("host %d: join complete transition failed: %v", hostID, err)
|
|
return
|
|
}
|
|
|
|
op, err := o.Ops.GetActive(ctx, hostID)
|
|
if err == nil {
|
|
_ = o.Ops.Complete(ctx, op.ID)
|
|
}
|
|
_ = o.Locks.Release(ctx, hostID)
|
|
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Provisioning complete")
|
|
log.Printf("host %d (%s): provisioning complete", hostID, host.Hostname)
|
|
}
|
|
|
|
func (o *HostOrchestrator) registerInfra(ctx context.Context, host *model.Host, ip string, hardwareID string) error {
|
|
if o.InfraClient == nil {
|
|
return nil
|
|
}
|
|
|
|
st, _ := o.ServerTypes.Get(host.ServerType)
|
|
serverTypeID := o.Config.Infrastructure.ServerTypeMap[host.ServerType]
|
|
|
|
infraID, err := o.InfraClient.CreateHost(ctx, infra.CreateHostRequest{
|
|
HardwareID: hardwareID,
|
|
Hostname: host.Hostname,
|
|
AssetID: host.Hostname,
|
|
RoomID: o.Config.Infrastructure.RoomID,
|
|
ServerTypeID: serverTypeID,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := o.Hosts.UpdateInfraID(ctx, host.ID, infraID); err != nil {
|
|
return err
|
|
}
|
|
|
|
_ = o.InfraClient.CreateInterface(ctx, infra.CreateInterfaceRequest{
|
|
HostID: int(infraID),
|
|
Name: st.ManagementNIC,
|
|
MACAddress: host.MAC,
|
|
IPAddress: ip,
|
|
})
|
|
|
|
return nil
|
|
}
|