Add activity log system for provisioning lifecycle visibility
build-and-push / test (push) Failing after 32s
build-and-push / build-and-push (push) Has been skipped

Hosts stuck in states like pxe_ready had zero visibility into why.
This adds a persistent activity log that records every meaningful
step (state transitions, PXE events, cluster join stages, failures)
and surfaces it on the host detail page with live SSE updates.
Includes a stuck-detection warning banner when a host sits in
pxe_ready for >10 minutes with no iPXE request.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-13 23:30:21 -04:00
parent c3a1cf99f9
commit a6603b463f
12 changed files with 209 additions and 12 deletions
+5 -1
View File
@@ -33,6 +33,7 @@ func (o *HostOrchestrator) PrepareRebuild(ctx context.Context, hostID int64) err
}
func (o *HostOrchestrator) HandlePhoneHome(ctx context.Context, hostID int64, ip string, hardwareID string) {
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Phone-home: updating IP to "+ip)
if err := o.Hosts.UpdateIP(ctx, hostID, ip, hardwareID); err != nil {
log.Printf("host %d: failed to update IP: %v", hostID, err)
o.Runner.FailHost(ctx, hostID, "failed to update IP: "+err.Error())
@@ -69,15 +70,17 @@ func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID str
return
}
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Starting cluster join via "+o.Cluster.ExistingNode)
if err := o.Cluster.Join(ctx, ip, privateKey, publicKey); err != nil {
log.Printf("host %d: cluster join failed: %v", hostID, err)
o.Runner.FailHost(ctx, hostID, "cluster join: "+err.Error())
return
}
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Cluster join complete")
// Key has been removed from the remote host; clear it from the DB
_ = o.Hosts.ClearEphemeralKey(ctx, hostID)
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Registering with infrastructure API")
if err := o.registerInfra(ctx, host, ip, hardwareID); err != nil {
log.Printf("host %d: infra registration failed: %v", hostID, err)
o.Runner.FailHost(ctx, hostID, "infra registration: "+err.Error())
@@ -94,6 +97,7 @@ func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID str
_ = o.Ops.Complete(ctx, op.ID)
}
_ = o.Locks.Release(ctx, hostID)
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Provisioning complete")
log.Printf("host %d (%s): provisioning complete", hostID, host.Hostname)
}