Add activity log system for provisioning lifecycle visibility
Hosts stuck in states like pxe_ready had zero visibility into why. This adds a persistent activity log that records every meaningful step (state transitions, PXE events, cluster join stages, failures) and surfaces it on the host detail page with live SSE updates. Includes a stuck-detection warning banner when a host sits in pxe_ready for >10 minutes with no iPXE request. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -33,6 +33,7 @@ func (o *HostOrchestrator) PrepareRebuild(ctx context.Context, hostID int64) err
|
||||
}
|
||||
|
||||
func (o *HostOrchestrator) HandlePhoneHome(ctx context.Context, hostID int64, ip string, hardwareID string) {
|
||||
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Phone-home: updating IP to "+ip)
|
||||
if err := o.Hosts.UpdateIP(ctx, hostID, ip, hardwareID); err != nil {
|
||||
log.Printf("host %d: failed to update IP: %v", hostID, err)
|
||||
o.Runner.FailHost(ctx, hostID, "failed to update IP: "+err.Error())
|
||||
@@ -69,15 +70,17 @@ func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID str
|
||||
return
|
||||
}
|
||||
|
||||
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Starting cluster join via "+o.Cluster.ExistingNode)
|
||||
if err := o.Cluster.Join(ctx, ip, privateKey, publicKey); err != nil {
|
||||
log.Printf("host %d: cluster join failed: %v", hostID, err)
|
||||
o.Runner.FailHost(ctx, hostID, "cluster join: "+err.Error())
|
||||
return
|
||||
}
|
||||
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Cluster join complete")
|
||||
|
||||
// Key has been removed from the remote host; clear it from the DB
|
||||
_ = o.Hosts.ClearEphemeralKey(ctx, hostID)
|
||||
|
||||
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Registering with infrastructure API")
|
||||
if err := o.registerInfra(ctx, host, ip, hardwareID); err != nil {
|
||||
log.Printf("host %d: infra registration failed: %v", hostID, err)
|
||||
o.Runner.FailHost(ctx, hostID, "infra registration: "+err.Error())
|
||||
@@ -94,6 +97,7 @@ func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID str
|
||||
_ = o.Ops.Complete(ctx, op.ID)
|
||||
}
|
||||
_ = o.Locks.Release(ctx, hostID)
|
||||
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Provisioning complete")
|
||||
log.Printf("host %d (%s): provisioning complete", hostID, host.Hostname)
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"provisioning/internal/events"
|
||||
"provisioning/internal/model"
|
||||
@@ -12,10 +13,11 @@ import (
|
||||
)
|
||||
|
||||
type Runner struct {
|
||||
Hosts *store.Hosts
|
||||
Ops *store.Operations
|
||||
Locks *store.Locks
|
||||
Hub *events.Hub
|
||||
Hosts *store.Hosts
|
||||
Ops *store.Operations
|
||||
Locks *store.Locks
|
||||
Hub *events.Hub
|
||||
Activity *store.Activity
|
||||
}
|
||||
|
||||
func (r *Runner) Transition(ctx context.Context, hostID int64, trigger statemachine.Trigger) (model.HostState, error) {
|
||||
@@ -35,10 +37,12 @@ func (r *Runner) Transition(ctx context.Context, hostID int64, trigger statemach
|
||||
Name: "host.state_changed",
|
||||
Payload: fmt.Sprintf(`{"host_id":%d,"old_state":"%s","new_state":"%s"}`, hostID, host.State, next),
|
||||
})
|
||||
r.LogActivity(ctx, hostID, model.LogInfo, "state", fmt.Sprintf("%s → %s", host.State, next))
|
||||
return next, nil
|
||||
}
|
||||
|
||||
func (r *Runner) FailHost(ctx context.Context, hostID int64, reason string) {
|
||||
r.LogActivity(ctx, hostID, model.LogError, "orchestrator", "host failed: "+reason)
|
||||
if _, err := r.Transition(ctx, hostID, statemachine.TriggerFailed); err != nil {
|
||||
log.Printf("host %d: failed to transition to failed state: %v", hostID, err)
|
||||
return
|
||||
@@ -49,3 +53,16 @@ func (r *Runner) FailHost(ctx context.Context, hostID int64, reason string) {
|
||||
}
|
||||
_ = r.Locks.Release(ctx, hostID)
|
||||
}
|
||||
|
||||
func (r *Runner) LogActivity(ctx context.Context, hostID int64, level model.LogLevel, source, message string) {
|
||||
var opID int64
|
||||
if op, err := r.Ops.GetActive(ctx, hostID); err == nil {
|
||||
opID = op.ID
|
||||
}
|
||||
id, _ := r.Activity.Log(ctx, hostID, opID, level, source, message)
|
||||
r.Hub.Publish(events.Event{
|
||||
Name: "activity.logged",
|
||||
Payload: fmt.Sprintf(`{"id":%d,"host_id":%d,"level":"%s","source":"%s","message":"%s","created_at":"%s"}`,
|
||||
id, hostID, level, source, message, time.Now().UTC().Format(time.RFC3339)),
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user