a6603b463f
Hosts stuck in states like pxe_ready had zero visibility into why. This adds a persistent activity log that records every meaningful step (state transitions, PXE events, cluster join stages, failures) and surfaces it on the host detail page with live SSE updates. Includes a stuck-detection warning banner when a host sits in pxe_ready for >10 minutes with no iPXE request. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
69 lines
2.1 KiB
Go
69 lines
2.1 KiB
Go
package orchestrator
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"time"
|
|
|
|
"provisioning/internal/events"
|
|
"provisioning/internal/model"
|
|
"provisioning/internal/statemachine"
|
|
"provisioning/internal/store"
|
|
)
|
|
|
|
type Runner struct {
|
|
Hosts *store.Hosts
|
|
Ops *store.Operations
|
|
Locks *store.Locks
|
|
Hub *events.Hub
|
|
Activity *store.Activity
|
|
}
|
|
|
|
func (r *Runner) Transition(ctx context.Context, hostID int64, trigger statemachine.Trigger) (model.HostState, error) {
|
|
host, err := r.Hosts.Get(ctx, hostID)
|
|
if err != nil {
|
|
return "", fmt.Errorf("transition: get host: %w", err)
|
|
}
|
|
next, err := statemachine.Next(host.State, trigger)
|
|
if err != nil {
|
|
return "", fmt.Errorf("transition: %w", err)
|
|
}
|
|
if err := r.Hosts.UpdateState(ctx, hostID, next); err != nil {
|
|
return "", fmt.Errorf("transition: update state: %w", err)
|
|
}
|
|
log.Printf("host %d (%s): %s -> %s [%s]", hostID, host.Hostname, host.State, next, trigger)
|
|
r.Hub.Publish(events.Event{
|
|
Name: "host.state_changed",
|
|
Payload: fmt.Sprintf(`{"host_id":%d,"old_state":"%s","new_state":"%s"}`, hostID, host.State, next),
|
|
})
|
|
r.LogActivity(ctx, hostID, model.LogInfo, "state", fmt.Sprintf("%s → %s", host.State, next))
|
|
return next, nil
|
|
}
|
|
|
|
func (r *Runner) FailHost(ctx context.Context, hostID int64, reason string) {
|
|
r.LogActivity(ctx, hostID, model.LogError, "orchestrator", "host failed: "+reason)
|
|
if _, err := r.Transition(ctx, hostID, statemachine.TriggerFailed); err != nil {
|
|
log.Printf("host %d: failed to transition to failed state: %v", hostID, err)
|
|
return
|
|
}
|
|
op, err := r.Ops.GetActive(ctx, hostID)
|
|
if err == nil {
|
|
_ = r.Ops.Fail(ctx, op.ID, reason)
|
|
}
|
|
_ = r.Locks.Release(ctx, hostID)
|
|
}
|
|
|
|
func (r *Runner) LogActivity(ctx context.Context, hostID int64, level model.LogLevel, source, message string) {
|
|
var opID int64
|
|
if op, err := r.Ops.GetActive(ctx, hostID); err == nil {
|
|
opID = op.ID
|
|
}
|
|
id, _ := r.Activity.Log(ctx, hostID, opID, level, source, message)
|
|
r.Hub.Publish(events.Event{
|
|
Name: "activity.logged",
|
|
Payload: fmt.Sprintf(`{"id":%d,"host_id":%d,"level":"%s","source":"%s","message":"%s","created_at":"%s"}`,
|
|
id, hostID, level, source, message, time.Now().UTC().Format(time.RFC3339)),
|
|
})
|
|
}
|