diff --git a/cmd/provisioning/main.go b/cmd/provisioning/main.go index 7c2cad4..449d6be 100644 --- a/cmd/provisioning/main.go +++ b/cmd/provisioning/main.go @@ -54,16 +54,18 @@ func main() { ops := &store.Operations{DB: database} locks := &store.Locks{DB: database, TTLMinutes: cfg.Locks.TTLMinutes} images := &store.Images{DB: database} + activity := &store.Activity{DB: database} imageSvc := &image.Service{Store: images, ImageDir: cfg.Images.Dir} hub := events.NewHub() runner := &orchestrator.Runner{ - Hosts: hosts, - Ops: ops, - Locks: locks, - Hub: hub, + Hosts: hosts, + Ops: ops, + Locks: locks, + Hub: hub, + Activity: activity, } pxeSupervisor := pxe.NewSupervisor(pxe.SupervisorConfig{ @@ -126,6 +128,7 @@ func main() { Ops: ops, Locks: locks, Images: images, + Activity: activity, ImageSvc: imageSvc, Runner: runner, Orchestrator: hostOrch, diff --git a/internal/api/boot.go b/internal/api/boot.go index a721509..366d25c 100644 --- a/internal/api/boot.go +++ b/internal/api/boot.go @@ -3,6 +3,7 @@ package api import ( "encoding/json" "errors" + "fmt" "log" "net/http" "strings" @@ -46,6 +47,7 @@ func (a *BootAPI) IPXEScript(w http.ResponseWriter, r *http.Request) { if host.State == model.StatePXEReady { a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerPXEScriptServed) + a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "iPXE script served — kernel + initrd delivered") } w.Header().Set("Content-Type", "text/plain") @@ -80,6 +82,7 @@ func (a *BootAPI) AnswerFile(w http.ResponseWriter, r *http.Request) { if host.State == model.StatePXEBooted { a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerAnswerServed) + a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "Answer file served — installation starting") } _, pubKey, _ := a.Hosts.GetEphemeralKey(r.Context(), host.ID) @@ -106,6 +109,7 @@ func (a *BootAPI) InstallComplete(w http.ResponseWriter, r *http.Request) { } if host.State == model.StateInstalling { + a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "Install-complete webhook received") if _, err := a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerInstallWebhook); err != nil { log.Printf("host %d: install-complete transition failed: %v", host.ID, err) } @@ -159,6 +163,7 @@ func (a *BootAPI) PhoneHome(w http.ResponseWriter, r *http.Request) { } log.Printf("host %d (%s): phone-home from %s, hwid=%s", host.ID, host.Hostname, req.IP, req.HardwareID) + a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", fmt.Sprintf("Phone-home received from %s", req.IP)) a.Orchestrator.HandlePhoneHome(r.Context(), host.ID, req.IP, req.HardwareID) writeJSON(w, http.StatusOK, map[string]any{"ok": true}) diff --git a/internal/api/hosts.go b/internal/api/hosts.go index 73d2594..726dcb9 100644 --- a/internal/api/hosts.go +++ b/internal/api/hosts.go @@ -124,6 +124,7 @@ func (a *HostAPI) Rebuild(w http.ResponseWriter, r *http.Request) { writeJSONErr(w, http.StatusInternalServerError, "failed to acquire lock") return } + a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "api", "Rebuild triggered via API") if err := a.Orchestrator.PrepareRebuild(r.Context(), host.ID); err != nil { _ = a.Locks.Release(r.Context(), host.ID) diff --git a/internal/api/render.go b/internal/api/render.go index d07b7f5..514e07b 100644 --- a/internal/api/render.go +++ b/internal/api/render.go @@ -5,6 +5,7 @@ import ( "html" "net/http" "strings" + "time" "provisioning/internal/model" ) @@ -83,7 +84,7 @@ func hostFormPage(types []string, errMsg string, prefill *model.Host) string { `, errHTML, hostname, mac, opts.String(), notes)) } -func hostDetailPage(h *model.Host, ops []model.Operation) string { +func hostDetailPage(h *model.Host, ops []model.Operation, activity []model.ActivityEntry) string { stateClass := stateColor(h.State) led := ledClass(h.State) canRebuild := h.State == model.StateRegistered || h.State == model.StateReady || h.State == model.StateFailed @@ -113,12 +114,29 @@ func hostDetailPage(h *model.Host, ops []model.Operation) string { ip = "—" } + var stuckWarning string + if h.State == model.StatePXEReady && time.Since(h.UpdatedAt) > 10*time.Minute { + mins := int(time.Since(h.UpdatedAt).Minutes()) + stuckWarning = fmt.Sprintf(`
Host has been in PXE_READY for %d minutes with no iPXE request. This usually means the host failed to PXE boot — check secure boot settings, network connectivity, and BIOS boot order.
`, mins) + } + + var activityHTML strings.Builder + for _, e := range activity { + activityHTML.WriteString(fmt.Sprintf( + `
%s%s%s
`, + e.Level, e.CreatedAt.Format("15:04"), html.EscapeString(e.Source), html.EscapeString(e.Message))) + } + if len(activity) == 0 { + activityHTML.WriteString(`

No activity recorded yet.

`) + } + return layout(h.Hostname, fmt.Sprintf(`

%s

%s
+ %s
@@ -135,7 +153,15 @@ func hostDetailPage(h *model.Host, ops []model.Operation) string { %s
MAC%s
- `, led, html.EscapeString(h.Hostname), stateClass, h.State, h.MAC, h.ServerType, ip, html.EscapeString(h.Notes), actions.String(), opsHTML.String())) +

Activity Log

+
+
%s
+
+ `, led, html.EscapeString(h.Hostname), stateClass, h.State, + stuckWarning, + h.MAC, h.ServerType, ip, html.EscapeString(h.Notes), actions.String(), + opsHTML.String(), + h.ID, activityHTML.String())) } func imagesPage(images []model.Image) string { diff --git a/internal/api/ui.go b/internal/api/ui.go index c2052b0..fd36e8b 100644 --- a/internal/api/ui.go +++ b/internal/api/ui.go @@ -24,6 +24,7 @@ type UI struct { Ops *store.Operations Locks *store.Locks Images *store.Images + Activity *store.Activity ImageSvc *image.Service Runner *orchestrator.Runner Orchestrator *orchestrator.HostOrchestrator @@ -106,7 +107,8 @@ func (u *UI) HostDetail(w http.ResponseWriter, r *http.Request) { return } ops, _ := u.Ops.ListByHost(r.Context(), host.ID) - renderHTML(w, hostDetailPage(host, ops)) + activity, _ := u.Activity.ListByHost(r.Context(), host.ID, 50) + renderHTML(w, hostDetailPage(host, ops, activity)) } func (u *UI) TriggerRebuild(w http.ResponseWriter, r *http.Request) { @@ -131,6 +133,7 @@ func (u *UI) TriggerRebuild(w http.ResponseWriter, r *http.Request) { Kind: model.OpRebuildProxmox, }) _ = u.Locks.Acquire(r.Context(), host.ID, opID) + u.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "ui", "Rebuild triggered by user") if err := u.Orchestrator.PrepareRebuild(r.Context(), host.ID); err != nil { _ = u.Locks.Release(r.Context(), host.ID) diff --git a/internal/db/migrations/0003_activity_log.sql b/internal/db/migrations/0003_activity_log.sql new file mode 100644 index 0000000..5e43cbb --- /dev/null +++ b/internal/db/migrations/0003_activity_log.sql @@ -0,0 +1,10 @@ +CREATE TABLE activity_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + host_id INTEGER NOT NULL REFERENCES hosts(id) ON DELETE CASCADE, + operation_id INTEGER REFERENCES operations(id) ON DELETE SET NULL, + level TEXT NOT NULL DEFAULT 'info', + message TEXT NOT NULL, + source TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')) +); +CREATE INDEX idx_activity_host ON activity_log(host_id, created_at DESC); diff --git a/internal/model/model.go b/internal/model/model.go index f2b4fe9..2a85175 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -66,6 +66,24 @@ type Image struct { CreatedAt time.Time } +type LogLevel string + +const ( + LogInfo LogLevel = "info" + LogWarn LogLevel = "warn" + LogError LogLevel = "error" +) + +type ActivityEntry struct { + ID int64 + HostID int64 + OperationID int64 + Level LogLevel + Message string + Source string + CreatedAt time.Time +} + type ServerType struct { Key string DisplayName string `yaml:"display_name"` diff --git a/internal/orchestrator/host.go b/internal/orchestrator/host.go index 571c4ce..66f77c0 100644 --- a/internal/orchestrator/host.go +++ b/internal/orchestrator/host.go @@ -33,6 +33,7 @@ func (o *HostOrchestrator) PrepareRebuild(ctx context.Context, hostID int64) err } func (o *HostOrchestrator) HandlePhoneHome(ctx context.Context, hostID int64, ip string, hardwareID string) { + o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Phone-home: updating IP to "+ip) if err := o.Hosts.UpdateIP(ctx, hostID, ip, hardwareID); err != nil { log.Printf("host %d: failed to update IP: %v", hostID, err) o.Runner.FailHost(ctx, hostID, "failed to update IP: "+err.Error()) @@ -69,15 +70,17 @@ func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID str return } + o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Starting cluster join via "+o.Cluster.ExistingNode) if err := o.Cluster.Join(ctx, ip, privateKey, publicKey); err != nil { log.Printf("host %d: cluster join failed: %v", hostID, err) o.Runner.FailHost(ctx, hostID, "cluster join: "+err.Error()) return } + o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Cluster join complete") - // Key has been removed from the remote host; clear it from the DB _ = o.Hosts.ClearEphemeralKey(ctx, hostID) + o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Registering with infrastructure API") if err := o.registerInfra(ctx, host, ip, hardwareID); err != nil { log.Printf("host %d: infra registration failed: %v", hostID, err) o.Runner.FailHost(ctx, hostID, "infra registration: "+err.Error()) @@ -94,6 +97,7 @@ func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID str _ = o.Ops.Complete(ctx, op.ID) } _ = o.Locks.Release(ctx, hostID) + o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Provisioning complete") log.Printf("host %d (%s): provisioning complete", hostID, host.Hostname) } diff --git a/internal/orchestrator/runner.go b/internal/orchestrator/runner.go index d3e1783..c987e4e 100644 --- a/internal/orchestrator/runner.go +++ b/internal/orchestrator/runner.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log" + "time" "provisioning/internal/events" "provisioning/internal/model" @@ -12,10 +13,11 @@ import ( ) type Runner struct { - Hosts *store.Hosts - Ops *store.Operations - Locks *store.Locks - Hub *events.Hub + Hosts *store.Hosts + Ops *store.Operations + Locks *store.Locks + Hub *events.Hub + Activity *store.Activity } func (r *Runner) Transition(ctx context.Context, hostID int64, trigger statemachine.Trigger) (model.HostState, error) { @@ -35,10 +37,12 @@ func (r *Runner) Transition(ctx context.Context, hostID int64, trigger statemach Name: "host.state_changed", Payload: fmt.Sprintf(`{"host_id":%d,"old_state":"%s","new_state":"%s"}`, hostID, host.State, next), }) + r.LogActivity(ctx, hostID, model.LogInfo, "state", fmt.Sprintf("%s → %s", host.State, next)) return next, nil } func (r *Runner) FailHost(ctx context.Context, hostID int64, reason string) { + r.LogActivity(ctx, hostID, model.LogError, "orchestrator", "host failed: "+reason) if _, err := r.Transition(ctx, hostID, statemachine.TriggerFailed); err != nil { log.Printf("host %d: failed to transition to failed state: %v", hostID, err) return @@ -49,3 +53,16 @@ func (r *Runner) FailHost(ctx context.Context, hostID int64, reason string) { } _ = r.Locks.Release(ctx, hostID) } + +func (r *Runner) LogActivity(ctx context.Context, hostID int64, level model.LogLevel, source, message string) { + var opID int64 + if op, err := r.Ops.GetActive(ctx, hostID); err == nil { + opID = op.ID + } + id, _ := r.Activity.Log(ctx, hostID, opID, level, source, message) + r.Hub.Publish(events.Event{ + Name: "activity.logged", + Payload: fmt.Sprintf(`{"id":%d,"host_id":%d,"level":"%s","source":"%s","message":"%s","created_at":"%s"}`, + id, hostID, level, source, message, time.Now().UTC().Format(time.RFC3339)), + }) +} diff --git a/internal/store/activity.go b/internal/store/activity.go new file mode 100644 index 0000000..b75a74d --- /dev/null +++ b/internal/store/activity.go @@ -0,0 +1,50 @@ +package store + +import ( + "context" + "database/sql" + "fmt" + "time" + + "provisioning/internal/model" +) + +type Activity struct { + DB *sql.DB +} + +func (s *Activity) Log(ctx context.Context, hostID, opID int64, level model.LogLevel, source, message string) (int64, error) { + res, err := s.DB.ExecContext(ctx, ` + INSERT INTO activity_log(host_id, operation_id, level, message, source) + VALUES(?,?,?,?,?) + `, hostID, nullInt64(opID), level, message, source) + if err != nil { + return 0, fmt.Errorf("insert activity: %w", err) + } + return res.LastInsertId() +} + +func (s *Activity) ListByHost(ctx context.Context, hostID int64, limit int) ([]model.ActivityEntry, error) { + if limit <= 0 { + limit = 50 + } + rows, err := s.DB.QueryContext(ctx, ` + SELECT id, host_id, COALESCE(operation_id, 0), level, message, source, created_at + FROM activity_log WHERE host_id = ? ORDER BY created_at DESC LIMIT ? + `, hostID, limit) + if err != nil { + return nil, fmt.Errorf("list activity: %w", err) + } + defer rows.Close() + var out []model.ActivityEntry + for rows.Next() { + var e model.ActivityEntry + var createdAt string + if err := rows.Scan(&e.ID, &e.HostID, &e.OperationID, &e.Level, &e.Message, &e.Source, &createdAt); err != nil { + return nil, fmt.Errorf("scan activity: %w", err) + } + e.CreatedAt, _ = time.Parse(time.RFC3339, createdAt) + out = append(out, e) + } + return out, rows.Err() +} diff --git a/internal/web/static/app.css b/internal/web/static/app.css index e26ed06..3cf7b77 100644 --- a/internal/web/static/app.css +++ b/internal/web/static/app.css @@ -391,6 +391,48 @@ main { .progress-text { font-size: 0.825rem; color: var(--text); font-weight: 500; } .progress-detail { font-size: 0.775rem; color: var(--text-secondary); } +/* === STUCK WARNING === */ +.stuck-warning { + background: var(--amber-bg); + color: var(--amber); + border: 1px solid var(--amber-border); + padding: 0.75rem 1rem; + border-radius: var(--radius-sm); + margin-bottom: 1rem; + font-size: 0.825rem; + font-weight: 500; + line-height: 1.5; +} + +/* === ACTIVITY LOG === */ +.activity-log { max-height: 400px; overflow-y: auto; } +.log-entry { + display: flex; + align-items: baseline; + gap: 0.75rem; + padding: 0.4rem 0.75rem; + border-bottom: 1px solid var(--border); + font-size: 0.825rem; +} +.log-entry:last-child { border-bottom: none; } +.log-time { + color: var(--text-tertiary); + font-family: var(--font-mono); + font-size: 0.75rem; + min-width: 3.5rem; + flex-shrink: 0; +} +.log-source { + color: var(--text-tertiary); + font-family: var(--font-mono); + font-size: 0.75rem; + min-width: 5rem; + flex-shrink: 0; +} +.log-msg { color: var(--text); } +.log-warn .log-msg { color: var(--amber); } +.log-error .log-msg { color: var(--red); font-weight: 500; } + /* === UTILITY === */ .inline { display: inline; } diff --git a/internal/web/static/app.js b/internal/web/static/app.js index c417ff5..739131c 100644 --- a/internal/web/static/app.js +++ b/internal/web/static/app.js @@ -10,6 +10,24 @@ es.addEventListener('host.state_changed', function() { window.location.reload(); }); + es.addEventListener('activity.logged', function(e) { + var data; + try { data = JSON.parse(e.data); } catch(_) { return; } + var logDiv = document.getElementById('activity-log'); + if (!logDiv) return; + var hostId = logDiv.getAttribute('data-host-id'); + if (String(data.host_id) !== hostId) return; + var empty = logDiv.querySelector('.empty'); + if (empty) empty.remove(); + var entry = document.createElement('div'); + entry.className = 'log-entry log-' + data.level; + var t = new Date(data.created_at); + var ts = t.getHours().toString().padStart(2,'0') + ':' + t.getMinutes().toString().padStart(2,'0'); + entry.innerHTML = '' + ts + '' + + '' + data.source + '' + + '' + data.message + ''; + logDiv.insertBefore(entry, logDiv.firstChild); + }); es.onerror = function() { dot.className = 'led led-red'; es.close();