Add activity log system for provisioning lifecycle visibility
build-and-push / test (push) Failing after 32s
build-and-push / build-and-push (push) Has been skipped

Hosts stuck in states like pxe_ready had zero visibility into why.
This adds a persistent activity log that records every meaningful
step (state transitions, PXE events, cluster join stages, failures)
and surfaces it on the host detail page with live SSE updates.
Includes a stuck-detection warning banner when a host sits in
pxe_ready for >10 minutes with no iPXE request.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-13 23:30:21 -04:00
parent c3a1cf99f9
commit a6603b463f
12 changed files with 209 additions and 12 deletions
+5
View File
@@ -3,6 +3,7 @@ package api
import (
"encoding/json"
"errors"
"fmt"
"log"
"net/http"
"strings"
@@ -46,6 +47,7 @@ func (a *BootAPI) IPXEScript(w http.ResponseWriter, r *http.Request) {
if host.State == model.StatePXEReady {
a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerPXEScriptServed)
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "iPXE script served — kernel + initrd delivered")
}
w.Header().Set("Content-Type", "text/plain")
@@ -80,6 +82,7 @@ func (a *BootAPI) AnswerFile(w http.ResponseWriter, r *http.Request) {
if host.State == model.StatePXEBooted {
a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerAnswerServed)
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "Answer file served — installation starting")
}
_, pubKey, _ := a.Hosts.GetEphemeralKey(r.Context(), host.ID)
@@ -106,6 +109,7 @@ func (a *BootAPI) InstallComplete(w http.ResponseWriter, r *http.Request) {
}
if host.State == model.StateInstalling {
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "Install-complete webhook received")
if _, err := a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerInstallWebhook); err != nil {
log.Printf("host %d: install-complete transition failed: %v", host.ID, err)
}
@@ -159,6 +163,7 @@ func (a *BootAPI) PhoneHome(w http.ResponseWriter, r *http.Request) {
}
log.Printf("host %d (%s): phone-home from %s, hwid=%s", host.ID, host.Hostname, req.IP, req.HardwareID)
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", fmt.Sprintf("Phone-home received from %s", req.IP))
a.Orchestrator.HandlePhoneHome(r.Context(), host.ID, req.IP, req.HardwareID)
writeJSON(w, http.StatusOK, map[string]any{"ok": true})
+1
View File
@@ -124,6 +124,7 @@ func (a *HostAPI) Rebuild(w http.ResponseWriter, r *http.Request) {
writeJSONErr(w, http.StatusInternalServerError, "failed to acquire lock")
return
}
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "api", "Rebuild triggered via API")
if err := a.Orchestrator.PrepareRebuild(r.Context(), host.ID); err != nil {
_ = a.Locks.Release(r.Context(), host.ID)
+28 -2
View File
@@ -5,6 +5,7 @@ import (
"html"
"net/http"
"strings"
"time"
"provisioning/internal/model"
)
@@ -83,7 +84,7 @@ func hostFormPage(types []string, errMsg string, prefill *model.Host) string {
`, errHTML, hostname, mac, opts.String(), notes))
}
func hostDetailPage(h *model.Host, ops []model.Operation) string {
func hostDetailPage(h *model.Host, ops []model.Operation, activity []model.ActivityEntry) string {
stateClass := stateColor(h.State)
led := ledClass(h.State)
canRebuild := h.State == model.StateRegistered || h.State == model.StateReady || h.State == model.StateFailed
@@ -113,12 +114,29 @@ func hostDetailPage(h *model.Host, ops []model.Operation) string {
ip = "—"
}
var stuckWarning string
if h.State == model.StatePXEReady && time.Since(h.UpdatedAt) > 10*time.Minute {
mins := int(time.Since(h.UpdatedAt).Minutes())
stuckWarning = fmt.Sprintf(`<div class="stuck-warning">Host has been in PXE_READY for %d minutes with no iPXE request. This usually means the host failed to PXE boot — check secure boot settings, network connectivity, and BIOS boot order.</div>`, mins)
}
var activityHTML strings.Builder
for _, e := range activity {
activityHTML.WriteString(fmt.Sprintf(
`<div class="log-entry log-%s"><span class="log-time">%s</span><span class="log-source">%s</span><span class="log-msg">%s</span></div>`,
e.Level, e.CreatedAt.Format("15:04"), html.EscapeString(e.Source), html.EscapeString(e.Message)))
}
if len(activity) == 0 {
activityHTML.WriteString(`<p class="empty">No activity recorded yet.</p>`)
}
return layout(h.Hostname, fmt.Sprintf(`
<div class="host-header">
<span class="led led-lg %s"></span>
<h2 style="margin-bottom:0">%s</h2>
<span class="badge %s">%s</span>
</div>
%s
<div class="panel">
<table class="detail-table">
<tr><th>MAC</th><td>%s</td></tr>
@@ -135,7 +153,15 @@ func hostDetailPage(h *model.Host, ops []model.Operation) string {
<tbody>%s</tbody>
</table>
</div>
`, led, html.EscapeString(h.Hostname), stateClass, h.State, h.MAC, h.ServerType, ip, html.EscapeString(h.Notes), actions.String(), opsHTML.String()))
<h3>Activity Log</h3>
<div class="panel">
<div class="activity-log" id="activity-log" data-host-id="%d">%s</div>
</div>
`, led, html.EscapeString(h.Hostname), stateClass, h.State,
stuckWarning,
h.MAC, h.ServerType, ip, html.EscapeString(h.Notes), actions.String(),
opsHTML.String(),
h.ID, activityHTML.String()))
}
func imagesPage(images []model.Image) string {
+4 -1
View File
@@ -24,6 +24,7 @@ type UI struct {
Ops *store.Operations
Locks *store.Locks
Images *store.Images
Activity *store.Activity
ImageSvc *image.Service
Runner *orchestrator.Runner
Orchestrator *orchestrator.HostOrchestrator
@@ -106,7 +107,8 @@ func (u *UI) HostDetail(w http.ResponseWriter, r *http.Request) {
return
}
ops, _ := u.Ops.ListByHost(r.Context(), host.ID)
renderHTML(w, hostDetailPage(host, ops))
activity, _ := u.Activity.ListByHost(r.Context(), host.ID, 50)
renderHTML(w, hostDetailPage(host, ops, activity))
}
func (u *UI) TriggerRebuild(w http.ResponseWriter, r *http.Request) {
@@ -131,6 +133,7 @@ func (u *UI) TriggerRebuild(w http.ResponseWriter, r *http.Request) {
Kind: model.OpRebuildProxmox,
})
_ = u.Locks.Acquire(r.Context(), host.ID, opID)
u.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "ui", "Rebuild triggered by user")
if err := u.Orchestrator.PrepareRebuild(r.Context(), host.ID); err != nil {
_ = u.Locks.Release(r.Context(), host.ID)
@@ -0,0 +1,10 @@
CREATE TABLE activity_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
host_id INTEGER NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
operation_id INTEGER REFERENCES operations(id) ON DELETE SET NULL,
level TEXT NOT NULL DEFAULT 'info',
message TEXT NOT NULL,
source TEXT NOT NULL DEFAULT '',
created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now'))
);
CREATE INDEX idx_activity_host ON activity_log(host_id, created_at DESC);
+18
View File
@@ -66,6 +66,24 @@ type Image struct {
CreatedAt time.Time
}
type LogLevel string
const (
LogInfo LogLevel = "info"
LogWarn LogLevel = "warn"
LogError LogLevel = "error"
)
type ActivityEntry struct {
ID int64
HostID int64
OperationID int64
Level LogLevel
Message string
Source string
CreatedAt time.Time
}
type ServerType struct {
Key string
DisplayName string `yaml:"display_name"`
+5 -1
View File
@@ -33,6 +33,7 @@ func (o *HostOrchestrator) PrepareRebuild(ctx context.Context, hostID int64) err
}
func (o *HostOrchestrator) HandlePhoneHome(ctx context.Context, hostID int64, ip string, hardwareID string) {
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Phone-home: updating IP to "+ip)
if err := o.Hosts.UpdateIP(ctx, hostID, ip, hardwareID); err != nil {
log.Printf("host %d: failed to update IP: %v", hostID, err)
o.Runner.FailHost(ctx, hostID, "failed to update IP: "+err.Error())
@@ -69,15 +70,17 @@ func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID str
return
}
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Starting cluster join via "+o.Cluster.ExistingNode)
if err := o.Cluster.Join(ctx, ip, privateKey, publicKey); err != nil {
log.Printf("host %d: cluster join failed: %v", hostID, err)
o.Runner.FailHost(ctx, hostID, "cluster join: "+err.Error())
return
}
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Cluster join complete")
// Key has been removed from the remote host; clear it from the DB
_ = o.Hosts.ClearEphemeralKey(ctx, hostID)
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Registering with infrastructure API")
if err := o.registerInfra(ctx, host, ip, hardwareID); err != nil {
log.Printf("host %d: infra registration failed: %v", hostID, err)
o.Runner.FailHost(ctx, hostID, "infra registration: "+err.Error())
@@ -94,6 +97,7 @@ func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID str
_ = o.Ops.Complete(ctx, op.ID)
}
_ = o.Locks.Release(ctx, hostID)
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Provisioning complete")
log.Printf("host %d (%s): provisioning complete", hostID, host.Hostname)
}
+21 -4
View File
@@ -4,6 +4,7 @@ import (
"context"
"fmt"
"log"
"time"
"provisioning/internal/events"
"provisioning/internal/model"
@@ -12,10 +13,11 @@ import (
)
type Runner struct {
Hosts *store.Hosts
Ops *store.Operations
Locks *store.Locks
Hub *events.Hub
Hosts *store.Hosts
Ops *store.Operations
Locks *store.Locks
Hub *events.Hub
Activity *store.Activity
}
func (r *Runner) Transition(ctx context.Context, hostID int64, trigger statemachine.Trigger) (model.HostState, error) {
@@ -35,10 +37,12 @@ func (r *Runner) Transition(ctx context.Context, hostID int64, trigger statemach
Name: "host.state_changed",
Payload: fmt.Sprintf(`{"host_id":%d,"old_state":"%s","new_state":"%s"}`, hostID, host.State, next),
})
r.LogActivity(ctx, hostID, model.LogInfo, "state", fmt.Sprintf("%s → %s", host.State, next))
return next, nil
}
func (r *Runner) FailHost(ctx context.Context, hostID int64, reason string) {
r.LogActivity(ctx, hostID, model.LogError, "orchestrator", "host failed: "+reason)
if _, err := r.Transition(ctx, hostID, statemachine.TriggerFailed); err != nil {
log.Printf("host %d: failed to transition to failed state: %v", hostID, err)
return
@@ -49,3 +53,16 @@ func (r *Runner) FailHost(ctx context.Context, hostID int64, reason string) {
}
_ = r.Locks.Release(ctx, hostID)
}
func (r *Runner) LogActivity(ctx context.Context, hostID int64, level model.LogLevel, source, message string) {
var opID int64
if op, err := r.Ops.GetActive(ctx, hostID); err == nil {
opID = op.ID
}
id, _ := r.Activity.Log(ctx, hostID, opID, level, source, message)
r.Hub.Publish(events.Event{
Name: "activity.logged",
Payload: fmt.Sprintf(`{"id":%d,"host_id":%d,"level":"%s","source":"%s","message":"%s","created_at":"%s"}`,
id, hostID, level, source, message, time.Now().UTC().Format(time.RFC3339)),
})
}
+50
View File
@@ -0,0 +1,50 @@
package store
import (
"context"
"database/sql"
"fmt"
"time"
"provisioning/internal/model"
)
type Activity struct {
DB *sql.DB
}
func (s *Activity) Log(ctx context.Context, hostID, opID int64, level model.LogLevel, source, message string) (int64, error) {
res, err := s.DB.ExecContext(ctx, `
INSERT INTO activity_log(host_id, operation_id, level, message, source)
VALUES(?,?,?,?,?)
`, hostID, nullInt64(opID), level, message, source)
if err != nil {
return 0, fmt.Errorf("insert activity: %w", err)
}
return res.LastInsertId()
}
func (s *Activity) ListByHost(ctx context.Context, hostID int64, limit int) ([]model.ActivityEntry, error) {
if limit <= 0 {
limit = 50
}
rows, err := s.DB.QueryContext(ctx, `
SELECT id, host_id, COALESCE(operation_id, 0), level, message, source, created_at
FROM activity_log WHERE host_id = ? ORDER BY created_at DESC LIMIT ?
`, hostID, limit)
if err != nil {
return nil, fmt.Errorf("list activity: %w", err)
}
defer rows.Close()
var out []model.ActivityEntry
for rows.Next() {
var e model.ActivityEntry
var createdAt string
if err := rows.Scan(&e.ID, &e.HostID, &e.OperationID, &e.Level, &e.Message, &e.Source, &createdAt); err != nil {
return nil, fmt.Errorf("scan activity: %w", err)
}
e.CreatedAt, _ = time.Parse(time.RFC3339, createdAt)
out = append(out, e)
}
return out, rows.Err()
}
+42
View File
@@ -391,6 +391,48 @@ main {
.progress-text { font-size: 0.825rem; color: var(--text); font-weight: 500; }
.progress-detail { font-size: 0.775rem; color: var(--text-secondary); }
/* === STUCK WARNING === */
.stuck-warning {
background: var(--amber-bg);
color: var(--amber);
border: 1px solid var(--amber-border);
padding: 0.75rem 1rem;
border-radius: var(--radius-sm);
margin-bottom: 1rem;
font-size: 0.825rem;
font-weight: 500;
line-height: 1.5;
}
/* === ACTIVITY LOG === */
.activity-log { max-height: 400px; overflow-y: auto; }
.log-entry {
display: flex;
align-items: baseline;
gap: 0.75rem;
padding: 0.4rem 0.75rem;
border-bottom: 1px solid var(--border);
font-size: 0.825rem;
}
.log-entry:last-child { border-bottom: none; }
.log-time {
color: var(--text-tertiary);
font-family: var(--font-mono);
font-size: 0.75rem;
min-width: 3.5rem;
flex-shrink: 0;
}
.log-source {
color: var(--text-tertiary);
font-family: var(--font-mono);
font-size: 0.75rem;
min-width: 5rem;
flex-shrink: 0;
}
.log-msg { color: var(--text); }
.log-warn .log-msg { color: var(--amber); }
.log-error .log-msg { color: var(--red); font-weight: 500; }
/* === UTILITY === */
.inline { display: inline; }
+18
View File
@@ -10,6 +10,24 @@
es.addEventListener('host.state_changed', function() {
window.location.reload();
});
es.addEventListener('activity.logged', function(e) {
var data;
try { data = JSON.parse(e.data); } catch(_) { return; }
var logDiv = document.getElementById('activity-log');
if (!logDiv) return;
var hostId = logDiv.getAttribute('data-host-id');
if (String(data.host_id) !== hostId) return;
var empty = logDiv.querySelector('.empty');
if (empty) empty.remove();
var entry = document.createElement('div');
entry.className = 'log-entry log-' + data.level;
var t = new Date(data.created_at);
var ts = t.getHours().toString().padStart(2,'0') + ':' + t.getMinutes().toString().padStart(2,'0');
entry.innerHTML = '<span class="log-time">' + ts + '</span>' +
'<span class="log-source">' + data.source + '</span>' +
'<span class="log-msg">' + data.message + '</span>';
logDiv.insertBefore(entry, logDiv.firstChild);
});
es.onerror = function() {
dot.className = 'led led-red';
es.close();