Add activity log system for provisioning lifecycle visibility
Hosts stuck in states like pxe_ready had zero visibility into why. This adds a persistent activity log that records every meaningful step (state transitions, PXE events, cluster join stages, failures) and surfaces it on the host detail page with live SSE updates. Includes a stuck-detection warning banner when a host sits in pxe_ready for >10 minutes with no iPXE request. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@ package api
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
@@ -46,6 +47,7 @@ func (a *BootAPI) IPXEScript(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
if host.State == model.StatePXEReady {
|
||||
a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerPXEScriptServed)
|
||||
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "iPXE script served — kernel + initrd delivered")
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
@@ -80,6 +82,7 @@ func (a *BootAPI) AnswerFile(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
if host.State == model.StatePXEBooted {
|
||||
a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerAnswerServed)
|
||||
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "Answer file served — installation starting")
|
||||
}
|
||||
|
||||
_, pubKey, _ := a.Hosts.GetEphemeralKey(r.Context(), host.ID)
|
||||
@@ -106,6 +109,7 @@ func (a *BootAPI) InstallComplete(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
if host.State == model.StateInstalling {
|
||||
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "Install-complete webhook received")
|
||||
if _, err := a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerInstallWebhook); err != nil {
|
||||
log.Printf("host %d: install-complete transition failed: %v", host.ID, err)
|
||||
}
|
||||
@@ -159,6 +163,7 @@ func (a *BootAPI) PhoneHome(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
log.Printf("host %d (%s): phone-home from %s, hwid=%s", host.ID, host.Hostname, req.IP, req.HardwareID)
|
||||
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", fmt.Sprintf("Phone-home received from %s", req.IP))
|
||||
a.Orchestrator.HandlePhoneHome(r.Context(), host.ID, req.IP, req.HardwareID)
|
||||
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true})
|
||||
|
||||
@@ -124,6 +124,7 @@ func (a *HostAPI) Rebuild(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSONErr(w, http.StatusInternalServerError, "failed to acquire lock")
|
||||
return
|
||||
}
|
||||
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "api", "Rebuild triggered via API")
|
||||
|
||||
if err := a.Orchestrator.PrepareRebuild(r.Context(), host.ID); err != nil {
|
||||
_ = a.Locks.Release(r.Context(), host.ID)
|
||||
|
||||
+28
-2
@@ -5,6 +5,7 @@ import (
|
||||
"html"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"provisioning/internal/model"
|
||||
)
|
||||
@@ -83,7 +84,7 @@ func hostFormPage(types []string, errMsg string, prefill *model.Host) string {
|
||||
`, errHTML, hostname, mac, opts.String(), notes))
|
||||
}
|
||||
|
||||
func hostDetailPage(h *model.Host, ops []model.Operation) string {
|
||||
func hostDetailPage(h *model.Host, ops []model.Operation, activity []model.ActivityEntry) string {
|
||||
stateClass := stateColor(h.State)
|
||||
led := ledClass(h.State)
|
||||
canRebuild := h.State == model.StateRegistered || h.State == model.StateReady || h.State == model.StateFailed
|
||||
@@ -113,12 +114,29 @@ func hostDetailPage(h *model.Host, ops []model.Operation) string {
|
||||
ip = "—"
|
||||
}
|
||||
|
||||
var stuckWarning string
|
||||
if h.State == model.StatePXEReady && time.Since(h.UpdatedAt) > 10*time.Minute {
|
||||
mins := int(time.Since(h.UpdatedAt).Minutes())
|
||||
stuckWarning = fmt.Sprintf(`<div class="stuck-warning">Host has been in PXE_READY for %d minutes with no iPXE request. This usually means the host failed to PXE boot — check secure boot settings, network connectivity, and BIOS boot order.</div>`, mins)
|
||||
}
|
||||
|
||||
var activityHTML strings.Builder
|
||||
for _, e := range activity {
|
||||
activityHTML.WriteString(fmt.Sprintf(
|
||||
`<div class="log-entry log-%s"><span class="log-time">%s</span><span class="log-source">%s</span><span class="log-msg">%s</span></div>`,
|
||||
e.Level, e.CreatedAt.Format("15:04"), html.EscapeString(e.Source), html.EscapeString(e.Message)))
|
||||
}
|
||||
if len(activity) == 0 {
|
||||
activityHTML.WriteString(`<p class="empty">No activity recorded yet.</p>`)
|
||||
}
|
||||
|
||||
return layout(h.Hostname, fmt.Sprintf(`
|
||||
<div class="host-header">
|
||||
<span class="led led-lg %s"></span>
|
||||
<h2 style="margin-bottom:0">%s</h2>
|
||||
<span class="badge %s">%s</span>
|
||||
</div>
|
||||
%s
|
||||
<div class="panel">
|
||||
<table class="detail-table">
|
||||
<tr><th>MAC</th><td>%s</td></tr>
|
||||
@@ -135,7 +153,15 @@ func hostDetailPage(h *model.Host, ops []model.Operation) string {
|
||||
<tbody>%s</tbody>
|
||||
</table>
|
||||
</div>
|
||||
`, led, html.EscapeString(h.Hostname), stateClass, h.State, h.MAC, h.ServerType, ip, html.EscapeString(h.Notes), actions.String(), opsHTML.String()))
|
||||
<h3>Activity Log</h3>
|
||||
<div class="panel">
|
||||
<div class="activity-log" id="activity-log" data-host-id="%d">%s</div>
|
||||
</div>
|
||||
`, led, html.EscapeString(h.Hostname), stateClass, h.State,
|
||||
stuckWarning,
|
||||
h.MAC, h.ServerType, ip, html.EscapeString(h.Notes), actions.String(),
|
||||
opsHTML.String(),
|
||||
h.ID, activityHTML.String()))
|
||||
}
|
||||
|
||||
func imagesPage(images []model.Image) string {
|
||||
|
||||
+4
-1
@@ -24,6 +24,7 @@ type UI struct {
|
||||
Ops *store.Operations
|
||||
Locks *store.Locks
|
||||
Images *store.Images
|
||||
Activity *store.Activity
|
||||
ImageSvc *image.Service
|
||||
Runner *orchestrator.Runner
|
||||
Orchestrator *orchestrator.HostOrchestrator
|
||||
@@ -106,7 +107,8 @@ func (u *UI) HostDetail(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
ops, _ := u.Ops.ListByHost(r.Context(), host.ID)
|
||||
renderHTML(w, hostDetailPage(host, ops))
|
||||
activity, _ := u.Activity.ListByHost(r.Context(), host.ID, 50)
|
||||
renderHTML(w, hostDetailPage(host, ops, activity))
|
||||
}
|
||||
|
||||
func (u *UI) TriggerRebuild(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -131,6 +133,7 @@ func (u *UI) TriggerRebuild(w http.ResponseWriter, r *http.Request) {
|
||||
Kind: model.OpRebuildProxmox,
|
||||
})
|
||||
_ = u.Locks.Acquire(r.Context(), host.ID, opID)
|
||||
u.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "ui", "Rebuild triggered by user")
|
||||
|
||||
if err := u.Orchestrator.PrepareRebuild(r.Context(), host.ID); err != nil {
|
||||
_ = u.Locks.Release(r.Context(), host.ID)
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
CREATE TABLE activity_log (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
host_id INTEGER NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
|
||||
operation_id INTEGER REFERENCES operations(id) ON DELETE SET NULL,
|
||||
level TEXT NOT NULL DEFAULT 'info',
|
||||
message TEXT NOT NULL,
|
||||
source TEXT NOT NULL DEFAULT '',
|
||||
created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now'))
|
||||
);
|
||||
CREATE INDEX idx_activity_host ON activity_log(host_id, created_at DESC);
|
||||
@@ -66,6 +66,24 @@ type Image struct {
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
type LogLevel string
|
||||
|
||||
const (
|
||||
LogInfo LogLevel = "info"
|
||||
LogWarn LogLevel = "warn"
|
||||
LogError LogLevel = "error"
|
||||
)
|
||||
|
||||
type ActivityEntry struct {
|
||||
ID int64
|
||||
HostID int64
|
||||
OperationID int64
|
||||
Level LogLevel
|
||||
Message string
|
||||
Source string
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
type ServerType struct {
|
||||
Key string
|
||||
DisplayName string `yaml:"display_name"`
|
||||
|
||||
@@ -33,6 +33,7 @@ func (o *HostOrchestrator) PrepareRebuild(ctx context.Context, hostID int64) err
|
||||
}
|
||||
|
||||
func (o *HostOrchestrator) HandlePhoneHome(ctx context.Context, hostID int64, ip string, hardwareID string) {
|
||||
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Phone-home: updating IP to "+ip)
|
||||
if err := o.Hosts.UpdateIP(ctx, hostID, ip, hardwareID); err != nil {
|
||||
log.Printf("host %d: failed to update IP: %v", hostID, err)
|
||||
o.Runner.FailHost(ctx, hostID, "failed to update IP: "+err.Error())
|
||||
@@ -69,15 +70,17 @@ func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID str
|
||||
return
|
||||
}
|
||||
|
||||
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Starting cluster join via "+o.Cluster.ExistingNode)
|
||||
if err := o.Cluster.Join(ctx, ip, privateKey, publicKey); err != nil {
|
||||
log.Printf("host %d: cluster join failed: %v", hostID, err)
|
||||
o.Runner.FailHost(ctx, hostID, "cluster join: "+err.Error())
|
||||
return
|
||||
}
|
||||
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Cluster join complete")
|
||||
|
||||
// Key has been removed from the remote host; clear it from the DB
|
||||
_ = o.Hosts.ClearEphemeralKey(ctx, hostID)
|
||||
|
||||
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Registering with infrastructure API")
|
||||
if err := o.registerInfra(ctx, host, ip, hardwareID); err != nil {
|
||||
log.Printf("host %d: infra registration failed: %v", hostID, err)
|
||||
o.Runner.FailHost(ctx, hostID, "infra registration: "+err.Error())
|
||||
@@ -94,6 +97,7 @@ func (o *HostOrchestrator) postPhoneHome(hostID int64, ip string, hardwareID str
|
||||
_ = o.Ops.Complete(ctx, op.ID)
|
||||
}
|
||||
_ = o.Locks.Release(ctx, hostID)
|
||||
o.Runner.LogActivity(ctx, hostID, model.LogInfo, "orchestrator", "Provisioning complete")
|
||||
log.Printf("host %d (%s): provisioning complete", hostID, host.Hostname)
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"provisioning/internal/events"
|
||||
"provisioning/internal/model"
|
||||
@@ -12,10 +13,11 @@ import (
|
||||
)
|
||||
|
||||
type Runner struct {
|
||||
Hosts *store.Hosts
|
||||
Ops *store.Operations
|
||||
Locks *store.Locks
|
||||
Hub *events.Hub
|
||||
Hosts *store.Hosts
|
||||
Ops *store.Operations
|
||||
Locks *store.Locks
|
||||
Hub *events.Hub
|
||||
Activity *store.Activity
|
||||
}
|
||||
|
||||
func (r *Runner) Transition(ctx context.Context, hostID int64, trigger statemachine.Trigger) (model.HostState, error) {
|
||||
@@ -35,10 +37,12 @@ func (r *Runner) Transition(ctx context.Context, hostID int64, trigger statemach
|
||||
Name: "host.state_changed",
|
||||
Payload: fmt.Sprintf(`{"host_id":%d,"old_state":"%s","new_state":"%s"}`, hostID, host.State, next),
|
||||
})
|
||||
r.LogActivity(ctx, hostID, model.LogInfo, "state", fmt.Sprintf("%s → %s", host.State, next))
|
||||
return next, nil
|
||||
}
|
||||
|
||||
func (r *Runner) FailHost(ctx context.Context, hostID int64, reason string) {
|
||||
r.LogActivity(ctx, hostID, model.LogError, "orchestrator", "host failed: "+reason)
|
||||
if _, err := r.Transition(ctx, hostID, statemachine.TriggerFailed); err != nil {
|
||||
log.Printf("host %d: failed to transition to failed state: %v", hostID, err)
|
||||
return
|
||||
@@ -49,3 +53,16 @@ func (r *Runner) FailHost(ctx context.Context, hostID int64, reason string) {
|
||||
}
|
||||
_ = r.Locks.Release(ctx, hostID)
|
||||
}
|
||||
|
||||
func (r *Runner) LogActivity(ctx context.Context, hostID int64, level model.LogLevel, source, message string) {
|
||||
var opID int64
|
||||
if op, err := r.Ops.GetActive(ctx, hostID); err == nil {
|
||||
opID = op.ID
|
||||
}
|
||||
id, _ := r.Activity.Log(ctx, hostID, opID, level, source, message)
|
||||
r.Hub.Publish(events.Event{
|
||||
Name: "activity.logged",
|
||||
Payload: fmt.Sprintf(`{"id":%d,"host_id":%d,"level":"%s","source":"%s","message":"%s","created_at":"%s"}`,
|
||||
id, hostID, level, source, message, time.Now().UTC().Format(time.RFC3339)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -0,0 +1,50 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"provisioning/internal/model"
|
||||
)
|
||||
|
||||
type Activity struct {
|
||||
DB *sql.DB
|
||||
}
|
||||
|
||||
func (s *Activity) Log(ctx context.Context, hostID, opID int64, level model.LogLevel, source, message string) (int64, error) {
|
||||
res, err := s.DB.ExecContext(ctx, `
|
||||
INSERT INTO activity_log(host_id, operation_id, level, message, source)
|
||||
VALUES(?,?,?,?,?)
|
||||
`, hostID, nullInt64(opID), level, message, source)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("insert activity: %w", err)
|
||||
}
|
||||
return res.LastInsertId()
|
||||
}
|
||||
|
||||
func (s *Activity) ListByHost(ctx context.Context, hostID int64, limit int) ([]model.ActivityEntry, error) {
|
||||
if limit <= 0 {
|
||||
limit = 50
|
||||
}
|
||||
rows, err := s.DB.QueryContext(ctx, `
|
||||
SELECT id, host_id, COALESCE(operation_id, 0), level, message, source, created_at
|
||||
FROM activity_log WHERE host_id = ? ORDER BY created_at DESC LIMIT ?
|
||||
`, hostID, limit)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list activity: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
var out []model.ActivityEntry
|
||||
for rows.Next() {
|
||||
var e model.ActivityEntry
|
||||
var createdAt string
|
||||
if err := rows.Scan(&e.ID, &e.HostID, &e.OperationID, &e.Level, &e.Message, &e.Source, &createdAt); err != nil {
|
||||
return nil, fmt.Errorf("scan activity: %w", err)
|
||||
}
|
||||
e.CreatedAt, _ = time.Parse(time.RFC3339, createdAt)
|
||||
out = append(out, e)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
@@ -391,6 +391,48 @@ main {
|
||||
.progress-text { font-size: 0.825rem; color: var(--text); font-weight: 500; }
|
||||
.progress-detail { font-size: 0.775rem; color: var(--text-secondary); }
|
||||
|
||||
/* === STUCK WARNING === */
|
||||
.stuck-warning {
|
||||
background: var(--amber-bg);
|
||||
color: var(--amber);
|
||||
border: 1px solid var(--amber-border);
|
||||
padding: 0.75rem 1rem;
|
||||
border-radius: var(--radius-sm);
|
||||
margin-bottom: 1rem;
|
||||
font-size: 0.825rem;
|
||||
font-weight: 500;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
/* === ACTIVITY LOG === */
|
||||
.activity-log { max-height: 400px; overflow-y: auto; }
|
||||
.log-entry {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
gap: 0.75rem;
|
||||
padding: 0.4rem 0.75rem;
|
||||
border-bottom: 1px solid var(--border);
|
||||
font-size: 0.825rem;
|
||||
}
|
||||
.log-entry:last-child { border-bottom: none; }
|
||||
.log-time {
|
||||
color: var(--text-tertiary);
|
||||
font-family: var(--font-mono);
|
||||
font-size: 0.75rem;
|
||||
min-width: 3.5rem;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.log-source {
|
||||
color: var(--text-tertiary);
|
||||
font-family: var(--font-mono);
|
||||
font-size: 0.75rem;
|
||||
min-width: 5rem;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.log-msg { color: var(--text); }
|
||||
.log-warn .log-msg { color: var(--amber); }
|
||||
.log-error .log-msg { color: var(--red); font-weight: 500; }
|
||||
|
||||
/* === UTILITY === */
|
||||
.inline { display: inline; }
|
||||
|
||||
|
||||
@@ -10,6 +10,24 @@
|
||||
es.addEventListener('host.state_changed', function() {
|
||||
window.location.reload();
|
||||
});
|
||||
es.addEventListener('activity.logged', function(e) {
|
||||
var data;
|
||||
try { data = JSON.parse(e.data); } catch(_) { return; }
|
||||
var logDiv = document.getElementById('activity-log');
|
||||
if (!logDiv) return;
|
||||
var hostId = logDiv.getAttribute('data-host-id');
|
||||
if (String(data.host_id) !== hostId) return;
|
||||
var empty = logDiv.querySelector('.empty');
|
||||
if (empty) empty.remove();
|
||||
var entry = document.createElement('div');
|
||||
entry.className = 'log-entry log-' + data.level;
|
||||
var t = new Date(data.created_at);
|
||||
var ts = t.getHours().toString().padStart(2,'0') + ':' + t.getMinutes().toString().padStart(2,'0');
|
||||
entry.innerHTML = '<span class="log-time">' + ts + '</span>' +
|
||||
'<span class="log-source">' + data.source + '</span>' +
|
||||
'<span class="log-msg">' + data.message + '</span>';
|
||||
logDiv.insertBefore(entry, logDiv.firstChild);
|
||||
});
|
||||
es.onerror = function() {
|
||||
dot.className = 'led led-red';
|
||||
es.close();
|
||||
|
||||
Reference in New Issue
Block a user