Add activity log system for provisioning lifecycle visibility
Hosts stuck in states like pxe_ready had zero visibility into why. This adds a persistent activity log that records every meaningful step (state transitions, PXE events, cluster join stages, failures) and surfaces it on the host detail page with live SSE updates. Includes a stuck-detection warning banner when a host sits in pxe_ready for >10 minutes with no iPXE request. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@ package api
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
@@ -46,6 +47,7 @@ func (a *BootAPI) IPXEScript(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
if host.State == model.StatePXEReady {
|
||||
a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerPXEScriptServed)
|
||||
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "iPXE script served — kernel + initrd delivered")
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
@@ -80,6 +82,7 @@ func (a *BootAPI) AnswerFile(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
if host.State == model.StatePXEBooted {
|
||||
a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerAnswerServed)
|
||||
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "Answer file served — installation starting")
|
||||
}
|
||||
|
||||
_, pubKey, _ := a.Hosts.GetEphemeralKey(r.Context(), host.ID)
|
||||
@@ -106,6 +109,7 @@ func (a *BootAPI) InstallComplete(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
if host.State == model.StateInstalling {
|
||||
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "Install-complete webhook received")
|
||||
if _, err := a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerInstallWebhook); err != nil {
|
||||
log.Printf("host %d: install-complete transition failed: %v", host.ID, err)
|
||||
}
|
||||
@@ -159,6 +163,7 @@ func (a *BootAPI) PhoneHome(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
log.Printf("host %d (%s): phone-home from %s, hwid=%s", host.ID, host.Hostname, req.IP, req.HardwareID)
|
||||
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", fmt.Sprintf("Phone-home received from %s", req.IP))
|
||||
a.Orchestrator.HandlePhoneHome(r.Context(), host.ID, req.IP, req.HardwareID)
|
||||
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true})
|
||||
|
||||
@@ -124,6 +124,7 @@ func (a *HostAPI) Rebuild(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSONErr(w, http.StatusInternalServerError, "failed to acquire lock")
|
||||
return
|
||||
}
|
||||
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "api", "Rebuild triggered via API")
|
||||
|
||||
if err := a.Orchestrator.PrepareRebuild(r.Context(), host.ID); err != nil {
|
||||
_ = a.Locks.Release(r.Context(), host.ID)
|
||||
|
||||
+28
-2
@@ -5,6 +5,7 @@ import (
|
||||
"html"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"provisioning/internal/model"
|
||||
)
|
||||
@@ -83,7 +84,7 @@ func hostFormPage(types []string, errMsg string, prefill *model.Host) string {
|
||||
`, errHTML, hostname, mac, opts.String(), notes))
|
||||
}
|
||||
|
||||
func hostDetailPage(h *model.Host, ops []model.Operation) string {
|
||||
func hostDetailPage(h *model.Host, ops []model.Operation, activity []model.ActivityEntry) string {
|
||||
stateClass := stateColor(h.State)
|
||||
led := ledClass(h.State)
|
||||
canRebuild := h.State == model.StateRegistered || h.State == model.StateReady || h.State == model.StateFailed
|
||||
@@ -113,12 +114,29 @@ func hostDetailPage(h *model.Host, ops []model.Operation) string {
|
||||
ip = "—"
|
||||
}
|
||||
|
||||
var stuckWarning string
|
||||
if h.State == model.StatePXEReady && time.Since(h.UpdatedAt) > 10*time.Minute {
|
||||
mins := int(time.Since(h.UpdatedAt).Minutes())
|
||||
stuckWarning = fmt.Sprintf(`<div class="stuck-warning">Host has been in PXE_READY for %d minutes with no iPXE request. This usually means the host failed to PXE boot — check secure boot settings, network connectivity, and BIOS boot order.</div>`, mins)
|
||||
}
|
||||
|
||||
var activityHTML strings.Builder
|
||||
for _, e := range activity {
|
||||
activityHTML.WriteString(fmt.Sprintf(
|
||||
`<div class="log-entry log-%s"><span class="log-time">%s</span><span class="log-source">%s</span><span class="log-msg">%s</span></div>`,
|
||||
e.Level, e.CreatedAt.Format("15:04"), html.EscapeString(e.Source), html.EscapeString(e.Message)))
|
||||
}
|
||||
if len(activity) == 0 {
|
||||
activityHTML.WriteString(`<p class="empty">No activity recorded yet.</p>`)
|
||||
}
|
||||
|
||||
return layout(h.Hostname, fmt.Sprintf(`
|
||||
<div class="host-header">
|
||||
<span class="led led-lg %s"></span>
|
||||
<h2 style="margin-bottom:0">%s</h2>
|
||||
<span class="badge %s">%s</span>
|
||||
</div>
|
||||
%s
|
||||
<div class="panel">
|
||||
<table class="detail-table">
|
||||
<tr><th>MAC</th><td>%s</td></tr>
|
||||
@@ -135,7 +153,15 @@ func hostDetailPage(h *model.Host, ops []model.Operation) string {
|
||||
<tbody>%s</tbody>
|
||||
</table>
|
||||
</div>
|
||||
`, led, html.EscapeString(h.Hostname), stateClass, h.State, h.MAC, h.ServerType, ip, html.EscapeString(h.Notes), actions.String(), opsHTML.String()))
|
||||
<h3>Activity Log</h3>
|
||||
<div class="panel">
|
||||
<div class="activity-log" id="activity-log" data-host-id="%d">%s</div>
|
||||
</div>
|
||||
`, led, html.EscapeString(h.Hostname), stateClass, h.State,
|
||||
stuckWarning,
|
||||
h.MAC, h.ServerType, ip, html.EscapeString(h.Notes), actions.String(),
|
||||
opsHTML.String(),
|
||||
h.ID, activityHTML.String()))
|
||||
}
|
||||
|
||||
func imagesPage(images []model.Image) string {
|
||||
|
||||
+4
-1
@@ -24,6 +24,7 @@ type UI struct {
|
||||
Ops *store.Operations
|
||||
Locks *store.Locks
|
||||
Images *store.Images
|
||||
Activity *store.Activity
|
||||
ImageSvc *image.Service
|
||||
Runner *orchestrator.Runner
|
||||
Orchestrator *orchestrator.HostOrchestrator
|
||||
@@ -106,7 +107,8 @@ func (u *UI) HostDetail(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
ops, _ := u.Ops.ListByHost(r.Context(), host.ID)
|
||||
renderHTML(w, hostDetailPage(host, ops))
|
||||
activity, _ := u.Activity.ListByHost(r.Context(), host.ID, 50)
|
||||
renderHTML(w, hostDetailPage(host, ops, activity))
|
||||
}
|
||||
|
||||
func (u *UI) TriggerRebuild(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -131,6 +133,7 @@ func (u *UI) TriggerRebuild(w http.ResponseWriter, r *http.Request) {
|
||||
Kind: model.OpRebuildProxmox,
|
||||
})
|
||||
_ = u.Locks.Acquire(r.Context(), host.ID, opID)
|
||||
u.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "ui", "Rebuild triggered by user")
|
||||
|
||||
if err := u.Orchestrator.PrepareRebuild(r.Context(), host.ID); err != nil {
|
||||
_ = u.Locks.Release(r.Context(), host.ID)
|
||||
|
||||
Reference in New Issue
Block a user