Files
Provisioning/internal/api/boot.go
T
josh a6603b463f
build-and-push / test (push) Failing after 32s
build-and-push / build-and-push (push) Has been skipped
Add activity log system for provisioning lifecycle visibility
Hosts stuck in states like pxe_ready had zero visibility into why.
This adds a persistent activity log that records every meaningful
step (state transitions, PXE events, cluster join stages, failures)
and surfaces it on the host detail page with live SSE updates.
Includes a stuck-detection warning banner when a host sits in
pxe_ready for >10 minutes with no iPXE request.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-13 23:30:21 -04:00

177 lines
4.7 KiB
Go

package api
import (
"encoding/json"
"errors"
"fmt"
"log"
"net/http"
"strings"
"provisioning/internal/config"
"provisioning/internal/model"
"provisioning/internal/orchestrator"
"provisioning/internal/pxe"
"provisioning/internal/statemachine"
"provisioning/internal/store"
"github.com/go-chi/chi/v5"
)
type BootAPI struct {
Hosts *store.Hosts
Images *store.Images
Runner *orchestrator.Runner
Orchestrator *orchestrator.HostOrchestrator
Config *config.Config
ServerTypes *config.ServerTypeRegistry
}
func (a *BootAPI) IPXEScript(w http.ResponseWriter, r *http.Request) {
mac := normalizeMAC(chi.URLParam(r, "mac"))
host, err := a.Hosts.GetByMAC(r.Context(), mac)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
http.Error(w, "#!ipxe\nexit", http.StatusNotFound)
return
}
http.Error(w, "internal error", http.StatusInternalServerError)
return
}
img, err := a.Images.GetDefault(r.Context())
if err != nil {
http.Error(w, "#!ipxe\necho No default image configured\nshell", http.StatusServiceUnavailable)
return
}
if host.State == model.StatePXEReady {
a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerPXEScriptServed)
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "iPXE script served — kernel + initrd delivered")
}
w.Header().Set("Content-Type", "text/plain")
w.Write([]byte(pxe.BuildIPXEScript(a.Config.Server.PublicURL, img, mac)))
}
func (a *BootAPI) AnswerFile(w http.ResponseWriter, r *http.Request) {
var sysInfo struct {
MAC string `json:"mac"`
}
if err := json.NewDecoder(r.Body).Decode(&sysInfo); err != nil {
http.Error(w, "invalid json", http.StatusBadRequest)
return
}
mac := normalizeMAC(sysInfo.MAC)
host, err := a.Hosts.GetByMAC(r.Context(), mac)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
http.Error(w, "unknown host", http.StatusForbidden)
return
}
http.Error(w, "internal error", http.StatusInternalServerError)
return
}
st, ok := a.ServerTypes.Get(host.ServerType)
if !ok {
http.Error(w, "unknown server type", http.StatusInternalServerError)
return
}
if host.State == model.StatePXEBooted {
a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerAnswerServed)
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "Answer file served — installation starting")
}
_, pubKey, _ := a.Hosts.GetEphemeralKey(r.Context(), host.ID)
if pubKey == "" {
http.Error(w, "no ephemeral key for host", http.StatusInternalServerError)
return
}
answer := pxe.GenerateAnswerFile(host, st, a.Config, pubKey)
w.Header().Set("Content-Type", "application/toml")
w.Write([]byte(answer))
}
func (a *BootAPI) InstallComplete(w http.ResponseWriter, r *http.Request) {
id, ok := idFromURL(w, r)
if !ok {
return
}
host, err := a.Hosts.Get(r.Context(), id)
if err != nil {
writeJSONErr(w, http.StatusNotFound, "host not found")
return
}
if host.State == model.StateInstalling {
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", "Install-complete webhook received")
if _, err := a.Runner.Transition(r.Context(), host.ID, statemachine.TriggerInstallWebhook); err != nil {
log.Printf("host %d: install-complete transition failed: %v", host.ID, err)
}
}
w.WriteHeader(http.StatusOK)
}
func (a *BootAPI) FirstBootScript(w http.ResponseWriter, r *http.Request) {
id, ok := idFromURL(w, r)
if !ok {
return
}
host, err := a.Hosts.Get(r.Context(), id)
if err != nil {
http.Error(w, "host not found", http.StatusNotFound)
return
}
st, ok := a.ServerTypes.Get(host.ServerType)
if !ok {
http.Error(w, "unknown server type", http.StatusInternalServerError)
return
}
script := pxe.GenerateFirstBootScript(host, st, a.Config)
w.Header().Set("Content-Type", "text/x-shellscript")
w.Write([]byte(script))
}
func (a *BootAPI) PhoneHome(w http.ResponseWriter, r *http.Request) {
id, ok := idFromURL(w, r)
if !ok {
return
}
var req struct {
IP string `json:"ip"`
HardwareID string `json:"hardware_id"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSONErr(w, http.StatusBadRequest, "invalid json")
return
}
host, err := a.Hosts.Get(r.Context(), id)
if err != nil {
writeJSONErr(w, http.StatusNotFound, "host not found")
return
}
log.Printf("host %d (%s): phone-home from %s, hwid=%s", host.ID, host.Hostname, req.IP, req.HardwareID)
a.Runner.LogActivity(r.Context(), host.ID, model.LogInfo, "pxe", fmt.Sprintf("Phone-home received from %s", req.IP))
a.Orchestrator.HandlePhoneHome(r.Context(), host.ID, req.IP, req.HardwareID)
writeJSON(w, http.StatusOK, map[string]any{"ok": true})
}
func normalizeMAC(m string) string {
m = strings.ToLower(strings.TrimSpace(m))
m = strings.ReplaceAll(m, "-", ":")
return m
}