Files
Vetting/internal/api/ui_handlers.go
T
josh d0bfae14c8
CI / Lint + build + test (push) Has been cancelled
Heartbeat-first dispatch: retire WoL-as-default, add WaitingReboot
Every supported host runs vetting-reporter in-OS and heartbeats every
30s. WoL was never the thing that started vetting — the heartbeat
response's reboot_for_vetting command was. Firing WoL first only
crowded the run log with misleading diagnostics when the real failure
mode is "reporter isn't installed."

- StartRun 409s if the host hasn't heartbeated within 60s, pointing
  the operator at /register/quick.sh.
- Dispatcher re-checks LastSeenAt at dispatch time (run may sit in
  Queued long enough for the host to go offline); stale hosts mark
  the run Failed with failed_stage=dispatch instead of looping.
- New StateWaitingReboot + TriggerRebootCommanded capture the actual
  semantics. StateWaitingWoL kept as the hook point for a future
  manual-override button.
- Tile disables the Start button with a quick.sh tooltip when the
  host is offline, matching the server-side 409.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 01:10:34 -04:00

533 lines
17 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package api
import (
"context"
"encoding/json"
"errors"
"log"
"net/http"
"regexp"
"strconv"
"strings"
"text/template"
"time"
"github.com/go-chi/chi/v5"
"gopkg.in/yaml.v3"
"vetting/internal/events"
"vetting/internal/logs"
"vetting/internal/model"
"vetting/internal/orchestrator"
"vetting/internal/store"
"vetting/internal/web"
"vetting/internal/web/templates"
)
type UI struct {
Hosts *store.Hosts
Runs *store.Runs
Stages *store.Stages
SpecDiffs *store.SpecDiffs
Artifacts *store.Artifacts
EventHub *events.Hub
Logs *logs.Hub
Runner *orchestrator.Runner
Tiles *TileEnricher
PublicURL string // user-visible base URL baked into the quick-register one-liner
}
var macRe = regexp.MustCompile(`^[0-9a-f]{2}(:[0-9a-f]{2}){5}$`)
// quickRegisterTmpl is parsed once at startup — a malformed template
// should fail the binary at init, not on a visitor's first hit.
var quickRegisterTmpl = template.Must(
template.ParseFS(web.Register, "register/quick.sh.tmpl"),
)
// baseURL returns the orchestrator URL to bake into generated artefacts
// (the quick-register one-liner, its rendered script). Prefers the
// operator-configured public URL; falls back to the request's own host
// so a dev run on http://127.0.0.1:8080 still produces a working command.
func (u *UI) baseURL(r *http.Request) string {
if u.PublicURL != "" {
return strings.TrimRight(u.PublicURL, "/")
}
scheme := "http"
if r.TLS != nil {
scheme = "https"
}
return scheme + "://" + r.Host
}
func (u *UI) Dashboard(w http.ResponseWriter, r *http.Request) {
hosts, err := u.Hosts.List(r.Context())
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
tiles := make([]templates.TileData, 0, len(hosts))
for _, h := range hosts {
latest, err := u.Runs.LatestForHost(r.Context(), h.ID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
tiles = append(tiles, u.Tiles.Build(r.Context(), h, latest))
}
_ = templates.Dashboard(tiles).Render(r.Context(), w)
}
// HostDetail renders the per-host page: breadcrumb, summary, pipeline
// timeline, hold card, action row, spec diffs, log pane, meta. Same
// enrichment path as Dashboard for tile data; additionally reads stage
// rows + spec diffs for the latest run to populate the timeline and
// diff list.
func (u *UI) HostDetail(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
host, err := u.Hosts.Get(r.Context(), id)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
http.NotFound(w, r)
return
}
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
latest, err := u.Runs.LatestForHost(r.Context(), id)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
var stages []model.Stage
var diffs []model.SpecDiff
if latest != nil {
if u.Stages != nil {
stages, _ = u.Stages.ListForRun(r.Context(), latest.ID)
}
if u.SpecDiffs != nil {
diffs, _ = u.SpecDiffs.ListForRun(r.Context(), latest.ID)
}
}
t := u.Tiles.Build(r.Context(), *host, latest)
replay := ""
if latest != nil && u.Logs != nil {
replay = u.Logs.Replay(latest.ID)
}
data := templates.HostDetailData{
Tile: t,
Stages: stages,
SpecDiffs: diffs,
LogReplay: replay,
}
_ = templates.HostDetail(data).Render(r.Context(), w)
}
// StartRun creates a new Run for the host, issues an agent token, and
// transitions Registered→Queued. The dispatcher goroutine picks it up
// on its next tick; the happy path is heartbeat-driven (the reporter's
// next heartbeat fetches reboot_for_vetting). Refuses the click outright
// if the host isn't currently heartbeating — there is no path from
// Queued to live-image without an in-OS reporter on the target.
func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
hostID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
host, err := u.Hosts.Get(r.Context(), hostID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
http.NotFound(w, r)
return
}
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Preflight: host must be heartbeating. The dispatcher re-checks at
// dispatch time (belt-and-braces for the gap between click and tick),
// but rejecting here gives the operator an immediate, actionable
// error instead of a mysterious Failed run 2s later.
if host.LastSeenAt == nil || time.Since(*host.LastSeenAt) > orchestrator.HostHeartbeatStaleAfter {
writeJSONError(w, http.StatusConflict,
"host is not heartbeating — install the reporter via /register/quick.sh on the target host, then retry")
return
}
// Guard: refuse to start a second run while one is still active.
if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
switch latest.State {
case model.StateCompleted, model.StateReleased, model.StateFailed, model.StateFailedHolding:
// ok to start fresh
default:
http.Error(w, "host already has an active run", http.StatusConflict)
return
}
}
_, hash, err := orchestrator.IssueRunToken()
if err != nil {
http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
return
}
runID, err := u.Runs.Create(r.Context(), hostID, hash)
if err != nil {
http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
return
}
log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID)
http.Redirect(w, r, "/", http.StatusSeeOther)
}
func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
_ = templates.Registration(templates.RegistrationForm{
QuickRegisterURL: u.baseURL(r),
}).Render(r.Context(), w)
}
// QuickRegisterScript renders the bash one-liner an operator pastes on
// the target host: hardware autodetect + POST to /api/v1/hosts. The
// orchestrator URL is substituted in so the script is self-contained.
func (u *UI) QuickRegisterScript(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/x-shellscript; charset=utf-8")
w.Header().Set("Cache-Control", "no-store")
if err := quickRegisterTmpl.Execute(w, struct{ OrchestratorURL string }{
OrchestratorURL: u.baseURL(r),
}); err != nil {
log.Printf("quick-register script render: %v", err)
}
}
func (u *UI) CreateHost(w http.ResponseWriter, r *http.Request) {
if err := r.ParseForm(); err != nil {
http.Error(w, "bad form", http.StatusBadRequest)
return
}
form := templates.RegistrationForm{
Name: strings.TrimSpace(r.PostForm.Get("name")),
MAC: strings.ToLower(strings.TrimSpace(r.PostForm.Get("mac"))),
WoLBroadcastIP: strings.TrimSpace(r.PostForm.Get("wol_broadcast_ip")),
WoLPort: r.PostForm.Get("wol_port"),
ExpectedSpecYAML: r.PostForm.Get("expected_spec_yaml"),
Notes: strings.TrimSpace(r.PostForm.Get("notes")),
QuickRegisterURL: u.baseURL(r),
}
if errMsg := validateHostForm(&form); errMsg != "" {
form.Error = errMsg
w.WriteHeader(http.StatusBadRequest)
_ = templates.Registration(form).Render(r.Context(), w)
return
}
wolPort, _ := strconv.Atoi(form.WoLPort)
if wolPort == 0 {
wolPort = 9
}
_, err := u.Hosts.Create(r.Context(), model.Host{
Name: form.Name,
MAC: form.MAC,
WoLBroadcastIP: form.WoLBroadcastIP,
WoLPort: wolPort,
ExpectedSpecYAML: form.ExpectedSpecYAML,
Notes: form.Notes,
})
if err != nil {
form.Error = friendlyDBError(err)
w.WriteHeader(http.StatusConflict)
_ = templates.Registration(form).Render(r.Context(), w)
return
}
http.Redirect(w, r, "/", http.StatusSeeOther)
}
// quickRegisterPayload is the POST body accepted by /api/v1/hosts —
// the shape the quick-register bash one-liner emits.
type quickRegisterPayload struct {
Name string `json:"name"`
MAC string `json:"mac"`
WoLBroadcastIP string `json:"wol_broadcast_ip"`
WoLPort int `json:"wol_port"`
ExpectedSpecYAML string `json:"expected_spec_yaml"`
Notes string `json:"notes"`
}
// CreateHostJSON is the API counterpart to CreateHost. Accepts the same
// fields as the form but in JSON, so a target host can POST its own
// registration payload over curl from the quick-register one-liner.
// Same validation as the form; no auth (LAN-only).
func (u *UI) CreateHostJSON(w http.ResponseWriter, r *http.Request) {
var p quickRegisterPayload
if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 256*1024)).Decode(&p); err != nil {
writeJSONError(w, http.StatusBadRequest, "bad json: "+err.Error())
return
}
form := templates.RegistrationForm{
Name: strings.TrimSpace(p.Name),
MAC: strings.ToLower(strings.TrimSpace(p.MAC)),
WoLBroadcastIP: strings.TrimSpace(p.WoLBroadcastIP),
ExpectedSpecYAML: p.ExpectedSpecYAML,
Notes: strings.TrimSpace(p.Notes),
}
if p.WoLPort > 0 {
form.WoLPort = strconv.Itoa(p.WoLPort)
}
if errMsg := validateHostForm(&form); errMsg != "" {
writeJSONError(w, http.StatusBadRequest, errMsg)
return
}
wolPort := p.WoLPort
if wolPort == 0 {
wolPort = 9
}
id, err := u.Hosts.Create(r.Context(), model.Host{
Name: form.Name,
MAC: form.MAC,
WoLBroadcastIP: form.WoLBroadcastIP,
WoLPort: wolPort,
ExpectedSpecYAML: form.ExpectedSpecYAML,
Notes: form.Notes,
})
if err != nil {
writeJSONError(w, http.StatusConflict, friendlyDBError(err))
return
}
log.Printf("api: registered host %d (%s, %s)", id, form.Name, form.MAC)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusCreated)
_ = json.NewEncoder(w).Encode(map[string]any{
"id": id,
"name": form.Name,
"mac": form.MAC,
})
}
// Heartbeat is called every ~30s by a host-mode vetting-agent running
// as a systemd service on the registered host. LAN-trusted, no auth —
// same threat model as the browser UI and quick-register. Stamps
// last_seen_at, flips the dashboard tile to "online", and — if the
// operator has clicked Start vetting since the last heartbeat — replies
// with cmd=reboot_for_vetting so the host boots into PXE without WoL.
func (u *UI) Heartbeat(w http.ResponseWriter, r *http.Request) {
mac := strings.ToLower(strings.TrimSpace(chi.URLParam(r, "mac")))
if !macRe.MatchString(mac) {
writeJSONError(w, http.StatusBadRequest,
"MAC address must be in the form aa:bb:cc:dd:ee:ff")
return
}
host, err := u.Hosts.GetByMAC(r.Context(), mac)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
writeJSONError(w, http.StatusNotFound, "unknown host")
return
}
writeJSONError(w, http.StatusInternalServerError, err.Error())
return
}
if err := u.Hosts.UpdateLastSeen(r.Context(), mac, time.Now().UTC()); err != nil {
writeJSONError(w, http.StatusInternalServerError, err.Error())
return
}
if u.Runner != nil {
u.Runner.PublishTileUpdate(r.Context(), host.ID)
}
cmd, runID := u.pickHostCommand(r.Context(), host.ID)
resp := heartbeatResponse{OK: true, Cmd: cmd, RunID: runID}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(resp)
}
// heartbeatResponse is the JSON the host-mode agent decodes on every
// heartbeat. `cmd` is "" (omitted) in the idle case so the wire shape
// stays `{"ok": true}` when nothing is happening.
type heartbeatResponse struct {
OK bool `json:"ok"`
Cmd string `json:"cmd,omitempty"`
RunID int64 `json:"run_id,omitempty"`
}
// pickHostCommand decides what the host-mode agent should do on the
// back of this heartbeat. Returns ("", 0) when there's nothing to do.
//
// - Queued run → Transition(RebootCommanded) and tell the agent to
// reboot. Beats the dispatcher's 2s poll to the punch, but either
// path ends at WaitingReboot.
// - WaitingReboot (or legacy WaitingWoL) run <10min old → also return
// reboot, covering "host crashed mid-reboot, systemd brought the
// reporter back".
// - anything else → idle.
func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) {
if u.Runs == nil || u.Runner == nil {
return "", 0
}
run, err := u.Runs.LatestForHost(ctx, hostID)
if err != nil {
log.Printf("heartbeat: latest run for host %d: %v", hostID, err)
return "", 0
}
if run == nil {
return "", 0
}
switch run.State {
case model.StateQueued:
if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerRebootCommanded); err != nil {
// Benign race with the dispatcher's own 2s poll — the
// state machine refuses the second transition; we just
// log and return idle so the agent doesn't reboot on a
// run that another path is already driving.
log.Printf("heartbeat: transition run %d: %v", run.ID, err)
return "", 0
}
log.Printf("heartbeat: dispatched run %d for host %d (reboot commanded)", run.ID, hostID)
return cmdRebootForVetting, run.ID
case model.StateWaitingReboot, model.StateWaitingWoL:
// Tolerate a crashed-mid-reboot retry: the reporter is the
// only thing that could be telling us about this host right
// now. Bound it so a perpetually-broken PXE doesn't
// reboot-loop the box.
if time.Since(run.StartedAt) < 10*time.Minute {
return cmdRebootForVetting, run.ID
}
return "", 0
}
return "", 0
}
const cmdRebootForVetting = "reboot_for_vetting"
func writeJSONError(w http.ResponseWriter, status int, msg string) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
_ = json.NewEncoder(w).Encode(map[string]string{"error": msg})
}
// OverrideWipeStorage is the operator's explicit "yes, wipe the disk
// even though we found filesystem signatures" button. Only meaningful
// when the latest run is FailedHolding with failed_stage=Storage — the
// agent's next heartbeat will receive retry_stage with wipe=true and
// re-enter the Storage stage bypassing the wipe-probe guard.
func (u *UI) OverrideWipeStorage(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
hostID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
latest, err := u.Runs.LatestForHost(r.Context(), hostID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if latest == nil {
http.Error(w, "no run for host", http.StatusConflict)
return
}
if latest.State != model.StateFailedHolding || latest.FailedStage != "Storage" {
http.Error(w, "override only valid when holding on Storage", http.StatusConflict)
return
}
if _, err := u.Runner.Override(r.Context(), latest.ID, `{"wipe":true}`); err != nil {
http.Error(w, "override: "+err.Error(), http.StatusInternalServerError)
return
}
http.Redirect(w, r, "/", http.StatusSeeOther)
}
func (u *UI) DeleteHost(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad id", http.StatusBadRequest)
return
}
if err := u.Hosts.Delete(r.Context(), id); err != nil {
if errors.Is(err, store.ErrNotFound) {
http.NotFound(w, r)
return
}
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
http.Redirect(w, r, "/", http.StatusSeeOther)
}
func (u *UI) SSE(w http.ResponseWriter, r *http.Request) {
u.EventHub.ServeSSE(w, r)
}
// Report serves the HTML report artifact for a run. Looks up the
// report_html artifact row for the runID, validates the path lives
// under the artifacts dir (defence-in-depth against path traversal),
// and streams it back. 404 when the run hasn't produced one yet.
func (u *UI) Report(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "runID")
runID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad run id", http.StatusBadRequest)
return
}
arts, err := u.Artifacts.ListForRun(r.Context(), runID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
var path string
for _, a := range arts {
if a.Kind == "report_html" {
path = a.Path
}
}
if path == "" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
http.ServeFile(w, r, path)
}
func validateHostForm(form *templates.RegistrationForm) string {
if form.Name == "" {
return "Name is required."
}
if !macRe.MatchString(form.MAC) {
return "MAC address must be in the form aa:bb:cc:dd:ee:ff."
}
if form.WoLBroadcastIP == "" {
return "WoL broadcast IP is required."
}
if form.ExpectedSpecYAML == "" {
return "Expected spec YAML is required."
}
var anything any
if err := yaml.Unmarshal([]byte(form.ExpectedSpecYAML), &anything); err != nil {
return "Expected spec YAML is not valid YAML: " + err.Error()
}
if form.WoLPort != "" {
port, err := strconv.Atoi(form.WoLPort)
if err != nil || port < 1 || port > 65535 {
return "WoL port must be 165535."
}
}
return ""
}
func friendlyDBError(err error) string {
s := err.Error()
switch {
case strings.Contains(s, "UNIQUE constraint failed: hosts.name"):
return "A host with that name already exists."
case strings.Contains(s, "UNIQUE constraint failed: hosts.mac"):
return "A host with that MAC already exists."
default:
return s
}
}