package api import ( "context" "encoding/json" "errors" "fmt" "log" "net/http" "regexp" "strconv" "strings" "text/template" "time" "github.com/go-chi/chi/v5" "gopkg.in/yaml.v3" "vetting/internal/events" "vetting/internal/logs" "vetting/internal/model" "vetting/internal/orchestrator" "vetting/internal/store" "vetting/internal/web" "vetting/internal/web/templates" ) type UI struct { Hosts *store.Hosts Runs *store.Runs Stages *store.Stages SubSteps *store.SubSteps SpecDiffs *store.SpecDiffs Artifacts *store.Artifacts EventHub *events.Hub Logs *logs.Hub Runner *orchestrator.Runner Tiles *TileEnricher PublicURL string // user-visible base URL baked into the quick-register one-liner // PXE, when non-nil, gets Reload()ed after host create/delete so // dnsmasq's dhcp-host= allowlist reflects the current registry. // Without this, a newly-registered host PXE-boots and gets // "proxy-ignored" because its MAC isn't tagged `known`. PXE PXEReloader } // PXEReloader rewrites dnsmasq.conf with the current host list and // SIGHUPs the subprocess. Satisfied by *pxe.Supervisor. type PXEReloader interface { Reload(hosts []model.Host) error } // reloadPXE reads the full host list and hands it to the reloader. // Logs on failure; never returns an error — the HTTP request that // triggered the host change has already succeeded. func (u *UI) reloadPXE(ctx context.Context) { if u.PXE == nil { return } hosts, err := u.Hosts.List(ctx) if err != nil { log.Printf("pxe reload: list hosts: %v", err) return } if err := u.PXE.Reload(hosts); err != nil { log.Printf("pxe reload: %v", err) } } var macRe = regexp.MustCompile(`^[0-9a-f]{2}(:[0-9a-f]{2}){5}$`) // quickRegisterTmpl is parsed once at startup — a malformed template // should fail the binary at init, not on a visitor's first hit. var quickRegisterTmpl = template.Must( template.ParseFS(web.Register, "register/quick.sh.tmpl"), ) // baseURL returns the orchestrator URL to bake into generated artefacts // (the quick-register one-liner, its rendered script). Prefers the // operator-configured public URL; falls back to the request's own host // so a dev run on http://127.0.0.1:8080 still produces a working command. func (u *UI) baseURL(r *http.Request) string { if u.PublicURL != "" { return strings.TrimRight(u.PublicURL, "/") } scheme := "http" if r.TLS != nil { scheme = "https" } return scheme + "://" + r.Host } func (u *UI) Dashboard(w http.ResponseWriter, r *http.Request) { hosts, err := u.Hosts.List(r.Context()) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } tiles := make([]templates.TileData, 0, len(hosts)) for _, h := range hosts { latest, err := u.Runs.LatestForHost(r.Context(), h.ID) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } tiles = append(tiles, u.Tiles.Build(r.Context(), h, latest)) } _ = templates.Dashboard(tiles).Render(r.Context(), w) } // HostPage renders /hosts/{id}: summary + actions + in-flight banner + // runs table. Run-level detail (pipeline, logs, sub-steps, spec diffs, // hold banner) lives on /runs/{runID}. The split keeps host-scoped and // run-scoped work on distinct URLs so permalinks don't wander onto // whichever run happens to be active. func (u *UI) HostPage(w http.ResponseWriter, r *http.Request) { idStr := chi.URLParam(r, "id") id, err := strconv.ParseInt(idStr, 10, 64) if err != nil { http.Error(w, "bad host id", http.StatusBadRequest) return } data, err := u.LoadHostPageData(r.Context(), id) if err != nil { if errors.Is(err, store.ErrNotFound) { http.NotFound(w, r) return } http.Error(w, err.Error(), http.StatusInternalServerError) return } _ = templates.HostPage(data).Render(r.Context(), w) } // LoadHostPageData assembles the HostPageData payload for hostID — host // metadata, the full newest-first runs list, the currently non-terminal // run (if any) for the in-flight banner, and a per-run stages map so // the runs table can paint its compact stage-strips without re-querying // inside the template. Returns store.ErrNotFound when the host doesn't // exist; other store errors are surfaced. Stage lookups are fail-soft: // a transient DB error on one run's stages yields an empty strip for // that row rather than blanking the whole page. func (u *UI) LoadHostPageData(ctx context.Context, hostID int64) (templates.HostPageData, error) { host, err := u.Hosts.Get(ctx, hostID) if err != nil { return templates.HostPageData{}, err } var runs []model.Run if u.Runs != nil { runs, _ = u.Runs.ListForHostAll(ctx, hostID) } var active *model.Run for i := range runs { if !runs[i].State.IsTerminal() { active = &runs[i] break } } runStages := make(map[int64][]model.Stage, len(runs)) if u.Stages != nil { for _, r := range runs { if stages, err := u.Stages.ListForRun(ctx, r.ID); err == nil { runStages[r.ID] = stages } } } return templates.HostPageData{ Host: *host, LastSeenAt: host.LastSeenAt, Runs: runs, ActiveRun: active, RunStages: runStages, }, nil } // RunPage renders /runs/{runID}: breadcrumb, run header, hold banner, // pipeline, per-stage active-step panels, and spec diffs. Host metadata // is resolved from run.HostID for the breadcrumb and for action POST // targets (cancel/override still live under /hosts/{hostID}/...). func (u *UI) RunPage(w http.ResponseWriter, r *http.Request) { idStr := chi.URLParam(r, "runID") runID, err := strconv.ParseInt(idStr, 10, 64) if err != nil { http.Error(w, "bad run id", http.StatusBadRequest) return } data, err := u.LoadRunPageData(r.Context(), runID) if err != nil { if errors.Is(err, store.ErrNotFound) { http.NotFound(w, r) return } http.Error(w, err.Error(), http.StatusInternalServerError) return } _ = templates.RunPage(data).Render(r.Context(), w) } // LoadRunPageData assembles the RunPageData payload for runID. Resolves // the owning host, then reads stages, sub-steps, spec diffs, and log // replay. Returns store.ErrNotFound when the run or host is gone. The // orchestrator's PublishRunPage path uses the same loader so SSE fragments // render from identical inputs as the initial GET. func (u *UI) LoadRunPageData(ctx context.Context, runID int64) (templates.RunPageData, error) { if u.Runs == nil { return templates.RunPageData{}, store.ErrNotFound } run, err := u.Runs.Get(ctx, runID) if err != nil { return templates.RunPageData{}, err } if run == nil { return templates.RunPageData{}, store.ErrNotFound } host, err := u.Hosts.Get(ctx, run.HostID) if err != nil { return templates.RunPageData{}, err } var stages []model.Stage var subSteps []model.SubStep var diffs []model.SpecDiff if u.Stages != nil { stages, _ = u.Stages.ListForRun(ctx, runID) } if u.SubSteps != nil { subSteps, _ = u.SubSteps.ListForRun(ctx, runID) } if u.SpecDiffs != nil { diffs, _ = u.SpecDiffs.ListForRun(ctx, runID) } replayByStage := map[string]string{} if u.Logs != nil { replayByStage = u.Logs.ReplayByStage(runID) } // Critical-diff count + hold-key path reuse the tile enricher so the // run header shows the same numbers the dashboard tile + runs-table // row show. Fail-soft if tiles isn't wired (test setups can skip it). critical := 0 holdKeyPath := "" if u.Tiles != nil { t := u.Tiles.Build(ctx, *host, run) critical = t.SpecDiffCritical holdKeyPath = t.HoldKeyPath } return templates.RunPageData{ Host: *host, Run: *run, Stages: stages, SubSteps: subSteps, SpecDiffs: diffs, DefaultStepStage: pickDefaultStep(stages), LogReplayByStage: replayByStage, HoldKeyPath: holdKeyPath, SpecDiffCritical: critical, }, nil } // pickDefaultStep chooses which stage the detail page opens expanded by // default. Rule: running → first-failed → Reporting. The operator is // almost always most interested in the thing currently happening (or // the thing that just failed); Reporting is the sensible terminal fallback // because it's where the report link lives. func pickDefaultStep(stages []model.Stage) string { for _, s := range stages { if s.State == model.StageRunning { return s.Name } } for _, s := range stages { if s.State == model.StageFailed { return s.Name } } return "Reporting" } // StartRun creates a new Run for the host, issues an agent token, and // transitions Registered→Queued. The dispatcher goroutine picks it up // on its next tick; the happy path is heartbeat-driven (the reporter's // next heartbeat fetches reboot_for_vetting). Refuses the click outright // if the host isn't currently heartbeating — there is no path from // Queued to live-image without an in-OS reporter on the target. func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) { idStr := chi.URLParam(r, "id") hostID, err := strconv.ParseInt(idStr, 10, 64) if err != nil { http.Error(w, "bad host id", http.StatusBadRequest) return } host, err := u.Hosts.Get(r.Context(), hostID) if err != nil { if errors.Is(err, store.ErrNotFound) { http.NotFound(w, r) return } http.Error(w, err.Error(), http.StatusInternalServerError) return } // Preflight: host must be heartbeating. The dispatcher re-checks at // dispatch time (belt-and-braces for the gap between click and tick), // but rejecting here gives the operator an immediate, actionable // error instead of a mysterious Failed run 2s later. if host.LastSeenAt == nil || time.Since(*host.LastSeenAt) > orchestrator.HostHeartbeatStaleAfter { writeJSONError(w, http.StatusConflict, "host is not heartbeating — install the reporter via /register/quick.sh on the target host, then retry") return } // Guard: refuse to start a second run while one is still active. if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil { if !latest.State.IsTerminal() { http.Error(w, "host already has an active run", http.StatusConflict) return } } nonDestructive := r.PostFormValue("non_destructive") == "1" _, hash, err := orchestrator.IssueRunToken() if err != nil { http.Error(w, "token: "+err.Error(), http.StatusInternalServerError) return } runID, err := u.Runs.Create(r.Context(), hostID, hash, nonDestructive) if err != nil { http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError) return } log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID) // Send the operator straight to the new run — the button they clicked // was "Start vetting", the thing they want next is to watch it. http.Redirect(w, r, fmt.Sprintf("/runs/%d", runID), http.StatusSeeOther) } func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) { _ = templates.Registration(templates.RegistrationForm{ QuickRegisterURL: u.baseURL(r), }).Render(r.Context(), w) } // QuickRegisterScript renders the bash one-liner an operator pastes on // the target host: hardware autodetect + POST to /api/v1/hosts. The // orchestrator URL is substituted in so the script is self-contained. func (u *UI) QuickRegisterScript(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/x-shellscript; charset=utf-8") w.Header().Set("Cache-Control", "no-store") if err := quickRegisterTmpl.Execute(w, struct{ OrchestratorURL string }{ OrchestratorURL: u.baseURL(r), }); err != nil { log.Printf("quick-register script render: %v", err) } } func (u *UI) CreateHost(w http.ResponseWriter, r *http.Request) { if err := r.ParseForm(); err != nil { http.Error(w, "bad form", http.StatusBadRequest) return } form := templates.RegistrationForm{ Name: strings.TrimSpace(r.PostForm.Get("name")), MAC: strings.ToLower(strings.TrimSpace(r.PostForm.Get("mac"))), WoLBroadcastIP: strings.TrimSpace(r.PostForm.Get("wol_broadcast_ip")), WoLPort: r.PostForm.Get("wol_port"), ExpectedSpecYAML: r.PostForm.Get("expected_spec_yaml"), Notes: strings.TrimSpace(r.PostForm.Get("notes")), QuickRegisterURL: u.baseURL(r), } if errMsg := validateHostForm(&form); errMsg != "" { form.Error = errMsg w.WriteHeader(http.StatusBadRequest) _ = templates.Registration(form).Render(r.Context(), w) return } wolPort, _ := strconv.Atoi(form.WoLPort) if wolPort == 0 { wolPort = 9 } _, err := u.Hosts.Create(r.Context(), model.Host{ Name: form.Name, MAC: form.MAC, WoLBroadcastIP: form.WoLBroadcastIP, WoLPort: wolPort, ExpectedSpecYAML: form.ExpectedSpecYAML, Notes: form.Notes, }) if err != nil { form.Error = friendlyDBError(err) w.WriteHeader(http.StatusConflict) _ = templates.Registration(form).Render(r.Context(), w) return } u.reloadPXE(r.Context()) http.Redirect(w, r, "/", http.StatusSeeOther) } // quickRegisterPayload is the POST body accepted by /api/v1/hosts — // the shape the quick-register bash one-liner emits. type quickRegisterPayload struct { Name string `json:"name"` MAC string `json:"mac"` WoLBroadcastIP string `json:"wol_broadcast_ip"` WoLPort int `json:"wol_port"` ExpectedSpecYAML string `json:"expected_spec_yaml"` Notes string `json:"notes"` } // CreateHostJSON is the API counterpart to CreateHost. Accepts the same // fields as the form but in JSON, so a target host can POST its own // registration payload over curl from the quick-register one-liner. // Same validation as the form; no auth (LAN-only). func (u *UI) CreateHostJSON(w http.ResponseWriter, r *http.Request) { var p quickRegisterPayload if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 256*1024)).Decode(&p); err != nil { writeJSONError(w, http.StatusBadRequest, "bad json: "+err.Error()) return } form := templates.RegistrationForm{ Name: strings.TrimSpace(p.Name), MAC: strings.ToLower(strings.TrimSpace(p.MAC)), WoLBroadcastIP: strings.TrimSpace(p.WoLBroadcastIP), ExpectedSpecYAML: p.ExpectedSpecYAML, Notes: strings.TrimSpace(p.Notes), } if p.WoLPort > 0 { form.WoLPort = strconv.Itoa(p.WoLPort) } if errMsg := validateHostForm(&form); errMsg != "" { writeJSONError(w, http.StatusBadRequest, errMsg) return } wolPort := p.WoLPort if wolPort == 0 { wolPort = 9 } id, err := u.Hosts.Create(r.Context(), model.Host{ Name: form.Name, MAC: form.MAC, WoLBroadcastIP: form.WoLBroadcastIP, WoLPort: wolPort, ExpectedSpecYAML: form.ExpectedSpecYAML, Notes: form.Notes, }) if err != nil { writeJSONError(w, http.StatusConflict, friendlyDBError(err)) return } log.Printf("api: registered host %d (%s, %s)", id, form.Name, form.MAC) u.reloadPXE(r.Context()) w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusCreated) _ = json.NewEncoder(w).Encode(map[string]any{ "id": id, "name": form.Name, "mac": form.MAC, }) } // Heartbeat is called every ~30s by a host-mode vetting-agent running // as a systemd service on the registered host. LAN-trusted, no auth — // same threat model as the browser UI and quick-register. Stamps // last_seen_at, flips the dashboard tile to "online", and — if the // operator has clicked Start vetting since the last heartbeat — replies // with cmd=reboot_for_vetting so the host boots into PXE without WoL. func (u *UI) Heartbeat(w http.ResponseWriter, r *http.Request) { mac := strings.ToLower(strings.TrimSpace(chi.URLParam(r, "mac"))) if !macRe.MatchString(mac) { writeJSONError(w, http.StatusBadRequest, "MAC address must be in the form aa:bb:cc:dd:ee:ff") return } host, err := u.Hosts.GetByMAC(r.Context(), mac) if err != nil { if errors.Is(err, store.ErrNotFound) { writeJSONError(w, http.StatusNotFound, "unknown host") return } writeJSONError(w, http.StatusInternalServerError, err.Error()) return } if err := u.Hosts.UpdateLastSeen(r.Context(), mac, time.Now().UTC()); err != nil { writeJSONError(w, http.StatusInternalServerError, err.Error()) return } if u.Runner != nil { u.Runner.PublishTileUpdate(r.Context(), host.ID) } cmd, runID := u.pickHostCommand(r.Context(), host.ID) resp := heartbeatResponse{OK: true, Cmd: cmd, RunID: runID} w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(resp) } // heartbeatResponse is the JSON the host-mode agent decodes on every // heartbeat. `cmd` is "" (omitted) in the idle case so the wire shape // stays `{"ok": true}` when nothing is happening. type heartbeatResponse struct { OK bool `json:"ok"` Cmd string `json:"cmd,omitempty"` RunID int64 `json:"run_id,omitempty"` } // pickHostCommand decides what the host-mode agent should do on the // back of this heartbeat. Returns ("", 0) when there's nothing to do. // // - Queued run → Transition(RebootCommanded) and tell the agent to // reboot. Beats the dispatcher's 2s poll to the punch, but either // path ends at WaitingReboot. // - WaitingReboot (or legacy WaitingWoL) run <10min old → also return // reboot, covering "host crashed mid-reboot, systemd brought the // reporter back". // - anything else → idle. func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) { if u.Runs == nil || u.Runner == nil { return "", 0 } run, err := u.Runs.LatestForHost(ctx, hostID) if err != nil { log.Printf("heartbeat: latest run for host %d: %v", hostID, err) return "", 0 } if run == nil { return "", 0 } switch run.State { case model.StateQueued: if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerRebootCommanded); err != nil { // Benign race with the dispatcher's own 2s poll — the // state machine refuses the second transition; we just // log and return idle so the agent doesn't reboot on a // run that another path is already driving. log.Printf("heartbeat: transition run %d: %v", run.ID, err) return "", 0 } log.Printf("heartbeat: dispatched run %d for host %d (reboot commanded)", run.ID, hostID) return cmdRebootForVetting, run.ID case model.StateWaitingReboot, model.StateWaitingWoL: // Tolerate a crashed-mid-reboot retry: the reporter is the // only thing that could be telling us about this host right // now. Bound it so a perpetually-broken PXE doesn't // reboot-loop the box. if time.Since(run.StartedAt) < 10*time.Minute { return cmdRebootForVetting, run.ID } return "", 0 } return "", 0 } const cmdRebootForVetting = "reboot_for_vetting" func writeJSONError(w http.ResponseWriter, status int, msg string) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(status) _ = json.NewEncoder(w).Encode(map[string]string{"error": msg}) } // OverrideWipeStorage is the operator's explicit "yes, wipe the disk // even though we found filesystem signatures" button. Only meaningful // when the latest run is FailedHolding with failed_stage=Storage — the // agent's next heartbeat will receive retry_stage with wipe=true and // re-enter the Storage stage bypassing the wipe-probe guard. func (u *UI) OverrideWipeStorage(w http.ResponseWriter, r *http.Request) { idStr := chi.URLParam(r, "id") hostID, err := strconv.ParseInt(idStr, 10, 64) if err != nil { http.Error(w, "bad host id", http.StatusBadRequest) return } latest, err := u.Runs.LatestForHost(r.Context(), hostID) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } if latest == nil { http.Error(w, "no run for host", http.StatusConflict) return } if latest.State != model.StateFailedHolding || latest.FailedStage != "Storage" { http.Error(w, "override only valid when holding on Storage", http.StatusConflict) return } if _, err := u.Runner.Override(r.Context(), latest.ID, `{"wipe":true}`); err != nil { http.Error(w, "override: "+err.Error(), http.StatusInternalServerError) return } // Operator was on /runs/{latest.ID} when they clicked — land them // back there so they can see the override take effect. http.Redirect(w, r, fmt.Sprintf("/runs/%d", latest.ID), http.StatusSeeOther) } // CancelRun halts an in-flight run. Transitions the run to // StateCancelled; the next agent heartbeat receives cmd=cancel_stage // which cancels the stage ctx on the agent side. Destructive stages // mid-run can leave the host in an intermediate state — the confirm // dialog in the UI warns the operator. func (u *UI) CancelRun(w http.ResponseWriter, r *http.Request) { idStr := chi.URLParam(r, "id") hostID, err := strconv.ParseInt(idStr, 10, 64) if err != nil { http.Error(w, "bad host id", http.StatusBadRequest) return } latest, err := u.Runs.LatestForHost(r.Context(), hostID) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } if latest == nil || latest.State.IsTerminal() { http.Error(w, "no active run to cancel", http.StatusConflict) return } if _, err := u.Runner.Transition(r.Context(), latest.ID, orchestrator.TriggerOperatorCancelled); err != nil { http.Error(w, "cancel: "+err.Error(), http.StatusInternalServerError) return } log.Printf("ui: cancelled run %d for host %d", latest.ID, hostID) http.Redirect(w, r, fmt.Sprintf("/runs/%d", latest.ID), http.StatusSeeOther) } func (u *UI) DeleteHost(w http.ResponseWriter, r *http.Request) { idStr := chi.URLParam(r, "id") id, err := strconv.ParseInt(idStr, 10, 64) if err != nil { http.Error(w, "bad id", http.StatusBadRequest) return } if err := u.Hosts.Delete(r.Context(), id); err != nil { if errors.Is(err, store.ErrNotFound) { http.NotFound(w, r) return } http.Error(w, err.Error(), http.StatusInternalServerError) return } u.reloadPXE(r.Context()) http.Redirect(w, r, "/", http.StatusSeeOther) } func (u *UI) SSE(w http.ResponseWriter, r *http.Request) { u.EventHub.ServeSSE(w, r) } // Report serves the HTML report artifact for a run. Looks up the // report_html artifact row for the runID, validates the path lives // under the artifacts dir (defence-in-depth against path traversal), // and streams it back. 404 when the run hasn't produced one yet. func (u *UI) Report(w http.ResponseWriter, r *http.Request) { idStr := chi.URLParam(r, "runID") runID, err := strconv.ParseInt(idStr, 10, 64) if err != nil { http.Error(w, "bad run id", http.StatusBadRequest) return } arts, err := u.Artifacts.ListForRun(r.Context(), runID) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } var path string for _, a := range arts { if a.Kind == "report_html" { path = a.Path } } if path == "" { http.NotFound(w, r) return } w.Header().Set("Content-Type", "text/html; charset=utf-8") http.ServeFile(w, r, path) } func validateHostForm(form *templates.RegistrationForm) string { if form.Name == "" { return "Name is required." } if !macRe.MatchString(form.MAC) { return "MAC address must be in the form aa:bb:cc:dd:ee:ff." } if form.WoLBroadcastIP == "" { return "WoL broadcast IP is required." } if form.ExpectedSpecYAML == "" { return "Expected spec YAML is required." } var anything any if err := yaml.Unmarshal([]byte(form.ExpectedSpecYAML), &anything); err != nil { return "Expected spec YAML is not valid YAML: " + err.Error() } if form.WoLPort != "" { port, err := strconv.Atoi(form.WoLPort) if err != nil || port < 1 || port > 65535 { return "WoL port must be 1–65535." } } return "" } func friendlyDBError(err error) string { s := err.Error() switch { case strings.Contains(s, "UNIQUE constraint failed: hosts.name"): return "A host with that name already exists." case strings.Contains(s, "UNIQUE constraint failed: hosts.mac"): return "A host with that MAC already exists." default: return s } }