Files
Vetting/internal/api/ui_handlers.go
T
josh 4524ab8dc0
CI / Lint + build + test (push) Successful in 2m5s
Release / release (push) Successful in 3m5s
runs: add non-destructive flag + operator Cancel button
Non-destructive pre-declares "don't touch the disks" on Start: the
Storage stage skips wipe-probe, badblocks -w, and write-mode fio,
and reports a read-only summary. Runs a new non_destructive column;
threaded through Claim → agent tests.Deps → Storage stage.

Cancel halts an in-flight run. The orchestrator transitions to a
new StateCancelled via TriggerOperatorCancelled (valid from any
active state); the agent's next heartbeat returns cmd=cancel_stage,
which fires a stored CancelFunc on the per-stage context. Stage
subprocesses spawned with exec.CommandContext die with the context,
the agent posts a cancelled outcome, then powers the host off.

Destructive stages mid-run may leave the host in an intermediate
state — the UI confirm dialog warns the operator; recovery is
manual for now.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 13:01:42 -04:00

592 lines
19 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package api
import (
"context"
"encoding/json"
"errors"
"log"
"net/http"
"regexp"
"strconv"
"strings"
"text/template"
"time"
"github.com/go-chi/chi/v5"
"gopkg.in/yaml.v3"
"vetting/internal/events"
"vetting/internal/logs"
"vetting/internal/model"
"vetting/internal/orchestrator"
"vetting/internal/store"
"vetting/internal/web"
"vetting/internal/web/templates"
)
type UI struct {
Hosts *store.Hosts
Runs *store.Runs
Stages *store.Stages
SpecDiffs *store.SpecDiffs
Artifacts *store.Artifacts
EventHub *events.Hub
Logs *logs.Hub
Runner *orchestrator.Runner
Tiles *TileEnricher
PublicURL string // user-visible base URL baked into the quick-register one-liner
// PXE, when non-nil, gets Reload()ed after host create/delete so
// dnsmasq's dhcp-host= allowlist reflects the current registry.
// Without this, a newly-registered host PXE-boots and gets
// "proxy-ignored" because its MAC isn't tagged `known`.
PXE PXEReloader
}
// PXEReloader rewrites dnsmasq.conf with the current host list and
// SIGHUPs the subprocess. Satisfied by *pxe.Supervisor.
type PXEReloader interface {
Reload(hosts []model.Host) error
}
// reloadPXE reads the full host list and hands it to the reloader.
// Logs on failure; never returns an error — the HTTP request that
// triggered the host change has already succeeded.
func (u *UI) reloadPXE(ctx context.Context) {
if u.PXE == nil {
return
}
hosts, err := u.Hosts.List(ctx)
if err != nil {
log.Printf("pxe reload: list hosts: %v", err)
return
}
if err := u.PXE.Reload(hosts); err != nil {
log.Printf("pxe reload: %v", err)
}
}
var macRe = regexp.MustCompile(`^[0-9a-f]{2}(:[0-9a-f]{2}){5}$`)
// quickRegisterTmpl is parsed once at startup — a malformed template
// should fail the binary at init, not on a visitor's first hit.
var quickRegisterTmpl = template.Must(
template.ParseFS(web.Register, "register/quick.sh.tmpl"),
)
// baseURL returns the orchestrator URL to bake into generated artefacts
// (the quick-register one-liner, its rendered script). Prefers the
// operator-configured public URL; falls back to the request's own host
// so a dev run on http://127.0.0.1:8080 still produces a working command.
func (u *UI) baseURL(r *http.Request) string {
if u.PublicURL != "" {
return strings.TrimRight(u.PublicURL, "/")
}
scheme := "http"
if r.TLS != nil {
scheme = "https"
}
return scheme + "://" + r.Host
}
func (u *UI) Dashboard(w http.ResponseWriter, r *http.Request) {
hosts, err := u.Hosts.List(r.Context())
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
tiles := make([]templates.TileData, 0, len(hosts))
for _, h := range hosts {
latest, err := u.Runs.LatestForHost(r.Context(), h.ID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
tiles = append(tiles, u.Tiles.Build(r.Context(), h, latest))
}
_ = templates.Dashboard(tiles).Render(r.Context(), w)
}
// HostDetail renders the per-host page: breadcrumb, summary, pipeline
// timeline, hold card, action row, spec diffs, log pane, meta. Same
// enrichment path as Dashboard for tile data; additionally reads stage
// rows + spec diffs for the latest run to populate the timeline and
// diff list.
func (u *UI) HostDetail(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
host, err := u.Hosts.Get(r.Context(), id)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
http.NotFound(w, r)
return
}
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
latest, err := u.Runs.LatestForHost(r.Context(), id)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
var stages []model.Stage
var diffs []model.SpecDiff
if latest != nil {
if u.Stages != nil {
stages, _ = u.Stages.ListForRun(r.Context(), latest.ID)
}
if u.SpecDiffs != nil {
diffs, _ = u.SpecDiffs.ListForRun(r.Context(), latest.ID)
}
}
t := u.Tiles.Build(r.Context(), *host, latest)
replay := ""
if latest != nil && u.Logs != nil {
replay = u.Logs.Replay(latest.ID)
}
data := templates.HostDetailData{
Tile: t,
Stages: stages,
SpecDiffs: diffs,
LogReplay: replay,
}
_ = templates.HostDetail(data).Render(r.Context(), w)
}
// StartRun creates a new Run for the host, issues an agent token, and
// transitions Registered→Queued. The dispatcher goroutine picks it up
// on its next tick; the happy path is heartbeat-driven (the reporter's
// next heartbeat fetches reboot_for_vetting). Refuses the click outright
// if the host isn't currently heartbeating — there is no path from
// Queued to live-image without an in-OS reporter on the target.
func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
hostID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
host, err := u.Hosts.Get(r.Context(), hostID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
http.NotFound(w, r)
return
}
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Preflight: host must be heartbeating. The dispatcher re-checks at
// dispatch time (belt-and-braces for the gap between click and tick),
// but rejecting here gives the operator an immediate, actionable
// error instead of a mysterious Failed run 2s later.
if host.LastSeenAt == nil || time.Since(*host.LastSeenAt) > orchestrator.HostHeartbeatStaleAfter {
writeJSONError(w, http.StatusConflict,
"host is not heartbeating — install the reporter via /register/quick.sh on the target host, then retry")
return
}
// Guard: refuse to start a second run while one is still active.
if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
if !latest.State.IsTerminal() {
http.Error(w, "host already has an active run", http.StatusConflict)
return
}
}
nonDestructive := r.PostFormValue("non_destructive") == "1"
_, hash, err := orchestrator.IssueRunToken()
if err != nil {
http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
return
}
runID, err := u.Runs.Create(r.Context(), hostID, hash, nonDestructive)
if err != nil {
http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
return
}
log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID)
http.Redirect(w, r, "/", http.StatusSeeOther)
}
func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
_ = templates.Registration(templates.RegistrationForm{
QuickRegisterURL: u.baseURL(r),
}).Render(r.Context(), w)
}
// QuickRegisterScript renders the bash one-liner an operator pastes on
// the target host: hardware autodetect + POST to /api/v1/hosts. The
// orchestrator URL is substituted in so the script is self-contained.
func (u *UI) QuickRegisterScript(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/x-shellscript; charset=utf-8")
w.Header().Set("Cache-Control", "no-store")
if err := quickRegisterTmpl.Execute(w, struct{ OrchestratorURL string }{
OrchestratorURL: u.baseURL(r),
}); err != nil {
log.Printf("quick-register script render: %v", err)
}
}
func (u *UI) CreateHost(w http.ResponseWriter, r *http.Request) {
if err := r.ParseForm(); err != nil {
http.Error(w, "bad form", http.StatusBadRequest)
return
}
form := templates.RegistrationForm{
Name: strings.TrimSpace(r.PostForm.Get("name")),
MAC: strings.ToLower(strings.TrimSpace(r.PostForm.Get("mac"))),
WoLBroadcastIP: strings.TrimSpace(r.PostForm.Get("wol_broadcast_ip")),
WoLPort: r.PostForm.Get("wol_port"),
ExpectedSpecYAML: r.PostForm.Get("expected_spec_yaml"),
Notes: strings.TrimSpace(r.PostForm.Get("notes")),
QuickRegisterURL: u.baseURL(r),
}
if errMsg := validateHostForm(&form); errMsg != "" {
form.Error = errMsg
w.WriteHeader(http.StatusBadRequest)
_ = templates.Registration(form).Render(r.Context(), w)
return
}
wolPort, _ := strconv.Atoi(form.WoLPort)
if wolPort == 0 {
wolPort = 9
}
_, err := u.Hosts.Create(r.Context(), model.Host{
Name: form.Name,
MAC: form.MAC,
WoLBroadcastIP: form.WoLBroadcastIP,
WoLPort: wolPort,
ExpectedSpecYAML: form.ExpectedSpecYAML,
Notes: form.Notes,
})
if err != nil {
form.Error = friendlyDBError(err)
w.WriteHeader(http.StatusConflict)
_ = templates.Registration(form).Render(r.Context(), w)
return
}
u.reloadPXE(r.Context())
http.Redirect(w, r, "/", http.StatusSeeOther)
}
// quickRegisterPayload is the POST body accepted by /api/v1/hosts —
// the shape the quick-register bash one-liner emits.
type quickRegisterPayload struct {
Name string `json:"name"`
MAC string `json:"mac"`
WoLBroadcastIP string `json:"wol_broadcast_ip"`
WoLPort int `json:"wol_port"`
ExpectedSpecYAML string `json:"expected_spec_yaml"`
Notes string `json:"notes"`
}
// CreateHostJSON is the API counterpart to CreateHost. Accepts the same
// fields as the form but in JSON, so a target host can POST its own
// registration payload over curl from the quick-register one-liner.
// Same validation as the form; no auth (LAN-only).
func (u *UI) CreateHostJSON(w http.ResponseWriter, r *http.Request) {
var p quickRegisterPayload
if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 256*1024)).Decode(&p); err != nil {
writeJSONError(w, http.StatusBadRequest, "bad json: "+err.Error())
return
}
form := templates.RegistrationForm{
Name: strings.TrimSpace(p.Name),
MAC: strings.ToLower(strings.TrimSpace(p.MAC)),
WoLBroadcastIP: strings.TrimSpace(p.WoLBroadcastIP),
ExpectedSpecYAML: p.ExpectedSpecYAML,
Notes: strings.TrimSpace(p.Notes),
}
if p.WoLPort > 0 {
form.WoLPort = strconv.Itoa(p.WoLPort)
}
if errMsg := validateHostForm(&form); errMsg != "" {
writeJSONError(w, http.StatusBadRequest, errMsg)
return
}
wolPort := p.WoLPort
if wolPort == 0 {
wolPort = 9
}
id, err := u.Hosts.Create(r.Context(), model.Host{
Name: form.Name,
MAC: form.MAC,
WoLBroadcastIP: form.WoLBroadcastIP,
WoLPort: wolPort,
ExpectedSpecYAML: form.ExpectedSpecYAML,
Notes: form.Notes,
})
if err != nil {
writeJSONError(w, http.StatusConflict, friendlyDBError(err))
return
}
log.Printf("api: registered host %d (%s, %s)", id, form.Name, form.MAC)
u.reloadPXE(r.Context())
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusCreated)
_ = json.NewEncoder(w).Encode(map[string]any{
"id": id,
"name": form.Name,
"mac": form.MAC,
})
}
// Heartbeat is called every ~30s by a host-mode vetting-agent running
// as a systemd service on the registered host. LAN-trusted, no auth —
// same threat model as the browser UI and quick-register. Stamps
// last_seen_at, flips the dashboard tile to "online", and — if the
// operator has clicked Start vetting since the last heartbeat — replies
// with cmd=reboot_for_vetting so the host boots into PXE without WoL.
func (u *UI) Heartbeat(w http.ResponseWriter, r *http.Request) {
mac := strings.ToLower(strings.TrimSpace(chi.URLParam(r, "mac")))
if !macRe.MatchString(mac) {
writeJSONError(w, http.StatusBadRequest,
"MAC address must be in the form aa:bb:cc:dd:ee:ff")
return
}
host, err := u.Hosts.GetByMAC(r.Context(), mac)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
writeJSONError(w, http.StatusNotFound, "unknown host")
return
}
writeJSONError(w, http.StatusInternalServerError, err.Error())
return
}
if err := u.Hosts.UpdateLastSeen(r.Context(), mac, time.Now().UTC()); err != nil {
writeJSONError(w, http.StatusInternalServerError, err.Error())
return
}
if u.Runner != nil {
u.Runner.PublishTileUpdate(r.Context(), host.ID)
}
cmd, runID := u.pickHostCommand(r.Context(), host.ID)
resp := heartbeatResponse{OK: true, Cmd: cmd, RunID: runID}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(resp)
}
// heartbeatResponse is the JSON the host-mode agent decodes on every
// heartbeat. `cmd` is "" (omitted) in the idle case so the wire shape
// stays `{"ok": true}` when nothing is happening.
type heartbeatResponse struct {
OK bool `json:"ok"`
Cmd string `json:"cmd,omitempty"`
RunID int64 `json:"run_id,omitempty"`
}
// pickHostCommand decides what the host-mode agent should do on the
// back of this heartbeat. Returns ("", 0) when there's nothing to do.
//
// - Queued run → Transition(RebootCommanded) and tell the agent to
// reboot. Beats the dispatcher's 2s poll to the punch, but either
// path ends at WaitingReboot.
// - WaitingReboot (or legacy WaitingWoL) run <10min old → also return
// reboot, covering "host crashed mid-reboot, systemd brought the
// reporter back".
// - anything else → idle.
func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) {
if u.Runs == nil || u.Runner == nil {
return "", 0
}
run, err := u.Runs.LatestForHost(ctx, hostID)
if err != nil {
log.Printf("heartbeat: latest run for host %d: %v", hostID, err)
return "", 0
}
if run == nil {
return "", 0
}
switch run.State {
case model.StateQueued:
if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerRebootCommanded); err != nil {
// Benign race with the dispatcher's own 2s poll — the
// state machine refuses the second transition; we just
// log and return idle so the agent doesn't reboot on a
// run that another path is already driving.
log.Printf("heartbeat: transition run %d: %v", run.ID, err)
return "", 0
}
log.Printf("heartbeat: dispatched run %d for host %d (reboot commanded)", run.ID, hostID)
return cmdRebootForVetting, run.ID
case model.StateWaitingReboot, model.StateWaitingWoL:
// Tolerate a crashed-mid-reboot retry: the reporter is the
// only thing that could be telling us about this host right
// now. Bound it so a perpetually-broken PXE doesn't
// reboot-loop the box.
if time.Since(run.StartedAt) < 10*time.Minute {
return cmdRebootForVetting, run.ID
}
return "", 0
}
return "", 0
}
const cmdRebootForVetting = "reboot_for_vetting"
func writeJSONError(w http.ResponseWriter, status int, msg string) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
_ = json.NewEncoder(w).Encode(map[string]string{"error": msg})
}
// OverrideWipeStorage is the operator's explicit "yes, wipe the disk
// even though we found filesystem signatures" button. Only meaningful
// when the latest run is FailedHolding with failed_stage=Storage — the
// agent's next heartbeat will receive retry_stage with wipe=true and
// re-enter the Storage stage bypassing the wipe-probe guard.
func (u *UI) OverrideWipeStorage(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
hostID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
latest, err := u.Runs.LatestForHost(r.Context(), hostID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if latest == nil {
http.Error(w, "no run for host", http.StatusConflict)
return
}
if latest.State != model.StateFailedHolding || latest.FailedStage != "Storage" {
http.Error(w, "override only valid when holding on Storage", http.StatusConflict)
return
}
if _, err := u.Runner.Override(r.Context(), latest.ID, `{"wipe":true}`); err != nil {
http.Error(w, "override: "+err.Error(), http.StatusInternalServerError)
return
}
http.Redirect(w, r, "/", http.StatusSeeOther)
}
// CancelRun halts an in-flight run. Transitions the run to
// StateCancelled; the next agent heartbeat receives cmd=cancel_stage
// which cancels the stage ctx on the agent side. Destructive stages
// mid-run can leave the host in an intermediate state — the confirm
// dialog in the UI warns the operator.
func (u *UI) CancelRun(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
hostID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
latest, err := u.Runs.LatestForHost(r.Context(), hostID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if latest == nil || latest.State.IsTerminal() {
http.Error(w, "no active run to cancel", http.StatusConflict)
return
}
if _, err := u.Runner.Transition(r.Context(), latest.ID, orchestrator.TriggerOperatorCancelled); err != nil {
http.Error(w, "cancel: "+err.Error(), http.StatusInternalServerError)
return
}
log.Printf("ui: cancelled run %d for host %d", latest.ID, hostID)
http.Redirect(w, r, "/", http.StatusSeeOther)
}
func (u *UI) DeleteHost(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad id", http.StatusBadRequest)
return
}
if err := u.Hosts.Delete(r.Context(), id); err != nil {
if errors.Is(err, store.ErrNotFound) {
http.NotFound(w, r)
return
}
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
u.reloadPXE(r.Context())
http.Redirect(w, r, "/", http.StatusSeeOther)
}
func (u *UI) SSE(w http.ResponseWriter, r *http.Request) {
u.EventHub.ServeSSE(w, r)
}
// Report serves the HTML report artifact for a run. Looks up the
// report_html artifact row for the runID, validates the path lives
// under the artifacts dir (defence-in-depth against path traversal),
// and streams it back. 404 when the run hasn't produced one yet.
func (u *UI) Report(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "runID")
runID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad run id", http.StatusBadRequest)
return
}
arts, err := u.Artifacts.ListForRun(r.Context(), runID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
var path string
for _, a := range arts {
if a.Kind == "report_html" {
path = a.Path
}
}
if path == "" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
http.ServeFile(w, r, path)
}
func validateHostForm(form *templates.RegistrationForm) string {
if form.Name == "" {
return "Name is required."
}
if !macRe.MatchString(form.MAC) {
return "MAC address must be in the form aa:bb:cc:dd:ee:ff."
}
if form.WoLBroadcastIP == "" {
return "WoL broadcast IP is required."
}
if form.ExpectedSpecYAML == "" {
return "Expected spec YAML is required."
}
var anything any
if err := yaml.Unmarshal([]byte(form.ExpectedSpecYAML), &anything); err != nil {
return "Expected spec YAML is not valid YAML: " + err.Error()
}
if form.WoLPort != "" {
port, err := strconv.Atoi(form.WoLPort)
if err != nil || port < 1 || port > 65535 {
return "WoL port must be 165535."
}
}
return ""
}
func friendlyDBError(err error) string {
s := err.Error()
switch {
case strings.Contains(s, "UNIQUE constraint failed: hosts.name"):
return "A host with that name already exists."
case strings.Contains(s, "UNIQUE constraint failed: hosts.mac"):
return "A host with that MAC already exists."
default:
return s
}
}