Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,918 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"crypto/subtle"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"vetting/internal/events"
|
||||
"vetting/internal/hold"
|
||||
"vetting/internal/logs"
|
||||
"vetting/internal/model"
|
||||
"vetting/internal/notify"
|
||||
"vetting/internal/orchestrator"
|
||||
"vetting/internal/pxe"
|
||||
"vetting/internal/report"
|
||||
"vetting/internal/spec"
|
||||
"vetting/internal/store"
|
||||
)
|
||||
|
||||
// Agent collects the collaborators used by agent-facing HTTP routes:
|
||||
// the iPXE chainload endpoint and the /api/v1/runs/:id/* endpoints.
|
||||
type Agent struct {
|
||||
Hosts *store.Hosts
|
||||
Runs *store.Runs
|
||||
Stages *store.Stages
|
||||
Artifacts *store.Artifacts
|
||||
SpecDiffs *store.SpecDiffs
|
||||
Measurements *store.Measurements
|
||||
Runner *orchestrator.Runner
|
||||
EventHub *events.Hub
|
||||
Logs *logs.Hub
|
||||
Notify *notify.Registry
|
||||
ArtifactsDir string // ./var/artifacts
|
||||
OrchestratorURL string // baked into iPXE cmdline
|
||||
PublicURL string // user-visible URL base for notification click-throughs
|
||||
LiveKernelURL string
|
||||
LiveInitrdURL string
|
||||
TLSCertFPR string // optional; empty = skip pinning
|
||||
IperfPort int // orchestrator-supervised iperf3 port; 0 = 5201
|
||||
}
|
||||
|
||||
// IPXEScript serves a per-MAC iPXE script. Called by iPXE itself after
|
||||
// dnsmasq hands it the chainload URL. Unknown MAC → halt script.
|
||||
// Known MAC with no active run → poweroff script. Known MAC with active
|
||||
// run → real boot script; the fetch triggers PXEObserved.
|
||||
func (a *Agent) IPXEScript(w http.ResponseWriter, r *http.Request) {
|
||||
mac := strings.ToLower(strings.TrimSpace(chi.URLParam(r, "mac")))
|
||||
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
|
||||
if !macRe.MatchString(mac) {
|
||||
log.Printf("ipxe: rejected malformed mac %q from %s", mac, r.RemoteAddr)
|
||||
_, _ = w.Write([]byte(pxe.NotRegisteredScript(mac)))
|
||||
return
|
||||
}
|
||||
|
||||
run, err := a.Runs.FindActiveByMAC(r.Context(), mac)
|
||||
if err != nil {
|
||||
log.Printf("ipxe: find run by mac %s: %v", mac, err)
|
||||
http.Error(w, "internal error", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if run == nil {
|
||||
_, _ = w.Write([]byte(pxe.NoActiveRunScript(mac)))
|
||||
return
|
||||
}
|
||||
|
||||
// The token hash in the DB is the sha256 of the plaintext. The
|
||||
// plaintext itself cannot be recovered from the hash — we issued it
|
||||
// once when the run was created. For iPXE we re-issue a fresh token
|
||||
// on every PXE fetch: this is safe because the hash in the DB is
|
||||
// rewritten to match and only the most recent PXE can be claimed.
|
||||
plain, hash, err := orchestrator.IssueRunToken()
|
||||
if err != nil {
|
||||
http.Error(w, "token", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if err := a.Runs.RotateTokenHash(r.Context(), run.ID, hash); err != nil {
|
||||
log.Printf("ipxe: rotate token run %d: %v", run.ID, err)
|
||||
http.Error(w, "token", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
script := pxe.BuildScript(pxe.IPXEParams{
|
||||
OrchestratorURL: a.OrchestratorURL,
|
||||
LiveKernelURL: a.LiveKernelURL,
|
||||
LiveInitrdURL: a.LiveInitrdURL,
|
||||
TLSCertFPR: a.TLSCertFPR,
|
||||
RunID: run.ID,
|
||||
MAC: mac,
|
||||
Token: plain,
|
||||
})
|
||||
_, _ = w.Write([]byte(script))
|
||||
|
||||
// iPXE has now fetched the script — treat this as PXEObserved. If we
|
||||
// were already in Booting the transition table allows staying.
|
||||
if _, err := a.Runner.Transition(r.Context(), run.ID, orchestrator.TriggerPXEObserved); err != nil {
|
||||
// Non-fatal: the agent may still claim via /claim.
|
||||
log.Printf("ipxe: PXEObserved for run %d: %v", run.ID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Hello is the first call an agent makes once userspace is up. It's
|
||||
// idempotent and only writes a log line; the authoritative transition
|
||||
// comes from /claim. The agent sends Hello early so operators see a
|
||||
// signal in the tile even before the token is validated.
|
||||
func (a *Agent) Hello(w http.ResponseWriter, r *http.Request) {
|
||||
runID, ok := runIDFromURL(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if _, ok := a.authenticate(w, r, runID); !ok {
|
||||
return
|
||||
}
|
||||
log.Printf("agent hello: run=%d remote=%s", runID, r.RemoteAddr)
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "run_id": runID})
|
||||
}
|
||||
|
||||
// Claim is the binding call: the agent proves it holds the plaintext
|
||||
// token for this run, and in return the orchestrator transitions to
|
||||
// InventoryCheck and seeds the stage rows. All destructive actions the
|
||||
// agent takes later require a prior successful claim.
|
||||
func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
|
||||
runID, ok := runIDFromURL(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
run, ok := a.authenticate(w, r, runID)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
AgentIP string `json:"agent_ip"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
// agent_ip is informational; if missing fall back to RemoteAddr.
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
}
|
||||
agentIP := strings.TrimSpace(body.AgentIP)
|
||||
if agentIP == "" {
|
||||
if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil {
|
||||
agentIP = host
|
||||
} else {
|
||||
agentIP = r.RemoteAddr
|
||||
}
|
||||
}
|
||||
|
||||
// First claim seeds the stage rows; subsequent claims are a no-op
|
||||
// so agent retries after transient network failures stay safe.
|
||||
if len(mustListStages(a.Stages, r, runID)) == 0 {
|
||||
if err := a.Stages.Seed(r.Context(), runID); err != nil {
|
||||
log.Printf("claim: seed stages run %d: %v", runID, err)
|
||||
http.Error(w, "seed stages", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Drive the transition. If we're already past Booting this returns
|
||||
// an error — treat as "already claimed" and report OK, don't 500.
|
||||
if run.State == model.StateWaitingWoL || run.State == model.StateBooting {
|
||||
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerAgentClaimed); err != nil {
|
||||
log.Printf("claim: transition run %d: %v", runID, err)
|
||||
http.Error(w, "transition", http.StatusConflict)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("agent claimed: run=%d agent_ip=%s", runID, agentIP)
|
||||
|
||||
// Stage-driven agent needs a bit of per-run config: the device
|
||||
// allowlist (serial + expected size) for Storage, and the iperf3
|
||||
// server port for Network. Parse the host's expected spec here so
|
||||
// the agent doesn't need to read YAML.
|
||||
expectedDisks := []map[string]any{}
|
||||
if host, err := a.Hosts.Get(r.Context(), run.HostID); err == nil && host != nil {
|
||||
if parsed, err := spec.Parse(host.ExpectedSpecYAML); err == nil && parsed != nil {
|
||||
for _, dd := range parsed.Disks {
|
||||
expectedDisks = append(expectedDisks, map[string]any{
|
||||
"serial": dd.Serial,
|
||||
"size_gb": dd.SizeGB,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
iperfPort := a.IperfPort
|
||||
if iperfPort == 0 {
|
||||
iperfPort = 5201
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{
|
||||
"ok": true,
|
||||
"run_id": runID,
|
||||
"stages": store.DefaultStageOrder,
|
||||
"expected_disks": expectedDisks,
|
||||
"iperf_port": iperfPort,
|
||||
})
|
||||
}
|
||||
|
||||
// Heartbeat is the agent's periodic liveness ping. The response body
|
||||
// acts as a control channel: cmd=continue is the normal case; cmd=abort
|
||||
// once the run enters FailedHolding/Released; cmd=retry_stage when the
|
||||
// operator has overridden a failed stage (wipe-probe override).
|
||||
func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) {
|
||||
runID, ok := runIDFromURL(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
run, ok := a.authenticate(w, r, runID)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
a.Runner.TouchHeartbeat(runID)
|
||||
|
||||
cmd := "continue"
|
||||
resp := map[string]any{"state": run.State}
|
||||
switch {
|
||||
case run.State == model.StateCompleted:
|
||||
// Pipeline succeeded — agent should power the host down.
|
||||
cmd = "shutdown"
|
||||
case run.State == model.StateFailedHolding || run.State == model.StateReleased:
|
||||
cmd = "abort"
|
||||
case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON):
|
||||
// Operator pressed "Override wipe & retry". Agent should
|
||||
// re-enter Storage with the wipe-probe bypass armed.
|
||||
cmd = "retry_stage"
|
||||
resp["stage"] = "Storage"
|
||||
resp["override_flags"] = json.RawMessage(run.OverrideFlagsJSON)
|
||||
}
|
||||
resp["cmd"] = cmd
|
||||
writeJSON(w, http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// overrideWipeSet inspects a Run.OverrideFlagsJSON blob for the wipe flag.
|
||||
// Malformed JSON is ignored — the operator has to reapply the override if
|
||||
// it didn't round-trip correctly.
|
||||
func overrideWipeSet(blob string) bool {
|
||||
if blob == "" {
|
||||
return false
|
||||
}
|
||||
var flags struct {
|
||||
Wipe bool `json:"wipe"`
|
||||
}
|
||||
_ = json.Unmarshal([]byte(blob), &flags)
|
||||
return flags.Wipe
|
||||
}
|
||||
|
||||
// authenticate verifies the Bearer token against the run's stored hash
|
||||
// and returns the Run for downstream handlers. Responds 401/404 on
|
||||
// failure and returns ok=false so the caller can bail early.
|
||||
func (a *Agent) authenticate(w http.ResponseWriter, r *http.Request, runID int64) (*model.Run, bool) {
|
||||
run, err := a.Runs.Get(r.Context(), runID)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
http.Error(w, "run not found", http.StatusNotFound)
|
||||
return nil, false
|
||||
}
|
||||
http.Error(w, "internal error", http.StatusInternalServerError)
|
||||
return nil, false
|
||||
}
|
||||
token := bearerToken(r)
|
||||
if token == "" {
|
||||
http.Error(w, "missing bearer", http.StatusUnauthorized)
|
||||
return nil, false
|
||||
}
|
||||
presented := orchestrator.HashRunToken(token)
|
||||
if subtle.ConstantTimeCompare([]byte(presented), []byte(run.AgentTokenHash)) != 1 {
|
||||
http.Error(w, "bad token", http.StatusUnauthorized)
|
||||
return nil, false
|
||||
}
|
||||
return run, true
|
||||
}
|
||||
|
||||
func bearerToken(r *http.Request) string {
|
||||
h := r.Header.Get("Authorization")
|
||||
if !strings.HasPrefix(h, "Bearer ") {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(strings.TrimPrefix(h, "Bearer "))
|
||||
}
|
||||
|
||||
func runIDFromURL(w http.ResponseWriter, r *http.Request) (int64, bool) {
|
||||
idStr := chi.URLParam(r, "id")
|
||||
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
http.Error(w, "bad run id", http.StatusBadRequest)
|
||||
return 0, false
|
||||
}
|
||||
return id, true
|
||||
}
|
||||
|
||||
func writeJSON(w http.ResponseWriter, status int, body any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
_ = json.NewEncoder(w).Encode(body)
|
||||
}
|
||||
|
||||
// mustListStages is a small wrapper that hides the error path from
|
||||
// /claim — a DB read failure just pretends there are zero stages, and
|
||||
// the subsequent Seed will surface the real error.
|
||||
func mustListStages(s *store.Stages, r *http.Request, runID int64) []model.Stage {
|
||||
rows, err := s.ListForRun(r.Context(), runID)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
// ===== Phase 3 endpoints =================================================
|
||||
|
||||
// LogBatch is what the agent POSTs to /log: zero or more lines with
|
||||
// timestamp + level + text. Lines are written in order to the per-run
|
||||
// file and fanned out on the SSE hub.
|
||||
type LogBatch struct {
|
||||
Lines []LogLine `json:"lines"`
|
||||
}
|
||||
|
||||
type LogLine struct {
|
||||
TS string `json:"ts,omitempty"` // RFC3339Nano; server clock used if empty
|
||||
Level string `json:"level,omitempty"` // info|warn|error|debug
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
// Log accepts a batch of log lines from the agent. Empty batches are
|
||||
// legal (useful for agent-side flush ping).
|
||||
func (a *Agent) Log(w http.ResponseWriter, r *http.Request) {
|
||||
runID, ok := runIDFromURL(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if _, ok := a.authenticate(w, r, runID); !ok {
|
||||
return
|
||||
}
|
||||
var batch LogBatch
|
||||
if err := json.NewDecoder(r.Body).Decode(&batch); err != nil {
|
||||
http.Error(w, "bad json", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
writer, err := a.Logs.WriterFor(runID)
|
||||
if err != nil {
|
||||
http.Error(w, "open log: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
for _, l := range batch.Lines {
|
||||
ts, _ := time.Parse(time.RFC3339Nano, l.TS)
|
||||
writer.Append(logs.Line{TS: ts, Level: l.Level, Text: l.Text})
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(batch.Lines)})
|
||||
}
|
||||
|
||||
// StageResult is the body of /result. Kind is the stage name (from
|
||||
// DefaultStageOrder); Passed drives StageCompleted vs StageFailed.
|
||||
// Inventory is optional and only set when kind == "Inventory" — the
|
||||
// orchestrator persists it as an artifact and feeds it to spec.Diff.
|
||||
type StageResult struct {
|
||||
Stage string `json:"stage"`
|
||||
Passed bool `json:"passed"`
|
||||
Summary json.RawMessage `json:"summary,omitempty"`
|
||||
Inventory *spec.Inventory `json:"inventory,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
// Result receives a stage's outcome. Flow:
|
||||
// 1. Mark the stage row passed/failed + record summary JSON.
|
||||
// 2. For Inventory: persist the inventory artifact.
|
||||
// 3. For Inventory (on pass): run spec diff server-side, persist rows,
|
||||
// bump the run into SpecValidate and immediately resolve SpecValidate
|
||||
// from that diff — the agent isn't involved in SpecValidate at all.
|
||||
// 4. Transition the run via StageCompleted/StageFailed.
|
||||
func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
|
||||
runID, ok := runIDFromURL(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
run, ok := a.authenticate(w, r, runID)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
var body StageResult
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
http.Error(w, "bad json", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
body.Stage = strings.TrimSpace(body.Stage)
|
||||
if _, ok := orchestrator.StateForStage(body.Stage); !ok {
|
||||
http.Error(w, "unknown stage: "+body.Stage, http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
stageState := model.StagePassed
|
||||
if !body.Passed {
|
||||
stageState = model.StageFailed
|
||||
}
|
||||
summaryJSON := ""
|
||||
if len(body.Summary) > 0 {
|
||||
summaryJSON = string(body.Summary)
|
||||
}
|
||||
if err := a.Stages.CompleteByName(r.Context(), runID, body.Stage, stageState, summaryJSON); err != nil {
|
||||
http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Inventory-specific: persist artifact + compute spec diff.
|
||||
if body.Stage == "Inventory" && body.Inventory != nil {
|
||||
if err := a.persistInventory(r, run, body.Inventory); err != nil {
|
||||
log.Printf("persist inventory run %d: %v", runID, err)
|
||||
}
|
||||
}
|
||||
|
||||
if !body.Passed {
|
||||
if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
|
||||
log.Printf("set failed stage: %v", err)
|
||||
}
|
||||
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
|
||||
log.Printf("result: failed-transition run %d: %v", runID, err)
|
||||
http.Error(w, "transition", http.StatusConflict)
|
||||
return
|
||||
}
|
||||
hostName := a.hostNameFor(r.Context(), run.HostID)
|
||||
detail := body.Message
|
||||
if detail == "" {
|
||||
detail = "stage reported failure"
|
||||
}
|
||||
a.dispatchEvent(notify.Event{
|
||||
Kind: notify.KindStageFailed,
|
||||
Severity: notify.SeverityCritical,
|
||||
RunID: runID,
|
||||
HostName: hostName,
|
||||
Title: fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage),
|
||||
Body: fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail),
|
||||
URL: a.runLinkURL(runID),
|
||||
})
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"})
|
||||
return
|
||||
}
|
||||
|
||||
// Passed: advance to the next stage in the pipeline.
|
||||
next, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted)
|
||||
if err != nil {
|
||||
http.Error(w, "advance: "+err.Error(), http.StatusConflict)
|
||||
return
|
||||
}
|
||||
log.Printf("result: run %d stage %s passed → %s", runID, body.Stage, next)
|
||||
|
||||
// If the just-advanced-into state is SpecValidate or Reporting, the
|
||||
// orchestrator owns those stages entirely. The resolve function may
|
||||
// transition further (→ next stage on pass, → FailedHolding on fail,
|
||||
// → Completed for Reporting), so we re-read the run after each.
|
||||
if next == model.StateSpecValidate {
|
||||
a.resolveSpecValidate(r, runID)
|
||||
if after, err := a.Runs.Get(r.Context(), runID); err == nil {
|
||||
next = after.State
|
||||
}
|
||||
}
|
||||
if next == model.StateReporting {
|
||||
a.resolveReporting(r, runID)
|
||||
if after, err := a.Runs.Get(r.Context(), runID); err == nil {
|
||||
next = after.State
|
||||
}
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": string(next)})
|
||||
}
|
||||
|
||||
func (a *Agent) persistInventory(r *http.Request, run *model.Run, inv *spec.Inventory) error {
|
||||
dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", run.ID))
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
path := filepath.Join(dir, "inventory.json")
|
||||
buf, err := json.MarshalIndent(inv, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.WriteFile(path, buf, 0o644); err != nil {
|
||||
return err
|
||||
}
|
||||
sum := sha256.Sum256(buf)
|
||||
_, err = a.Artifacts.Create(r.Context(), store.Artifact{
|
||||
RunID: run.ID,
|
||||
Kind: "inventory",
|
||||
Path: path,
|
||||
SHA256: hex.EncodeToString(sum[:]),
|
||||
SizeBytes: int64(len(buf)),
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
// resolveSpecValidate runs the expected-vs-actual diff against the
|
||||
// just-stored inventory artifact, persists spec_diffs rows, and drives
|
||||
// the state machine — all on the server. The agent does nothing for
|
||||
// this stage.
|
||||
func (a *Agent) resolveSpecValidate(r *http.Request, runID int64) {
|
||||
run, err := a.Runs.Get(r.Context(), runID)
|
||||
if err != nil {
|
||||
log.Printf("specvalidate: get run: %v", err)
|
||||
return
|
||||
}
|
||||
host, err := a.Hosts.Get(r.Context(), run.HostID)
|
||||
if err != nil {
|
||||
log.Printf("specvalidate: get host: %v", err)
|
||||
return
|
||||
}
|
||||
expected, err := spec.Parse(host.ExpectedSpecYAML)
|
||||
if err != nil {
|
||||
log.Printf("specvalidate: parse expected yaml: %v", err)
|
||||
a.failStage(r, runID, "SpecValidate", "malformed expected spec: "+err.Error())
|
||||
return
|
||||
}
|
||||
inv, err := a.readInventoryArtifact(r, runID)
|
||||
if err != nil {
|
||||
log.Printf("specvalidate: read inventory: %v", err)
|
||||
a.failStage(r, runID, "SpecValidate", "missing inventory artifact")
|
||||
return
|
||||
}
|
||||
diffs := spec.Diff(expected, inv)
|
||||
if err := a.SpecDiffs.ReplaceForRun(r.Context(), runID, diffs); err != nil {
|
||||
log.Printf("specvalidate: write diffs: %v", err)
|
||||
}
|
||||
if err := a.Stages.StartByName(r.Context(), runID, "SpecValidate"); err != nil {
|
||||
log.Printf("specvalidate: start stage: %v", err)
|
||||
}
|
||||
|
||||
critical := 0
|
||||
for _, d := range diffs {
|
||||
if d.Severity == "critical" && !d.Ignored {
|
||||
critical++
|
||||
}
|
||||
}
|
||||
summaryBuf, _ := json.Marshal(map[string]any{
|
||||
"diffs": len(diffs),
|
||||
"critical": critical,
|
||||
})
|
||||
if critical > 0 {
|
||||
_ = a.Stages.CompleteByName(r.Context(), runID, "SpecValidate", model.StageFailed, string(summaryBuf))
|
||||
_ = a.Runs.SetFailedStage(r.Context(), runID, "SpecValidate")
|
||||
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
|
||||
log.Printf("specvalidate: failed-transition: %v", err)
|
||||
}
|
||||
a.appendLog(runID, "error", fmt.Sprintf("SpecValidate: %d critical diff(s) — holding host", critical))
|
||||
hostName := a.hostNameFor(r.Context(), run.HostID)
|
||||
a.dispatchEvent(notify.Event{
|
||||
Kind: notify.KindSpecMismatch,
|
||||
Severity: notify.SeverityCritical,
|
||||
RunID: runID,
|
||||
HostName: hostName,
|
||||
Title: fmt.Sprintf("[vetting] %s spec mismatch (%d critical)", hostName, critical),
|
||||
Body: fmt.Sprintf("SpecValidate found %d critical diff(s) on %s. Host is held for inspection.", critical, hostName),
|
||||
URL: a.runLinkURL(runID),
|
||||
})
|
||||
} else {
|
||||
_ = a.Stages.CompleteByName(r.Context(), runID, "SpecValidate", model.StagePassed, string(summaryBuf))
|
||||
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted); err != nil {
|
||||
log.Printf("specvalidate: advance: %v", err)
|
||||
}
|
||||
a.appendLog(runID, "info", "SpecValidate: all fields match expected spec")
|
||||
}
|
||||
}
|
||||
|
||||
func (a *Agent) readInventoryArtifact(r *http.Request, runID int64) (*spec.Inventory, error) {
|
||||
arts, err := a.Artifacts.ListForRun(r.Context(), runID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for i := len(arts) - 1; i >= 0; i-- {
|
||||
if arts[i].Kind == "inventory" {
|
||||
buf, err := os.ReadFile(arts[i].Path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var inv spec.Inventory
|
||||
if err := json.Unmarshal(buf, &inv); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &inv, nil
|
||||
}
|
||||
}
|
||||
return nil, errors.New("no inventory artifact")
|
||||
}
|
||||
|
||||
func (a *Agent) failStage(r *http.Request, runID int64, stage, message string) {
|
||||
_ = a.Stages.CompleteByName(r.Context(), runID, stage, model.StageFailed, fmt.Sprintf(`{"error":%q}`, message))
|
||||
_ = a.Runs.SetFailedStage(r.Context(), runID, stage)
|
||||
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
|
||||
log.Printf("failStage: transition run %d: %v", runID, err)
|
||||
}
|
||||
a.appendLog(runID, "error", stage+": "+message)
|
||||
}
|
||||
|
||||
func (a *Agent) appendLog(runID int64, level, text string) {
|
||||
if a.Logs == nil {
|
||||
return
|
||||
}
|
||||
w, err := a.Logs.WriterFor(runID)
|
||||
if err != nil {
|
||||
log.Printf("appendLog: %v", err)
|
||||
return
|
||||
}
|
||||
w.Append(logs.Line{Level: level, Text: text})
|
||||
}
|
||||
|
||||
// Hold issues the per-run ephemeral ed25519 keypair: the agent gets
|
||||
// the authorized_keys line, the orchestrator keeps the privkey on disk.
|
||||
// Hold also records the agent's reported IP so the tile can print the
|
||||
// ssh invocation.
|
||||
type HoldRequest struct {
|
||||
AgentIP string `json:"agent_ip"`
|
||||
}
|
||||
|
||||
type HoldResponse struct {
|
||||
AuthorizedKey string `json:"authorized_key"`
|
||||
RunID int64 `json:"run_id"`
|
||||
}
|
||||
|
||||
func (a *Agent) Hold(w http.ResponseWriter, r *http.Request) {
|
||||
runID, ok := runIDFromURL(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if _, ok := a.authenticate(w, r, runID); !ok {
|
||||
return
|
||||
}
|
||||
var body HoldRequest
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
agentIP := strings.TrimSpace(body.AgentIP)
|
||||
if agentIP == "" {
|
||||
if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil {
|
||||
agentIP = host
|
||||
}
|
||||
}
|
||||
if agentIP != "" {
|
||||
if err := a.Runs.SetHoldIP(r.Context(), runID, agentIP); err != nil {
|
||||
log.Printf("hold: set hold_ip: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
kp, err := hold.Issue(runID)
|
||||
if err != nil {
|
||||
http.Error(w, "generate key: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
keyPath := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID), "hold.key")
|
||||
abs, err := kp.WritePrivateTo(keyPath)
|
||||
if err != nil {
|
||||
http.Error(w, "write key: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
sum := sha256.Sum256(kp.PrivatePEM)
|
||||
if _, err := a.Artifacts.Create(r.Context(), store.Artifact{
|
||||
RunID: runID,
|
||||
Kind: "hold_key",
|
||||
Path: abs,
|
||||
SHA256: hex.EncodeToString(sum[:]),
|
||||
SizeBytes: int64(len(kp.PrivatePEM)),
|
||||
}); err != nil {
|
||||
log.Printf("hold: record artifact: %v", err)
|
||||
}
|
||||
a.appendLog(runID, "info", fmt.Sprintf("Hold key issued. SSH in with: ssh -i %s root@%s", abs, agentIP))
|
||||
hostID := mustHostID(a, r, runID)
|
||||
if hostID != 0 {
|
||||
hostName := a.hostNameFor(r.Context(), hostID)
|
||||
a.dispatchEvent(notify.Event{
|
||||
Kind: notify.KindHoldingOpened,
|
||||
Severity: notify.SeverityCritical,
|
||||
RunID: runID,
|
||||
HostName: hostName,
|
||||
Title: fmt.Sprintf("[vetting] %s holding — SSH ready", hostName),
|
||||
Body: fmt.Sprintf("Host %s is holding at %s.\nssh -i %s root@%s", hostName, agentIP, abs, agentIP),
|
||||
URL: a.runLinkURL(runID),
|
||||
})
|
||||
}
|
||||
// Refresh the tile so the operator sees the ssh command.
|
||||
host, _ := a.Hosts.Get(r.Context(), mustHostID(a, r, runID))
|
||||
if host != nil {
|
||||
latest, _ := a.Runs.Get(r.Context(), runID)
|
||||
if orchestrator.TileRenderer != nil {
|
||||
payload := orchestrator.TileRenderer(r.Context(), *host, latest)
|
||||
a.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", host.ID), Payload: payload})
|
||||
}
|
||||
}
|
||||
writeJSON(w, http.StatusOK, HoldResponse{AuthorizedKey: kp.AuthorizedKey, RunID: runID})
|
||||
}
|
||||
|
||||
// dispatchEvent hands an already-populated Event to the notify Registry
|
||||
// if one is wired. Handler code uses hostNameFor to resolve the host
|
||||
// name for the event payload; this keeps call sites terse.
|
||||
func (a *Agent) dispatchEvent(ev notify.Event) {
|
||||
if a.Notify == nil {
|
||||
return
|
||||
}
|
||||
a.Notify.Dispatch(ev)
|
||||
}
|
||||
|
||||
// hostNameFor returns a human-readable host name for a run, or "host-N"
|
||||
// if the lookup fails — notifications should never fail silently over a
|
||||
// missing name.
|
||||
func (a *Agent) hostNameFor(ctx context.Context, hostID int64) string {
|
||||
if host, err := a.Hosts.Get(ctx, hostID); err == nil && host != nil {
|
||||
return host.Name
|
||||
}
|
||||
return fmt.Sprintf("host-%d", hostID)
|
||||
}
|
||||
|
||||
func (a *Agent) runLinkURL(runID int64) string {
|
||||
if a.PublicURL == "" {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimRight(a.PublicURL, "/") + "/reports/" + fmt.Sprintf("%d", runID)
|
||||
}
|
||||
|
||||
func mustHostID(a *Agent, r *http.Request, runID int64) int64 {
|
||||
run, err := a.Runs.Get(r.Context(), runID)
|
||||
if err != nil || run == nil {
|
||||
return 0
|
||||
}
|
||||
return run.HostID
|
||||
}
|
||||
|
||||
// ===== Phase 4 endpoints =================================================
|
||||
|
||||
// SensorBatch is what the agent POSTs to /sensor: a stream of numeric
|
||||
// samples (temps, fan rpm, PSU rails, iperf throughput). Each sample is
|
||||
// (kind, key, value, unit). Timestamps default to server-now when empty
|
||||
// so the thermal sidecar doesn't have to carry a clock.
|
||||
type SensorBatch struct {
|
||||
Samples []SensorSample `json:"samples"`
|
||||
}
|
||||
|
||||
type SensorSample struct {
|
||||
TS string `json:"ts,omitempty"`
|
||||
Kind string `json:"kind"` // temp|fan|psu_volt|iperf|fio|smart_attr
|
||||
Key string `json:"key"`
|
||||
Value float64 `json:"value"`
|
||||
Unit string `json:"unit,omitempty"`
|
||||
}
|
||||
|
||||
// Sensor persists a batch of numeric samples. The thermal sidecar hits
|
||||
// this on a tick; stage executors (iperf, fio) also drop here.
|
||||
func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
|
||||
runID, ok := runIDFromURL(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if _, ok := a.authenticate(w, r, runID); !ok {
|
||||
return
|
||||
}
|
||||
if a.Measurements == nil {
|
||||
http.Error(w, "measurements store not wired", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
var body SensorBatch
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
http.Error(w, "bad json", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
rows := make([]model.Measurement, 0, len(body.Samples))
|
||||
for _, s := range body.Samples {
|
||||
ts, _ := time.Parse(time.RFC3339Nano, s.TS)
|
||||
rows = append(rows, model.Measurement{
|
||||
RunID: runID,
|
||||
TS: ts,
|
||||
Kind: s.Kind,
|
||||
Key: s.Key,
|
||||
Value: s.Value,
|
||||
Unit: s.Unit,
|
||||
})
|
||||
}
|
||||
if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil {
|
||||
http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(rows)})
|
||||
}
|
||||
|
||||
// resolveReporting runs when the pipeline advances into StateReporting.
|
||||
// It's an orchestrator-owned stage like SpecValidate: no agent action.
|
||||
// Writes a JSON report bundling run + stages + diffs + measurements,
|
||||
// then advances the run to Completed. Heartbeat will then return abort
|
||||
// and the agent will power the host off in Phase 5.
|
||||
func (a *Agent) resolveReporting(r *http.Request, runID int64) {
|
||||
ctx := r.Context()
|
||||
if err := a.Stages.StartByName(ctx, runID, "Reporting"); err != nil {
|
||||
log.Printf("reporting: start stage: %v", err)
|
||||
}
|
||||
run, err := a.Runs.Get(ctx, runID)
|
||||
if err != nil {
|
||||
log.Printf("reporting: get run: %v", err)
|
||||
return
|
||||
}
|
||||
host, err := a.Hosts.Get(ctx, run.HostID)
|
||||
if err != nil {
|
||||
log.Printf("reporting: get host: %v", err)
|
||||
return
|
||||
}
|
||||
stages, err := a.Stages.ListForRun(ctx, runID)
|
||||
if err != nil {
|
||||
log.Printf("reporting: list stages: %v", err)
|
||||
}
|
||||
diffs, err := a.SpecDiffs.ListForRun(ctx, runID)
|
||||
if err != nil {
|
||||
log.Printf("reporting: list diffs: %v", err)
|
||||
}
|
||||
var measurements []model.Measurement
|
||||
if a.Measurements != nil {
|
||||
measurements, err = a.Measurements.ListForRun(ctx, runID)
|
||||
if err != nil {
|
||||
log.Printf("reporting: list measurements: %v", err)
|
||||
}
|
||||
}
|
||||
bundle := map[string]any{
|
||||
"run": run,
|
||||
"host": host,
|
||||
"stages": stages,
|
||||
"spec_diffs": diffs,
|
||||
"measurements": measurements,
|
||||
"generated_at": time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
buf, err := json.MarshalIndent(bundle, "", " ")
|
||||
if err != nil {
|
||||
log.Printf("reporting: marshal: %v", err)
|
||||
a.failStage(r, runID, "Reporting", "marshal report: "+err.Error())
|
||||
return
|
||||
}
|
||||
dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID))
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
a.failStage(r, runID, "Reporting", "mkdir: "+err.Error())
|
||||
return
|
||||
}
|
||||
path := filepath.Join(dir, "report.json")
|
||||
if err := os.WriteFile(path, buf, 0o644); err != nil {
|
||||
a.failStage(r, runID, "Reporting", "write: "+err.Error())
|
||||
return
|
||||
}
|
||||
sum := sha256.Sum256(buf)
|
||||
if _, err := a.Artifacts.Create(ctx, store.Artifact{
|
||||
RunID: runID,
|
||||
Kind: "report",
|
||||
Path: path,
|
||||
SHA256: hex.EncodeToString(sum[:]),
|
||||
SizeBytes: int64(len(buf)),
|
||||
}); err != nil {
|
||||
log.Printf("reporting: record artifact: %v", err)
|
||||
}
|
||||
// Also render the operator-facing HTML summary alongside the JSON.
|
||||
// Failures here are non-fatal — the JSON is the source of truth.
|
||||
if host != nil {
|
||||
htmlData := report.Data{
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Run: *run,
|
||||
Host: *host,
|
||||
Stages: stages,
|
||||
SpecDiffs: diffs,
|
||||
Aggregates: report.AggregateMeasurements(measurements),
|
||||
}
|
||||
if htmlBuf, err := report.RenderHTML(htmlData); err != nil {
|
||||
log.Printf("reporting: render html: %v", err)
|
||||
} else {
|
||||
htmlPath := filepath.Join(dir, "report.html")
|
||||
if err := os.WriteFile(htmlPath, htmlBuf, 0o644); err != nil {
|
||||
log.Printf("reporting: write html: %v", err)
|
||||
} else {
|
||||
htmlSum := sha256.Sum256(htmlBuf)
|
||||
if _, err := a.Artifacts.Create(ctx, store.Artifact{
|
||||
RunID: runID,
|
||||
Kind: "report_html",
|
||||
Path: htmlPath,
|
||||
SHA256: hex.EncodeToString(htmlSum[:]),
|
||||
SizeBytes: int64(len(htmlBuf)),
|
||||
}); err != nil {
|
||||
log.Printf("reporting: record html artifact: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
summaryBuf, _ := json.Marshal(map[string]any{
|
||||
"report_path": path,
|
||||
"stages": len(stages),
|
||||
"diffs": len(diffs),
|
||||
})
|
||||
if err := a.Stages.CompleteByName(ctx, runID, "Reporting", model.StagePassed, string(summaryBuf)); err != nil {
|
||||
log.Printf("reporting: complete stage: %v", err)
|
||||
}
|
||||
if err := a.Runs.MarkCompleted(ctx, runID, path); err != nil {
|
||||
log.Printf("reporting: mark completed: %v", err)
|
||||
}
|
||||
a.appendLog(runID, "info", "Reporting: wrote "+path+"; run completed.")
|
||||
// Publish a final tile update so the dashboard flips to pass mood.
|
||||
if host != nil && orchestrator.TileRenderer != nil {
|
||||
latest, _ := a.Runs.Get(ctx, runID)
|
||||
payload := orchestrator.TileRenderer(ctx, *host, latest)
|
||||
a.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", host.ID), Payload: payload})
|
||||
}
|
||||
hostName := "host"
|
||||
if host != nil {
|
||||
hostName = host.Name
|
||||
}
|
||||
a.dispatchEvent(notify.Event{
|
||||
Kind: notify.KindRunCompleted,
|
||||
Severity: notify.SeverityInfo,
|
||||
RunID: runID,
|
||||
HostName: hostName,
|
||||
Title: fmt.Sprintf("[vetting] %s passed vetting", hostName),
|
||||
Body: fmt.Sprintf("Run %d on %s completed all stages. Report: %s", runID, hostName, path),
|
||||
URL: a.runLinkURL(runID),
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,128 @@
|
||||
package api_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"vetting/internal/api"
|
||||
"vetting/internal/db"
|
||||
"vetting/internal/model"
|
||||
"vetting/internal/orchestrator"
|
||||
"vetting/internal/store"
|
||||
)
|
||||
|
||||
func setupAgent(t *testing.T) (*api.Agent, int64, string) {
|
||||
t.Helper()
|
||||
path := filepath.Join(t.TempDir(), "vetting.db")
|
||||
conn, err := db.Open(path)
|
||||
if err != nil {
|
||||
t.Fatalf("open db: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { _ = conn.Close() })
|
||||
|
||||
hosts := &store.Hosts{DB: conn}
|
||||
runs := &store.Runs{DB: conn}
|
||||
meas := &store.Measurements{DB: conn}
|
||||
|
||||
hostID, err := hosts.Create(context.Background(), model.Host{
|
||||
Name: "t-host",
|
||||
MAC: "aa:bb:cc:dd:ee:01",
|
||||
WoLBroadcastIP: "10.0.0.255",
|
||||
WoLPort: 9,
|
||||
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
plain, hash, err := orchestrator.IssueRunToken()
|
||||
if err != nil {
|
||||
t.Fatalf("issue token: %v", err)
|
||||
}
|
||||
runID, err := runs.Create(context.Background(), hostID, hash)
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
return &api.Agent{
|
||||
Hosts: hosts,
|
||||
Runs: runs,
|
||||
Measurements: meas,
|
||||
}, runID, plain
|
||||
}
|
||||
|
||||
func routedRequest(runID int64, method, path string, body []byte) *http.Request {
|
||||
req := httptest.NewRequest(method, path, bytes.NewReader(body))
|
||||
// chi.URLParam is read from chi's context routing; fake that here.
|
||||
rctx := chi.NewRouteContext()
|
||||
rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
|
||||
return req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
|
||||
}
|
||||
|
||||
func TestSensorPersistsBatch(t *testing.T) {
|
||||
a, runID, token := setupAgent(t)
|
||||
batch := api.SensorBatch{Samples: []api.SensorSample{
|
||||
{Kind: "thermal", Key: "cpu", Value: 47.5, Unit: "C"},
|
||||
{Kind: "iperf", Key: "throughput_mbps", Value: 938.2, Unit: "Mbps"},
|
||||
}}
|
||||
buf, _ := json.Marshal(batch)
|
||||
req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
|
||||
req.Header.Set("Authorization", "Bearer "+token)
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
rr := httptest.NewRecorder()
|
||||
a.Sensor(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
|
||||
}
|
||||
rows, err := a.Measurements.ListForRun(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("ListForRun: %v", err)
|
||||
}
|
||||
if len(rows) != 2 {
|
||||
t.Fatalf("expected 2 measurements, got %d", len(rows))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSensorRejectsBadToken(t *testing.T) {
|
||||
a, runID, _ := setupAgent(t)
|
||||
body, _ := json.Marshal(api.SensorBatch{})
|
||||
req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", body)
|
||||
req.Header.Set("Authorization", "Bearer wrong-token")
|
||||
rr := httptest.NewRecorder()
|
||||
a.Sensor(rr, req)
|
||||
if rr.Code != http.StatusUnauthorized {
|
||||
t.Fatalf("status = %d, want 401", rr.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHeartbeatShutdownWhenCompleted: once the orchestrator has flipped
|
||||
// the run into Completed, the next heartbeat response must carry
|
||||
// cmd=shutdown so the agent powers the host down.
|
||||
func TestHeartbeatShutdownWhenCompleted(t *testing.T) {
|
||||
a, runID, token := setupAgent(t)
|
||||
// Wire a runner so Heartbeat's TouchHeartbeat call doesn't nil-panic.
|
||||
a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}}
|
||||
if err := a.Runs.SetState(context.Background(), runID, model.StateCompleted); err != nil {
|
||||
t.Fatalf("set state: %v", err)
|
||||
}
|
||||
req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil)
|
||||
req.Header.Set("Authorization", "Bearer "+token)
|
||||
rr := httptest.NewRecorder()
|
||||
a.Heartbeat(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
|
||||
}
|
||||
var resp map[string]any
|
||||
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if resp["cmd"] != "shutdown" {
|
||||
t.Fatalf("cmd = %v, want shutdown", resp["cmd"])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,318 @@
|
||||
package api_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"vetting/internal/api"
|
||||
"vetting/internal/db"
|
||||
"vetting/internal/events"
|
||||
"vetting/internal/logs"
|
||||
"vetting/internal/model"
|
||||
"vetting/internal/notify"
|
||||
"vetting/internal/orchestrator"
|
||||
"vetting/internal/spec"
|
||||
"vetting/internal/store"
|
||||
)
|
||||
|
||||
// captureNotifier is a testing-only Notifier that records every Event
|
||||
// sent to it, under a mutex so concurrent Dispatch goroutines are safe.
|
||||
type captureNotifier struct {
|
||||
mu sync.Mutex
|
||||
name string
|
||||
evs []notify.Event
|
||||
}
|
||||
|
||||
func (c *captureNotifier) Name() string { return c.name }
|
||||
|
||||
func (c *captureNotifier) Send(_ context.Context, ev notify.Event) error {
|
||||
c.mu.Lock()
|
||||
c.evs = append(c.evs, ev)
|
||||
c.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *captureNotifier) awaitKind(t *testing.T, k notify.Kind) notify.Event {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for {
|
||||
c.mu.Lock()
|
||||
for _, ev := range c.evs {
|
||||
if ev.Kind == k {
|
||||
got := ev
|
||||
c.mu.Unlock()
|
||||
return got
|
||||
}
|
||||
}
|
||||
c.mu.Unlock()
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("no %q event received within timeout", k)
|
||||
}
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
func newCaptureRegistry(c *captureNotifier) *notify.Registry {
|
||||
reg := notify.NewRegistry(time.Second)
|
||||
reg.Register(c)
|
||||
reg.AddRoute(notify.Route{Notifier: c.name}) // wildcard
|
||||
return reg
|
||||
}
|
||||
|
||||
// Builds a fully-wired Agent against a fresh sqlite DB and returns
|
||||
// (agent, runID, plainTokenForBearer). Caller is responsible for
|
||||
// transitioning the run out of Queued.
|
||||
func fullAgent(t *testing.T) (*api.Agent, int64, string) {
|
||||
t.Helper()
|
||||
tmp := t.TempDir()
|
||||
conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open db: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { _ = conn.Close() })
|
||||
|
||||
hostStore := &store.Hosts{DB: conn}
|
||||
runStore := &store.Runs{DB: conn}
|
||||
stageStore := &store.Stages{DB: conn}
|
||||
artifactStore := &store.Artifacts{DB: conn}
|
||||
specDiffStore := &store.SpecDiffs{DB: conn}
|
||||
measurementStore := &store.Measurements{DB: conn}
|
||||
|
||||
hub := events.NewHub()
|
||||
logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
|
||||
if err != nil {
|
||||
t.Fatalf("logs hub: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { logHub.Close() })
|
||||
|
||||
runner := &orchestrator.Runner{
|
||||
Runs: runStore,
|
||||
Hosts: hostStore,
|
||||
Stages: stageStore,
|
||||
EventHub: hub,
|
||||
}
|
||||
|
||||
hostID, err := hostStore.Create(context.Background(), model.Host{
|
||||
Name: "smoke-host",
|
||||
MAC: "aa:bb:cc:dd:ee:10",
|
||||
WoLBroadcastIP: "10.0.0.255",
|
||||
WoLPort: 9,
|
||||
ExpectedSpecYAML: "", // empty spec → no diffs
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
plain, hash, err := orchestrator.IssueRunToken()
|
||||
if err != nil {
|
||||
t.Fatalf("issue token: %v", err)
|
||||
}
|
||||
runID, err := runStore.Create(context.Background(), hostID, hash)
|
||||
if err != nil {
|
||||
t.Fatalf("create run: %v", err)
|
||||
}
|
||||
if err := stageStore.Seed(context.Background(), runID); err != nil {
|
||||
t.Fatalf("seed stages: %v", err)
|
||||
}
|
||||
return &api.Agent{
|
||||
Hosts: hostStore,
|
||||
Runs: runStore,
|
||||
Stages: stageStore,
|
||||
Artifacts: artifactStore,
|
||||
SpecDiffs: specDiffStore,
|
||||
Measurements: measurementStore,
|
||||
Runner: runner,
|
||||
EventHub: hub,
|
||||
Logs: logHub,
|
||||
ArtifactsDir: filepath.Join(tmp, "artifacts"),
|
||||
PublicURL: "https://vetting.example",
|
||||
}, runID, plain
|
||||
}
|
||||
|
||||
// walkStage simulates the agent reporting a single stage's outcome.
|
||||
// Returns the next_state the orchestrator decided to advance to.
|
||||
func walkStage(t *testing.T, a *api.Agent, runID int64, token, stage string, passed bool, extras map[string]any) string {
|
||||
t.Helper()
|
||||
body := map[string]any{"stage": stage, "passed": passed}
|
||||
if extras != nil {
|
||||
for k, v := range extras {
|
||||
body[k] = v
|
||||
}
|
||||
}
|
||||
buf, _ := json.Marshal(body)
|
||||
req := httptest.NewRequest(http.MethodPost,
|
||||
"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/result",
|
||||
bytes.NewReader(buf))
|
||||
rctx := chi.NewRouteContext()
|
||||
rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
|
||||
req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
|
||||
req.Header.Set("Authorization", "Bearer "+token)
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
rr := httptest.NewRecorder()
|
||||
a.Result(rr, req)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("stage %s: status %d body=%q", stage, rr.Code, rr.Body.String())
|
||||
}
|
||||
var resp struct {
|
||||
OK bool `json:"ok"`
|
||||
NextState string `json:"next_state"`
|
||||
}
|
||||
if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil {
|
||||
t.Fatalf("stage %s: decode resp: %v", stage, err)
|
||||
}
|
||||
return resp.NextState
|
||||
}
|
||||
|
||||
// TestFullPipelineToCompleted walks an agent through all stages of a
|
||||
// successful run and asserts the run ends in Completed. Inventory is
|
||||
// minimal; the empty expected-spec means SpecValidate produces zero
|
||||
// critical diffs and the orchestrator auto-advances past it.
|
||||
func TestFullPipelineToCompleted(t *testing.T) {
|
||||
a, runID, token := fullAgent(t)
|
||||
capture := &captureNotifier{name: "capture"}
|
||||
a.Notify = newCaptureRegistry(capture)
|
||||
// Claim would normally transition Booting → InventoryCheck; set it
|
||||
// directly here since we're not exercising the claim path.
|
||||
if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
|
||||
t.Fatalf("set state: %v", err)
|
||||
}
|
||||
|
||||
// Stage 1: Inventory — provide a concrete inventory so SpecValidate
|
||||
// has something to compare against.
|
||||
inv := spec.Inventory{
|
||||
CPU: spec.CPUSpec{Model: "Xeon Gold", LogicalCores: 8},
|
||||
Memory: spec.MemorySpec{TotalGiB: 16},
|
||||
}
|
||||
next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
|
||||
// After Inventory → SpecValidate resolves inline → SMART
|
||||
if next != "SMART" {
|
||||
t.Fatalf("after Inventory, next_state = %q, want SMART", next)
|
||||
}
|
||||
|
||||
// The remaining stages advance one-for-one in order.
|
||||
walkPlan := []struct {
|
||||
stage string
|
||||
expected string
|
||||
}{
|
||||
{"SMART", "CPUStress"},
|
||||
{"CPUStress", "Storage"},
|
||||
{"Storage", "Network"},
|
||||
{"Network", "GPU"},
|
||||
{"GPU", "PSU"},
|
||||
{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
|
||||
}
|
||||
for _, step := range walkPlan {
|
||||
got := walkStage(t, a, runID, token, step.stage, true, nil)
|
||||
if got != step.expected {
|
||||
t.Fatalf("after %s, next_state = %q, want %q", step.stage, got, step.expected)
|
||||
}
|
||||
}
|
||||
|
||||
run, err := a.Runs.Get(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("Get run: %v", err)
|
||||
}
|
||||
if run.State != model.StateCompleted {
|
||||
t.Fatalf("run.State = %q, want Completed", run.State)
|
||||
}
|
||||
if run.ReportPath == "" {
|
||||
t.Fatalf("run.ReportPath not set")
|
||||
}
|
||||
|
||||
// Phase 5 assertions: an HTML report artifact exists on disk, and
|
||||
// the capture notifier saw a RunCompleted event.
|
||||
arts, err := a.Artifacts.ListForRun(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("ListForRun: %v", err)
|
||||
}
|
||||
var htmlPath string
|
||||
for _, art := range arts {
|
||||
if art.Kind == "report_html" {
|
||||
htmlPath = art.Path
|
||||
}
|
||||
}
|
||||
if htmlPath == "" {
|
||||
t.Fatalf("no report_html artifact recorded (kinds seen: %v)", artifactKinds(arts))
|
||||
}
|
||||
data, err := os.ReadFile(htmlPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read report.html: %v", err)
|
||||
}
|
||||
if !strings.Contains(string(data), "<html") {
|
||||
t.Fatalf("report.html missing <html tag: %s", string(data[:min(200, len(data))]))
|
||||
}
|
||||
ev := capture.awaitKind(t, notify.KindRunCompleted)
|
||||
if ev.HostName != "smoke-host" {
|
||||
t.Errorf("RunCompleted host = %q, want smoke-host", ev.HostName)
|
||||
}
|
||||
if ev.URL == "" || !strings.Contains(ev.URL, "/reports/") {
|
||||
t.Errorf("RunCompleted URL = %q, want non-empty with /reports/", ev.URL)
|
||||
}
|
||||
}
|
||||
|
||||
func artifactKinds(arts []store.Artifact) []string {
|
||||
out := make([]string, 0, len(arts))
|
||||
for _, a := range arts {
|
||||
out = append(out, a.Kind)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// TestFaultInjectionSMART verifies a failing SMART stage halts the
|
||||
// pipeline at FailedHolding with failed_stage recorded.
|
||||
func TestFaultInjectionSMART(t *testing.T) {
|
||||
a, runID, token := fullAgent(t)
|
||||
capture := &captureNotifier{name: "capture"}
|
||||
a.Notify = newCaptureRegistry(capture)
|
||||
if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
|
||||
t.Fatalf("set state: %v", err)
|
||||
}
|
||||
|
||||
inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
|
||||
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" {
|
||||
t.Fatalf("after Inventory, next = %q want SMART", next)
|
||||
}
|
||||
|
||||
// Fake SMART failure → expect FailedHolding.
|
||||
if next := walkStage(t, a, runID, token, "SMART", false, nil); next != "FailedHolding" {
|
||||
t.Fatalf("after SMART fail, next = %q want FailedHolding", next)
|
||||
}
|
||||
|
||||
run, err := a.Runs.Get(context.Background(), runID)
|
||||
if err != nil {
|
||||
t.Fatalf("Get run: %v", err)
|
||||
}
|
||||
if run.State != model.StateFailedHolding {
|
||||
t.Fatalf("run.State = %q, want FailedHolding", run.State)
|
||||
}
|
||||
if run.FailedStage != "SMART" {
|
||||
t.Fatalf("run.FailedStage = %q, want SMART", run.FailedStage)
|
||||
}
|
||||
|
||||
// Phase 5 assertion: the fault fires a StageFailed notification.
|
||||
ev := capture.awaitKind(t, notify.KindStageFailed)
|
||||
if !strings.Contains(ev.Title, "SMART") {
|
||||
t.Errorf("StageFailed title = %q, want to mention SMART", ev.Title)
|
||||
}
|
||||
if ev.Severity != notify.SeverityCritical {
|
||||
t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
|
||||
"vetting/internal/model"
|
||||
"vetting/internal/store"
|
||||
"vetting/internal/web/templates"
|
||||
)
|
||||
|
||||
// TileEnricher builds a fully-populated TileData for a host. It looks
|
||||
// up the latest run's spec-diff count and hold-key artifact path so the
|
||||
// tile can render the "n critical diffs" badge and the ssh invocation
|
||||
// without the template package needing DB access.
|
||||
//
|
||||
// Used by both the Dashboard handler (initial render) and the SSE tile-
|
||||
// refresh path (agent_handlers.Hold, orchestrator runner) so every
|
||||
// place that renders a tile shows the same data.
|
||||
type TileEnricher struct {
|
||||
Runs *store.Runs
|
||||
Artifacts *store.Artifacts
|
||||
SpecDiffs *store.SpecDiffs
|
||||
}
|
||||
|
||||
// Build returns a TileData for (host, latest). Fails soft: DB errors
|
||||
// fall back to a tile without the extra fields rather than breaking
|
||||
// the whole dashboard.
|
||||
func (e *TileEnricher) Build(ctx context.Context, host model.Host, latest *model.Run) templates.TileData {
|
||||
t := templates.TileData{Host: host, Latest: latest}
|
||||
if latest == nil {
|
||||
return t
|
||||
}
|
||||
if e.SpecDiffs != nil {
|
||||
if diffs, err := e.SpecDiffs.ListForRun(ctx, latest.ID); err == nil {
|
||||
for _, d := range diffs {
|
||||
if d.Severity == "critical" && !d.Ignored {
|
||||
t.SpecDiffCritical++
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log.Printf("tile: list spec_diffs run %d: %v", latest.ID, err)
|
||||
}
|
||||
}
|
||||
if e.Artifacts != nil {
|
||||
if arts, err := e.Artifacts.ListForRun(ctx, latest.ID); err == nil {
|
||||
for _, a := range arts {
|
||||
if a.Kind == "hold_key" {
|
||||
t.HoldKeyPath = a.Path
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log.Printf("tile: list artifacts run %d: %v", latest.ID, err)
|
||||
}
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// BuildByHost looks up the latest run itself — convenient for SSE tile
|
||||
// publishers that only know the host ID.
|
||||
func (e *TileEnricher) BuildByHost(ctx context.Context, host model.Host) templates.TileData {
|
||||
var latest *model.Run
|
||||
if e.Runs != nil {
|
||||
if r, err := e.Runs.LatestForHost(ctx, host.ID); err == nil {
|
||||
latest = r
|
||||
}
|
||||
}
|
||||
return e.Build(ctx, host, latest)
|
||||
}
|
||||
@@ -0,0 +1,295 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
"gopkg.in/yaml.v3"
|
||||
|
||||
"vetting/internal/auth"
|
||||
"vetting/internal/events"
|
||||
"vetting/internal/model"
|
||||
"vetting/internal/orchestrator"
|
||||
"vetting/internal/store"
|
||||
"vetting/internal/web/templates"
|
||||
)
|
||||
|
||||
type UI struct {
|
||||
Hosts *store.Hosts
|
||||
Runs *store.Runs
|
||||
Artifacts *store.Artifacts
|
||||
Auth *auth.Manager
|
||||
EventHub *events.Hub
|
||||
Runner *orchestrator.Runner
|
||||
Tiles *TileEnricher
|
||||
}
|
||||
|
||||
var macRe = regexp.MustCompile(`^[0-9a-f]{2}(:[0-9a-f]{2}){5}$`)
|
||||
|
||||
func (u *UI) Dashboard(w http.ResponseWriter, r *http.Request) {
|
||||
hosts, err := u.Hosts.List(r.Context())
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
tiles := make([]templates.TileData, 0, len(hosts))
|
||||
for _, h := range hosts {
|
||||
latest, err := u.Runs.LatestForHost(r.Context(), h.ID)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
tiles = append(tiles, u.Tiles.Build(r.Context(), h, latest))
|
||||
}
|
||||
_ = templates.Dashboard(tiles).Render(r.Context(), w)
|
||||
}
|
||||
|
||||
// StartRun creates a new Run for the host, issues an agent token, and
|
||||
// transitions Registered→Queued. The dispatcher goroutine picks it up
|
||||
// and fires WoL.
|
||||
func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
|
||||
idStr := chi.URLParam(r, "id")
|
||||
hostID, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil {
|
||||
http.Error(w, "bad host id", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if _, err := u.Hosts.Get(r.Context(), hostID); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Guard: refuse to start a second run while one is still active.
|
||||
if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
|
||||
switch latest.State {
|
||||
case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
|
||||
// ok to start fresh
|
||||
default:
|
||||
http.Error(w, "host already has an active run", http.StatusConflict)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
_, hash, err := orchestrator.IssueRunToken()
|
||||
if err != nil {
|
||||
http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
runID, err := u.Runs.Create(r.Context(), hostID, hash)
|
||||
if err != nil {
|
||||
http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID)
|
||||
http.Redirect(w, r, "/", http.StatusSeeOther)
|
||||
}
|
||||
|
||||
func (u *UI) LoginForm(w http.ResponseWriter, r *http.Request) {
|
||||
next := r.URL.Query().Get("next")
|
||||
if next == "" {
|
||||
next = "/"
|
||||
}
|
||||
_ = templates.Login("", next).Render(r.Context(), w)
|
||||
}
|
||||
|
||||
func (u *UI) LoginSubmit(w http.ResponseWriter, r *http.Request) {
|
||||
if err := r.ParseForm(); err != nil {
|
||||
http.Error(w, "bad form", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
password := r.PostForm.Get("password")
|
||||
next := r.PostForm.Get("next")
|
||||
if next == "" || !strings.HasPrefix(next, "/") {
|
||||
next = "/"
|
||||
}
|
||||
if !u.Auth.VerifyPassword(password) {
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
_ = templates.Login("Invalid password.", next).Render(r.Context(), w)
|
||||
return
|
||||
}
|
||||
u.Auth.Issue(w, r)
|
||||
http.Redirect(w, r, next, http.StatusSeeOther)
|
||||
}
|
||||
|
||||
func (u *UI) Logout(w http.ResponseWriter, r *http.Request) {
|
||||
u.Auth.Clear(w)
|
||||
http.Redirect(w, r, "/login", http.StatusSeeOther)
|
||||
}
|
||||
|
||||
func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
|
||||
_ = templates.Registration(templates.RegistrationForm{}).Render(r.Context(), w)
|
||||
}
|
||||
|
||||
func (u *UI) CreateHost(w http.ResponseWriter, r *http.Request) {
|
||||
if err := r.ParseForm(); err != nil {
|
||||
http.Error(w, "bad form", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
form := templates.RegistrationForm{
|
||||
Name: strings.TrimSpace(r.PostForm.Get("name")),
|
||||
MAC: strings.ToLower(strings.TrimSpace(r.PostForm.Get("mac"))),
|
||||
WoLBroadcastIP: strings.TrimSpace(r.PostForm.Get("wol_broadcast_ip")),
|
||||
WoLPort: r.PostForm.Get("wol_port"),
|
||||
ExpectedSpecYAML: r.PostForm.Get("expected_spec_yaml"),
|
||||
Notes: strings.TrimSpace(r.PostForm.Get("notes")),
|
||||
}
|
||||
|
||||
if errMsg := validateHostForm(&form); errMsg != "" {
|
||||
form.Error = errMsg
|
||||
w.WriteHeader(http.StatusBadRequest)
|
||||
_ = templates.Registration(form).Render(r.Context(), w)
|
||||
return
|
||||
}
|
||||
|
||||
wolPort, _ := strconv.Atoi(form.WoLPort)
|
||||
if wolPort == 0 {
|
||||
wolPort = 9
|
||||
}
|
||||
|
||||
_, err := u.Hosts.Create(r.Context(), model.Host{
|
||||
Name: form.Name,
|
||||
MAC: form.MAC,
|
||||
WoLBroadcastIP: form.WoLBroadcastIP,
|
||||
WoLPort: wolPort,
|
||||
ExpectedSpecYAML: form.ExpectedSpecYAML,
|
||||
Notes: form.Notes,
|
||||
})
|
||||
if err != nil {
|
||||
form.Error = friendlyDBError(err)
|
||||
w.WriteHeader(http.StatusConflict)
|
||||
_ = templates.Registration(form).Render(r.Context(), w)
|
||||
return
|
||||
}
|
||||
http.Redirect(w, r, "/", http.StatusSeeOther)
|
||||
}
|
||||
|
||||
// OverrideWipeStorage is the operator's explicit "yes, wipe the disk
|
||||
// even though we found filesystem signatures" button. Only meaningful
|
||||
// when the latest run is FailedHolding with failed_stage=Storage — the
|
||||
// agent's next heartbeat will receive retry_stage with wipe=true and
|
||||
// re-enter the Storage stage bypassing the wipe-probe guard.
|
||||
func (u *UI) OverrideWipeStorage(w http.ResponseWriter, r *http.Request) {
|
||||
idStr := chi.URLParam(r, "id")
|
||||
hostID, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil {
|
||||
http.Error(w, "bad host id", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
latest, err := u.Runs.LatestForHost(r.Context(), hostID)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if latest == nil {
|
||||
http.Error(w, "no run for host", http.StatusConflict)
|
||||
return
|
||||
}
|
||||
if latest.State != model.StateFailedHolding || latest.FailedStage != "Storage" {
|
||||
http.Error(w, "override only valid when holding on Storage", http.StatusConflict)
|
||||
return
|
||||
}
|
||||
if _, err := u.Runner.Override(r.Context(), latest.ID, `{"wipe":true}`); err != nil {
|
||||
http.Error(w, "override: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
http.Redirect(w, r, "/", http.StatusSeeOther)
|
||||
}
|
||||
|
||||
func (u *UI) DeleteHost(w http.ResponseWriter, r *http.Request) {
|
||||
idStr := chi.URLParam(r, "id")
|
||||
id, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil {
|
||||
http.Error(w, "bad id", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if err := u.Hosts.Delete(r.Context(), id); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
http.Redirect(w, r, "/", http.StatusSeeOther)
|
||||
}
|
||||
|
||||
func (u *UI) SSE(w http.ResponseWriter, r *http.Request) {
|
||||
u.EventHub.ServeSSE(w, r)
|
||||
}
|
||||
|
||||
// Report serves the HTML report artifact for a run. Looks up the
|
||||
// report_html artifact row for the runID, validates the path lives
|
||||
// under the artifacts dir (defence-in-depth against path traversal),
|
||||
// and streams it back. 404 when the run hasn't produced one yet.
|
||||
func (u *UI) Report(w http.ResponseWriter, r *http.Request) {
|
||||
idStr := chi.URLParam(r, "runID")
|
||||
runID, err := strconv.ParseInt(idStr, 10, 64)
|
||||
if err != nil {
|
||||
http.Error(w, "bad run id", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
arts, err := u.Artifacts.ListForRun(r.Context(), runID)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
var path string
|
||||
for _, a := range arts {
|
||||
if a.Kind == "report_html" {
|
||||
path = a.Path
|
||||
}
|
||||
}
|
||||
if path == "" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
http.ServeFile(w, r, path)
|
||||
}
|
||||
|
||||
func validateHostForm(form *templates.RegistrationForm) string {
|
||||
if form.Name == "" {
|
||||
return "Name is required."
|
||||
}
|
||||
if !macRe.MatchString(form.MAC) {
|
||||
return "MAC address must be in the form aa:bb:cc:dd:ee:ff."
|
||||
}
|
||||
if form.WoLBroadcastIP == "" {
|
||||
return "WoL broadcast IP is required."
|
||||
}
|
||||
if form.ExpectedSpecYAML == "" {
|
||||
return "Expected spec YAML is required."
|
||||
}
|
||||
var anything any
|
||||
if err := yaml.Unmarshal([]byte(form.ExpectedSpecYAML), &anything); err != nil {
|
||||
return "Expected spec YAML is not valid YAML: " + err.Error()
|
||||
}
|
||||
if form.WoLPort != "" {
|
||||
port, err := strconv.Atoi(form.WoLPort)
|
||||
if err != nil || port < 1 || port > 65535 {
|
||||
return "WoL port must be 1–65535."
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func friendlyDBError(err error) string {
|
||||
s := err.Error()
|
||||
switch {
|
||||
case strings.Contains(s, "UNIQUE constraint failed: hosts.name"):
|
||||
return "A host with that name already exists."
|
||||
case strings.Contains(s, "UNIQUE constraint failed: hosts.mac"):
|
||||
return "A host with that MAC already exists."
|
||||
default:
|
||||
return s
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user