chore: cleanup sprint — dead CSS, dedup helpers, handler refactor
CI / Lint + build + test (push) Successful in 1m34s
Release / detect (push) Successful in 4s
Release / build-live-image (push) Has been skipped
Release / bundle (push) Successful in 1m5s

Remove ~126 lines of orphaned CSS from tile slim-down and old detail
layout. Consolidate 4 duplicate duration formatters into shared
elapsed()/fmtElapsed() helpers. Break 160-line Result handler into
focused sub-functions. Implement real Hub.Shutdown() (was a no-op).
Standardize agent error responses to JSON. Replace panic() in router
init with error return. Extract magic numbers as named constants.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-21 20:39:38 -04:00
parent c11573eeeb
commit 17ec55cb85
17 changed files with 242 additions and 438 deletions
+106 -99
View File
@@ -170,7 +170,7 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
if len(mustListStages(a.Stages, r, runID)) == 0 {
if err := a.Stages.Seed(r.Context(), runID); err != nil {
log.Printf("claim: seed stages run %d: %v", runID, err)
http.Error(w, "seed stages", http.StatusInternalServerError)
writeJSONErr(w, http.StatusInternalServerError, "seed stages")
return
}
}
@@ -180,7 +180,7 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
if run.State == model.StateWaitingWoL || run.State == model.StateBooting {
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerAgentClaimed); err != nil {
log.Printf("claim: transition run %d: %v", runID, err)
http.Error(w, "transition", http.StatusConflict)
writeJSONErr(w, http.StatusConflict, "transition")
return
}
}
@@ -369,6 +369,10 @@ func writeJSON(w http.ResponseWriter, status int, body any) {
_ = json.NewEncoder(w).Encode(body)
}
func writeJSONErr(w http.ResponseWriter, status int, msg string) {
writeJSON(w, status, map[string]any{"ok": false, "error": msg})
}
// mustListStages is a small wrapper that hides the error path from
// /claim — a DB read failure just pretends there are zero stages, and
// the subsequent Seed will surface the real error.
@@ -408,12 +412,12 @@ func (a *Agent) Log(w http.ResponseWriter, r *http.Request) {
}
var batch LogBatch
if err := json.NewDecoder(r.Body).Decode(&batch); err != nil {
http.Error(w, "bad json", http.StatusBadRequest)
writeJSONErr(w, http.StatusBadRequest, "bad json")
return
}
writer, err := a.Logs.WriterFor(runID)
if err != nil {
http.Error(w, "open log: "+err.Error(), http.StatusInternalServerError)
writeJSONErr(w, http.StatusInternalServerError, "open log: "+err.Error())
return
}
for _, l := range batch.Lines {
@@ -470,9 +474,7 @@ type SubStepResultLine struct {
// Result receives a stage's outcome. Flow:
// 1. Mark the stage row passed/failed + record summary JSON.
// 2. For Inventory: persist the inventory artifact.
// 3. For Inventory (on pass): run spec diff server-side, persist rows,
// bump the run into SpecValidate and immediately resolve SpecValidate
// from that diff — the agent isn't involved in SpecValidate at all.
// 3. For Firmware: persist firmware snapshots.
// 4. Transition the run via StageCompleted/StageFailed.
func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
runID, ok := runIDFromURL(w, r)
@@ -485,64 +487,20 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
}
var body StageResult
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
http.Error(w, "bad json", http.StatusBadRequest)
writeJSONErr(w, http.StatusBadRequest, "bad json")
return
}
body.Stage = strings.TrimSpace(body.Stage)
if _, ok := orchestrator.StateForStage(body.Stage); !ok {
http.Error(w, "unknown stage: "+body.Stage, http.StatusBadRequest)
writeJSONErr(w, http.StatusBadRequest, "unknown stage: "+body.Stage)
return
}
// Silent-skip guard. Orchestrator advances the run state via
// TriggerStageCompleted against the *current* state, not against
// body.Stage — so an Inventory result posted while the run is in
// StateCPUStress would silently advance CPUStress → Storage and mark
// CPUStress as passed without it ever running. That's exactly what
// happened on Orion when the agent OOM-crashed mid-CPUStress,
// systemd restarted it, and the restarted agent (which hardcoded
// "Inventory" as its first stage) re-ran Inventory and reported it.
// Guard: if body.Stage doesn't match the stage the run is currently
// in, park the run in FailedHolding so the operator can investigate
// rather than trusting the claim and cascading silent passes.
expectedStage := orchestrator.StageNameForState(run.State)
if expectedStage != "" && body.Stage != expectedStage {
failedLabel := fmt.Sprintf("%s (expected %s)", body.Stage, expectedStage)
if err := a.Runs.SetFailedStage(r.Context(), runID, failedLabel); err != nil {
log.Printf("result: set failed stage on mismatch run %d: %v", runID, err)
}
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageMismatch); err != nil {
log.Printf("result: stage-mismatch transition run %d: %v", runID, err)
}
hostName := a.hostNameFor(r.Context(), run.HostID)
a.dispatchEvent(notify.Event{
Kind: notify.KindStageFailed,
Severity: notify.SeverityCritical,
RunID: runID,
HostName: hostName,
Title: fmt.Sprintf("[vetting] %s stage mismatch: %s", hostName, body.Stage),
Body: fmt.Sprintf("Run %d reported stage %s while orchestrator expected %s — parked in FailedHolding to prevent silent skip.",
runID, body.Stage, expectedStage),
URL: a.runLinkURL(runID),
})
log.Printf("result: stage mismatch run=%d got=%s expected=%s — parked", runID, body.Stage, expectedStage)
http.Error(w, "stage mismatch: got "+body.Stage+", expected "+expectedStage, http.StatusConflict)
if a.resultStageMismatch(w, r, runID, run, &body) {
return
}
// Aggregate threshold gate: flip Passed=false server-side when any
// critical breach landed for this stage. The agent's verdict is
// advisory — a stage-executor can miss a runaway sample that the
// sidecar caught. We check this *before* writing the stage state
// so the DB reflects the server-side decision.
thresholdDetail := ""
if body.Passed {
if breached, detail := a.stageHadCriticalBreach(r.Context(), runID, body.Stage); breached {
body.Passed = false
thresholdDetail = detail
a.appendLog(runID, "error", fmt.Sprintf("%s reported passed but %s — flipping to failed", body.Stage, detail))
}
}
thresholdDetail := a.resultCheckThresholds(r.Context(), runID, &body)
stageState := model.StagePassed
if !body.Passed {
@@ -553,73 +511,122 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
summaryJSON = string(body.Summary)
}
if err := a.Runner.CompleteStage(r.Context(), runID, body.Stage, stageState, summaryJSON); err != nil {
http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError)
writeJSONErr(w, http.StatusInternalServerError, "complete stage: "+err.Error())
return
}
if thresholdDetail != "" && body.Message == "" {
body.Message = thresholdDetail
}
// Agent-authored sub-steps: persist in slice order (ordinal = index)
// and fan out a per-row SSE event each so the detail pane shows them
// without a reload. Best-effort — a persistence error is logged but
// doesn't fail the whole /result.
a.persistSubSteps(r.Context(), runID, body.Stage, body.SubSteps)
a.resultPersistArtifacts(r, run, runID, &body)
// Inventory-specific: persist artifact + compute spec diff.
if !body.Passed {
a.resultHandleFailed(w, r, runID, run, &body)
return
}
a.resultAdvance(w, r, runID, &body)
}
// resultStageMismatch parks the run in FailedHolding when the reported
// stage doesn't match what the orchestrator expects. Returns true if the
// response has been written (caller should return).
func (a *Agent) resultStageMismatch(w http.ResponseWriter, r *http.Request, runID int64, run *model.Run, body *StageResult) bool {
expectedStage := orchestrator.StageNameForState(run.State)
if expectedStage == "" || body.Stage == expectedStage {
return false
}
failedLabel := fmt.Sprintf("%s (expected %s)", body.Stage, expectedStage)
if err := a.Runs.SetFailedStage(r.Context(), runID, failedLabel); err != nil {
log.Printf("result: set failed stage on mismatch run %d: %v", runID, err)
}
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageMismatch); err != nil {
log.Printf("result: stage-mismatch transition run %d: %v", runID, err)
}
hostName := a.hostNameFor(r.Context(), run.HostID)
a.dispatchEvent(notify.Event{
Kind: notify.KindStageFailed,
Severity: notify.SeverityCritical,
RunID: runID,
HostName: hostName,
Title: fmt.Sprintf("[vetting] %s stage mismatch: %s", hostName, body.Stage),
Body: fmt.Sprintf("Run %d reported stage %s while orchestrator expected %s — parked in FailedHolding to prevent silent skip.",
runID, body.Stage, expectedStage),
URL: a.runLinkURL(runID),
})
log.Printf("result: stage mismatch run=%d got=%s expected=%s — parked", runID, body.Stage, expectedStage)
writeJSONErr(w, http.StatusConflict, "stage mismatch: got "+body.Stage+", expected "+expectedStage)
return true
}
// resultCheckThresholds flips body.Passed to false when the server-side
// threshold sidecar recorded a critical breach the agent missed.
func (a *Agent) resultCheckThresholds(ctx context.Context, runID int64, body *StageResult) string {
if !body.Passed {
return ""
}
breached, detail := a.stageHadCriticalBreach(ctx, runID, body.Stage)
if !breached {
return ""
}
body.Passed = false
a.appendLog(runID, "error", fmt.Sprintf("%s reported passed but %s — flipping to failed", body.Stage, detail))
return detail
}
// resultPersistArtifacts handles stage-specific artifact persistence
// (inventory JSON, firmware snapshots). Best-effort — errors are logged.
func (a *Agent) resultPersistArtifacts(r *http.Request, run *model.Run, runID int64, body *StageResult) {
if body.Stage == "Inventory" && body.Inventory != nil {
if err := a.persistInventory(r, run, body.Inventory); err != nil {
log.Printf("persist inventory run %d: %v", runID, err)
}
}
// Firmware-specific: persist each snapshot into firmware_snapshots.
// SpecValidate reads them back to diff against expected_firmware.
if body.Stage == "Firmware" && len(body.Firmware) > 0 {
if err := a.persistFirmware(r.Context(), runID, body.Firmware); err != nil {
log.Printf("persist firmware run %d: %v", runID, err)
}
}
}
if !body.Passed {
if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
log.Printf("set failed stage: %v", err)
}
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
log.Printf("result: failed-transition run %d: %v", runID, err)
http.Error(w, "transition", http.StatusConflict)
return
}
hostName := a.hostNameFor(r.Context(), run.HostID)
detail := body.Message
if detail == "" {
detail = "stage reported failure"
}
a.dispatchEvent(notify.Event{
Kind: notify.KindStageFailed,
Severity: notify.SeverityCritical,
RunID: runID,
HostName: hostName,
Title: fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage),
Body: fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail),
URL: a.runLinkURL(runID),
})
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"})
// resultHandleFailed transitions a failed stage into FailedHolding and
// fires the failure notification.
func (a *Agent) resultHandleFailed(w http.ResponseWriter, r *http.Request, runID int64, run *model.Run, body *StageResult) {
if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
log.Printf("set failed stage: %v", err)
}
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
log.Printf("result: failed-transition run %d: %v", runID, err)
writeJSONErr(w, http.StatusConflict, "transition")
return
}
hostName := a.hostNameFor(r.Context(), run.HostID)
detail := body.Message
if detail == "" {
detail = "stage reported failure"
}
a.dispatchEvent(notify.Event{
Kind: notify.KindStageFailed,
Severity: notify.SeverityCritical,
RunID: runID,
HostName: hostName,
Title: fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage),
Body: fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail),
URL: a.runLinkURL(runID),
})
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"})
}
// Passed: advance to the next stage in the pipeline.
// resultAdvance transitions a passed stage to the next pipeline state,
// auto-resolving server-owned stages (SpecValidate, Reporting).
func (a *Agent) resultAdvance(w http.ResponseWriter, r *http.Request, runID int64, body *StageResult) {
next, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted)
if err != nil {
http.Error(w, "advance: "+err.Error(), http.StatusConflict)
writeJSONErr(w, http.StatusConflict, "advance: "+err.Error())
return
}
log.Printf("result: run %d stage %s passed → %s", runID, body.Stage, next)
// If the just-advanced-into state is SpecValidate or Reporting, the
// orchestrator owns those stages entirely. The resolve function may
// transition further (→ next stage on pass, → FailedHolding on fail,
// → Completed for Reporting), so we re-read the run after each.
if next == model.StateSpecValidate {
a.resolveSpecValidate(r, runID)
if after, err := a.Runs.Get(r.Context(), runID); err == nil {
@@ -912,13 +919,13 @@ func (a *Agent) Hold(w http.ResponseWriter, r *http.Request) {
kp, err := hold.Issue(runID)
if err != nil {
http.Error(w, "generate key: "+err.Error(), http.StatusInternalServerError)
writeJSONErr(w, http.StatusInternalServerError, "generate key: "+err.Error())
return
}
keyPath := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID), "hold.key")
abs, err := kp.WritePrivateTo(keyPath)
if err != nil {
http.Error(w, "write key: "+err.Error(), http.StatusInternalServerError)
writeJSONErr(w, http.StatusInternalServerError, "write key: "+err.Error())
return
}
sum := sha256.Sum256(kp.PrivatePEM)
@@ -1021,12 +1028,12 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
return
}
if a.Measurements == nil {
http.Error(w, "measurements store not wired", http.StatusInternalServerError)
writeJSONErr(w, http.StatusInternalServerError, "measurements store not wired")
return
}
var body SensorBatch
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
http.Error(w, "bad json", http.StatusBadRequest)
writeJSONErr(w, http.StatusBadRequest, "bad json")
return
}
rows := make([]model.Measurement, 0, len(body.Samples))
@@ -1050,7 +1057,7 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
sampleStages = append(sampleStages, orchestrator.StageNameForState(run.State))
}
if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil {
http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError)
writeJSONErr(w, http.StatusInternalServerError, "write samples: "+err.Error())
return
}
critical := a.evaluateSensorBatch(r.Context(), runID, rows, sampleStages)