chore: cleanup sprint — dead CSS, dedup helpers, handler refactor

Remove ~126 lines of orphaned CSS from tile slim-down and old detail layout. Consolidate 4 duplicate duration formatters into shared elapsed()/fmtElapsed() helpers. Break 160-line Result handler into focused sub-functions. Implement real Hub.Shutdown() (was a no-op). Standardize agent error responses to JSON. Replace panic() in router init with error return. Extract magic numbers as named constants. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-21 20:39:38 -04:00
parent c11573eeeb
commit 17ec55cb85
17 changed files with 242 additions and 438 deletions
@@ -170,7 +170,7 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
 	if len(mustListStages(a.Stages, r, runID)) == 0 {
 		if err := a.Stages.Seed(r.Context(), runID); err != nil {
 			log.Printf("claim: seed stages run %d: %v", runID, err)
-			http.Error(w, "seed stages", http.StatusInternalServerError)
+			writeJSONErr(w, http.StatusInternalServerError, "seed stages")
 			return
 		}
 	}
@@ -180,7 +180,7 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
 	if run.State == model.StateWaitingWoL || run.State == model.StateBooting {
 		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerAgentClaimed); err != nil {
 			log.Printf("claim: transition run %d: %v", runID, err)
-			http.Error(w, "transition", http.StatusConflict)
+			writeJSONErr(w, http.StatusConflict, "transition")
 			return
 		}
 	}
@@ -369,6 +369,10 @@ func writeJSON(w http.ResponseWriter, status int, body any) {
 	_ = json.NewEncoder(w).Encode(body)
 }

+func writeJSONErr(w http.ResponseWriter, status int, msg string) {
+	writeJSON(w, status, map[string]any{"ok": false, "error": msg})
+}
+
 // mustListStages is a small wrapper that hides the error path from
 // /claim — a DB read failure just pretends there are zero stages, and
 // the subsequent Seed will surface the real error.
@@ -408,12 +412,12 @@ func (a *Agent) Log(w http.ResponseWriter, r *http.Request) {
 	}
 	var batch LogBatch
 	if err := json.NewDecoder(r.Body).Decode(&batch); err != nil {
-		http.Error(w, "bad json", http.StatusBadRequest)
+		writeJSONErr(w, http.StatusBadRequest, "bad json")
 		return
 	}
 	writer, err := a.Logs.WriterFor(runID)
 	if err != nil {
-		http.Error(w, "open log: "+err.Error(), http.StatusInternalServerError)
+		writeJSONErr(w, http.StatusInternalServerError, "open log: "+err.Error())
 		return
 	}
 	for _, l := range batch.Lines {
@@ -470,9 +474,7 @@ type SubStepResultLine struct {
 // Result receives a stage's outcome. Flow:
 //  1. Mark the stage row passed/failed + record summary JSON.
 //  2. For Inventory: persist the inventory artifact.
-//  3. For Inventory (on pass): run spec diff server-side, persist rows,
-//     bump the run into SpecValidate and immediately resolve SpecValidate
-//     from that diff — the agent isn't involved in SpecValidate at all.
+//  3. For Firmware: persist firmware snapshots.
 //  4. Transition the run via StageCompleted/StageFailed.
 func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
 	runID, ok := runIDFromURL(w, r)
@@ -485,64 +487,20 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
 	}
 	var body StageResult
 	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
-		http.Error(w, "bad json", http.StatusBadRequest)
+		writeJSONErr(w, http.StatusBadRequest, "bad json")
 		return
 	}
 	body.Stage = strings.TrimSpace(body.Stage)
 	if _, ok := orchestrator.StateForStage(body.Stage); !ok {
-		http.Error(w, "unknown stage: "+body.Stage, http.StatusBadRequest)
+		writeJSONErr(w, http.StatusBadRequest, "unknown stage: "+body.Stage)
 		return
 	}

-	// Silent-skip guard. Orchestrator advances the run state via
-	// TriggerStageCompleted against the *current* state, not against
-	// body.Stage — so an Inventory result posted while the run is in
-	// StateCPUStress would silently advance CPUStress → Storage and mark
-	// CPUStress as passed without it ever running. That's exactly what
-	// happened on Orion when the agent OOM-crashed mid-CPUStress,
-	// systemd restarted it, and the restarted agent (which hardcoded
-	// "Inventory" as its first stage) re-ran Inventory and reported it.
-	// Guard: if body.Stage doesn't match the stage the run is currently
-	// in, park the run in FailedHolding so the operator can investigate
-	// rather than trusting the claim and cascading silent passes.
-	expectedStage := orchestrator.StageNameForState(run.State)
-	if expectedStage != "" && body.Stage != expectedStage {
-		failedLabel := fmt.Sprintf("%s (expected %s)", body.Stage, expectedStage)
-		if err := a.Runs.SetFailedStage(r.Context(), runID, failedLabel); err != nil {
-			log.Printf("result: set failed stage on mismatch run %d: %v", runID, err)
-		}
-		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageMismatch); err != nil {
-			log.Printf("result: stage-mismatch transition run %d: %v", runID, err)
-		}
-		hostName := a.hostNameFor(r.Context(), run.HostID)
-		a.dispatchEvent(notify.Event{
-			Kind:     notify.KindStageFailed,
-			Severity: notify.SeverityCritical,
-			RunID:    runID,
-			HostName: hostName,
-			Title:    fmt.Sprintf("[vetting] %s stage mismatch: %s", hostName, body.Stage),
-			Body: fmt.Sprintf("Run %d reported stage %s while orchestrator expected %s — parked in FailedHolding to prevent silent skip.",
-				runID, body.Stage, expectedStage),
-			URL: a.runLinkURL(runID),
-		})
-		log.Printf("result: stage mismatch run=%d got=%s expected=%s — parked", runID, body.Stage, expectedStage)
-		http.Error(w, "stage mismatch: got "+body.Stage+", expected "+expectedStage, http.StatusConflict)
+	if a.resultStageMismatch(w, r, runID, run, &body) {
 		return
 	}

-	// Aggregate threshold gate: flip Passed=false server-side when any
-	// critical breach landed for this stage. The agent's verdict is
-	// advisory — a stage-executor can miss a runaway sample that the
-	// sidecar caught. We check this *before* writing the stage state
-	// so the DB reflects the server-side decision.
-	thresholdDetail := ""
-	if body.Passed {
-		if breached, detail := a.stageHadCriticalBreach(r.Context(), runID, body.Stage); breached {
-			body.Passed = false
-			thresholdDetail = detail
-			a.appendLog(runID, "error", fmt.Sprintf("%s reported passed but %s — flipping to failed", body.Stage, detail))
-		}
-	}
+	thresholdDetail := a.resultCheckThresholds(r.Context(), runID, &body)

 	stageState := model.StagePassed
 	if !body.Passed {
@@ -553,73 +511,122 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
 		summaryJSON = string(body.Summary)
 	}
 	if err := a.Runner.CompleteStage(r.Context(), runID, body.Stage, stageState, summaryJSON); err != nil {
-		http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError)
+		writeJSONErr(w, http.StatusInternalServerError, "complete stage: "+err.Error())
 		return
 	}
 	if thresholdDetail != "" && body.Message == "" {
 		body.Message = thresholdDetail
 	}

-	// Agent-authored sub-steps: persist in slice order (ordinal = index)
-	// and fan out a per-row SSE event each so the detail pane shows them
-	// without a reload. Best-effort — a persistence error is logged but
-	// doesn't fail the whole /result.
 	a.persistSubSteps(r.Context(), runID, body.Stage, body.SubSteps)
+	a.resultPersistArtifacts(r, run, runID, &body)

-	// Inventory-specific: persist artifact + compute spec diff.
+	if !body.Passed {
+		a.resultHandleFailed(w, r, runID, run, &body)
+		return
+	}
+	a.resultAdvance(w, r, runID, &body)
+}
+
+// resultStageMismatch parks the run in FailedHolding when the reported
+// stage doesn't match what the orchestrator expects. Returns true if the
+// response has been written (caller should return).
+func (a *Agent) resultStageMismatch(w http.ResponseWriter, r *http.Request, runID int64, run *model.Run, body *StageResult) bool {
+	expectedStage := orchestrator.StageNameForState(run.State)
+	if expectedStage == "" || body.Stage == expectedStage {
+		return false
+	}
+	failedLabel := fmt.Sprintf("%s (expected %s)", body.Stage, expectedStage)
+	if err := a.Runs.SetFailedStage(r.Context(), runID, failedLabel); err != nil {
+		log.Printf("result: set failed stage on mismatch run %d: %v", runID, err)
+	}
+	if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageMismatch); err != nil {
+		log.Printf("result: stage-mismatch transition run %d: %v", runID, err)
+	}
+	hostName := a.hostNameFor(r.Context(), run.HostID)
+	a.dispatchEvent(notify.Event{
+		Kind:     notify.KindStageFailed,
+		Severity: notify.SeverityCritical,
+		RunID:    runID,
+		HostName: hostName,
+		Title:    fmt.Sprintf("[vetting] %s stage mismatch: %s", hostName, body.Stage),
+		Body: fmt.Sprintf("Run %d reported stage %s while orchestrator expected %s — parked in FailedHolding to prevent silent skip.",
+			runID, body.Stage, expectedStage),
+		URL: a.runLinkURL(runID),
+	})
+	log.Printf("result: stage mismatch run=%d got=%s expected=%s — parked", runID, body.Stage, expectedStage)
+	writeJSONErr(w, http.StatusConflict, "stage mismatch: got "+body.Stage+", expected "+expectedStage)
+	return true
+}
+
+// resultCheckThresholds flips body.Passed to false when the server-side
+// threshold sidecar recorded a critical breach the agent missed.
+func (a *Agent) resultCheckThresholds(ctx context.Context, runID int64, body *StageResult) string {
+	if !body.Passed {
+		return ""
+	}
+	breached, detail := a.stageHadCriticalBreach(ctx, runID, body.Stage)
+	if !breached {
+		return ""
+	}
+	body.Passed = false
+	a.appendLog(runID, "error", fmt.Sprintf("%s reported passed but %s — flipping to failed", body.Stage, detail))
+	return detail
+}
+
+// resultPersistArtifacts handles stage-specific artifact persistence
+// (inventory JSON, firmware snapshots). Best-effort — errors are logged.
+func (a *Agent) resultPersistArtifacts(r *http.Request, run *model.Run, runID int64, body *StageResult) {
 	if body.Stage == "Inventory" && body.Inventory != nil {
 		if err := a.persistInventory(r, run, body.Inventory); err != nil {
 			log.Printf("persist inventory run %d: %v", runID, err)
 		}
 	}
-
-	// Firmware-specific: persist each snapshot into firmware_snapshots.
-	// SpecValidate reads them back to diff against expected_firmware.
 	if body.Stage == "Firmware" && len(body.Firmware) > 0 {
 		if err := a.persistFirmware(r.Context(), runID, body.Firmware); err != nil {
 			log.Printf("persist firmware run %d: %v", runID, err)
 		}
 	}
+}

-	if !body.Passed {
-		if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
-			log.Printf("set failed stage: %v", err)
-		}
-		if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
-			log.Printf("result: failed-transition run %d: %v", runID, err)
-			http.Error(w, "transition", http.StatusConflict)
-			return
-		}
-		hostName := a.hostNameFor(r.Context(), run.HostID)
-		detail := body.Message
-		if detail == "" {
-			detail = "stage reported failure"
-		}
-		a.dispatchEvent(notify.Event{
-			Kind:     notify.KindStageFailed,
-			Severity: notify.SeverityCritical,
-			RunID:    runID,
-			HostName: hostName,
-			Title:    fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage),
-			Body:     fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail),
-			URL:      a.runLinkURL(runID),
-		})
-		writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"})
+// resultHandleFailed transitions a failed stage into FailedHolding and
+// fires the failure notification.
+func (a *Agent) resultHandleFailed(w http.ResponseWriter, r *http.Request, runID int64, run *model.Run, body *StageResult) {
+	if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
+		log.Printf("set failed stage: %v", err)
+	}
+	if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
+		log.Printf("result: failed-transition run %d: %v", runID, err)
+		writeJSONErr(w, http.StatusConflict, "transition")
 		return
 	}
+	hostName := a.hostNameFor(r.Context(), run.HostID)
+	detail := body.Message
+	if detail == "" {
+		detail = "stage reported failure"
+	}
+	a.dispatchEvent(notify.Event{
+		Kind:     notify.KindStageFailed,
+		Severity: notify.SeverityCritical,
+		RunID:    runID,
+		HostName: hostName,
+		Title:    fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage),
+		Body:     fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail),
+		URL:      a.runLinkURL(runID),
+	})
+	writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"})
+}

-	// Passed: advance to the next stage in the pipeline.
+// resultAdvance transitions a passed stage to the next pipeline state,
+// auto-resolving server-owned stages (SpecValidate, Reporting).
+func (a *Agent) resultAdvance(w http.ResponseWriter, r *http.Request, runID int64, body *StageResult) {
 	next, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted)
 	if err != nil {
-		http.Error(w, "advance: "+err.Error(), http.StatusConflict)
+		writeJSONErr(w, http.StatusConflict, "advance: "+err.Error())
 		return
 	}
 	log.Printf("result: run %d stage %s passed → %s", runID, body.Stage, next)

-	// If the just-advanced-into state is SpecValidate or Reporting, the
-	// orchestrator owns those stages entirely. The resolve function may
-	// transition further (→ next stage on pass, → FailedHolding on fail,
-	// → Completed for Reporting), so we re-read the run after each.
 	if next == model.StateSpecValidate {
 		a.resolveSpecValidate(r, runID)
 		if after, err := a.Runs.Get(r.Context(), runID); err == nil {
@@ -912,13 +919,13 @@ func (a *Agent) Hold(w http.ResponseWriter, r *http.Request) {

 	kp, err := hold.Issue(runID)
 	if err != nil {
-		http.Error(w, "generate key: "+err.Error(), http.StatusInternalServerError)
+		writeJSONErr(w, http.StatusInternalServerError, "generate key: "+err.Error())
 		return
 	}
 	keyPath := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID), "hold.key")
 	abs, err := kp.WritePrivateTo(keyPath)
 	if err != nil {
-		http.Error(w, "write key: "+err.Error(), http.StatusInternalServerError)
+		writeJSONErr(w, http.StatusInternalServerError, "write key: "+err.Error())
 		return
 	}
 	sum := sha256.Sum256(kp.PrivatePEM)
@@ -1021,12 +1028,12 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 	if a.Measurements == nil {
-		http.Error(w, "measurements store not wired", http.StatusInternalServerError)
+		writeJSONErr(w, http.StatusInternalServerError, "measurements store not wired")
 		return
 	}
 	var body SensorBatch
 	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
-		http.Error(w, "bad json", http.StatusBadRequest)
+		writeJSONErr(w, http.StatusBadRequest, "bad json")
 		return
 	}
 	rows := make([]model.Measurement, 0, len(body.Samples))
@@ -1050,7 +1057,7 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
 		sampleStages = append(sampleStages, orchestrator.StageNameForState(run.State))
 	}
 	if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil {
-		http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError)
+		writeJSONErr(w, http.StatusInternalServerError, "write samples: "+err.Error())
 		return
 	}
 	critical := a.evaluateSensorBatch(r.Context(), runID, rows, sampleStages)