chore: cleanup sprint — dead CSS, dedup helpers, handler refactor
Remove ~126 lines of orphaned CSS from tile slim-down and old detail layout. Consolidate 4 duplicate duration formatters into shared elapsed()/fmtElapsed() helpers. Break 160-line Result handler into focused sub-functions. Implement real Hub.Shutdown() (was a no-op). Standardize agent error responses to JSON. Replace panic() in router init with error return. Extract magic numbers as named constants. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+106
-99
@@ -170,7 +170,7 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
|
||||
if len(mustListStages(a.Stages, r, runID)) == 0 {
|
||||
if err := a.Stages.Seed(r.Context(), runID); err != nil {
|
||||
log.Printf("claim: seed stages run %d: %v", runID, err)
|
||||
http.Error(w, "seed stages", http.StatusInternalServerError)
|
||||
writeJSONErr(w, http.StatusInternalServerError, "seed stages")
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -180,7 +180,7 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
|
||||
if run.State == model.StateWaitingWoL || run.State == model.StateBooting {
|
||||
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerAgentClaimed); err != nil {
|
||||
log.Printf("claim: transition run %d: %v", runID, err)
|
||||
http.Error(w, "transition", http.StatusConflict)
|
||||
writeJSONErr(w, http.StatusConflict, "transition")
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -369,6 +369,10 @@ func writeJSON(w http.ResponseWriter, status int, body any) {
|
||||
_ = json.NewEncoder(w).Encode(body)
|
||||
}
|
||||
|
||||
func writeJSONErr(w http.ResponseWriter, status int, msg string) {
|
||||
writeJSON(w, status, map[string]any{"ok": false, "error": msg})
|
||||
}
|
||||
|
||||
// mustListStages is a small wrapper that hides the error path from
|
||||
// /claim — a DB read failure just pretends there are zero stages, and
|
||||
// the subsequent Seed will surface the real error.
|
||||
@@ -408,12 +412,12 @@ func (a *Agent) Log(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
var batch LogBatch
|
||||
if err := json.NewDecoder(r.Body).Decode(&batch); err != nil {
|
||||
http.Error(w, "bad json", http.StatusBadRequest)
|
||||
writeJSONErr(w, http.StatusBadRequest, "bad json")
|
||||
return
|
||||
}
|
||||
writer, err := a.Logs.WriterFor(runID)
|
||||
if err != nil {
|
||||
http.Error(w, "open log: "+err.Error(), http.StatusInternalServerError)
|
||||
writeJSONErr(w, http.StatusInternalServerError, "open log: "+err.Error())
|
||||
return
|
||||
}
|
||||
for _, l := range batch.Lines {
|
||||
@@ -470,9 +474,7 @@ type SubStepResultLine struct {
|
||||
// Result receives a stage's outcome. Flow:
|
||||
// 1. Mark the stage row passed/failed + record summary JSON.
|
||||
// 2. For Inventory: persist the inventory artifact.
|
||||
// 3. For Inventory (on pass): run spec diff server-side, persist rows,
|
||||
// bump the run into SpecValidate and immediately resolve SpecValidate
|
||||
// from that diff — the agent isn't involved in SpecValidate at all.
|
||||
// 3. For Firmware: persist firmware snapshots.
|
||||
// 4. Transition the run via StageCompleted/StageFailed.
|
||||
func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
|
||||
runID, ok := runIDFromURL(w, r)
|
||||
@@ -485,64 +487,20 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
var body StageResult
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
http.Error(w, "bad json", http.StatusBadRequest)
|
||||
writeJSONErr(w, http.StatusBadRequest, "bad json")
|
||||
return
|
||||
}
|
||||
body.Stage = strings.TrimSpace(body.Stage)
|
||||
if _, ok := orchestrator.StateForStage(body.Stage); !ok {
|
||||
http.Error(w, "unknown stage: "+body.Stage, http.StatusBadRequest)
|
||||
writeJSONErr(w, http.StatusBadRequest, "unknown stage: "+body.Stage)
|
||||
return
|
||||
}
|
||||
|
||||
// Silent-skip guard. Orchestrator advances the run state via
|
||||
// TriggerStageCompleted against the *current* state, not against
|
||||
// body.Stage — so an Inventory result posted while the run is in
|
||||
// StateCPUStress would silently advance CPUStress → Storage and mark
|
||||
// CPUStress as passed without it ever running. That's exactly what
|
||||
// happened on Orion when the agent OOM-crashed mid-CPUStress,
|
||||
// systemd restarted it, and the restarted agent (which hardcoded
|
||||
// "Inventory" as its first stage) re-ran Inventory and reported it.
|
||||
// Guard: if body.Stage doesn't match the stage the run is currently
|
||||
// in, park the run in FailedHolding so the operator can investigate
|
||||
// rather than trusting the claim and cascading silent passes.
|
||||
expectedStage := orchestrator.StageNameForState(run.State)
|
||||
if expectedStage != "" && body.Stage != expectedStage {
|
||||
failedLabel := fmt.Sprintf("%s (expected %s)", body.Stage, expectedStage)
|
||||
if err := a.Runs.SetFailedStage(r.Context(), runID, failedLabel); err != nil {
|
||||
log.Printf("result: set failed stage on mismatch run %d: %v", runID, err)
|
||||
}
|
||||
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageMismatch); err != nil {
|
||||
log.Printf("result: stage-mismatch transition run %d: %v", runID, err)
|
||||
}
|
||||
hostName := a.hostNameFor(r.Context(), run.HostID)
|
||||
a.dispatchEvent(notify.Event{
|
||||
Kind: notify.KindStageFailed,
|
||||
Severity: notify.SeverityCritical,
|
||||
RunID: runID,
|
||||
HostName: hostName,
|
||||
Title: fmt.Sprintf("[vetting] %s stage mismatch: %s", hostName, body.Stage),
|
||||
Body: fmt.Sprintf("Run %d reported stage %s while orchestrator expected %s — parked in FailedHolding to prevent silent skip.",
|
||||
runID, body.Stage, expectedStage),
|
||||
URL: a.runLinkURL(runID),
|
||||
})
|
||||
log.Printf("result: stage mismatch run=%d got=%s expected=%s — parked", runID, body.Stage, expectedStage)
|
||||
http.Error(w, "stage mismatch: got "+body.Stage+", expected "+expectedStage, http.StatusConflict)
|
||||
if a.resultStageMismatch(w, r, runID, run, &body) {
|
||||
return
|
||||
}
|
||||
|
||||
// Aggregate threshold gate: flip Passed=false server-side when any
|
||||
// critical breach landed for this stage. The agent's verdict is
|
||||
// advisory — a stage-executor can miss a runaway sample that the
|
||||
// sidecar caught. We check this *before* writing the stage state
|
||||
// so the DB reflects the server-side decision.
|
||||
thresholdDetail := ""
|
||||
if body.Passed {
|
||||
if breached, detail := a.stageHadCriticalBreach(r.Context(), runID, body.Stage); breached {
|
||||
body.Passed = false
|
||||
thresholdDetail = detail
|
||||
a.appendLog(runID, "error", fmt.Sprintf("%s reported passed but %s — flipping to failed", body.Stage, detail))
|
||||
}
|
||||
}
|
||||
thresholdDetail := a.resultCheckThresholds(r.Context(), runID, &body)
|
||||
|
||||
stageState := model.StagePassed
|
||||
if !body.Passed {
|
||||
@@ -553,73 +511,122 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
|
||||
summaryJSON = string(body.Summary)
|
||||
}
|
||||
if err := a.Runner.CompleteStage(r.Context(), runID, body.Stage, stageState, summaryJSON); err != nil {
|
||||
http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError)
|
||||
writeJSONErr(w, http.StatusInternalServerError, "complete stage: "+err.Error())
|
||||
return
|
||||
}
|
||||
if thresholdDetail != "" && body.Message == "" {
|
||||
body.Message = thresholdDetail
|
||||
}
|
||||
|
||||
// Agent-authored sub-steps: persist in slice order (ordinal = index)
|
||||
// and fan out a per-row SSE event each so the detail pane shows them
|
||||
// without a reload. Best-effort — a persistence error is logged but
|
||||
// doesn't fail the whole /result.
|
||||
a.persistSubSteps(r.Context(), runID, body.Stage, body.SubSteps)
|
||||
a.resultPersistArtifacts(r, run, runID, &body)
|
||||
|
||||
// Inventory-specific: persist artifact + compute spec diff.
|
||||
if !body.Passed {
|
||||
a.resultHandleFailed(w, r, runID, run, &body)
|
||||
return
|
||||
}
|
||||
a.resultAdvance(w, r, runID, &body)
|
||||
}
|
||||
|
||||
// resultStageMismatch parks the run in FailedHolding when the reported
|
||||
// stage doesn't match what the orchestrator expects. Returns true if the
|
||||
// response has been written (caller should return).
|
||||
func (a *Agent) resultStageMismatch(w http.ResponseWriter, r *http.Request, runID int64, run *model.Run, body *StageResult) bool {
|
||||
expectedStage := orchestrator.StageNameForState(run.State)
|
||||
if expectedStage == "" || body.Stage == expectedStage {
|
||||
return false
|
||||
}
|
||||
failedLabel := fmt.Sprintf("%s (expected %s)", body.Stage, expectedStage)
|
||||
if err := a.Runs.SetFailedStage(r.Context(), runID, failedLabel); err != nil {
|
||||
log.Printf("result: set failed stage on mismatch run %d: %v", runID, err)
|
||||
}
|
||||
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageMismatch); err != nil {
|
||||
log.Printf("result: stage-mismatch transition run %d: %v", runID, err)
|
||||
}
|
||||
hostName := a.hostNameFor(r.Context(), run.HostID)
|
||||
a.dispatchEvent(notify.Event{
|
||||
Kind: notify.KindStageFailed,
|
||||
Severity: notify.SeverityCritical,
|
||||
RunID: runID,
|
||||
HostName: hostName,
|
||||
Title: fmt.Sprintf("[vetting] %s stage mismatch: %s", hostName, body.Stage),
|
||||
Body: fmt.Sprintf("Run %d reported stage %s while orchestrator expected %s — parked in FailedHolding to prevent silent skip.",
|
||||
runID, body.Stage, expectedStage),
|
||||
URL: a.runLinkURL(runID),
|
||||
})
|
||||
log.Printf("result: stage mismatch run=%d got=%s expected=%s — parked", runID, body.Stage, expectedStage)
|
||||
writeJSONErr(w, http.StatusConflict, "stage mismatch: got "+body.Stage+", expected "+expectedStage)
|
||||
return true
|
||||
}
|
||||
|
||||
// resultCheckThresholds flips body.Passed to false when the server-side
|
||||
// threshold sidecar recorded a critical breach the agent missed.
|
||||
func (a *Agent) resultCheckThresholds(ctx context.Context, runID int64, body *StageResult) string {
|
||||
if !body.Passed {
|
||||
return ""
|
||||
}
|
||||
breached, detail := a.stageHadCriticalBreach(ctx, runID, body.Stage)
|
||||
if !breached {
|
||||
return ""
|
||||
}
|
||||
body.Passed = false
|
||||
a.appendLog(runID, "error", fmt.Sprintf("%s reported passed but %s — flipping to failed", body.Stage, detail))
|
||||
return detail
|
||||
}
|
||||
|
||||
// resultPersistArtifacts handles stage-specific artifact persistence
|
||||
// (inventory JSON, firmware snapshots). Best-effort — errors are logged.
|
||||
func (a *Agent) resultPersistArtifacts(r *http.Request, run *model.Run, runID int64, body *StageResult) {
|
||||
if body.Stage == "Inventory" && body.Inventory != nil {
|
||||
if err := a.persistInventory(r, run, body.Inventory); err != nil {
|
||||
log.Printf("persist inventory run %d: %v", runID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Firmware-specific: persist each snapshot into firmware_snapshots.
|
||||
// SpecValidate reads them back to diff against expected_firmware.
|
||||
if body.Stage == "Firmware" && len(body.Firmware) > 0 {
|
||||
if err := a.persistFirmware(r.Context(), runID, body.Firmware); err != nil {
|
||||
log.Printf("persist firmware run %d: %v", runID, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !body.Passed {
|
||||
if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
|
||||
log.Printf("set failed stage: %v", err)
|
||||
}
|
||||
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
|
||||
log.Printf("result: failed-transition run %d: %v", runID, err)
|
||||
http.Error(w, "transition", http.StatusConflict)
|
||||
return
|
||||
}
|
||||
hostName := a.hostNameFor(r.Context(), run.HostID)
|
||||
detail := body.Message
|
||||
if detail == "" {
|
||||
detail = "stage reported failure"
|
||||
}
|
||||
a.dispatchEvent(notify.Event{
|
||||
Kind: notify.KindStageFailed,
|
||||
Severity: notify.SeverityCritical,
|
||||
RunID: runID,
|
||||
HostName: hostName,
|
||||
Title: fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage),
|
||||
Body: fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail),
|
||||
URL: a.runLinkURL(runID),
|
||||
})
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"})
|
||||
// resultHandleFailed transitions a failed stage into FailedHolding and
|
||||
// fires the failure notification.
|
||||
func (a *Agent) resultHandleFailed(w http.ResponseWriter, r *http.Request, runID int64, run *model.Run, body *StageResult) {
|
||||
if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
|
||||
log.Printf("set failed stage: %v", err)
|
||||
}
|
||||
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
|
||||
log.Printf("result: failed-transition run %d: %v", runID, err)
|
||||
writeJSONErr(w, http.StatusConflict, "transition")
|
||||
return
|
||||
}
|
||||
hostName := a.hostNameFor(r.Context(), run.HostID)
|
||||
detail := body.Message
|
||||
if detail == "" {
|
||||
detail = "stage reported failure"
|
||||
}
|
||||
a.dispatchEvent(notify.Event{
|
||||
Kind: notify.KindStageFailed,
|
||||
Severity: notify.SeverityCritical,
|
||||
RunID: runID,
|
||||
HostName: hostName,
|
||||
Title: fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage),
|
||||
Body: fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail),
|
||||
URL: a.runLinkURL(runID),
|
||||
})
|
||||
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"})
|
||||
}
|
||||
|
||||
// Passed: advance to the next stage in the pipeline.
|
||||
// resultAdvance transitions a passed stage to the next pipeline state,
|
||||
// auto-resolving server-owned stages (SpecValidate, Reporting).
|
||||
func (a *Agent) resultAdvance(w http.ResponseWriter, r *http.Request, runID int64, body *StageResult) {
|
||||
next, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted)
|
||||
if err != nil {
|
||||
http.Error(w, "advance: "+err.Error(), http.StatusConflict)
|
||||
writeJSONErr(w, http.StatusConflict, "advance: "+err.Error())
|
||||
return
|
||||
}
|
||||
log.Printf("result: run %d stage %s passed → %s", runID, body.Stage, next)
|
||||
|
||||
// If the just-advanced-into state is SpecValidate or Reporting, the
|
||||
// orchestrator owns those stages entirely. The resolve function may
|
||||
// transition further (→ next stage on pass, → FailedHolding on fail,
|
||||
// → Completed for Reporting), so we re-read the run after each.
|
||||
if next == model.StateSpecValidate {
|
||||
a.resolveSpecValidate(r, runID)
|
||||
if after, err := a.Runs.Get(r.Context(), runID); err == nil {
|
||||
@@ -912,13 +919,13 @@ func (a *Agent) Hold(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
kp, err := hold.Issue(runID)
|
||||
if err != nil {
|
||||
http.Error(w, "generate key: "+err.Error(), http.StatusInternalServerError)
|
||||
writeJSONErr(w, http.StatusInternalServerError, "generate key: "+err.Error())
|
||||
return
|
||||
}
|
||||
keyPath := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID), "hold.key")
|
||||
abs, err := kp.WritePrivateTo(keyPath)
|
||||
if err != nil {
|
||||
http.Error(w, "write key: "+err.Error(), http.StatusInternalServerError)
|
||||
writeJSONErr(w, http.StatusInternalServerError, "write key: "+err.Error())
|
||||
return
|
||||
}
|
||||
sum := sha256.Sum256(kp.PrivatePEM)
|
||||
@@ -1021,12 +1028,12 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
if a.Measurements == nil {
|
||||
http.Error(w, "measurements store not wired", http.StatusInternalServerError)
|
||||
writeJSONErr(w, http.StatusInternalServerError, "measurements store not wired")
|
||||
return
|
||||
}
|
||||
var body SensorBatch
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
http.Error(w, "bad json", http.StatusBadRequest)
|
||||
writeJSONErr(w, http.StatusBadRequest, "bad json")
|
||||
return
|
||||
}
|
||||
rows := make([]model.Measurement, 0, len(body.Samples))
|
||||
@@ -1050,7 +1057,7 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
|
||||
sampleStages = append(sampleStages, orchestrator.StageNameForState(run.State))
|
||||
}
|
||||
if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil {
|
||||
http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError)
|
||||
writeJSONErr(w, http.StatusInternalServerError, "write samples: "+err.Error())
|
||||
return
|
||||
}
|
||||
critical := a.evaluateSensorBatch(r.Context(), runID, rows, sampleStages)
|
||||
|
||||
Reference in New Issue
Block a user