Heartbeat-first dispatch: retire WoL-as-default, add WaitingReboot
CI / Lint + build + test (push) Has been cancelled
CI / Lint + build + test (push) Has been cancelled
Every supported host runs vetting-reporter in-OS and heartbeats every 30s. WoL was never the thing that started vetting — the heartbeat response's reboot_for_vetting command was. Firing WoL first only crowded the run log with misleading diagnostics when the real failure mode is "reporter isn't installed." - StartRun 409s if the host hasn't heartbeated within 60s, pointing the operator at /register/quick.sh. - Dispatcher re-checks LastSeenAt at dispatch time (run may sit in Queued long enough for the host to go offline); stale hosts mark the run Failed with failed_stage=dispatch instead of looping. - New StateWaitingReboot + TriggerRebootCommanded capture the actual semantics. StateWaitingWoL kept as the hook point for a future manual-override button. - Tile disables the Start button with a quick.sh tooltip when the host is offline, matching the server-side 409. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -11,7 +11,8 @@ type Trigger string
|
||||
|
||||
const (
|
||||
TriggerStartRequested Trigger = "StartRequested" // user clicks Start Vetting
|
||||
TriggerDispatched Trigger = "Dispatched" // dispatcher picked this run
|
||||
TriggerDispatched Trigger = "Dispatched" // dispatcher picked this run (manual-WoL override path; dormant in happy path)
|
||||
TriggerRebootCommanded Trigger = "RebootCommanded" // dispatcher (or heartbeat race) told the reporter to reboot
|
||||
TriggerPXEObserved Trigger = "PXEObserved" // iPXE fetched cmdline for MAC
|
||||
TriggerAgentClaimed Trigger = "AgentClaimed" // agent POSTed /claim with valid token
|
||||
TriggerStageFailed Trigger = "StageFailed" // a stage reported failure
|
||||
@@ -59,8 +60,9 @@ type transition struct {
|
||||
var table = map[Trigger]transition{
|
||||
TriggerStartRequested: {from: []model.RunState{model.StateRegistered}, to: model.StateQueued},
|
||||
TriggerDispatched: {from: []model.RunState{model.StateQueued}, to: model.StateWaitingWoL},
|
||||
TriggerPXEObserved: {from: []model.RunState{model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting},
|
||||
TriggerAgentClaimed: {from: []model.RunState{model.StateBooting, model.StateWaitingWoL}, to: model.StateInventoryCheck},
|
||||
TriggerRebootCommanded: {from: []model.RunState{model.StateQueued}, to: model.StateWaitingReboot},
|
||||
TriggerPXEObserved: {from: []model.RunState{model.StateWaitingReboot, model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting},
|
||||
TriggerAgentClaimed: {from: []model.RunState{model.StateBooting, model.StateWaitingReboot, model.StateWaitingWoL}, to: model.StateInventoryCheck},
|
||||
TriggerStageFailed: {from: allActiveStates(), to: model.StateFailedHolding},
|
||||
TriggerAllStagesPassed: {from: []model.RunState{model.StateReporting}, to: model.StateCompleted},
|
||||
TriggerOperatorReleased: {from: []model.RunState{model.StateFailedHolding}, to: model.StateReleased},
|
||||
@@ -121,7 +123,7 @@ func nextStageState(current model.RunState) (model.RunState, error) {
|
||||
|
||||
func allActiveStates() []model.RunState {
|
||||
return []model.RunState{
|
||||
model.StateQueued, model.StateWaitingWoL, model.StateBooting,
|
||||
model.StateQueued, model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
|
||||
model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
|
||||
model.StateCPUStress, model.StateStorage, model.StateNetwork,
|
||||
model.StateGPU, model.StatePSU, model.StateReporting,
|
||||
|
||||
Reference in New Issue
Block a user