Heartbeat command channel: reboot_for_vetting skips WoL
CI / Lint + build + test (push) Failing after 5m13s
CI / Lint + build + test (push) Failing after 5m13s
When the operator clicks Start vetting and the host is heartbeating, the heartbeat response now carries cmd=reboot_for_vetting + run_id. The handler drives the Queued → WaitingWoL transition via the existing state machine, so a benign race with the 2s dispatcher poll is refused by the state machine (not double-dispatched). WaitingWoL retries for 10 minutes to cover a crashed-mid-reboot case, then falls back to operator action. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"log"
|
||||
@@ -243,8 +244,10 @@ func (u *UI) CreateHostJSON(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
// Heartbeat is called every ~30s by a host-mode vetting-agent running
|
||||
// as a systemd service on the registered host. LAN-trusted, no auth —
|
||||
// same threat model as the browser UI and quick-register. Phase 1
|
||||
// just stamps last_seen_at and flips the dashboard tile to "online".
|
||||
// same threat model as the browser UI and quick-register. Stamps
|
||||
// last_seen_at, flips the dashboard tile to "online", and — if the
|
||||
// operator has clicked Start vetting since the last heartbeat — replies
|
||||
// with cmd=reboot_for_vetting so the host boots into PXE without WoL.
|
||||
func (u *UI) Heartbeat(w http.ResponseWriter, r *http.Request) {
|
||||
mac := strings.ToLower(strings.TrimSpace(chi.URLParam(r, "mac")))
|
||||
if !macRe.MatchString(mac) {
|
||||
@@ -268,10 +271,69 @@ func (u *UI) Heartbeat(w http.ResponseWriter, r *http.Request) {
|
||||
if u.Runner != nil {
|
||||
u.Runner.PublishTileUpdate(r.Context(), host.ID)
|
||||
}
|
||||
cmd, runID := u.pickHostCommand(r.Context(), host.ID)
|
||||
resp := heartbeatResponse{OK: true, Cmd: cmd, RunID: runID}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{"ok": true})
|
||||
_ = json.NewEncoder(w).Encode(resp)
|
||||
}
|
||||
|
||||
// heartbeatResponse is the JSON the host-mode agent decodes on every
|
||||
// heartbeat. `cmd` is "" (omitted) in the idle case so the wire shape
|
||||
// stays `{"ok": true}` when nothing is happening.
|
||||
type heartbeatResponse struct {
|
||||
OK bool `json:"ok"`
|
||||
Cmd string `json:"cmd,omitempty"`
|
||||
RunID int64 `json:"run_id,omitempty"`
|
||||
}
|
||||
|
||||
// pickHostCommand decides what the host-mode agent should do on the
|
||||
// back of this heartbeat. Returns ("", 0) when there's nothing to do.
|
||||
//
|
||||
// - Queued run → Transition(Dispatched) and tell the agent to reboot.
|
||||
// The dispatcher would have WoL'd it anyway; we beat it to the
|
||||
// punch so the host skips the WoL dance.
|
||||
// - WaitingWoL run created <10min ago → also return reboot, covering
|
||||
// "host crashed mid-reboot, systemd brought the reporter back".
|
||||
// - anything else → idle.
|
||||
func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) {
|
||||
if u.Runs == nil || u.Runner == nil {
|
||||
return "", 0
|
||||
}
|
||||
run, err := u.Runs.LatestForHost(ctx, hostID)
|
||||
if err != nil {
|
||||
log.Printf("heartbeat: latest run for host %d: %v", hostID, err)
|
||||
return "", 0
|
||||
}
|
||||
if run == nil {
|
||||
return "", 0
|
||||
}
|
||||
switch run.State {
|
||||
case model.StateQueued:
|
||||
if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerDispatched); err != nil {
|
||||
// Benign race with the dispatcher's own 2s poll — the
|
||||
// state machine refuses the second transition; we just
|
||||
// log and return idle so the agent doesn't reboot on a
|
||||
// run that another path is already driving.
|
||||
log.Printf("heartbeat: transition run %d: %v", run.ID, err)
|
||||
return "", 0
|
||||
}
|
||||
log.Printf("heartbeat: dispatched run %d for host %d via heartbeat (no WoL)", run.ID, hostID)
|
||||
return cmdRebootForVetting, run.ID
|
||||
case model.StateWaitingWoL:
|
||||
// Tolerate a crashed-mid-reboot retry: the reporter is the
|
||||
// only thing that could be telling us about this host right
|
||||
// now, and WoL is only the fallback anyway. Bound it so a
|
||||
// perpetually-broken PXE doesn't reboot-loop the box.
|
||||
if time.Since(run.StartedAt) < 10*time.Minute {
|
||||
return cmdRebootForVetting, run.ID
|
||||
}
|
||||
return "", 0
|
||||
}
|
||||
return "", 0
|
||||
}
|
||||
|
||||
const cmdRebootForVetting = "reboot_for_vetting"
|
||||
|
||||
func writeJSONError(w http.ResponseWriter, status int, msg string) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
|
||||
Reference in New Issue
Block a user