Heartbeat command channel: reboot_for_vetting skips WoL
CI / Lint + build + test (push) Failing after 5m13s

When the operator clicks Start vetting and the host is heartbeating,
the heartbeat response now carries cmd=reboot_for_vetting + run_id.
The handler drives the Queued → WaitingWoL transition via the existing
state machine, so a benign race with the 2s dispatcher poll is refused
by the state machine (not double-dispatched). WaitingWoL retries for
10 minutes to cover a crashed-mid-reboot case, then falls back to
operator action.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-17 23:37:01 -04:00
parent a0c0fb114f
commit 9b16ed80e6
2 changed files with 222 additions and 6 deletions
+65 -3
View File
@@ -1,6 +1,7 @@
package api
import (
"context"
"encoding/json"
"errors"
"log"
@@ -243,8 +244,10 @@ func (u *UI) CreateHostJSON(w http.ResponseWriter, r *http.Request) {
// Heartbeat is called every ~30s by a host-mode vetting-agent running
// as a systemd service on the registered host. LAN-trusted, no auth —
// same threat model as the browser UI and quick-register. Phase 1
// just stamps last_seen_at and flips the dashboard tile to "online".
// same threat model as the browser UI and quick-register. Stamps
// last_seen_at, flips the dashboard tile to "online", and — if the
// operator has clicked Start vetting since the last heartbeat — replies
// with cmd=reboot_for_vetting so the host boots into PXE without WoL.
func (u *UI) Heartbeat(w http.ResponseWriter, r *http.Request) {
mac := strings.ToLower(strings.TrimSpace(chi.URLParam(r, "mac")))
if !macRe.MatchString(mac) {
@@ -268,10 +271,69 @@ func (u *UI) Heartbeat(w http.ResponseWriter, r *http.Request) {
if u.Runner != nil {
u.Runner.PublishTileUpdate(r.Context(), host.ID)
}
cmd, runID := u.pickHostCommand(r.Context(), host.ID)
resp := heartbeatResponse{OK: true, Cmd: cmd, RunID: runID}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(map[string]any{"ok": true})
_ = json.NewEncoder(w).Encode(resp)
}
// heartbeatResponse is the JSON the host-mode agent decodes on every
// heartbeat. `cmd` is "" (omitted) in the idle case so the wire shape
// stays `{"ok": true}` when nothing is happening.
type heartbeatResponse struct {
OK bool `json:"ok"`
Cmd string `json:"cmd,omitempty"`
RunID int64 `json:"run_id,omitempty"`
}
// pickHostCommand decides what the host-mode agent should do on the
// back of this heartbeat. Returns ("", 0) when there's nothing to do.
//
// - Queued run → Transition(Dispatched) and tell the agent to reboot.
// The dispatcher would have WoL'd it anyway; we beat it to the
// punch so the host skips the WoL dance.
// - WaitingWoL run created <10min ago → also return reboot, covering
// "host crashed mid-reboot, systemd brought the reporter back".
// - anything else → idle.
func (u *UI) pickHostCommand(ctx context.Context, hostID int64) (string, int64) {
if u.Runs == nil || u.Runner == nil {
return "", 0
}
run, err := u.Runs.LatestForHost(ctx, hostID)
if err != nil {
log.Printf("heartbeat: latest run for host %d: %v", hostID, err)
return "", 0
}
if run == nil {
return "", 0
}
switch run.State {
case model.StateQueued:
if _, err := u.Runner.Transition(ctx, run.ID, orchestrator.TriggerDispatched); err != nil {
// Benign race with the dispatcher's own 2s poll — the
// state machine refuses the second transition; we just
// log and return idle so the agent doesn't reboot on a
// run that another path is already driving.
log.Printf("heartbeat: transition run %d: %v", run.ID, err)
return "", 0
}
log.Printf("heartbeat: dispatched run %d for host %d via heartbeat (no WoL)", run.ID, hostID)
return cmdRebootForVetting, run.ID
case model.StateWaitingWoL:
// Tolerate a crashed-mid-reboot retry: the reporter is the
// only thing that could be telling us about this host right
// now, and WoL is only the fallback anyway. Bound it so a
// perpetually-broken PXE doesn't reboot-loop the box.
if time.Since(run.StartedAt) < 10*time.Minute {
return cmdRebootForVetting, run.ID
}
return "", 0
}
return "", 0
}
const cmdRebootForVetting = "reboot_for_vetting"
func writeJSONError(w http.ResponseWriter, status int, msg string) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)