Heartbeat-first dispatch: retire WoL-as-default, add WaitingReboot
CI / Lint + build + test (push) Has been cancelled

Every supported host runs vetting-reporter in-OS and heartbeats every
30s. WoL was never the thing that started vetting — the heartbeat
response's reboot_for_vetting command was. Firing WoL first only
crowded the run log with misleading diagnostics when the real failure
mode is "reporter isn't installed."

- StartRun 409s if the host hasn't heartbeated within 60s, pointing
  the operator at /register/quick.sh.
- Dispatcher re-checks LastSeenAt at dispatch time (run may sit in
  Queued long enough for the host to go offline); stale hosts mark
  the run Failed with failed_stage=dispatch instead of looping.
- New StateWaitingReboot + TriggerRebootCommanded capture the actual
  semantics. StateWaitingWoL kept as the hook point for a future
  manual-override button.
- Tile disables the Start button with a quick.sh tooltip when the
  host is offline, matching the server-side 409.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 01:10:34 -04:00
parent c9927ca2bf
commit d0bfae14c8
17 changed files with 632 additions and 155 deletions
+130
View File
@@ -0,0 +1,130 @@
package api_test
import (
"context"
"fmt"
"net/http"
"net/http/httptest"
"path/filepath"
"strings"
"testing"
"time"
"github.com/go-chi/chi/v5"
"vetting/internal/api"
"vetting/internal/db"
"vetting/internal/events"
"vetting/internal/model"
"vetting/internal/orchestrator"
"vetting/internal/store"
)
func setupStartRun(t *testing.T) (*api.UI, *store.Hosts, *store.Runs) {
t.Helper()
conn, err := db.Open(filepath.Join(t.TempDir(), "vetting.db"))
if err != nil {
t.Fatalf("open db: %v", err)
}
t.Cleanup(func() { _ = conn.Close() })
hosts := &store.Hosts{DB: conn}
runs := &store.Runs{DB: conn}
stages := &store.Stages{DB: conn}
hub := events.NewHub()
runner := &orchestrator.Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub}
ui := &api.UI{Hosts: hosts, Runs: runs, Runner: runner, EventHub: hub}
return ui, hosts, runs
}
func startRunReq(id int64) *http.Request {
req := httptest.NewRequest(http.MethodPost, fmt.Sprintf("/hosts/%d/start", id), nil)
rctx := chi.NewRouteContext()
rctx.URLParams.Add("id", fmt.Sprintf("%d", id))
return req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
}
// TestStartRun_RefusesWhenNeverHeartbeated: operator clicks Start on a
// host whose reporter has never phoned home. The handler must 409 with
// a message pointing at the quick.sh install path so the operator knows
// exactly what to fix.
func TestStartRun_RefusesWhenNeverHeartbeated(t *testing.T) {
ui, hosts, _ := setupStartRun(t)
id, err := hosts.Create(context.Background(), model.Host{
Name: "sr-never",
MAC: "aa:bb:cc:dd:ee:60",
WoLBroadcastIP: "10.0.0.255",
WoLPort: 9,
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
})
if err != nil {
t.Fatalf("create host: %v", err)
}
rr := httptest.NewRecorder()
ui.StartRun(rr, startRunReq(id))
if rr.Code != http.StatusConflict {
t.Fatalf("status = %d, want 409, body=%q", rr.Code, rr.Body.String())
}
if !strings.Contains(rr.Body.String(), "quick.sh") {
t.Fatalf("body missing quick.sh hint: %s", rr.Body.String())
}
}
// TestStartRun_RefusesWhenHeartbeatStale: last heartbeat was 5 minutes
// ago — well past the 60s staleness cutoff. Same 409 path as never-seen.
func TestStartRun_RefusesWhenHeartbeatStale(t *testing.T) {
ui, hosts, _ := setupStartRun(t)
ctx := context.Background()
id, err := hosts.Create(ctx, model.Host{
Name: "sr-stale",
MAC: "aa:bb:cc:dd:ee:61",
WoLBroadcastIP: "10.0.0.255",
WoLPort: 9,
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
})
if err != nil {
t.Fatalf("create host: %v", err)
}
if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:61", time.Now().UTC().Add(-5*time.Minute)); err != nil {
t.Fatalf("stamp: %v", err)
}
rr := httptest.NewRecorder()
ui.StartRun(rr, startRunReq(id))
if rr.Code != http.StatusConflict {
t.Fatalf("status = %d, want 409", rr.Code)
}
}
// TestStartRun_AcceptsWhenHeartbeating: fresh heartbeat within the
// staleness window → 303 redirect + a Queued run in the DB.
func TestStartRun_AcceptsWhenHeartbeating(t *testing.T) {
ui, hosts, runs := setupStartRun(t)
ctx := context.Background()
id, err := hosts.Create(ctx, model.Host{
Name: "sr-live",
MAC: "aa:bb:cc:dd:ee:62",
WoLBroadcastIP: "10.0.0.255",
WoLPort: 9,
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
})
if err != nil {
t.Fatalf("create host: %v", err)
}
if err := hosts.UpdateLastSeen(ctx, "aa:bb:cc:dd:ee:62", time.Now().UTC()); err != nil {
t.Fatalf("stamp: %v", err)
}
rr := httptest.NewRecorder()
ui.StartRun(rr, startRunReq(id))
if rr.Code != http.StatusSeeOther {
t.Fatalf("status = %d, want 303, body=%q", rr.Code, rr.Body.String())
}
latest, err := runs.LatestForHost(ctx, id)
if err != nil {
t.Fatalf("latest: %v", err)
}
if latest == nil || latest.State != model.StateQueued {
t.Fatalf("latest run = %+v, want Queued", latest)
}
}