Add host-mode heartbeat: vetting-agent host + last-seen badge
CI / Lint + build + test (push) Has been cancelled

vetting-agent gains a `host` subcommand that runs as a systemd service
installed by the quick-register one-liner, POSTing every 30s to
/api/v1/hosts/{mac}/heartbeat so the dashboard tile shows "online" or
"Nm ago" without waiting on WoL. Ships dormant client code for the
Phase 2 reboot_for_vetting command so the server can flip it on later
without a binary redeploy.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-17 23:34:15 -04:00
parent d24207427f
commit a0c0fb114f
28 changed files with 1106 additions and 165 deletions
+65
View File
@@ -0,0 +1,65 @@
package hostmode
import (
"bufio"
"context"
"log"
"os"
"os/exec"
"strings"
)
// setPXEBootNext points the next boot at a PXE-capable BootOrder
// entry via efibootmgr --bootnext. Best-effort: absent efibootmgr,
// non-UEFI firmware, or zero PXE entries all fall through silently —
// the operator's BIOS/DHCP chain will still PXE-boot on most hosts.
func setPXEBootNext(ctx context.Context) {
if _, err := os.Stat("/sys/firmware/efi"); err != nil {
log.Printf("hostmode: not a UEFI system; skipping efibootmgr")
return
}
bin, err := exec.LookPath("efibootmgr")
if err != nil {
log.Printf("hostmode: efibootmgr not installed; skipping")
return
}
boots, err := exec.CommandContext(ctx, bin, "-v").Output()
if err != nil {
log.Printf("hostmode: efibootmgr -v: %v", err)
return
}
num := findPXEBootNum(string(boots))
if num == "" {
log.Printf("hostmode: no PXE boot entry found")
return
}
if err := exec.CommandContext(ctx, bin, "--bootnext", num).Run(); err != nil {
log.Printf("hostmode: efibootmgr --bootnext %s: %v", num, err)
return
}
log.Printf("hostmode: efibootmgr --bootnext %s", num)
}
// findPXEBootNum picks the first BootXXXX entry whose description
// looks like a network boot. efibootmgr -v output lines look like:
//
// Boot0003* UEFI: IPv4 Intel I225-V PciRoot(0x0)/Pci(...)/MAC(...)
// Boot0001* ubuntu HD(1,GPT,...)/File(\EFI\ubuntu\shimx64.efi)
func findPXEBootNum(out string) string {
scan := bufio.NewScanner(strings.NewReader(out))
for scan.Scan() {
line := scan.Text()
if !strings.HasPrefix(line, "Boot") || len(line) < 8 {
continue
}
low := strings.ToLower(line)
if !(strings.Contains(low, "pxe") ||
strings.Contains(low, "ipv4") ||
strings.Contains(low, "ipv6") ||
strings.Contains(low, "network")) {
continue
}
return line[4:8]
}
return ""
}
+55
View File
@@ -0,0 +1,55 @@
package hostmode
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"time"
)
// heartbeatResponse is what the orchestrator sends back.
// Phase 1 only populates Ok. Phase 2 adds Cmd + RunID.
type heartbeatResponse struct {
Ok bool `json:"ok"`
Cmd string `json:"cmd,omitempty"`
RunID int64 `json:"run_id,omitempty"`
}
type hostClient struct {
base string
h *http.Client
}
func newHostClient(base string) *hostClient {
return &hostClient{
base: base,
h: &http.Client{Timeout: 5 * time.Second},
}
}
func (c *hostClient) heartbeat(ctx context.Context, mac string) (*heartbeatResponse, error) {
url := fmt.Sprintf("%s/api/v1/hosts/%s/heartbeat", c.base, mac)
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url,
bytes.NewReader([]byte(`{}`)))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
resp, err := c.h.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(body))
}
var out heartbeatResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, fmt.Errorf("decode: %w", err)
}
return &out, nil
}
+41
View File
@@ -0,0 +1,41 @@
package hostmode
import (
"context"
"log"
"os/exec"
)
// cmdRebootForVetting is the Phase 2 command the orchestrator sends
// when the operator clicked "Start vetting" and the host is actively
// heartbeating — the agent redirects next boot to PXE and reboots
// itself, obviating WoL.
const cmdRebootForVetting = "reboot_for_vetting"
// handleResponse dispatches on the heartbeat response. Phase 1 never
// sees a non-empty Cmd (the server omits the field). Phase 2 adds
// reboot_for_vetting handling.
func handleResponse(ctx context.Context, resp *heartbeatResponse) {
if resp == nil || resp.Cmd == "" {
return
}
switch resp.Cmd {
case cmdRebootForVetting:
log.Printf("hostmode: orchestrator requested reboot_for_vetting (run=%d)", resp.RunID)
rebootForVetting(ctx)
default:
log.Printf("hostmode: unknown cmd %q, ignoring", resp.Cmd)
}
}
// rebootForVetting redirects next boot to PXE (best-effort on UEFI
// via efibootmgr) and triggers a clean reboot. BIOS/legacy hosts
// typically PXE-boot via DHCP chain on every boot, so efibootmgr
// missing is non-fatal.
func rebootForVetting(ctx context.Context) {
setPXEBootNext(ctx)
log.Printf("hostmode: executing systemctl reboot")
if err := exec.CommandContext(ctx, "systemctl", "reboot").Run(); err != nil {
log.Printf("hostmode: systemctl reboot failed: %v", err)
}
}
+47
View File
@@ -0,0 +1,47 @@
package hostmode
import (
"bufio"
"errors"
"fmt"
"os"
"os/exec"
"strings"
)
// primaryMAC resolves the MAC of the iface that carries the default
// IPv4 route. Mirrors quick.sh.tmpl's primary_iface so the agent
// reports the same MAC that was registered (important on Proxmox
// where vmbr0 inherits its physical NIC's MAC).
func primaryMAC() (string, error) {
iface, err := defaultRouteIface()
if err != nil {
return "", err
}
raw, err := os.ReadFile(fmt.Sprintf("/sys/class/net/%s/address", iface))
if err != nil {
return "", fmt.Errorf("read mac for %s: %w", iface, err)
}
return strings.ToLower(strings.TrimSpace(string(raw))), nil
}
// defaultRouteIface shells out to `ip` because reading /proc/net/route
// requires hex-swap logic and still misses the IPv4-only "dev"
// qualification. The service runs as root on a Linux box; `ip` is
// always present.
func defaultRouteIface() (string, error) {
out, err := exec.Command("ip", "-o", "-4", "route", "show", "default").Output()
if err != nil {
return "", fmt.Errorf("ip route: %w", err)
}
scan := bufio.NewScanner(strings.NewReader(string(out)))
for scan.Scan() {
fields := strings.Fields(scan.Text())
for i, f := range fields {
if f == "dev" && i+1 < len(fields) {
return fields[i+1], nil
}
}
}
return "", errors.New("no default IPv4 route")
}
+101
View File
@@ -0,0 +1,101 @@
// Package hostmode implements the "persistent reporter" mode of
// vetting-agent. It runs as a systemd service on the host (not in
// the live image), heartbeats to the orchestrator every ~30s, and
// in Phase 2 accepts commands — most importantly reboot-for-vetting.
package hostmode
import (
"context"
"errors"
"fmt"
"log"
"os"
"strings"
"time"
"gopkg.in/yaml.v3"
)
// Config mirrors /etc/vetting/host-agent.yaml. All fields are
// optional except OrchestratorURL — the rest have reasonable
// defaults so a single `orchestrator_url:` line works.
type Config struct {
OrchestratorURL string `yaml:"orchestrator_url"`
MAC string `yaml:"mac,omitempty"`
Interval time.Duration `yaml:"-"`
IntervalRaw string `yaml:"interval,omitempty"`
}
func LoadConfig(path string) (*Config, error) {
b, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read %s: %w", path, err)
}
var c Config
if err := yaml.Unmarshal(b, &c); err != nil {
return nil, fmt.Errorf("parse %s: %w", path, err)
}
c.OrchestratorURL = strings.TrimRight(strings.TrimSpace(c.OrchestratorURL), "/")
if c.OrchestratorURL == "" {
return nil, errors.New("orchestrator_url is required")
}
c.MAC = strings.ToLower(strings.TrimSpace(c.MAC))
if c.IntervalRaw == "" {
c.Interval = 30 * time.Second
} else {
d, err := time.ParseDuration(c.IntervalRaw)
if err != nil {
return nil, fmt.Errorf("parse interval: %w", err)
}
if d < time.Second {
return nil, fmt.Errorf("interval %s is too aggressive", d)
}
c.Interval = d
}
return &c, nil
}
// Run blocks until ctx is cancelled, heartbeating on an interval.
// Errors never abort the loop — the service is `Restart=on-failure`
// in systemd, and a transient HTTP failure is not a reason to exit.
func Run(ctx context.Context, cfgPath string) error {
cfg, err := LoadConfig(cfgPath)
if err != nil {
return err
}
if cfg.MAC == "" {
mac, err := primaryMAC()
if err != nil {
return fmt.Errorf("resolve primary MAC: %w", err)
}
cfg.MAC = mac
}
log.Printf("hostmode: reporting to %s as %s every %s",
cfg.OrchestratorURL, cfg.MAC, cfg.Interval)
client := newHostClient(cfg.OrchestratorURL)
// Fire one heartbeat immediately so the dashboard lights up on
// service start, without waiting for the first tick.
tick(ctx, client, cfg)
t := time.NewTicker(cfg.Interval)
defer t.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-t.C:
tick(ctx, client, cfg)
}
}
}
func tick(ctx context.Context, c *hostClient, cfg *Config) {
resp, err := c.heartbeat(ctx, cfg.MAC)
if err != nil {
log.Printf("hostmode: heartbeat: %v", err)
return
}
handleResponse(ctx, resp)
}