Add host-mode heartbeat: vetting-agent host + last-seen badge
CI / Lint + build + test (push) Has been cancelled
CI / Lint + build + test (push) Has been cancelled
vetting-agent gains a `host` subcommand that runs as a systemd service
installed by the quick-register one-liner, POSTing every 30s to
/api/v1/hosts/{mac}/heartbeat so the dashboard tile shows "online" or
"Nm ago" without waiting on WoL. Ships dormant client code for the
Phase 2 reboot_for_vetting command so the server can flip it on later
without a binary redeploy.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,65 @@
|
||||
package hostmode
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// setPXEBootNext points the next boot at a PXE-capable BootOrder
|
||||
// entry via efibootmgr --bootnext. Best-effort: absent efibootmgr,
|
||||
// non-UEFI firmware, or zero PXE entries all fall through silently —
|
||||
// the operator's BIOS/DHCP chain will still PXE-boot on most hosts.
|
||||
func setPXEBootNext(ctx context.Context) {
|
||||
if _, err := os.Stat("/sys/firmware/efi"); err != nil {
|
||||
log.Printf("hostmode: not a UEFI system; skipping efibootmgr")
|
||||
return
|
||||
}
|
||||
bin, err := exec.LookPath("efibootmgr")
|
||||
if err != nil {
|
||||
log.Printf("hostmode: efibootmgr not installed; skipping")
|
||||
return
|
||||
}
|
||||
boots, err := exec.CommandContext(ctx, bin, "-v").Output()
|
||||
if err != nil {
|
||||
log.Printf("hostmode: efibootmgr -v: %v", err)
|
||||
return
|
||||
}
|
||||
num := findPXEBootNum(string(boots))
|
||||
if num == "" {
|
||||
log.Printf("hostmode: no PXE boot entry found")
|
||||
return
|
||||
}
|
||||
if err := exec.CommandContext(ctx, bin, "--bootnext", num).Run(); err != nil {
|
||||
log.Printf("hostmode: efibootmgr --bootnext %s: %v", num, err)
|
||||
return
|
||||
}
|
||||
log.Printf("hostmode: efibootmgr --bootnext %s", num)
|
||||
}
|
||||
|
||||
// findPXEBootNum picks the first BootXXXX entry whose description
|
||||
// looks like a network boot. efibootmgr -v output lines look like:
|
||||
//
|
||||
// Boot0003* UEFI: IPv4 Intel I225-V PciRoot(0x0)/Pci(...)/MAC(...)
|
||||
// Boot0001* ubuntu HD(1,GPT,...)/File(\EFI\ubuntu\shimx64.efi)
|
||||
func findPXEBootNum(out string) string {
|
||||
scan := bufio.NewScanner(strings.NewReader(out))
|
||||
for scan.Scan() {
|
||||
line := scan.Text()
|
||||
if !strings.HasPrefix(line, "Boot") || len(line) < 8 {
|
||||
continue
|
||||
}
|
||||
low := strings.ToLower(line)
|
||||
if !(strings.Contains(low, "pxe") ||
|
||||
strings.Contains(low, "ipv4") ||
|
||||
strings.Contains(low, "ipv6") ||
|
||||
strings.Contains(low, "network")) {
|
||||
continue
|
||||
}
|
||||
return line[4:8]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
package hostmode
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// heartbeatResponse is what the orchestrator sends back.
|
||||
// Phase 1 only populates Ok. Phase 2 adds Cmd + RunID.
|
||||
type heartbeatResponse struct {
|
||||
Ok bool `json:"ok"`
|
||||
Cmd string `json:"cmd,omitempty"`
|
||||
RunID int64 `json:"run_id,omitempty"`
|
||||
}
|
||||
|
||||
type hostClient struct {
|
||||
base string
|
||||
h *http.Client
|
||||
}
|
||||
|
||||
func newHostClient(base string) *hostClient {
|
||||
return &hostClient{
|
||||
base: base,
|
||||
h: &http.Client{Timeout: 5 * time.Second},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *hostClient) heartbeat(ctx context.Context, mac string) (*heartbeatResponse, error) {
|
||||
url := fmt.Sprintf("%s/api/v1/hosts/%s/heartbeat", c.base, mac)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url,
|
||||
bytes.NewReader([]byte(`{}`)))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
resp, err := c.h.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
|
||||
return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
var out heartbeatResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
||||
return nil, fmt.Errorf("decode: %w", err)
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
package hostmode
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
// cmdRebootForVetting is the Phase 2 command the orchestrator sends
|
||||
// when the operator clicked "Start vetting" and the host is actively
|
||||
// heartbeating — the agent redirects next boot to PXE and reboots
|
||||
// itself, obviating WoL.
|
||||
const cmdRebootForVetting = "reboot_for_vetting"
|
||||
|
||||
// handleResponse dispatches on the heartbeat response. Phase 1 never
|
||||
// sees a non-empty Cmd (the server omits the field). Phase 2 adds
|
||||
// reboot_for_vetting handling.
|
||||
func handleResponse(ctx context.Context, resp *heartbeatResponse) {
|
||||
if resp == nil || resp.Cmd == "" {
|
||||
return
|
||||
}
|
||||
switch resp.Cmd {
|
||||
case cmdRebootForVetting:
|
||||
log.Printf("hostmode: orchestrator requested reboot_for_vetting (run=%d)", resp.RunID)
|
||||
rebootForVetting(ctx)
|
||||
default:
|
||||
log.Printf("hostmode: unknown cmd %q, ignoring", resp.Cmd)
|
||||
}
|
||||
}
|
||||
|
||||
// rebootForVetting redirects next boot to PXE (best-effort on UEFI
|
||||
// via efibootmgr) and triggers a clean reboot. BIOS/legacy hosts
|
||||
// typically PXE-boot via DHCP chain on every boot, so efibootmgr
|
||||
// missing is non-fatal.
|
||||
func rebootForVetting(ctx context.Context) {
|
||||
setPXEBootNext(ctx)
|
||||
log.Printf("hostmode: executing systemctl reboot")
|
||||
if err := exec.CommandContext(ctx, "systemctl", "reboot").Run(); err != nil {
|
||||
log.Printf("hostmode: systemctl reboot failed: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
package hostmode
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// primaryMAC resolves the MAC of the iface that carries the default
|
||||
// IPv4 route. Mirrors quick.sh.tmpl's primary_iface so the agent
|
||||
// reports the same MAC that was registered (important on Proxmox
|
||||
// where vmbr0 inherits its physical NIC's MAC).
|
||||
func primaryMAC() (string, error) {
|
||||
iface, err := defaultRouteIface()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
raw, err := os.ReadFile(fmt.Sprintf("/sys/class/net/%s/address", iface))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read mac for %s: %w", iface, err)
|
||||
}
|
||||
return strings.ToLower(strings.TrimSpace(string(raw))), nil
|
||||
}
|
||||
|
||||
// defaultRouteIface shells out to `ip` because reading /proc/net/route
|
||||
// requires hex-swap logic and still misses the IPv4-only "dev"
|
||||
// qualification. The service runs as root on a Linux box; `ip` is
|
||||
// always present.
|
||||
func defaultRouteIface() (string, error) {
|
||||
out, err := exec.Command("ip", "-o", "-4", "route", "show", "default").Output()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("ip route: %w", err)
|
||||
}
|
||||
scan := bufio.NewScanner(strings.NewReader(string(out)))
|
||||
for scan.Scan() {
|
||||
fields := strings.Fields(scan.Text())
|
||||
for i, f := range fields {
|
||||
if f == "dev" && i+1 < len(fields) {
|
||||
return fields[i+1], nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", errors.New("no default IPv4 route")
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
// Package hostmode implements the "persistent reporter" mode of
|
||||
// vetting-agent. It runs as a systemd service on the host (not in
|
||||
// the live image), heartbeats to the orchestrator every ~30s, and
|
||||
// in Phase 2 accepts commands — most importantly reboot-for-vetting.
|
||||
package hostmode
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// Config mirrors /etc/vetting/host-agent.yaml. All fields are
|
||||
// optional except OrchestratorURL — the rest have reasonable
|
||||
// defaults so a single `orchestrator_url:` line works.
|
||||
type Config struct {
|
||||
OrchestratorURL string `yaml:"orchestrator_url"`
|
||||
MAC string `yaml:"mac,omitempty"`
|
||||
Interval time.Duration `yaml:"-"`
|
||||
IntervalRaw string `yaml:"interval,omitempty"`
|
||||
}
|
||||
|
||||
func LoadConfig(path string) (*Config, error) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read %s: %w", path, err)
|
||||
}
|
||||
var c Config
|
||||
if err := yaml.Unmarshal(b, &c); err != nil {
|
||||
return nil, fmt.Errorf("parse %s: %w", path, err)
|
||||
}
|
||||
c.OrchestratorURL = strings.TrimRight(strings.TrimSpace(c.OrchestratorURL), "/")
|
||||
if c.OrchestratorURL == "" {
|
||||
return nil, errors.New("orchestrator_url is required")
|
||||
}
|
||||
c.MAC = strings.ToLower(strings.TrimSpace(c.MAC))
|
||||
if c.IntervalRaw == "" {
|
||||
c.Interval = 30 * time.Second
|
||||
} else {
|
||||
d, err := time.ParseDuration(c.IntervalRaw)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse interval: %w", err)
|
||||
}
|
||||
if d < time.Second {
|
||||
return nil, fmt.Errorf("interval %s is too aggressive", d)
|
||||
}
|
||||
c.Interval = d
|
||||
}
|
||||
return &c, nil
|
||||
}
|
||||
|
||||
// Run blocks until ctx is cancelled, heartbeating on an interval.
|
||||
// Errors never abort the loop — the service is `Restart=on-failure`
|
||||
// in systemd, and a transient HTTP failure is not a reason to exit.
|
||||
func Run(ctx context.Context, cfgPath string) error {
|
||||
cfg, err := LoadConfig(cfgPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if cfg.MAC == "" {
|
||||
mac, err := primaryMAC()
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolve primary MAC: %w", err)
|
||||
}
|
||||
cfg.MAC = mac
|
||||
}
|
||||
log.Printf("hostmode: reporting to %s as %s every %s",
|
||||
cfg.OrchestratorURL, cfg.MAC, cfg.Interval)
|
||||
|
||||
client := newHostClient(cfg.OrchestratorURL)
|
||||
|
||||
// Fire one heartbeat immediately so the dashboard lights up on
|
||||
// service start, without waiting for the first tick.
|
||||
tick(ctx, client, cfg)
|
||||
|
||||
t := time.NewTicker(cfg.Interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-t.C:
|
||||
tick(ctx, client, cfg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func tick(ctx context.Context, c *hostClient, cfg *Config) {
|
||||
resp, err := c.heartbeat(ctx, cfg.MAC)
|
||||
if err != nil {
|
||||
log.Printf("hostmode: heartbeat: %v", err)
|
||||
return
|
||||
}
|
||||
handleResponse(ctx, resp)
|
||||
}
|
||||
Reference in New Issue
Block a user