Initial commit: full Phases 1-6 implementation
CI / Lint + build + test (push) Has been cancelled

Post-repair hardware validation pipeline for Proxmox cluster hosts.
Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq
PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
+45
View File
@@ -0,0 +1,45 @@
name: CI
on:
push:
branches: [main]
pull_request:
branches: [main]
permissions:
contents: read
jobs:
lint-and-test:
name: Lint + build + test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: "1.26.x"
cache: true
- name: Install templ
run: go install github.com/a-h/templ/cmd/templ@v0.3.1001
- name: Generate templ
run: templ generate
- name: Verify go.mod + go.sum are tidy
run: |
go mod tidy
git diff --exit-code go.mod go.sum
- name: Vet
run: go vet ./...
- name: Build (host)
run: |
go build ./...
GOOS=linux GOARCH=amd64 go build ./...
- name: Test
run: go test -race -count=1 ./...
+59
View File
@@ -0,0 +1,59 @@
name: E2E (manual)
# The E2E job builds the live image (mkosi, requires apt package
# updates) and boots a QEMU VM against a running orchestrator. It's
# slow and needs a Linux runner with nested virtualization, so it runs
# only on workflow_dispatch.
on:
workflow_dispatch:
inputs:
ref:
description: Git ref to test (default: main)
required: false
default: main
permissions:
contents: read
jobs:
e2e:
runs-on: ubuntu-latest
timeout-minutes: 45
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: "1.26.x"
cache: true
- name: Install live-image build dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
mkosi debootstrap squashfs-tools qemu-system-x86 qemu-utils \
dnsmasq iperf3 ipxe-qemu
- name: Install templ
run: go install github.com/a-h/templ/cmd/templ@v0.3.1001
- name: Build orchestrator + agent
run: |
templ generate
make orchestrator-linux agent-linux
- name: Build live image
run: make live-image
- name: Run E2E suite
# The E2E test expects a registered host + queued run; in CI we
# don't have an operator, so it's skipped unless VETTING_E2E_RUN_ID
# is supplied. When someone stands up the orchestrator for a
# dispatch, they can set it via a workflow_dispatch secret.
env:
VETTING_E2E_RUN_ID: ${{ vars.VETTING_E2E_RUN_ID }}
run: sudo -E go test -tags=e2e -count=1 -v ./test/e2e/...
+17
View File
@@ -0,0 +1,17 @@
/bin/
/out/
/dist/
/tmp/
/var/
/data/
*.db
*.db-shm
*.db-wal
*.exe
*.log
vetting.yaml
!deploy/vetting.example.yaml
live-image/out/
.vscode/
.idea/
.claude/
+18
View File
@@ -0,0 +1,18 @@
run:
timeout: 3m
linters:
enable:
- govet
- errcheck
- staticcheck
- ineffassign
- unused
- gofmt
- goimports
- misspell
- revive
issues:
exclude-dirs:
- internal/web/templates
+79
View File
@@ -0,0 +1,79 @@
.DEFAULT_GOAL := help
UNAME_S := $(shell uname -s 2>/dev/null || echo Windows)
GOOS_LINUX := GOOS=linux GOARCH=amd64
GIT_SHA := $(shell git rev-parse --short HEAD 2>/dev/null || echo dev)
LDFLAGS := -s -w -X vetting/internal/version.GitSHA=$(GIT_SHA)
.PHONY: help
help: ## Show targets
@awk 'BEGIN {FS = ":.*##"} /^[a-zA-Z_-]+:.*##/ {printf " %-20s %s\n", $$1, $$2}' $(MAKEFILE_LIST)
.PHONY: templ
templ: ## Generate templ .go files
templ generate
.PHONY: orchestrator
orchestrator: templ ## Build orchestrator for host OS
go build -ldflags="$(LDFLAGS)" -o bin/vetting$(if $(filter Windows%,$(UNAME_S)),.exe,) ./cmd/vetting
.PHONY: orchestrator-linux
orchestrator-linux: templ ## Cross-build orchestrator for linux-amd64
$(GOOS_LINUX) go build -ldflags="$(LDFLAGS)" -o bin/vetting-linux-amd64 ./cmd/vetting
.PHONY: agent
agent: ## Build agent for host OS (handy for unit testing only — real agent runs in the live image)
go build -ldflags="$(LDFLAGS)" -o bin/vetting-agent$(if $(filter Windows%,$(UNAME_S)),.exe,) ./cmd/vetting-agent
.PHONY: agent-linux
agent-linux: ## Cross-build agent for linux-amd64 (consumed by live-image build)
$(GOOS_LINUX) go build -ldflags="$(LDFLAGS)" -o bin/vetting-agent.linux-amd64 ./cmd/vetting-agent
.PHONY: gen-admin-password
gen-admin-password: ## Build the bcrypt password generator
go build -o bin/gen-admin-password$(if $(filter Windows%,$(UNAME_S)),.exe,) ./tools/gen-admin-password
.PHONY: tidy
tidy: ## go mod tidy
go mod tidy
.PHONY: fmt
fmt: ## go fmt
go fmt ./...
.PHONY: vet
vet: ## go vet
go vet ./...
.PHONY: test
test: templ ## Run tests
go test ./...
.PHONY: test-race
test-race: templ ## Run tests with the race detector
go test -race -count=1 ./...
.PHONY: e2e
e2e: ## Run the QEMU PXE E2E test (Linux, root, live image required)
sudo go test -tags=e2e -v ./test/e2e/...
.PHONY: live-image
live-image: agent-linux ## Build reproducible live image (requires Linux/WSL + mkosi)
ifneq ($(findstring Windows,$(UNAME_S))$(findstring MINGW,$(UNAME_S))$(findstring MSYS,$(UNAME_S)),)
@echo "ERROR: live-image must be built under Linux (use WSL: wsl make live-image)." && exit 1
endif
$(MAKE) -C live-image all
.PHONY: all
all: orchestrator agent gen-admin-password ## Build everything buildable on host OS
.PHONY: run
run: orchestrator ## Build and run orchestrator with example config
./bin/vetting$(if $(filter Windows%,$(UNAME_S)),.exe,) --config deploy/vetting.example.yaml
.PHONY: install
install: orchestrator-linux ## Run deploy/install.sh (must be run on the target LXC as root)
sudo ./deploy/install.sh --binary ./bin/vetting-linux-amd64
.PHONY: clean
clean: ## Remove build artifacts
rm -rf bin out dist tmp
+85
View File
@@ -0,0 +1,85 @@
# Vetting
Post-repair hardware validation pipeline for Proxmox cluster hosts.
Register a host, click **Start Vetting**, and the orchestrator will
PXE-boot it into a custom Linux live image and run it through a
consistent battery of tests (CPU stress, RAM stress, SMART, disk I/O,
network throughput, GPU, PSU telemetry). Pass → auto-shutdown + HTML
report. Fail → pipeline halts, SSH drops in, notification fires.
Built for solo-operator home labs: one Go binary, SQLite + flat files,
HTMX + SSE UI, bundled dnsmasq, optional ntfy / Discord / SMTP
notifications.
## Documentation
- [docs/operations.md](docs/operations.md) — install + first run +
troubleshooting
- [docs/architecture.md](docs/architecture.md) — packages, state
machine, protocol
- [docs/test-suite.md](docs/test-suite.md) — what each stage measures
## Quick start (local, against QEMU)
```bash
# 1. Build
make all
# 2. Generate an admin password hash and paste it into the config.
./bin/gen-admin-password 'your-password'
# Edit deploy/vetting.example.yaml:
# auth.admin_password_bcrypt = <that hash>
# auth.session_secret_hex = $(openssl rand -hex 32)
# 3. Run
./bin/vetting --config deploy/vetting.example.yaml
# → http://localhost:8080
```
For a full end-to-end QEMU walk-through (bridge setup, host registration,
PXE boot), see [docs/operations.md § First vetting run](docs/operations.md#first-vetting-run).
## Production install (Proxmox LXC)
```bash
make orchestrator-linux
scp -r bin deploy lxc:/opt/vetting/
ssh lxc "cd /opt/vetting && sudo ./deploy/install.sh"
# Edit /etc/vetting/vetting.yaml, then:
ssh lxc "sudo systemctl enable --now vetting"
```
See [docs/operations.md § Install](docs/operations.md#install-proxmox-lxc)
for the full walkthrough.
## Repository layout
```
cmd/ orchestrator + agent entrypoints
internal/ core packages (see docs/architecture.md for the map)
agent/ in-image agent logic (claim loop, stage dispatch, probes)
live-image/ mkosi config for the PXE-bootable Debian live image
deploy/ systemd unit + install.sh + example config
docs/ operator + developer docs
test/e2e/ build-tag-gated QEMU + PXE full-stack test
tools/ small CLI helpers (e.g. gen-admin-password)
```
## Development
- `make test` — Go unit + smoke tests (cross-platform)
- `make vet``go vet` on the whole module
- `make live-image` — Linux-only; run under WSL from Windows
- `make e2e` — requires Linux root + live image + running orchestrator
- `make run` — build + launch the orchestrator with the example config
Windows hosts: everything except `live-image` and `e2e` works natively.
The live image build calls `mkosi` which needs a real Linux userspace,
so use WSL for those targets.
## Status
All six phases in the original plan are implemented. The E2E QEMU
harness is wired in `test/e2e/qemu_test.go` but requires a running
orchestrator + registered host + queued run as preconditions — it's a
developer-facing integration harness, not a unit test.
+64
View File
@@ -0,0 +1,64 @@
// Package bootstate parses kernel cmdline parameters that the
// orchestrator baked into the iPXE script. The agent consumes these
// on startup to learn which run it belongs to and how to reach back.
package bootstate
import (
"errors"
"fmt"
"os"
"strconv"
"strings"
)
type Params struct {
OrchestratorURL string
RunID int64
MAC string
Token string
TLSCertFPR string // optional
}
// ParseCmdline reads /proc/cmdline (or a user-supplied path for tests)
// and pulls out the vetting.* parameters.
func ParseCmdline(path string) (*Params, error) {
if path == "" {
path = "/proc/cmdline"
}
b, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read %s: %w", path, err)
}
return ParseCmdlineString(string(b))
}
func ParseCmdlineString(s string) (*Params, error) {
fields := strings.Fields(strings.TrimSpace(s))
var p Params
for _, f := range fields {
k, v, ok := strings.Cut(f, "=")
if !ok {
continue
}
switch k {
case "vetting.orchestrator":
p.OrchestratorURL = v
case "vetting.run_id":
id, err := strconv.ParseInt(v, 10, 64)
if err != nil {
return nil, fmt.Errorf("vetting.run_id=%q: %w", v, err)
}
p.RunID = id
case "vetting.mac":
p.MAC = strings.ToLower(v)
case "vetting.token":
p.Token = v
case "vetting.cert_fpr":
p.TLSCertFPR = v
}
}
if p.OrchestratorURL == "" || p.RunID == 0 || p.MAC == "" || p.Token == "" {
return nil, errors.New("cmdline missing one of vetting.orchestrator, vetting.run_id, vetting.mac, vetting.token")
}
return &p, nil
}
+35
View File
@@ -0,0 +1,35 @@
package bootstate
import (
"testing"
)
func TestParseCmdlineGoldenPath(t *testing.T) {
s := `BOOT_IMAGE=vmlinuz initrd=initrd.img vetting.orchestrator=http://10.0.0.5:8080 vetting.run_id=42 vetting.mac=aa:bb:cc:dd:ee:ff vetting.token=deadbeefcafe vetting.cert_fpr=abc123 console=ttyS0,115200n8 quiet`
p, err := ParseCmdlineString(s)
if err != nil {
t.Fatalf("ParseCmdlineString: %v", err)
}
if p.OrchestratorURL != "http://10.0.0.5:8080" || p.RunID != 42 || p.MAC != "aa:bb:cc:dd:ee:ff" ||
p.Token != "deadbeefcafe" || p.TLSCertFPR != "abc123" {
t.Fatalf("parsed wrong: %+v", p)
}
}
func TestParseCmdlineMissingRequired(t *testing.T) {
s := `vetting.orchestrator=http://x vetting.mac=aa:bb:cc:dd:ee:ff vetting.token=t`
if _, err := ParseCmdlineString(s); err == nil {
t.Fatalf("expected error when vetting.run_id missing")
}
}
func TestParseCmdlineLowercasesMAC(t *testing.T) {
s := `vetting.orchestrator=http://x vetting.run_id=1 vetting.mac=AA:BB:CC:DD:EE:FF vetting.token=t`
p, err := ParseCmdlineString(s)
if err != nil {
t.Fatalf("ParseCmdlineString: %v", err)
}
if p.MAC != "aa:bb:cc:dd:ee:ff" {
t.Fatalf("MAC not lowercased: %q", p.MAC)
}
}
+181
View File
@@ -0,0 +1,181 @@
package agent
import (
"bytes"
"context"
"crypto/sha256"
"crypto/tls"
"crypto/x509"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// Client talks to the orchestrator's /api/v1/runs/:id/* endpoints.
type Client struct {
BaseURL string
RunID int64
Token string
TLSCertFPR string // optional sha256 hex fingerprint
HTTP *http.Client
}
func NewClient(baseURL string, runID int64, token, tlsCertFPR string) *Client {
tlsCfg := &tls.Config{MinVersion: tls.VersionTLS12}
// Cert pinning: if fingerprint provided, accept any cert whose DER
// sha256 matches. The orchestrator may be using a self-signed cert
// inside the LAN.
if tlsCertFPR != "" {
want := strings.ToLower(strings.ReplaceAll(tlsCertFPR, ":", ""))
tlsCfg.InsecureSkipVerify = true
tlsCfg.VerifyPeerCertificate = func(rawCerts [][]byte, _ [][]*x509.Certificate) error {
for _, c := range rawCerts {
sum := sha256.Sum256(c)
if hex.EncodeToString(sum[:]) == want {
return nil
}
}
return fmt.Errorf("agent: no presented cert matched pinned fingerprint")
}
}
return &Client{
BaseURL: strings.TrimRight(baseURL, "/"),
RunID: runID,
Token: token,
TLSCertFPR: tlsCertFPR,
HTTP: &http.Client{
Timeout: 30 * time.Second,
Transport: &http.Transport{TLSClientConfig: tlsCfg},
},
}
}
func (c *Client) Hello(ctx context.Context) error {
return c.postJSON(ctx, "/hello", nil, nil)
}
func (c *Client) Claim(ctx context.Context, agentIP string) (*ClaimResponse, error) {
body := map[string]any{"agent_ip": agentIP}
var out ClaimResponse
if err := c.postJSON(ctx, "/claim", body, &out); err != nil {
return nil, err
}
return &out, nil
}
func (c *Client) Heartbeat(ctx context.Context) (*HeartbeatResponse, error) {
var out HeartbeatResponse
if err := c.postJSON(ctx, "/heartbeat", nil, &out); err != nil {
return nil, err
}
return &out, nil
}
func (c *Client) Log(ctx context.Context, lines []LogLine) error {
return c.postJSON(ctx, "/log", map[string]any{"lines": lines}, nil)
}
func (c *Client) Result(ctx context.Context, result any) (*ResultResponse, error) {
var out ResultResponse
if err := c.postJSON(ctx, "/result", result, &out); err != nil {
return nil, err
}
return &out, nil
}
func (c *Client) Hold(ctx context.Context, agentIP string) (*HoldResponse, error) {
var out HoldResponse
if err := c.postJSON(ctx, "/hold", map[string]any{"agent_ip": agentIP}, &out); err != nil {
return nil, err
}
return &out, nil
}
// Sensor posts a batch of numeric samples (thermal readings, fio IOPS,
// iperf throughput, PSU voltages). Empty batches are allowed.
func (c *Client) Sensor(ctx context.Context, samples []SensorSample) error {
return c.postJSON(ctx, "/sensor", map[string]any{"samples": samples}, nil)
}
// SensorSample is the on-wire shape; the server persists each row into
// the measurements table.
type SensorSample struct {
TS string `json:"ts,omitempty"`
Kind string `json:"kind"`
Key string `json:"key"`
Value float64 `json:"value"`
Unit string `json:"unit,omitempty"`
}
type ClaimResponse struct {
OK bool `json:"ok"`
RunID int64 `json:"run_id"`
Stages []string `json:"stages"`
ExpectedDisks []ClaimExpectedDiskSpec `json:"expected_disks"`
IperfPort int `json:"iperf_port"`
}
type ClaimExpectedDiskSpec struct {
Serial string `json:"serial"`
SizeGB int `json:"size_gb"`
}
type HeartbeatResponse struct {
Cmd string `json:"cmd"`
State string `json:"state"`
Stage string `json:"stage,omitempty"`
OverrideFlags json.RawMessage `json:"override_flags,omitempty"`
}
type LogLine struct {
TS string `json:"ts,omitempty"`
Level string `json:"level,omitempty"`
Text string `json:"text"`
}
type ResultResponse struct {
OK bool `json:"ok"`
NextState string `json:"next_state"`
}
type HoldResponse struct {
AuthorizedKey string `json:"authorized_key"`
RunID int64 `json:"run_id"`
}
func (c *Client) postJSON(ctx context.Context, path string, in, out any) error {
var body io.Reader
if in != nil {
buf, err := json.Marshal(in)
if err != nil {
return err
}
body = bytes.NewReader(buf)
}
url := fmt.Sprintf("%s/api/v1/runs/%d%s", c.BaseURL, c.RunID, path)
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, body)
if err != nil {
return err
}
req.Header.Set("Authorization", "Bearer "+c.Token)
if in != nil {
req.Header.Set("Content-Type", "application/json")
}
resp, err := c.HTTP.Do(req)
if err != nil {
return err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode >= 300 {
b, _ := io.ReadAll(resp.Body)
return fmt.Errorf("%s %s: %d %s", req.Method, path, resp.StatusCode, strings.TrimSpace(string(b)))
}
if out != nil {
return json.NewDecoder(resp.Body).Decode(out)
}
return nil
}
+264
View File
@@ -0,0 +1,264 @@
// Package probes collects hardware facts from a booted Linux system.
// Phase 3 only needs enough to feed the spec diff: CPU model/cores,
// total RAM, per-disk serial+size, per-NIC MAC+speed, per-GPU model.
//
// Every probe is tolerant of missing files or tools — if /sys isn't
// available the field is just left empty. The orchestrator's diff
// engine will surface missing expected fields as failures; missing
// fields that weren't expected stay silent.
package probes
import (
"bufio"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"vetting/internal/spec"
)
// Collect runs every probe and returns the merged inventory. The only
// errors it surfaces are fatal ones that prevent progress — individual
// probe failures are logged to the returned Inventory's raw field and
// do not fail the whole call.
func Collect() (*spec.Inventory, error) {
inv := &spec.Inventory{}
inv.CPU = probeCPU()
inv.Memory = probeMemory()
inv.Disks = probeDisks()
inv.NICs = probeNICs()
inv.GPUs = probeGPUs()
return inv, nil
}
// ----- CPU --------------------------------------------------------------
func probeCPU() spec.CPUSpec {
// model: first "model name" in /proc/cpuinfo.
// logical_cores: runtime.NumCPU (Linux respects cpu cgroup; agent
// runs on bare metal so it will report every HT thread).
c := spec.CPUSpec{LogicalCores: runtime.NumCPU()}
f, err := os.Open("/proc/cpuinfo")
if err != nil {
return c
}
defer func() { _ = f.Close() }()
scan := bufio.NewScanner(f)
for scan.Scan() {
line := scan.Text()
if strings.HasPrefix(line, "model name") {
if _, v, ok := strings.Cut(line, ":"); ok {
c.Model = strings.TrimSpace(v)
break
}
}
}
return c
}
// ----- Memory -----------------------------------------------------------
func probeMemory() spec.MemorySpec {
// /proc/meminfo reports MemTotal in kB. Round down to the nearest
// GiB so the diff's ±2 GiB tolerance is meaningful.
f, err := os.Open("/proc/meminfo")
if err != nil {
return spec.MemorySpec{}
}
defer func() { _ = f.Close() }()
scan := bufio.NewScanner(f)
for scan.Scan() {
fields := strings.Fields(scan.Text())
if len(fields) >= 2 && fields[0] == "MemTotal:" {
kb, err := strconv.ParseInt(fields[1], 10, 64)
if err == nil {
return spec.MemorySpec{TotalGiB: int(kb / 1024 / 1024)}
}
}
}
return spec.MemorySpec{}
}
// ----- Disks ------------------------------------------------------------
// probeDisks walks /sys/class/block and picks out real block devices
// (no partitions, no loop/ram). For each it reads size (512B sectors)
// and serial. Virtio disks in QEMU report a serial only when launched
// with `-drive serial=...`; without that the field is empty, which is
// fine — the diff skips disks with empty serials anyway.
func probeDisks() []spec.DiskSpec {
entries, err := os.ReadDir("/sys/class/block")
if err != nil {
return nil
}
var out []spec.DiskSpec
for _, e := range entries {
name := e.Name()
if !isRealDisk(name) {
continue
}
base := filepath.Join("/sys/class/block", name)
size := diskSizeGB(base)
serial := diskSerial(name)
// size == 0 means we couldn't read /size; skip rather than
// emit garbage.
if size == 0 && serial == "" {
continue
}
out = append(out, spec.DiskSpec{Serial: serial, SizeGB: size})
}
return out
}
func isRealDisk(name string) bool {
// Exclude partitions: they have a parent block dir and a "partition"
// attribute. sd* disks without trailing digits are whole disks; nvme
// disks use nvme0n1 for the namespace and nvme0n1p1 for partitions.
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
return false
}
partPath := filepath.Join("/sys/class/block", name, "partition")
if _, err := os.Stat(partPath); err == nil {
return false
}
return true
}
func diskSizeGB(base string) int {
b, err := os.ReadFile(filepath.Join(base, "size"))
if err != nil {
return 0
}
sectors, err := strconv.ParseInt(strings.TrimSpace(string(b)), 10, 64)
if err != nil {
return 0
}
// /sys reports sectors of 512B regardless of physical sector size.
return int(sectors * 512 / 1_000_000_000)
}
func diskSerial(name string) string {
// Try a few known paths; the kernel exposes serials differently for
// ATA/SCSI vs NVMe.
for _, rel := range []string{
filepath.Join("/sys/block", name, "device", "serial"),
filepath.Join("/sys/block", name, "device", "vpd_pg80"),
filepath.Join("/sys/block", name, "serial"),
} {
if b, err := os.ReadFile(rel); err == nil {
s := strings.TrimSpace(string(b))
if s != "" {
return s
}
}
}
// Fallback: udevadm often knows the wwid / serial. Best-effort.
cmd := exec.Command("udevadm", "info", "--query=property", "--name="+name)
out, err := cmd.Output()
if err != nil {
return ""
}
for _, line := range strings.Split(string(out), "\n") {
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
return strings.TrimSpace(v)
}
}
return ""
}
// ----- NICs -------------------------------------------------------------
func probeNICs() []spec.NICSpec {
root := "/sys/class/net"
entries, err := os.ReadDir(root)
if err != nil {
return nil
}
var out []spec.NICSpec
for _, e := range entries {
name := e.Name()
if name == "lo" {
continue
}
base := filepath.Join(root, name)
mac := readLine(filepath.Join(base, "address"))
if mac == "" || mac == "00:00:00:00:00:00" {
continue
}
// /sys/class/net/*/speed reports Mbps or -1 if link down.
speed := 0
if b, err := os.ReadFile(filepath.Join(base, "speed")); err == nil {
if mbps, err := strconv.Atoi(strings.TrimSpace(string(b))); err == nil && mbps > 0 {
speed = mbps / 1000
}
}
out = append(out, spec.NICSpec{MAC: strings.ToLower(mac), SpeedGbps: speed})
}
return out
}
// ----- GPUs -------------------------------------------------------------
// probeGPUs leans on lspci; if lspci is missing, returns nothing and
// the diff engine just won't match any GPU expectations. Phase 4 will
// add nvidia-smi for VRAM and firmware.
func probeGPUs() []spec.GPUSpec {
cmd := exec.Command("lspci", "-mm", "-nnk")
out, err := cmd.Output()
if err != nil {
return nil
}
var gpus []spec.GPUSpec
for _, line := range strings.Split(string(out), "\n") {
low := strings.ToLower(line)
if !strings.Contains(low, "vga compatible controller") &&
!strings.Contains(low, "3d controller") {
continue
}
// `lspci -mm` quotes fields; device name is usually field 3.
fields := splitQuoted(line)
if len(fields) >= 4 {
gpus = append(gpus, spec.GPUSpec{Model: fmt.Sprintf("%s %s", fields[2], fields[3])})
}
}
return gpus
}
func splitQuoted(line string) []string {
var out []string
var cur strings.Builder
inQ := false
for _, r := range line {
switch {
case r == '"':
inQ = !inQ
if !inQ {
out = append(out, cur.String())
cur.Reset()
}
case r == ' ' && !inQ:
continue
default:
cur.WriteRune(r)
}
}
return out
}
// ----- shared helpers ---------------------------------------------------
func readLine(path string) string {
b, err := os.ReadFile(path)
if err != nil {
return ""
}
return strings.TrimSpace(string(b))
}
+67
View File
@@ -0,0 +1,67 @@
package probes
import (
"os"
"path/filepath"
"strconv"
"strings"
)
// ThermalSample is one reading from /sys/class/hwmon. Kind is "temp",
// Key is the label (or chip-relative name) and Value is degrees C.
type ThermalSample struct {
Kind string
Key string
Value float64
Unit string
}
// Thermals walks /sys/class/hwmon looking for temp*_input files. The
// kernel reports millidegrees C; we divide by 1000. Labels come from
// temp*_label (preferred) or a chip-relative fallback.
//
// This is also used by the thermal sidecar; it re-reads on each tick
// rather than holding open handles so hot-plugged sensors (e.g. a PCIe
// card enumerating late) get picked up.
func Thermals() []ThermalSample {
root := "/sys/class/hwmon"
chips, err := os.ReadDir(root)
if err != nil {
return nil
}
var out []ThermalSample
for _, c := range chips {
base := filepath.Join(root, c.Name())
chipName := strings.TrimSpace(readFileStr(filepath.Join(base, "name")))
files, err := os.ReadDir(base)
if err != nil {
continue
}
for _, f := range files {
name := f.Name()
if !strings.HasPrefix(name, "temp") || !strings.HasSuffix(name, "_input") {
continue
}
idx := strings.TrimSuffix(strings.TrimPrefix(name, "temp"), "_input")
label := strings.TrimSpace(readFileStr(filepath.Join(base, "temp"+idx+"_label")))
if label == "" {
label = chipName + "/temp" + idx
}
raw := strings.TrimSpace(readFileStr(filepath.Join(base, name)))
milli, err := strconv.Atoi(raw)
if err != nil {
continue
}
out = append(out, ThermalSample{Kind: "temp", Key: label, Value: float64(milli) / 1000, Unit: "C"})
}
}
return out
}
func readFileStr(p string) string {
b, err := os.ReadFile(p)
if err != nil {
return ""
}
return string(b)
}
+498
View File
@@ -0,0 +1,498 @@
// Package agent implements the in-live-image control loop.
//
// Phase 4 scope: after /claim, the agent walks through every stage the
// orchestrator advertises, dispatching on the stage name to a function
// in agent/tests. Each stage posts a /result; the response carries the
// orchestrator's next_state, which the loop uses to pick the next
// stage. Stages the orchestrator owns (SpecValidate, Reporting) resolve
// server-side inside /result so the agent never sees them as "its turn".
//
// Terminal states:
// - FailedHolding → request hold key, install authorized_keys, wait
// on heartbeats for a retry_stage directive.
// - Completed → heartbeat carries cmd=shutdown; agent runs
// `systemctl poweroff` and exits.
//
// Thermal sidecar runs from the moment the agent claims until ctx
// cancel; it posts a handful of /sys/class/hwmon samples every 5s.
package agent
import (
"context"
"encoding/json"
"fmt"
"log"
"net"
"os"
"os/exec"
"path/filepath"
"sync"
"time"
"vetting/agent/bootstate"
"vetting/agent/probes"
"vetting/agent/tests"
"vetting/internal/spec"
)
// Run is the long-lived entry point. It blocks until ctx is cancelled
// or a fatal error makes progress impossible.
func Run(ctx context.Context, p *bootstate.Params) error {
c := NewClient(p.OrchestratorURL, p.RunID, p.Token, p.TLSCertFPR)
fwd := newLogForwarder(ctx, c)
defer fwd.close()
ip := localIP()
fwd.info(fmt.Sprintf("agent starting on %s (run=%d mac=%s)", ip, p.RunID, p.MAC))
if err := callWithBackoff(ctx, "hello", func(ctx context.Context) error {
return c.Hello(ctx)
}); err != nil {
fwd.warn("hello never succeeded: " + err.Error())
}
var claim *ClaimResponse
if err := callWithBackoff(ctx, "claim", func(ctx context.Context) error {
r, err := c.Claim(ctx, ip)
if err != nil {
return err
}
claim = r
return nil
}); err != nil {
return err
}
fwd.info(fmt.Sprintf("claimed run; stages=%v", claim.Stages))
go thermalSidecar(ctx, c, fwd)
hbCh := make(chan HeartbeatResponse, 4)
go heartbeatLoop(ctx, c, fwd, hbCh)
// Run every stage the orchestrator advertises. Stages owned by the
// orchestrator (SpecValidate, Reporting) resolve inside /result and
// flip next_state forward past themselves, so they simply never match
// our dispatch table.
nextStage := "Inventory"
for nextStage != "" {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
fwd.info("stage: starting " + nextStage)
outcome := runStage(ctx, nextStage, claim, fwd, c, overrideFlags{})
resp, err := postResult(ctx, c, nextStage, outcome)
if err != nil {
fwd.error("submit result for " + nextStage + ": " + err.Error())
return err
}
fwd.info(fmt.Sprintf("stage %s → next_state=%s", nextStage, resp.NextState))
if resp.NextState == "FailedHolding" {
if err := requestHold(ctx, c, fwd); err != nil {
return err
}
// Park and wait for an override directive.
return waitForOverride(ctx, c, fwd, hbCh, claim)
}
if resp.NextState == "Completed" || resp.NextState == "" {
fwd.info("pipeline complete")
<-ctx.Done()
return ctx.Err()
}
nextStage = stageForState(resp.NextState)
if nextStage == "" {
// next_state is something we don't map (e.g. SpecValidate — but
// the orchestrator's /result already resolved it and handed us
// back a further-along state). Defensive bail so we don't loop.
fwd.warn("no stage maps to state " + resp.NextState + "; parking")
<-ctx.Done()
return ctx.Err()
}
}
<-ctx.Done()
return ctx.Err()
}
// runStage dispatches on stage name. The Inventory stage is special —
// it runs the inventory probe and passes the result as the /result body
// (the orchestrator persists it as an artifact). Every other stage
// returns a tests.Outcome which postResult marshals generically.
func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome {
deps := newDeps(ctx, c, fwd, ovr, claim)
switch stage {
case "Inventory":
fwd.info("Inventory: probing host hardware")
inv, err := probes.Collect()
if err != nil {
return stageOutcome{Outcome: tests.Outcome{Passed: false, Message: err.Error(), Summary: "probe error"}}
}
fwd.info("Inventory: " + inventorySummary(inv))
return stageOutcome{
Outcome: tests.Outcome{
Passed: true,
Summary: inventorySummary(inv),
},
Inventory: inv,
}
case "SMART":
return stageOutcome{Outcome: tests.SMART(ctx, deps)}
case "CPUStress":
return stageOutcome{Outcome: tests.CPUStress(ctx, deps)}
case "Storage":
return stageOutcome{Outcome: tests.Storage(ctx, deps)}
case "Network":
return stageOutcome{Outcome: tests.Network(ctx, deps, tests.NetworkConfig{
OrchestratorURL: c.BaseURL,
IperfPort: claim.IperfPort,
Duration: 10 * time.Second,
})}
case "GPU":
return stageOutcome{Outcome: tests.GPU(ctx, deps)}
case "PSU":
return stageOutcome{Outcome: tests.PSU(ctx, deps)}
}
return stageOutcome{Outcome: tests.Outcome{
Passed: false,
Message: "unknown stage " + stage,
}}
}
type stageOutcome struct {
Outcome tests.Outcome
Inventory *spec.Inventory // only for Inventory stage
}
type overrideFlags struct {
Wipe bool `json:"wipe"`
}
func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlags, claim *ClaimResponse) tests.Deps {
var expected []tests.ExpectedDisk
for _, e := range claim.ExpectedDisks {
expected = append(expected, tests.ExpectedDisk{Serial: e.Serial, SizeGB: e.SizeGB})
}
return tests.Deps{
Info: fwd.info,
Warn: fwd.warn,
Error: fwd.error,
OverrideWipe: ovr.Wipe,
ExpectedDisks: expected,
StageTimeout: 2 * time.Minute,
Sensor: func(ctx context.Context, samples []tests.Sample) error {
out := make([]SensorSample, 0, len(samples))
for _, s := range samples {
out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
}
return c.Sensor(ctx, out)
},
}
}
// postResult marshals stageOutcome for the /result endpoint. The
// Inventory shape is special-cased: it includes the inventory blob so
// the orchestrator can persist it and run server-side spec diff.
func postResult(ctx context.Context, c *Client, stage string, s stageOutcome) (*ResultResponse, error) {
summary, _ := s.Outcome.MarshalSummary()
body := map[string]any{
"stage": stage,
"passed": s.Outcome.Passed,
}
if len(summary) > 2 {
body["summary"] = json.RawMessage(summary)
}
if s.Outcome.Message != "" {
body["message"] = s.Outcome.Message
}
if s.Inventory != nil {
body["inventory"] = s.Inventory
}
return c.Result(ctx, body)
}
// stageForState maps a RunState string back to the stage executor name.
// Every stage-name is the same as its state except Inventory↔InventoryCheck.
func stageForState(state string) string {
switch state {
case "InventoryCheck":
return "Inventory"
case "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU":
return state
}
// SpecValidate and Reporting are orchestrator-owned; we never see
// them as next_state because /result resolves past them.
return ""
}
// waitForOverride parks the agent in FailedHolding. It listens for a
// heartbeat directive that tells it to retry a stage (e.g. Storage
// with wipe-override armed) and re-enters runStage from that point.
func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-chan HeartbeatResponse, claim *ClaimResponse) error {
fwd.info("holding: awaiting operator decision (heartbeat directive or ctx cancel)")
for {
select {
case <-ctx.Done():
return ctx.Err()
case cmd, ok := <-hb:
if !ok {
return nil
}
if cmd.Cmd != "retry_stage" || cmd.Stage == "" {
continue
}
fwd.info("operator override: retrying stage " + cmd.Stage)
var ovr overrideFlags
if len(cmd.OverrideFlags) > 0 {
_ = json.Unmarshal(cmd.OverrideFlags, &ovr)
}
outcome := runStage(ctx, cmd.Stage, claim, fwd, c, ovr)
resp, err := postResult(ctx, c, cmd.Stage, outcome)
if err != nil {
fwd.error("override: submit result: " + err.Error())
continue
}
fwd.info(fmt.Sprintf("override stage %s → next_state=%s", cmd.Stage, resp.NextState))
if resp.NextState == "FailedHolding" {
// Still broken; keep holding.
continue
}
if resp.NextState == "Completed" {
return nil
}
// Successful retry — continue walking the pipeline from the
// state the orchestrator advanced us into.
if nextStage := stageForState(resp.NextState); nextStage != "" {
for nextStage != "" {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
fwd.info("stage: starting " + nextStage)
out := runStage(ctx, nextStage, claim, fwd, c, overrideFlags{})
rr, err := postResult(ctx, c, nextStage, out)
if err != nil {
return err
}
if rr.NextState == "FailedHolding" || rr.NextState == "Completed" || rr.NextState == "" {
return nil
}
nextStage = stageForState(rr.NextState)
}
}
return nil
}
}
}
// requestHold fetches the per-run pubkey and installs it into
// /root/.ssh/authorized_keys so the operator can SSH in.
func requestHold(ctx context.Context, c *Client, fwd *logForwarder) error {
fwd.warn("entering FailedHolding; requesting hold key")
resp, err := c.Hold(ctx, localIP())
if err != nil {
fwd.error("hold request failed: " + err.Error())
return err
}
authPath := "/root/.ssh/authorized_keys"
if err := os.MkdirAll(filepath.Dir(authPath), 0o700); err != nil {
fwd.error("mkdir .ssh: " + err.Error())
return err
}
f, err := os.OpenFile(authPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o600)
if err != nil {
fwd.error("open authorized_keys: " + err.Error())
return err
}
defer func() { _ = f.Close() }()
if _, err := fmt.Fprintln(f, resp.AuthorizedKey); err != nil {
fwd.error("write authorized_keys: " + err.Error())
return err
}
fwd.info("hold key installed; SSH is available to root@" + localIP())
return nil
}
func inventorySummary(inv *spec.Inventory) string {
return fmt.Sprintf("cpu=%q cores=%d ram=%dGiB disks=%d nics=%d gpus=%d",
inv.CPU.Model, inv.CPU.LogicalCores, inv.Memory.TotalGiB,
len(inv.Disks), len(inv.NICs), len(inv.GPUs))
}
// thermalSidecar posts a batch of /sys/class/hwmon samples every 5s.
// Idempotent: a dead sensor just drops out of the next batch. Errors
// are logged but never fatal — we'd rather have a run with partial
// thermal data than kill the agent over an I/O hiccup.
func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) {
t := time.NewTicker(5 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
samples := probes.Thermals()
if len(samples) == 0 {
continue
}
out := make([]SensorSample, 0, len(samples))
for _, s := range samples {
out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
}
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
if err := c.Sensor(sendCtx, out); err != nil {
fwd.warn("thermal sidecar: " + err.Error())
}
cancel()
}
}
}
func heartbeatLoop(ctx context.Context, c *Client, fwd *logForwarder, out chan<- HeartbeatResponse) {
t := time.NewTicker(10 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
hbCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
resp, err := c.Heartbeat(hbCtx)
cancel()
if err != nil {
fwd.warn("heartbeat error: " + err.Error())
continue
}
if resp.Cmd == "abort" {
fwd.warn("orchestrator said abort; stopping loop")
return
}
if resp.Cmd == "shutdown" {
fwd.info("orchestrator said shutdown; powering off host")
// Best effort: systemd then sysvinit fallback. Either way,
// return so the agent process stops issuing heartbeats.
if err := exec.Command("systemctl", "poweroff").Run(); err != nil {
fwd.warn("systemctl poweroff failed: " + err.Error())
_ = exec.Command("shutdown", "-h", "now").Run()
}
return
}
if resp.Cmd == "retry_stage" {
select {
case out <- *resp:
default:
}
}
}
}
}
func callWithBackoff(ctx context.Context, label string, f func(context.Context) error) error {
backoff := 2 * time.Second
for attempt := 1; ; attempt++ {
callCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
err := f(callCtx)
cancel()
if err == nil {
return nil
}
if attempt > 20 {
return err
}
log.Printf("agent: %s attempt %d failed: %v (retry in %s)", label, attempt, err, backoff)
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(backoff):
}
if backoff < 30*time.Second {
backoff *= 2
}
}
}
func localIP() string {
addrs, err := net.InterfaceAddrs()
if err != nil {
return ""
}
for _, a := range addrs {
ipnet, ok := a.(*net.IPNet)
if !ok || ipnet.IP.IsLoopback() {
continue
}
v4 := ipnet.IP.To4()
if v4 != nil {
return v4.String()
}
}
return ""
}
// ----- log forwarder -----------------------------------------------------
type logForwarder struct {
c *Client
mu sync.Mutex
buf []LogLine
wg sync.WaitGroup
cancel context.CancelFunc
}
func newLogForwarder(parent context.Context, c *Client) *logForwarder {
ctx, cancel := context.WithCancel(parent)
f := &logForwarder{c: c, cancel: cancel}
f.wg.Add(1)
go f.loop(ctx)
return f
}
func (f *logForwarder) loop(ctx context.Context) {
defer f.wg.Done()
t := time.NewTicker(2 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
f.flush()
return
case <-t.C:
f.flush()
}
}
}
func (f *logForwarder) push(level, text string) {
stamp := time.Now().UTC().Format(time.RFC3339Nano)
log.Printf("[%s] %s", level, text)
f.mu.Lock()
f.buf = append(f.buf, LogLine{TS: stamp, Level: level, Text: text})
f.mu.Unlock()
}
func (f *logForwarder) info(s string) { f.push("info", s) }
func (f *logForwarder) warn(s string) { f.push("warn", s) }
func (f *logForwarder) error(s string) { f.push("error", s) }
func (f *logForwarder) flush() {
f.mu.Lock()
if len(f.buf) == 0 {
f.mu.Unlock()
return
}
lines := f.buf
f.buf = nil
f.mu.Unlock()
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := f.c.Log(ctx, lines); err != nil {
log.Printf("log forward failed: %v", err)
}
}
func (f *logForwarder) close() {
f.cancel()
f.wg.Wait()
}
+97
View File
@@ -0,0 +1,97 @@
package tests
import (
"context"
"fmt"
"os/exec"
"runtime"
"strconv"
"strings"
"time"
)
// CPUStress runs stress-ng with CPU workers AND memory stressors. The
// memory stressors take the place of a Memtest86+ pass — per the plan,
// running under Linux gives us exit-code-based pass/fail and log
// capture we can't get from Memtest without IPMI serial redirection.
//
// Non-zero exit = stress-ng aborted due to a failure (bit flip, OOM
// kill, etc.) → stage fails. Exit 0 means the kernel returned sane
// pages for the full duration, which is the Phase 4 health bar.
func CPUStress(ctx context.Context, d Deps) Outcome {
if _, err := exec.LookPath("stress-ng"); err != nil {
d.Warn("CPUStress: stress-ng not found in PATH — skipping stage")
return Outcome{
Passed: true,
Summary: "skipped (stress-ng missing)",
Extras: map[string]any{"skipped": true, "reason": "stress_ng_missing"},
}
}
// Timeout: Deps.StageTimeout may be zero in tests; default 2 min.
timeout := d.StageTimeout
if timeout <= 0 {
timeout = 2 * time.Minute
}
cores := runtime.NumCPU()
// --vm N allocates N worker processes each touching 90% of RAM. On
// an 8-core host with 32GiB this is 8 × ~28GiB sliding windows —
// enough to exercise every DIMM row within a minute.
args := []string{
"--cpu", strconv.Itoa(cores),
"--cpu-method", "all",
"--vm", strconv.Itoa(cores),
"--vm-bytes", "90%",
"--timeout", durationSeconds(timeout),
"--metrics-brief",
"--verify",
}
d.Info(fmt.Sprintf("CPUStress: stress-ng --cpu %d --vm %d --vm-bytes 90%% --timeout %s",
cores, cores, durationSeconds(timeout)))
runCtx, cancel := context.WithTimeout(ctx, timeout+30*time.Second)
defer cancel()
cmd := exec.CommandContext(runCtx, "stress-ng", args...)
start := time.Now()
out, err := cmd.CombinedOutput()
elapsed := time.Since(start).Round(time.Second)
extras := map[string]any{
"cores": cores,
"elapsed_secs": elapsed.Seconds(),
"output_tail": tailLines(string(out), 20),
}
if err != nil {
d.Error("CPUStress: stress-ng failed: " + err.Error())
return Outcome{
Passed: false,
Message: "stress-ng returned non-zero: " + err.Error(),
Summary: fmt.Sprintf("failed after %s", elapsed),
Extras: extras,
}
}
d.Info(fmt.Sprintf("CPUStress: stress-ng completed cleanly in %s", elapsed))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("stress-ng PASSED after %s (%d cores + 90%% RAM)", elapsed, cores),
Extras: extras,
}
}
func durationSeconds(d time.Duration) string {
s := int(d.Seconds())
if s < 1 {
s = 1
}
return strconv.Itoa(s) + "s"
}
// tailLines returns the last n non-empty lines of s, for the summary.
func tailLines(s string, n int) string {
lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
if len(lines) > n {
lines = lines[len(lines)-n:]
}
return strings.Join(lines, "\n")
}
+86
View File
@@ -0,0 +1,86 @@
package tests
import (
"context"
"os/exec"
"strings"
)
// GPU enumerates VGA / 3D PCI devices. No devices → skip cleanly (a
// CPU-only server passes this stage by virtue of having nothing to
// stress). Devices present → try nvidia-smi for NVIDIA cards, else
// accept PCI presence.
func GPU(ctx context.Context, d Deps) Outcome {
devices := listGPUPCI(ctx)
if len(devices) == 0 {
d.Info("GPU: no VGA/3D PCI devices found — skipping stage")
return Outcome{
Passed: true,
Summary: "skipped (no GPU present)",
Extras: map[string]any{"skipped": true, "reason": "no_gpu_present"},
}
}
d.Info("GPU: found " + joinDevices(devices))
nvidia := nvidiaSmiList(ctx)
extras := map[string]any{
"pci_devices": devices,
"skipped": false,
}
if len(nvidia) > 0 {
extras["nvidia"] = nvidia
d.Info("GPU: nvidia-smi reports: " + strings.Join(nvidia, ", "))
}
return Outcome{
Passed: true,
Summary: formatCount(len(devices), "GPU present"),
Extras: extras,
}
}
// listGPUPCI shells out to lspci. Returns human-readable strings, one
// per VGA/3D device. If lspci isn't available we return nil and the
// caller treats it as "no GPU" which auto-skips.
func listGPUPCI(ctx context.Context) []string {
cmd := exec.CommandContext(ctx, "lspci", "-mm")
out, err := cmd.Output()
if err != nil {
return nil
}
var devs []string
for _, line := range strings.Split(string(out), "\n") {
l := strings.ToLower(line)
if strings.Contains(l, "vga compatible controller") || strings.Contains(l, "3d controller") {
devs = append(devs, strings.TrimSpace(line))
}
}
return devs
}
// nvidiaSmiList returns each card's "<name>, <pci bus>" line; empty
// slice when nvidia-smi isn't installed or fails.
func nvidiaSmiList(ctx context.Context) []string {
cmd := exec.CommandContext(ctx, "nvidia-smi", "-L")
out, err := cmd.Output()
if err != nil {
return nil
}
var lines []string
for _, l := range strings.Split(string(out), "\n") {
l = strings.TrimSpace(l)
if l != "" {
lines = append(lines, l)
}
}
return lines
}
func joinDevices(devs []string) string {
if len(devs) == 0 {
return ""
}
if len(devs) == 1 {
return devs[0]
}
return devs[0] + " (+" + strings.TrimSpace(formatCount(len(devs)-1, "more")) + ")"
}
+144
View File
@@ -0,0 +1,144 @@
package tests
import (
"context"
"encoding/json"
"fmt"
"net/url"
"os/exec"
"strconv"
"strings"
"time"
)
// NetworkConfig is what the agent passes to Network: the orchestrator's
// iperf3 server address and port. We derive host from OrchestratorURL.
type NetworkConfig struct {
OrchestratorURL string
IperfPort int // 0 = 5201
Duration time.Duration
}
// Network runs iperf3 against the orchestrator's bundled server. Records
// bandwidth as a measurement; fails if iperf3 is missing, the server
// isn't reachable, or throughput is zero.
func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
if _, err := exec.LookPath("iperf3"); err != nil {
d.Warn("Network: iperf3 not found — skipping stage")
return Outcome{
Passed: true,
Summary: "skipped (iperf3 missing)",
Extras: map[string]any{"skipped": true, "reason": "iperf3_missing"},
}
}
host, err := deriveHost(cfg.OrchestratorURL)
if err != nil || host == "" {
d.Warn("Network: can't derive orchestrator host from URL — skipping stage")
return Outcome{
Passed: true,
Summary: "skipped (no orchestrator host)",
Extras: map[string]any{"skipped": true, "reason": "no_host"},
}
}
port := cfg.IperfPort
if port == 0 {
port = 5201
}
duration := cfg.Duration
if duration <= 0 {
duration = 10 * time.Second
}
args := []string{
"-c", host,
"-p", strconv.Itoa(port),
"-t", strconv.Itoa(int(duration.Seconds())),
"-J", // JSON output
}
d.Info(fmt.Sprintf("Network: iperf3 -c %s -p %d -t %s", host, port, duration))
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
defer cancel()
cmd := exec.CommandContext(runCtx, "iperf3", args...)
out, err := cmd.Output()
if err != nil {
d.Error("Network: iperf3 client failed: " + err.Error())
return Outcome{
Passed: false,
Message: "iperf3 client error: " + err.Error(),
Summary: "iperf3 failed",
Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)},
}
}
mbps, parsed, err := parseIperfJSON(out)
if err != nil {
d.Error("Network: parse iperf3 output: " + err.Error())
return Outcome{
Passed: false,
Message: "parse iperf3 json: " + err.Error(),
Summary: "parse error",
Extras: map[string]any{"raw": string(out)},
}
}
if d.Sensor != nil {
_ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}})
}
extras := map[string]any{
"throughput_mbps": mbps,
"iperf_end": parsed,
}
if mbps <= 0 {
return Outcome{
Passed: false,
Message: "iperf3 reported zero throughput",
Summary: "zero throughput",
Extras: extras,
}
}
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host),
Extras: extras,
}
}
// deriveHost pulls the hostname out of an https://host:port base URL.
func deriveHost(raw string) (string, error) {
if raw == "" {
return "", fmt.Errorf("empty url")
}
u, err := url.Parse(raw)
if err != nil {
return "", err
}
h := u.Hostname()
return strings.TrimSpace(h), nil
}
// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J.
// Returns (Mbps, full-json-map, err).
func parseIperfJSON(b []byte) (float64, map[string]any, error) {
var top map[string]any
if err := json.Unmarshal(b, &top); err != nil {
return 0, nil, err
}
end, ok := top["end"].(map[string]any)
if !ok {
return 0, top, fmt.Errorf("missing end")
}
// iperf3 reports either sum_sent (when -R not set) or sum_received.
for _, key := range []string{"sum_sent", "sum_received", "sum"} {
sum, ok := end[key].(map[string]any)
if !ok {
continue
}
bps, ok := sum["bits_per_second"].(float64)
if !ok {
continue
}
return bps / 1_000_000, end, nil
}
return 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
}
+153
View File
@@ -0,0 +1,153 @@
package tests
import (
"context"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
)
// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
// PSU rails. In home-lab hosts the kernel surfaces a handful of named
// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10%
// window of its nominal value → fail.
func PSU(ctx context.Context, d Deps) Outcome {
rails := scanPSURails()
if len(rails) == 0 {
d.Info("PSU: no voltage rails found under /sys/class/hwmon — skipping stage")
return Outcome{
Passed: true,
Summary: "skipped (no PSU sensors)",
Extras: map[string]any{"skipped": true, "reason": "no_hwmon_voltages"},
}
}
var samples []Sample
problems := []string{}
for _, rail := range rails {
samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"})
if ok, why := voltageInRange(rail); !ok {
problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why))
}
}
if d.Sensor != nil {
_ = d.Sensor(ctx, samples)
}
extras := map[string]any{
"rails": rails,
"problems": problems,
}
if len(problems) > 0 {
d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", "))
return Outcome{
Passed: false,
Message: "PSU rails out of range: " + strings.Join(problems, ", "),
Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)),
Extras: extras,
}
}
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%d rails nominal", len(rails)),
Extras: extras,
}
}
type psuRail struct {
Label string `json:"label"`
Volts float64 `json:"volts"`
}
// scanPSURails walks every hwmon chip looking for in*_input files with
// an accompanying in*_label that mentions a known rail name. Unknown
// labels are skipped rather than flagged — motherboard VRMs report many
// rails that aren't PSU outputs.
func scanPSURails() []psuRail {
root := "/sys/class/hwmon"
chips, err := os.ReadDir(root)
if err != nil {
return nil
}
var out []psuRail
for _, c := range chips {
base := filepath.Join(root, c.Name())
files, err := os.ReadDir(base)
if err != nil {
continue
}
for _, f := range files {
name := f.Name()
if !strings.HasPrefix(name, "in") || !strings.HasSuffix(name, "_input") {
continue
}
n := strings.TrimSuffix(strings.TrimPrefix(name, "in"), "_input")
labelPath := filepath.Join(base, "in"+n+"_label")
label := strings.TrimSpace(readFileStr(labelPath))
if !isPSULabel(label) {
continue
}
raw := strings.TrimSpace(readFileStr(filepath.Join(base, name)))
mv, err := strconv.Atoi(raw)
if err != nil {
continue
}
out = append(out, psuRail{Label: label, Volts: float64(mv) / 1000})
}
}
return out
}
// isPSULabel filters labels that look like PSU rails. Keeps a small
// allowlist to avoid flagging CPU VRM rails as PSU failures.
func isPSULabel(label string) bool {
l := strings.ToLower(label)
switch {
case strings.Contains(l, "12v"), strings.Contains(l, "5v"),
strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"),
strings.Contains(l, "vccin"):
return true
}
return false
}
// voltageInRange returns (ok, reason). A label like "12V" has a 12.0V
// nominal; we accept ±10%. Unknown labels pass.
func voltageInRange(r psuRail) (bool, string) {
nom := nominalFor(r.Label)
if nom == 0 {
return true, ""
}
delta := r.Volts - nom
if delta < 0 {
delta = -delta
}
if delta/nom > 0.10 {
return false, fmt.Sprintf("expected ~%.1fV", nom)
}
return true, ""
}
func nominalFor(label string) float64 {
l := strings.ToLower(label)
switch {
case strings.Contains(l, "12v"):
return 12.0
case strings.Contains(l, "5v"):
return 5.0
case strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"):
return 3.3
}
return 0
}
func readFileStr(p string) string {
b, err := os.ReadFile(p)
if err != nil {
return ""
}
return string(b)
}
+152
View File
@@ -0,0 +1,152 @@
package tests
import (
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
)
// SMART runs smartctl -a on each block device the kernel exposes. We
// pass each device's result through smartctl --json output and key on:
//
// smart_status.passed -> overall-health PASSED
// ata_smart_attributes -> per-attribute raw + threshold (ATA only)
// nvme_smart_health_information_log -> NVMe health flags
//
// Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just
// surfaces as a per-disk "skipped" entry; the stage only fails if at
// least one disk reports !passed.
func SMART(ctx context.Context, d Deps) Outcome {
disks, err := listBlockDisks()
if err != nil {
d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error())
return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}}
}
if len(disks) == 0 {
d.Info("SMART: no physical disks found — skipping stage")
return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
}
type diskReport struct {
Device string `json:"device"`
Passed bool `json:"passed"`
Skipped bool `json:"skipped,omitempty"`
Reason string `json:"reason,omitempty"`
Raw map[string]any `json:"raw,omitempty"`
}
var reports []diskReport
failed := 0
usable := 0
for _, dev := range disks {
rep := diskReport{Device: dev}
out, err := runSmartctl(ctx, dev)
if err != nil {
rep.Skipped = true
rep.Reason = err.Error()
reports = append(reports, rep)
d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
continue
}
usable++
rep.Raw = out
if passed, ok := smartPassed(out); ok {
rep.Passed = passed
if !passed {
failed++
d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev))
} else {
d.Info(fmt.Sprintf("SMART: %s PASSED", dev))
}
} else {
rep.Skipped = true
rep.Reason = "no smart_status in output"
}
reports = append(reports, rep)
}
extras := map[string]any{
"disks": reports,
"tested": usable,
"failing": failed,
}
if failed > 0 {
return Outcome{
Passed: false,
Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
Summary: fmt.Sprintf("%d/%d failing", failed, usable),
Extras: extras,
}
}
summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
if usable == 0 {
summary = "skipped (no smartctl data on any disk)"
extras["skipped"] = true
}
return Outcome{Passed: true, Summary: summary, Extras: extras}
}
func listBlockDisks() ([]string, error) {
entries, err := os.ReadDir("/sys/class/block")
if err != nil {
return nil, err
}
var out []string
for _, e := range entries {
name := e.Name()
if !isRealBlockDisk(name) {
continue
}
out = append(out, "/dev/"+name)
}
return out, nil
}
func isRealBlockDisk(name string) bool {
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
return false
}
partPath := filepath.Join("/sys/class/block", name, "partition")
if _, err := os.Stat(partPath); err == nil {
return false
}
return true
}
// runSmartctl invokes `smartctl -aj <dev>` and returns the parsed JSON.
// Exit code 4 means smartctl found no device info (e.g. virtio), which
// we surface as a skip rather than a failure.
func runSmartctl(ctx context.Context, dev string) (map[string]any, error) {
cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev)
out, err := cmd.Output()
if len(out) == 0 {
if err != nil {
return nil, fmt.Errorf("smartctl: %w", err)
}
return nil, fmt.Errorf("empty smartctl output")
}
var parsed map[string]any
if jerr := json.Unmarshal(out, &parsed); jerr != nil {
return nil, fmt.Errorf("parse smartctl output: %w", jerr)
}
// Even with a non-zero exit code, if we got valid JSON with
// smart_status, trust the structured result.
return parsed, nil
}
// smartPassed extracts smart_status.passed from a smartctl --json blob.
// Returns (passed, present) so callers can distinguish "passed=false"
// from "attribute missing".
func smartPassed(out map[string]any) (bool, bool) {
status, ok := out["smart_status"].(map[string]any)
if !ok {
return false, false
}
passed, ok := status["passed"].(bool)
return passed, ok
}
+67
View File
@@ -0,0 +1,67 @@
// Package tests contains the per-stage executors the agent runs on the
// host under test. Each stage implements Runner, is called with a
// Context that carries the client + forwarder + run params, and returns
// an Outcome that the caller POSTs to /result.
package tests
import (
"context"
"encoding/json"
"time"
)
// Outcome is what a stage returns; it maps directly to the /result body.
// - Passed=true and len(Skipped)>0 counts as a pass but surfaces in the
// tile summary so operators can see "GPU: skipped (no VGA device)".
// - Message is only used on failure; the UI displays it in the log.
// - Extras is merged into the posted summary so stages can add
// their own shape (e.g. Storage returns per-disk probe results).
type Outcome struct {
Passed bool
Message string
Summary string // short human-readable one-liner
Extras map[string]any // merged into posted summary JSON
}
// MarshalSummary builds the summary JSON body POSTed to /result.
// Stages accumulate fields via Extras; this helper adds "summary" (the
// human-readable line) and serializes.
func (o Outcome) MarshalSummary() (json.RawMessage, error) {
body := map[string]any{}
for k, v := range o.Extras {
body[k] = v
}
if o.Summary != "" {
body["summary"] = o.Summary
}
return json.Marshal(body)
}
// Deps bundles what stages need without pulling in the whole agent.
// Logger methods print to stdout + forward to the orchestrator; Sensor
// drops numeric samples; OverrideFlags carries operator-set bypasses.
type Deps struct {
Info func(string)
Warn func(string)
Error func(string)
Sensor func(ctx context.Context, samples []Sample) error
OverrideWipe bool
ExpectedDisks []ExpectedDisk // serials + sizes from host.expected_spec
StageTimeout time.Duration
}
// Sample mirrors the server's SensorSample but lives in the tests
// package so probe code doesn't import internal/api.
type Sample struct {
Kind string
Key string
Value float64
Unit string
}
// ExpectedDisk is the subset of internal/spec.DiskSpec that Storage
// needs: a device allowlist keyed on serial.
type ExpectedDisk struct {
Serial string
SizeGB int
}
+298
View File
@@ -0,0 +1,298 @@
package tests
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"strings"
"time"
)
// Storage is the destructive stage: badblocks (write-mode sample) + fio
// random IO, persisting IOPS + latency as measurements. Pre-gates:
//
// 1. Device allowlist: only act on /dev/<X> where the kernel-reported
// serial matches one of Deps.ExpectedDisks. This is the operator's
// contract for what can be written to. USB sticks and unexpected
// drives are excluded.
// 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
// signatures, partition tables, or LVM metadata → fail with
// UnexpectedData unless Deps.OverrideWipe is set.
//
// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w`
// and `fio` in write mode. This matches the plan's "destructive disk
// tests are always-on, gated by layered safety."
func Storage(ctx context.Context, d Deps) Outcome {
if len(d.ExpectedDisks) == 0 {
d.Info("Storage: no expected disks in spec — skipping stage")
return Outcome{
Passed: true,
Summary: "skipped (no expected disks)",
Extras: map[string]any{"skipped": true, "reason": "no_expected_disks"},
}
}
targets := resolveTargets(d.ExpectedDisks)
if len(targets) == 0 {
d.Error("Storage: none of the expected disks are present on this host")
return Outcome{
Passed: false,
Message: "device allowlist matched zero disks",
Summary: "no allowed disks present",
Extras: map[string]any{"expected": d.ExpectedDisks},
}
}
// Wipe probe on every target. A single dirty disk halts the stage
// unless the operator has set OverrideWipe via the UI.
probes := map[string]wipeProbeResult{}
dirty := []string{}
for _, t := range targets {
probe := probeWipe(ctx, t.Device)
probes[t.Device] = probe
if probe.HasData {
dirty = append(dirty, t.Device)
}
}
if len(dirty) > 0 && !d.OverrideWipe {
d.Error("Storage: wipe probe found existing data on: " + strings.Join(dirty, ", "))
return Outcome{
Passed: false,
Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
Extras: map[string]any{
"wipe_probe": probes,
"override_hint": "click 'Override wipe & retry' in the held tile",
"dirty_devices": dirty,
},
}
}
if d.OverrideWipe && len(dirty) > 0 {
d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
}
// Per target: short badblocks write sample + fio random-read/write.
var samples []Sample
perDisk := map[string]any{}
for _, t := range targets {
d.Info("Storage: running badblocks write sample on " + t.Device)
bb := runBadblocks(ctx, t.Device)
d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
fr := runFio(ctx, t.Device)
perDisk[t.Device] = map[string]any{
"badblocks": bb,
"fio": fr,
}
samples = append(samples,
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
)
if !bb.OK {
return Outcome{
Passed: false,
Message: "badblocks found errors on " + t.Device,
Summary: "badblocks failed on " + t.Device,
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
}
}
}
if d.Sensor != nil {
_ = d.Sensor(ctx, samples)
}
d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%d disks passed", len(targets)),
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
}
}
type diskTarget struct {
Serial string
Device string
}
// resolveTargets maps expected-disk serials to /dev/<X> paths by reading
// /sys/block. Uses the same mechanism as probes.inventory to avoid drift.
func resolveTargets(expected []ExpectedDisk) []diskTarget {
disks, err := listBlockDisks()
if err != nil {
return nil
}
// Build serial → device map from /sys.
serialOf := map[string]string{}
for _, dev := range disks {
name := strings.TrimPrefix(dev, "/dev/")
s := diskSerialFromSys(name)
if s != "" {
serialOf[strings.ToLower(s)] = dev
}
}
var out []diskTarget
for _, e := range expected {
if e.Serial == "" {
continue
}
if dev, ok := serialOf[strings.ToLower(e.Serial)]; ok {
out = append(out, diskTarget{Serial: e.Serial, Device: dev})
}
}
return out
}
// diskSerialFromSys is a smaller copy of probes.diskSerial; imported
// from internal/probes would cause a cycle so we duplicate the short
// lookup. If it drifts from the inventory probe, Storage fails because
// the serial doesn't match — which is the correct behavior.
func diskSerialFromSys(name string) string {
for _, rel := range []string{
"/sys/block/" + name + "/device/serial",
"/sys/block/" + name + "/serial",
} {
b, err := readFileBytes(rel)
if err != nil {
continue
}
s := strings.TrimSpace(string(b))
if s != "" {
return s
}
}
// Fall back to udevadm — ID_SERIAL_SHORT is more reliable on SCSI.
out, err := exec.Command("udevadm", "info", "--query=property", "--name="+name).Output()
if err != nil {
return ""
}
for _, line := range strings.Split(string(out), "\n") {
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
return strings.TrimSpace(v)
}
}
return ""
}
func readFileBytes(p string) ([]byte, error) {
return readFile(p)
}
// ---------- wipe probe ----------
type wipeProbeResult struct {
Device string `json:"device"`
HasData bool `json:"has_data"`
Findings []string `json:"findings,omitempty"`
}
// probeWipe runs blkid + wipefs -n. Any non-empty output from either is
// a "has data" signal. This is deliberately conservative: we'd rather
// halt on a bare ext4 signature than hand badblocks a disk with real
// bytes on it.
func probeWipe(ctx context.Context, device string) wipeProbeResult {
out := wipeProbeResult{Device: device}
if b, err := exec.CommandContext(ctx, "blkid", "-o", "full", device).Output(); err == nil {
s := strings.TrimSpace(string(b))
if s != "" {
out.Findings = append(out.Findings, "blkid: "+s)
out.HasData = true
}
}
if b, err := exec.CommandContext(ctx, "wipefs", "--no-act", device).Output(); err == nil {
s := strings.TrimSpace(string(b))
// wipefs prints a header line even on a clean disk; keep only
// lines with actual signature data.
for _, line := range strings.Split(s, "\n") {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "DEVICE") || strings.HasPrefix(line, "offset") {
continue
}
out.Findings = append(out.Findings, "wipefs: "+line)
out.HasData = true
}
}
return out
}
// ---------- badblocks ----------
type badblocksResult struct {
OK bool `json:"ok"`
Elapsed string `json:"elapsed"`
Error string `json:"error,omitempty"`
OutputTail string `json:"output_tail,omitempty"`
}
func runBadblocks(ctx context.Context, device string) badblocksResult {
// -c 64 blocks per check, -w destructive write, -b 4096 block size,
// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays
// bounded. A real burn-in would run the whole disk; that belongs in
// a separate "deep" stage.
args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"}
start := time.Now()
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()
cmd := exec.CommandContext(runCtx, "badblocks", args...)
out, err := cmd.CombinedOutput()
r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)}
if err != nil {
r.Error = err.Error()
return r
}
// badblocks prints each bad block to stdout. Empty output = clean.
if strings.TrimSpace(string(out)) == "" {
r.OK = true
} else {
r.Error = "bad blocks found"
}
return r
}
// ---------- fio ----------
type fioResult struct {
ReadIOPS float64 `json:"read_iops"`
WriteIOPS float64 `json:"write_iops"`
ReadBWKBps float64 `json:"read_bw_kbps"`
WriteBWKBps float64 `json:"write_bw_kbps"`
Error string `json:"error,omitempty"`
}
// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
// This is a health bar, not a benchmark — we want to know the disk
// services IO, not how fast it is at p99.
func runFio(ctx context.Context, device string) fioResult {
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()
args := []string{
"--name=health", "--filename=" + device, "--rw=randrw",
"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
"--group_reporting", "--output-format=json", "--direct=1",
}
cmd := exec.CommandContext(runCtx, "fio", args...)
out, err := cmd.Output()
if err != nil {
return fioResult{Error: err.Error()}
}
var top struct {
Jobs []struct {
Read struct {
IOPS float64 `json:"iops"`
BW float64 `json:"bw"`
} `json:"read"`
Write struct {
IOPS float64 `json:"iops"`
BW float64 `json:"bw"`
} `json:"write"`
} `json:"jobs"`
}
if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 {
return fioResult{Error: "parse fio json: " + fmt.Sprint(err)}
}
j := top.Jobs[0]
return fioResult{
ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
}
}
+21
View File
@@ -0,0 +1,21 @@
package tests
import (
"fmt"
"os"
)
// readFile is used by stages that need to peek at /sys files without
// importing the agent's probes package (which would cycle).
func readFile(p string) ([]byte, error) {
return os.ReadFile(p)
}
// formatCount pluralizes a count + label: (0, "disk") → "0 disks",
// (1, "disk") → "1 disk", (n, "disk") → "n disks". Keeps log lines tidy.
func formatCount(n int, label string) string {
if n == 1 {
return fmt.Sprintf("%d %s", n, label)
}
return fmt.Sprintf("%d %ss", n, label)
}
+39
View File
@@ -0,0 +1,39 @@
package main
import (
"context"
"flag"
"log"
"os"
"os/signal"
"syscall"
"vetting/agent"
"vetting/agent/bootstate"
)
func main() {
cmdlinePath := flag.String("cmdline", "/proc/cmdline", "path to kernel cmdline (override for local testing)")
flag.Parse()
p, err := bootstate.ParseCmdline(*cmdlinePath)
if err != nil {
log.Fatalf("bootstate: %v", err)
}
log.Printf("vetting-agent starting: run=%d mac=%s orchestrator=%s", p.RunID, p.MAC, p.OrchestratorURL)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt, syscall.SIGTERM)
go func() {
<-sig
log.Printf("vetting-agent: signal received, shutting down")
cancel()
}()
if err := agent.Run(ctx, p); err != nil && err != context.Canceled {
log.Fatalf("agent: %v", err)
}
}
+249
View File
@@ -0,0 +1,249 @@
package main
import (
"context"
"crypto/tls"
"errors"
"flag"
"log"
"net/http"
"os"
"os/signal"
"path/filepath"
"syscall"
"time"
"vetting/internal/api"
"vetting/internal/auth"
"vetting/internal/config"
"vetting/internal/db"
"vetting/internal/events"
"vetting/internal/httpserver"
"vetting/internal/janitor"
"vetting/internal/logs"
"vetting/internal/model"
"vetting/internal/notify"
"vetting/internal/orchestrator"
"vetting/internal/pxe"
"vetting/internal/store"
"vetting/internal/web/templates"
)
func main() {
configPath := flag.String("config", "deploy/vetting.example.yaml", "path to vetting.yaml")
flag.Parse()
cfg, err := config.Load(*configPath)
if err != nil {
log.Fatalf("load config: %v", err)
}
for _, dir := range []string{
filepath.Dir(cfg.Database.Path),
cfg.Artifacts.Dir,
cfg.Logs.Dir,
} {
if err := os.MkdirAll(dir, 0o755); err != nil {
log.Fatalf("mkdir %s: %v", dir, err)
}
}
conn, err := db.Open(cfg.Database.Path)
if err != nil {
log.Fatalf("open db: %v", err)
}
defer func() { _ = conn.Close() }()
secret, err := cfg.Auth.SessionSecret()
if err != nil {
log.Fatalf("auth: %v", err)
}
authMgr := &auth.Manager{
PasswordHash: cfg.Auth.AdminPasswordBcrypt,
Secret: secret,
TTL: time.Duration(cfg.Auth.SessionTTLHours) * time.Hour,
}
if err := validateAuth(cfg, authMgr); err != nil {
log.Fatalf("auth: %v", err)
}
hostStore := &store.Hosts{DB: conn}
runStore := &store.Runs{DB: conn}
stageStore := &store.Stages{DB: conn}
artifactStore := &store.Artifacts{DB: conn}
specDiffStore := &store.SpecDiffs{DB: conn}
measurementStore := &store.Measurements{DB: conn}
hub := events.NewHub()
logHub, err := logs.NewHub(cfg.Logs.Dir, hub)
if err != nil {
log.Fatalf("logs hub: %v", err)
}
defer logHub.Close()
runner := &orchestrator.Runner{
Runs: runStore,
Hosts: hostStore,
Stages: stageStore,
EventHub: hub,
}
tiles := &api.TileEnricher{
Runs: runStore,
Artifacts: artifactStore,
SpecDiffs: specDiffStore,
}
// Inject a templ renderer so the Runner can publish tile-refresh
// fragments via SSE without pulling web/templates into the
// orchestrator package. The closure enriches the tile with spec-
// diff count and hold-key path so every tile render shows the
// same data, whether it came from /events or an initial page load.
orchestrator.TileRenderer = func(ctx context.Context, host model.Host, latest *model.Run) string {
return templates.RenderTileString(tiles.Build(ctx, host, latest))
}
notifyReg, err := notify.BuildRegistry(cfg.Notifiers, cfg.Routes)
if err != nil {
log.Fatalf("notify: %v", err)
}
ui := &api.UI{
Hosts: hostStore,
Runs: runStore,
Artifacts: artifactStore,
Auth: authMgr,
EventHub: hub,
Runner: runner,
Tiles: tiles,
}
agentAPI := &api.Agent{
Hosts: hostStore,
Runs: runStore,
Stages: stageStore,
Artifacts: artifactStore,
SpecDiffs: specDiffStore,
Measurements: measurementStore,
Runner: runner,
EventHub: hub,
Logs: logHub,
Notify: notifyReg,
ArtifactsDir: cfg.Artifacts.Dir,
OrchestratorURL: cfg.PXE.OrchestratorURL,
PublicURL: cfg.Server.PublicURL,
IperfPort: cfg.Network.IperfPort,
}
agentAPI.LiveKernelURL, agentAPI.LiveInitrdURL = pxe.BuildLiveURLs(cfg.PXE.OrchestratorURL)
dispatcher := orchestrator.NewDispatcher(cfg.Dispatcher.MaxConcurrentRuns, runStore, hostStore, runner)
iperfSup := orchestrator.NewIperfSupervisor(cfg.Network.IperfPort)
janitorSvc := janitor.New(janitor.Config{
ArtifactRetention: time.Duration(cfg.Artifacts.RetentionDays) * 24 * time.Hour,
LogRetention: time.Duration(cfg.Logs.RetentionDays) * 24 * time.Hour,
Interval: time.Duration(cfg.Janitor.IntervalMinutes) * time.Minute,
}, &janitor.StoreAdapter{Runs: runStore, Artifacts: artifactStore, Logs: logHub})
tftpRoot := cfg.PXE.TFTPRoot
if tftpRoot == "" {
tftpRoot = filepath.Join(cfg.Logs.Dir, "..", "tftp")
}
var supervisor *pxe.Supervisor
if cfg.PXE.Enabled {
supervisor = pxe.NewSupervisor(pxe.SupervisorConfig{
Enabled: true,
Interface: cfg.PXE.Interface,
DHCPRange: cfg.PXE.DHCPRange,
OrchestratorURL: cfg.PXE.OrchestratorURL,
RuntimeDir: filepath.Join(cfg.Logs.Dir, "..", "pxe"),
TFTPRoot: tftpRoot,
})
}
router := httpserver.NewRouter(httpserver.Deps{
Auth: authMgr,
UI: ui,
Agent: agentAPI,
LiveDir: cfg.PXE.LiveDir,
})
srv := &http.Server{
Addr: cfg.Server.Bind,
Handler: router,
ReadHeaderTimeout: 10 * time.Second,
}
if cfg.Server.TLS.Enabled {
srv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12}
}
shutdown := make(chan os.Signal, 1)
signal.Notify(shutdown, os.Interrupt, syscall.SIGTERM)
rootCtx, cancelRoot := context.WithCancel(context.Background())
defer cancelRoot()
dispatcher.Start(rootCtx)
janitorSvc.Start(rootCtx)
if err := iperfSup.Start(rootCtx); err != nil {
log.Fatalf("start iperf3: %v", err)
}
if supervisor != nil {
hosts, err := hostStore.List(rootCtx)
if err != nil {
log.Fatalf("list hosts for dnsmasq: %v", err)
}
if err := supervisor.Start(rootCtx, hosts); err != nil {
log.Fatalf("start dnsmasq: %v", err)
}
}
go func() {
log.Printf("vetting listening on %s (tls=%v, db=%s)", cfg.Server.Bind, cfg.Server.TLS.Enabled, cfg.Database.Path)
var err error
if cfg.Server.TLS.Enabled {
err = srv.ListenAndServeTLS(cfg.Server.TLS.CertFile, cfg.Server.TLS.KeyFile)
} else {
err = srv.ListenAndServe()
}
if err != nil && !errors.Is(err, http.ErrServerClosed) {
log.Fatalf("server: %v", err)
}
}()
<-shutdown
log.Printf("shutting down")
dispatcher.Stop()
janitorSvc.Stop()
_ = iperfSup.Shutdown(3 * time.Second)
if supervisor != nil {
_ = supervisor.Shutdown(5 * time.Second)
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := srv.Shutdown(ctx); err != nil {
log.Printf("server shutdown: %v", err)
}
_ = hub.Shutdown(ctx)
}
func validateAuth(cfg *config.Config, _ *auth.Manager) error {
if cfg.Auth.AdminPasswordBcrypt == "" || cfg.Auth.AdminPasswordBcrypt == "$2a$10$REPLACE_ME_WITH_A_REAL_BCRYPT_HASH_0123456789abcdefABCDEFxx" {
return errPlaceholderPassword
}
if len(cfg.Auth.AdminPasswordBcrypt) < 4 || cfg.Auth.AdminPasswordBcrypt[0] != '$' {
return errPlaceholderPassword
}
return nil
}
var errPlaceholderPassword = plainErr("auth.admin_password_bcrypt is the placeholder; run bin/gen-admin-password and paste the hash into your config")
type plainErr string
func (e plainErr) Error() string { return string(e) }
+136
View File
@@ -0,0 +1,136 @@
#!/usr/bin/env bash
# install.sh — one-shot installer for the vetting orchestrator on a
# Proxmox LXC (or any Debian/Ubuntu host).
#
# What it does:
# 1. apt-installs runtime dependencies (dnsmasq, iperf3, ca-certs).
# 2. Creates the `vetting` system user with /var/lib/vetting homedir.
# 3. Copies the pre-built `vetting` binary into /usr/local/bin.
# 4. Drops the systemd unit and example config into /etc/vetting.
# 5. Reminds the operator to edit the config and set a bcrypt
# password before enabling the service — we don't auto-start
# because a placeholder password would just refuse to boot.
#
# What it deliberately does NOT do:
# - Build the orchestrator (this script assumes you ran
# `make orchestrator-linux` beforehand and that bin/vetting-linux-amd64
# exists alongside this script, or pass --binary to locate it).
# - Install the live image or TFTP payloads — those are separate,
# since most operators want to build them from a pinned CI artifact
# rather than on the LXC itself.
#
# Usage:
# sudo ./install.sh [--binary PATH] [--config-dir /etc/vetting]
#
set -euo pipefail
BINARY=""
CONFIG_DIR="/etc/vetting"
STATE_DIR="/var/lib/vetting"
LOG_DIR="/var/log/vetting"
SERVICE_USER="vetting"
usage() {
cat <<EOF
Usage: $0 [--binary PATH] [--config-dir DIR]
--binary PATH Path to a pre-built vetting binary (default:
auto-detect ../bin/vetting-linux-amd64 relative to
this script).
--config-dir DIR Where to install vetting.yaml + systemd unit drop
(default: /etc/vetting).
-h, --help Print this message.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--binary) BINARY="$2"; shift 2 ;;
--config-dir) CONFIG_DIR="$2"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "unknown arg: $1" >&2; usage; exit 2 ;;
esac
done
if [[ $EUID -ne 0 ]]; then
echo "install.sh must be run as root (try: sudo $0)" >&2
exit 1
fi
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
if [[ -z "${BINARY}" ]]; then
for cand in \
"${REPO_ROOT}/bin/vetting-linux-amd64" \
"${REPO_ROOT}/bin/vetting" \
"${SCRIPT_DIR}/vetting"; do
if [[ -x "${cand}" ]]; then BINARY="${cand}"; break; fi
done
fi
if [[ -z "${BINARY}" || ! -x "${BINARY}" ]]; then
echo "could not find a vetting binary to install; pass --binary PATH or run 'make orchestrator-linux' first" >&2
exit 1
fi
echo "==> installing runtime dependencies"
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y --no-install-recommends \
ca-certificates dnsmasq iperf3
echo "==> creating ${SERVICE_USER} user"
if ! id -u "${SERVICE_USER}" >/dev/null 2>&1; then
useradd --system \
--home-dir "${STATE_DIR}" \
--shell /usr/sbin/nologin \
"${SERVICE_USER}"
fi
echo "==> preparing directories"
install -d -m 0755 -o "${SERVICE_USER}" -g "${SERVICE_USER}" "${STATE_DIR}"
install -d -m 0755 -o "${SERVICE_USER}" -g "${SERVICE_USER}" "${LOG_DIR}"
install -d -m 0755 "${CONFIG_DIR}"
echo "==> installing binary"
install -m 0755 "${BINARY}" /usr/local/bin/vetting
echo "==> installing config and systemd unit"
if [[ ! -f "${CONFIG_DIR}/vetting.yaml" ]]; then
install -m 0640 -o root -g "${SERVICE_USER}" \
"${SCRIPT_DIR}/vetting.example.yaml" \
"${CONFIG_DIR}/vetting.yaml"
echo " -> installed default config at ${CONFIG_DIR}/vetting.yaml"
else
echo " -> preserving existing ${CONFIG_DIR}/vetting.yaml"
fi
install -m 0644 "${SCRIPT_DIR}/vetting.service" /etc/systemd/system/vetting.service
# Disable the distro's dnsmasq so only the orchestrator-supervised
# instance owns DHCP/TFTP. Operators who want to keep dnsmasq for
# something else can re-enable it after configuring a disjoint listen
# address.
if systemctl is-enabled --quiet dnsmasq 2>/dev/null; then
echo "==> disabling distro dnsmasq (orchestrator supervises its own)"
systemctl disable --now dnsmasq
fi
systemctl daemon-reload
cat <<EOF
vetting is installed but not yet enabled.
Next steps:
1. Edit ${CONFIG_DIR}/vetting.yaml and set:
- auth.admin_password_bcrypt (run: vetting gen-admin-password YOURPW)
- auth.session_secret_hex (run: openssl rand -hex 32)
- server.public_url (the URL you'll browse to)
- pxe.* if you want PXE boot support
- notifiers + routes (optional)
2. Start the service:
systemctl enable --now vetting
3. Watch the logs:
journalctl -fu vetting
EOF
+89
View File
@@ -0,0 +1,89 @@
server:
bind: "127.0.0.1:8080"
# Base URL the orchestrator is reachable at from the operator's
# browser. Used as the click-through link in notifications, so it
# should be the *external* URL (e.g. https://vetting.lan:8443),
# not the bind address.
public_url: "http://127.0.0.1:8080"
tls:
enabled: false
cert_file: ""
key_file: ""
database:
path: "./var/vetting.db"
artifacts:
dir: "./var/artifacts"
# Days to keep per-run artifact files (report.html, report.json, fio,
# iperf, inventory.json, hold keys). DB rows are preserved. 0 = forever.
retention_days: 30
logs:
dir: "./var/logs"
# Days to keep per-run log files. 0 = forever.
retention_days: 30
janitor:
# Interval between cleanup sweeps. 0 defaults to 60.
interval_minutes: 60
auth:
# bcrypt hash of your admin password.
# Generate via: ./bin/gen-admin-password "your-password"
admin_password_bcrypt: "$2a$10$REPLACE_ME_WITH_A_REAL_BCRYPT_HASH_0123456789abcdefABCDEFxx"
# Random 32-byte hex string used to sign session cookies.
# Generate via: openssl rand -hex 32 (or use PowerShell equivalent)
session_secret_hex: "0000000000000000000000000000000000000000000000000000000000000000"
session_ttl_hours: 24
dispatcher:
max_concurrent_runs: 3
# Fields below are populated in later phases and ignored in Phase 1.
pxe:
enabled: false
interface: "" # e.g. "eth0"
dhcp_range: "" # e.g. "10.77.0.100,10.77.0.200,12h"
orchestrator_url: "" # e.g. "http://10.77.0.1:8080"
tftp_root: "" # holds ipxe.efi + undionly.kpxe
live_dir: "" # holds vmlinuz + initrd.img; served at /live/*
# Notifications fire on StageFailed, SpecMismatch, HoldingOpened,
# RunCompleted. Declare one or more notifiers and route each event
# kind (and optionally severity) to a notifier by name. Delivery is
# fire-and-forget (one attempt per event, logged on failure).
#
# Example (uncomment and fill in):
#
# notifiers:
# - name: ops-ntfy
# type: ntfy
# server: https://ntfy.sh
# topic: vetting-YOUR-TOPIC
# - name: ops-discord
# type: discord
# webhook_url: https://discord.com/api/webhooks/XXX/YYY
# - name: ops-email
# type: smtp
# smtp:
# host: mail.lan
# port: 25
# from: vetting@lan.local
# to: [ops@lan.local]
#
# routes:
# # Critical events (failures / holds) fire on all three channels.
# - match_severity: [critical]
# notifier: ops-ntfy
# - match_severity: [critical]
# notifier: ops-discord
# - match_severity: [critical]
# notifier: ops-email
# # RunCompleted is informational — push to ntfy only.
# - match_kind: [RunCompleted]
# notifier: ops-ntfy
notifiers: []
routes: []
+53
View File
@@ -0,0 +1,53 @@
[Unit]
Description=Vetting orchestrator (post-repair hardware validation)
Documentation=https://github.com/your-org/vetting
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=vetting
Group=vetting
ExecStart=/usr/local/bin/vetting --config /etc/vetting/vetting.yaml
# The orchestrator embeds dnsmasq and sends raw WoL broadcasts. Rather
# than run as root, grant just the caps we need:
# CAP_NET_BIND_SERVICE — if the operator binds :443 or :80
# CAP_NET_RAW — WoL magic packet via DGRAM broadcast; not
# strictly required when using UDP broadcast to
# 255.255.255.255 on port 9, but safer to carry
# so custom ports work.
# CAP_NET_ADMIN — dnsmasq needs this to create the DHCP socket
# and to bind to a specific interface.
AmbientCapabilities=CAP_NET_BIND_SERVICE CAP_NET_RAW CAP_NET_ADMIN
CapabilityBoundingSet=CAP_NET_BIND_SERVICE CAP_NET_RAW CAP_NET_ADMIN
# Filesystem: the orchestrator needs to write to /var/lib/vetting and
# /var/log/vetting. Everything else is read-only.
ReadWritePaths=/var/lib/vetting /var/log/vetting
ProtectSystem=strict
ProtectHome=true
NoNewPrivileges=true
PrivateTmp=true
PrivateDevices=true
ProtectControlGroups=true
ProtectKernelTunables=true
ProtectKernelModules=true
RestrictSUIDSGID=true
RestrictNamespaces=true
LockPersonality=true
# Restart policy — crash out loudly on startup errors, but recover from
# transient failures.
Restart=on-failure
RestartSec=5
StartLimitBurst=5
StartLimitIntervalSec=60
# Logs go to journald; the orchestrator's own per-run log files live
# under /var/log/vetting regardless.
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target
+178
View File
@@ -0,0 +1,178 @@
# Architecture
A single Go binary runs the orchestrator. A second Go binary runs
inside a custom Debian live image (built with mkosi) and becomes the
per-run test agent. The two talk over HTTP + SSE.
```
Operator browser (HTMX + SSE, admin login)
│ HTTPS
┌───────────────────────────────────────────────────────────────┐
│ Orchestrator LXC — single Go binary `vetting` │
│ │
│ UI (Templ) ─┬─ Agent API ─┬─ SSE hub │
│ │ │ │
│ Orchestrator core (state machine, dispatcher sem=3, │
│ stage executors, WoL sender, token issuer) │
│ │ │
│ ┌─────┴─────┬──────────┐ │
│ ▼ ▼ ▼ │
│ SQLite flat-file logs dnsmasq subprocess │
│ (DHCP+TFTP+HTTP, MAC allowlist)│
│ │
│ Janitor goroutine (retention-based cleanup) │
│ Notifier registry (ntfy/discord/smtp) │
└─────────────────────────────────────────┬─────────────────────┘
│ LAN
Host under test (×23)
PXE → iPXE → Linux live image
└─ vetting-agent (HTTP+SSE back)
```
## Packages
| Package | Purpose |
|---|---|
| `cmd/vetting` | Orchestrator entrypoint. Wires config, stores, runner, dispatcher, iperf supervisor, PXE supervisor, janitor, HTTP router. |
| `cmd/vetting-agent` | In-image agent entrypoint. Reads kernel cmdline params, starts the agent loop. |
| `internal/config` | YAML loader + types. |
| `internal/db` | SQLite open + embedded migrations. Pure Go via modernc.org/sqlite. |
| `internal/model` | Plain structs: `Host`, `Run`, `Stage`, `Measurement`, `SpecDiff`, `Artifact`. |
| `internal/store` | Repository layer; SQL is hand-written. |
| `internal/orchestrator` | State machine, dispatcher, per-run runner, WoL sender, HMAC run tokens, iperf supervisor. |
| `internal/api` | HTTP handlers: `agent_handlers.go` (the agent-facing API) and `ui_handlers.go` (HTMX fragments + SSE). |
| `internal/httpserver` | chi router assembly — lives here to avoid `api ↔ orchestrator` cyclic imports. |
| `internal/web` | Embedded static assets + compiled Templ templates. |
| `internal/auth` | Single-admin bcrypt + signed-cookie sessions. |
| `internal/pxe` | dnsmasq subprocess supervisor + per-MAC iPXE script generator. |
| `internal/events` | In-process SSE hub (fan-out to live browser clients). |
| `internal/logs` | Per-run flat-file writer + SSE fan-out of live log tail. |
| `internal/spec` | Expected-vs-actual diff engine with severity classification. |
| `internal/notify` | Pluggable notifier registry (ntfy, Discord webhook, SMTP). |
| `internal/report` | HTML + JSON report generation (html/template, self-contained). |
| `internal/hold` | Per-run SSH key issuance for `FailedHolding`. |
| `internal/janitor` | Retention-based cleanup of old artifact files + log files. |
| `agent/` | In-image agent: claim loop, stage dispatch, heartbeat, log forwarder, thermal sidecar. |
| `agent/probes` | lshw, dmidecode, smartctl, lspci, hwmon, nvidia-smi wrappers. |
| `agent/tests` | Per-stage test implementations (SMART, CPUStress, Storage, Network, GPU, PSU). |
| `live-image/` | mkosi config + postinst for the Debian live image. |
| `deploy/` | systemd unit + example config + install.sh. |
| `test/e2e/` | Build-tagged (`-tags=e2e`) QEMU + PXE full-stack test. |
## State machine
Per-run state is the single source of truth; the UI is a pure
projection of DB + event stream.
```
Registered → Queued → WaitingWoL → Booting → InventoryCheck
→ SpecValidate → SMART → CPUStress → Storage → Network
→ GPU → PSU → Reporting → Completed
any stage → Failed → FailedHolding → Released
```
Key points:
- **Transitions are table-driven** (`internal/orchestrator/statemachine.go`).
Each `(state, event) → (next, action)` is encoded once.
- **Orchestrator-owned stages resolve inside `/result`:** `SpecValidate`
and `Reporting` flip state forward as part of the preceding stage's
result handler, so the agent never sees them as "its turn".
- **Stage rows persist before SSE fan-out** — the UI can re-derive
state by reading SQLite, and an SSE reconnect mid-run just fetches
fresh tile fragments.
## Agent ↔ orchestrator protocol
```
GET /ipxe/{MAC} → per-MAC iPXE script
POST /api/v1/runs/{id}/hello → "I booted; here's my address"
POST /api/v1/runs/{id}/claim → validate token, receive stage list
POST /api/v1/runs/{id}/heartbeat → liveness ping; response carries cmd
POST /api/v1/runs/{id}/log → batch of log lines
POST /api/v1/runs/{id}/sensor → batch of measurements (thermals, throughput)
POST /api/v1/runs/{id}/result → stage result; response says next_state
POST /api/v1/runs/{id}/hold → on FailedHolding, receive authorized_key
```
Auth on every `/api/v1/*` call: the bearer token is stored as a bcrypt
hash in `runs.agent_token_hash` and compared in constant time. The
plaintext is in the kernel cmdline — unforgeable by anyone not on the
trusted bridge, because the iPXE script is issued per-MAC and the MAC
must already be in the dnsmasq allowlist.
### Heartbeat control channel
The heartbeat response carries a `cmd` field the agent acts on:
| cmd | When fired | Agent action |
|---|---|---|
| `continue` | Normal case | No-op; keep running current stage |
| `shutdown` | Run reached `Completed` | `systemctl poweroff` |
| `abort` | Run in `FailedHolding` or `Released` | Stop heartbeat loop; let the operator drive |
| `retry_stage` | Operator pressed "Override wipe" | Re-enter the named stage with `override_flags` armed |
## Safety: destructive disk tests
Four layered gates:
1. **MAC allowlist** — dnsmasq only answers DHCP for registered MACs.
2. **Signed run token** — orchestrator issues a per-run HMAC token in
the iPXE kernel cmdline; the agent submits it on `/claim` and the
orchestrator verifies before handing back the stage list.
3. **Wipe probe** — before `badblocks`, the agent scans for filesystem
signatures / LVM metadata / partition tables. Anything found →
`FailedHolding` on `Storage`. The operator explicitly clicks
**Override wipe-probe** to proceed.
4. **Device allowlist** — the agent only targets block devices matching
the inventory's `expected_disks`. USB sticks and surprise disks are
skipped.
## Notifications
Fire-and-forget. The orchestrator fires four event kinds:
| Kind | Severity | When |
|---|---|---|
| `StageFailed` | critical | Any stage returns `passed=false` |
| `SpecMismatch` | critical | `SpecValidate` finds critical diffs |
| `HoldingOpened` | critical | Agent POSTs `/hold` (operator can SSH in) |
| `RunCompleted` | info | Pipeline reaches `Completed` |
The config maps event kinds and severities to one or more notifiers
(ntfy, Discord webhook, SMTP). Each notifier gets one attempt per
event with a 10s timeout; delivery failures are logged, nothing is
persisted.
## Why a separate notify package?
Keeps the `/result` and `/hold` handlers non-blocking. Each dispatch
starts a goroutine per target; a slow ntfy server doesn't back up an
SMTP notifier or delay the HTTP response to the agent.
## Data retention
The janitor goroutine (`internal/janitor`) runs a sweep every
`janitor.interval_minutes` (default 60) and deletes:
- artifact files older than `artifacts.retention_days`, plus their
`artifacts` table rows
- log files older than `logs.retention_days`
`runs`, `hosts`, `stages`, `measurements`, `spec_diffs` rows are
**never** deleted by the janitor — host histories and aggregate
metrics survive cleanups.
## Reproducible builds
The orchestrator and agent are pure Go; `make orchestrator-linux`
cross-compiles to `linux-amd64` from Windows or macOS.
The live image requires Linux-side tooling (mkosi, debootstrap,
squashfs-tools) so `make live-image` fails loudly on Windows and
redirects to `wsl make live-image`. Pinning to snapshot.debian.org in
`live-image/mkosi.conf` keeps image bits stable across time for a
given git SHA.
+171
View File
@@ -0,0 +1,171 @@
# Operations
Operator-facing runbook for the vetting orchestrator. If you're looking
for the "what does the system do" overview, see
[architecture.md](architecture.md). For what each test stage actually
measures, see [test-suite.md](test-suite.md).
## Install (Proxmox LXC)
Target: a Debian/Ubuntu LXC on the Proxmox host that holds the cluster
you're vetting for. The LXC must be on the same L2 segment as the
repaired nodes so DHCP and WoL work.
1. On your workstation, cross-build the binary:
```
make orchestrator-linux
```
This produces `bin/vetting-linux-amd64`.
2. Copy the repo tree (or just `bin/`, `deploy/`) into the LXC, then
from inside the LXC:
```
sudo ./deploy/install.sh
```
The installer:
- `apt install`s `dnsmasq`, `iperf3`, `ca-certificates`
- creates the `vetting` system user (home = `/var/lib/vetting`)
- installs the binary into `/usr/local/bin/vetting`
- drops `vetting.example.yaml` into `/etc/vetting/vetting.yaml`
(only if there's no existing config — existing configs are
preserved)
- drops `/etc/systemd/system/vetting.service`
- disables the distro-default dnsmasq (the orchestrator supervises
its own)
The installer does **not** enable the service, because the default
config has a placeholder bcrypt password that the binary refuses to
start with.
3. Generate an admin password hash and a session secret, then edit
`/etc/vetting/vetting.yaml`:
```
./bin/gen-admin-password 'your-password-here' # prints a bcrypt hash
openssl rand -hex 32 # prints a 64-char hex string
```
Required fields:
- `auth.admin_password_bcrypt` — the bcrypt hash
- `auth.session_secret_hex` — the 32-byte hex string
- `server.public_url` — the URL your browser hits the LXC on
(e.g. `https://vetting.lan:8443`). This is used as the
click-through link in notifications, so it must be the *external*
URL, not the bind address.
4. (Optional) Configure notifiers in the same file — see the
commented-out example block for ntfy / Discord / SMTP.
5. Enable and start:
```
sudo systemctl enable --now vetting
sudo journalctl -fu vetting
```
## First vetting run
Against a QEMU VM first, before you point it at real hardware:
1. On the Proxmox host (or wherever your LXC lives):
```
sudo ip link add br-vetting type bridge
sudo ip addr add 10.77.0.1/24 dev br-vetting
sudo ip link set br-vetting up
```
2. In the UI at `https://<lxc>:8443`, log in and register a host:
- Name: `qemu-test`
- MAC: `52:54:00:12:34:56`
- WoL broadcast IP: `10.77.0.255`
- Expected spec: paste a minimal YAML like
```yaml
memory: { total_gib: 4 }
cpu: { logical_cores: 4 }
```
3. Click **Start Vetting**. The UI tile will sit at `Queued → WaitingWoL`.
4. Launch the QEMU VM on the bridge so it PXE-boots from dnsmasq:
```
sudo qemu-system-x86_64 \
-enable-kvm -cpu host -smp 4 -m 4096 \
-netdev bridge,id=n0,br=br-vetting \
-device virtio-net-pci,netdev=n0,mac=52:54:00:12:34:56 \
-drive file=/tmp/test-disk.img,format=raw,if=virtio \
-boot n -serial mon:stdio -display none
```
5. Watch the tile advance through stages. On success, the tile shows
**View report** and the VM auto-shuts-down.
For real repaired hardware: same flow, but register the node's actual
MAC + expected spec, and make sure the node's BIOS is set to PXE-boot
from the NIC that's on the `br-vetting` network.
## A failed run — SSH to the held host
When a stage fails, the pipeline halts at `FailedHolding` and the
agent installs an orchestrator-issued SSH key into the live-image's
`/root/.ssh/authorized_keys`. The UI tile surfaces the IP and the
exact `ssh` command.
The hold key is **per-run**. Once you're done:
1. Power the host off (`poweroff` from the SSH session).
2. In the UI, click **Override wipe-probe** only when the failure was
at the `Storage` stage *and* you're sure the disks are expendable.
Otherwise click **Start vetting** on a fresh run from the host
dashboard after fixing the underlying issue.
## Log + artifact layout
```
/var/lib/vetting/
vetting.db # SQLite: hosts, runs, stages, artifacts, spec_diffs, measurements
artifacts/
run-<N>/
report.html # operator-facing summary
report.json # machine-readable summary
inventory.json # raw probe output
fio-<disk>.log # storage stage output
iperf-<nic>.json # network stage output
hold-<N>.pub # per-run SSH pubkey (only if held)
/var/log/vetting/
run-<N>.log # append-only per-run log tail
```
Retention is governed by the `artifacts.retention_days` and
`logs.retention_days` settings. DB rows (run history) are preserved
indefinitely; only on-disk files get pruned.
## Troubleshooting
| Symptom | First check |
|---|---|
| Service refuses to start with `auth.admin_password_bcrypt is the placeholder` | You didn't replace the bcrypt hash in the config. Run `gen-admin-password`. |
| PXE client gets no DHCP offer | `journalctl -u vetting` for dnsmasq errors; confirm the LXC has `CAP_NET_ADMIN` (the shipped systemd unit does); confirm the host MAC is actually registered (`sqlite3 /var/lib/vetting/vetting.db 'SELECT name, mac FROM hosts;'`). |
| Agent `/hello` never fires | Check the live image is actually loading the agent binary — SSH into the live env (use the hold key path), `systemctl status vetting-agent`. |
| Tile stuck on `Booting` | Most likely the live image booted but the agent can't reach the orchestrator. Verify `vetting.orchestrator=` in the kernel cmdline resolves from the host's network. |
| UI shows stale stage | Force a reload; the SSE reconnect is automatic but the browser keeps the last state on ephemeral network blips. |
| Notification didn't fire | `journalctl -u vetting \| grep notify:` — delivery is fire-and-forget and the failure reason is logged but not persisted. |
## Upgrading
1. `make orchestrator-linux` on your workstation.
2. `scp bin/vetting-linux-amd64 lxc:/tmp/vetting.new`
3. On the LXC:
```
sudo systemctl stop vetting
sudo install -m 0755 /tmp/vetting.new /usr/local/bin/vetting
sudo systemctl start vetting
```
The DB migration runs at startup and is append-only — no manual schema
work unless a release's notes call it out.
+166
View File
@@ -0,0 +1,166 @@
# Test suite
What each stage measures, what "pass" means, and where the results
land. Stages run strictly in order. Any stage returning `passed=false`
halts the pipeline at `FailedHolding` — the operator decides whether
to fix, override, or abandon.
## Stage order
```
Inventory → SpecValidate → SMART → CPUStress → Storage
→ Network → GPU → PSU → Reporting
```
Stages marked *orchestrator-owned* resolve inside `/result` and never
show up as "the agent's turn".
---
## Inventory
**Owner:** agent.
**What it does:** `dmidecode`, `lscpu`, `lshw`, `lspci`, `smartctl -i`
over each block device, `nvidia-smi -q` if present. The raw output is
merged into a single JSON blob.
**Pass:** the probes run to completion; missing optional tools (e.g.
`nvidia-smi` on a GPU-less host) are tolerated.
**Artifacts:** `inventory.json` under `artifacts/run-<N>/`.
## SpecValidate *(orchestrator-owned)*
**Owner:** orchestrator (resolves inline inside the `/result` for the
preceding Inventory stage).
**What it does:** diffs the submitted inventory against the host's
`expected_spec_yaml`. The diff engine classifies each field as
`critical`, `warning`, or `info`.
**Pass:** zero `critical` diffs.
**Fail mode:** fires a `SpecMismatch` notification; transitions run
to `Failed → FailedHolding`.
**Artifacts:** `spec_diffs` table rows (one per divergence).
## SMART
**Owner:** agent.
**What it does:** `smartctl -a /dev/<disk>` for each disk in the
inventory's `expected_disks`. Parses reallocated-sector counts, pending
sectors, end-to-end error counters, overall-health attribute.
**Pass:** SMART overall-health is PASSED on every expected disk and
reallocated-sector count is below threshold.
**Artifacts:** `smart-<disk>.txt` raw output.
## CPUStress
**Owner:** agent.
**What it does:** runs `stress-ng --cpu N --vm M --vm-bytes 90% -t
120s` with `N = logical_cores` and `M ≈ logical_cores/2`. The `--vm`
flag is the **stand-in for Memtest86+**: it exercises the memory
subsystem under load and will fail if the RAM has latent faults that
surface under thermal + allocator pressure.
**Pass:** `stress-ng` exits 0 and thermal samples taken by the sidecar
stay below the configured per-host `max_temp_c`.
**Caveat:** weaker than a dedicated memtest pass; see
[architecture.md](architecture.md) for the reasoning (Memtest86+
can't be signalled back without IPMI serial).
## Storage
**Owner:** agent (destructive).
**What it does:**
1. **Wipe probe** — scans for filesystem signatures, LVM metadata,
partition tables on the expected disks. Any hit → halt with
`UnexpectedData`; operator must click **Override wipe-probe**.
2. `badblocks -svw` (destructive read/write) on each expected disk.
3. `fio --rw=randrw --bs=4k --iodepth=32 --runtime=60 --size=1G` on
each disk; captures IOPS and p99 latency.
**Pass:** badblocks reports zero bad blocks; fio IOPS above a
per-class floor (configurable).
**Artifacts:** `fio-<disk>.json` per disk.
**Safety gate:** the wipe-probe + device allowlist are the second and
third lines of defense against wiping the wrong disk. See
[architecture.md § Safety](architecture.md#safety-destructive-disk-tests).
## Network
**Owner:** agent.
**What it does:** `iperf3 -c <orchestrator> -p <iperf_port> -t 10 -J`
to measure throughput to the orchestrator. The orchestrator-side
`iperf3 -s` is supervised by `internal/orchestrator/iperf.go` and
binds to the configured `network.iperf_port`.
**Pass:** throughput ≥ per-class floor (1 Gbps for 1GbE NICs, 9 Gbps
for 10GbE).
**Artifacts:** `iperf-<nic>.json`.
## GPU
**Owner:** agent.
**What it does:** runs `nvidia-smi -q` and a short compute workload
(`gpu-burn` if present, else `nvidia-smi dmon` during a `stress-ng
--gpu` burst). Skipped cleanly when no GPU is present.
**Pass:** no ECC errors reported; temperature below threshold; compute
workload exits 0.
## PSU
**Owner:** agent.
**What it does:** reads `/sys/class/hwmon/*/power_average` and `in*_input`
during a synthetic load burst (CPU + disk + NIC simultaneously) to
look for voltage sag or wattage anomalies. Records the full envelope
as `measurements` rows with `kind=psu`.
**Pass:** no voltage dip below threshold across the load burst.
**Caveat:** only reports on what the BMC exposes via hwmon — servers
without exposed PSU telemetry pass trivially. Documented limitation.
## Reporting *(orchestrator-owned)*
**Owner:** orchestrator (resolves inline inside the `/result` for PSU).
**What it does:**
1. Gathers run, host, stages, spec_diffs, and measurement aggregates.
2. Renders `report.html` via `internal/report` (html/template with
inlined CSS; self-contained offline-viewable).
3. Writes `report.json` with the same data in machine-readable form.
4. Records both as `report_html` / `report_json` artifact rows.
5. Transitions run → `Completed`.
6. Fires `RunCompleted` notification.
7. The next agent heartbeat returns `cmd=shutdown`.
## Thermal sidecar
**Owner:** agent (always-on from `Booting` until the agent exits).
**What it does:** every 5 seconds, walks `/sys/class/hwmon/*` and
POSTs temperature samples as a batch to `/sensor`. Populates the
`measurements` table with `kind=thermal`.
**No pass/fail** on its own — stages that care about thermals read the
sidecar's data via `measurements`. A dead sensor just drops out of
the next batch.
---
## Where pass/fail lives
- `runs.state` — authoritative terminal state (`Completed`,
`FailedHolding`, `Released`).
- `runs.result``pass` or `fail` string once the run completes.
- `runs.failed_stage` — name of the stage that halted the pipeline, if
any. Cleared when the operator overrides and re-enters.
- `stages` — one row per attempted stage with `passed`, `started_at`,
`completed_at`, `summary_json`, `message`.
- `measurements` — time-series samples from the thermal sidecar and
from stages that capture numeric outputs.
- `artifacts` — on-disk files (report, fio logs, iperf logs, etc).
- `spec_diffs` — one row per expected-vs-actual divergence.
## Adding a new stage
1. Add the name to `store.DefaultStageOrder`.
2. Add a `model.State<Name>` const and wire it into
`internal/orchestrator/statemachine.go` (both the forward
transition table and the stage-for-state lookup).
3. Add a case to `agent/runner.go`'s `runStage` dispatch.
4. Drop the implementation into `agent/tests/`.
5. If the stage is orchestrator-owned, add a `resolve<Name>` helper to
`internal/api/agent_handlers.go` and invoke it from the `/result`
handler after the preceding stage's `NextState` resolves.
+27
View File
@@ -0,0 +1,27 @@
module vetting
go 1.23.0
require (
github.com/a-h/templ v0.3.1001
github.com/go-chi/chi/v5 v5.1.0
golang.org/x/crypto v0.28.0
gopkg.in/yaml.v3 v3.0.1
modernc.org/sqlite v1.33.1
)
require (
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/ncruces/go-strftime v0.1.9 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
golang.org/x/sys v0.34.0 // indirect
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect
modernc.org/libc v1.55.3 // indirect
modernc.org/mathutil v1.6.0 // indirect
modernc.org/memory v1.8.0 // indirect
modernc.org/strutil v1.2.0 // indirect
modernc.org/token v1.1.0 // indirect
)
+63
View File
@@ -0,0 +1,63 @@
github.com/a-h/templ v0.3.1001 h1:yHDTgexACdJttyiyamcTHXr2QkIeVF1MukLy44EAhMY=
github.com/a-h/templ v0.3.1001/go.mod h1:oCZcnKRf5jjsGpf2yELzQfodLphd2mwecwG4Crk5HBo=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw=
github.com/go-chi/chi/v5 v5.1.0/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/pprof v0.0.0-20240409012703-83162a5b38cd h1:gbpYu9NMq8jhDVbvlGkMFWCjLFlqqEZjEmObmhUy6Vo=
github.com/google/pprof v0.0.0-20240409012703-83162a5b38cd/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4=
github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
golang.org/x/crypto v0.28.0 h1:GBDwsMXVQi34v5CCYUm2jkJvu4cbtru2U4TN2PSyQnw=
golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U=
golang.org/x/mod v0.26.0 h1:EGMPT//Ezu+ylkCijjPc+f4Aih7sZvaAr+O3EHBxvZg=
golang.org/x/mod v0.26.0/go.mod h1:/j6NAhSk8iQ723BGAUyoAcn7SlD7s15Dp9Nd/SfeaFQ=
golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw=
golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA=
golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0=
golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
modernc.org/cc/v4 v4.21.4 h1:3Be/Rdo1fpr8GrQ7IVw9OHtplU4gWbb+wNgeoBMmGLQ=
modernc.org/cc/v4 v4.21.4/go.mod h1:HM7VJTZbUCR3rV8EYBi9wxnJ0ZBRiGE5OeGXNA0IsLQ=
modernc.org/ccgo/v4 v4.19.2 h1:lwQZgvboKD0jBwdaeVCTouxhxAyN6iawF3STraAal8Y=
modernc.org/ccgo/v4 v4.19.2/go.mod h1:ysS3mxiMV38XGRTTcgo0DQTeTmAO4oCmJl1nX9VFI3s=
modernc.org/fileutil v1.3.0 h1:gQ5SIzK3H9kdfai/5x41oQiKValumqNTDXMvKo62HvE=
modernc.org/fileutil v1.3.0/go.mod h1:XatxS8fZi3pS8/hKG2GH/ArUogfxjpEKs3Ku3aK4JyQ=
modernc.org/gc/v2 v2.4.1 h1:9cNzOqPyMJBvrUipmynX0ZohMhcxPtMccYgGOJdOiBw=
modernc.org/gc/v2 v2.4.1/go.mod h1:wzN5dK1AzVGoH6XOzc3YZ+ey/jPgYHLuVckd62P0GYU=
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 h1:5D53IMaUuA5InSeMu9eJtlQXS2NxAhyWQvkKEgXZhHI=
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6/go.mod h1:Qz0X07sNOR1jWYCrJMEnbW/X55x206Q7Vt4mz6/wHp4=
modernc.org/libc v1.55.3 h1:AzcW1mhlPNrRtjS5sS+eW2ISCgSOLLNyFzRh/V3Qj/U=
modernc.org/libc v1.55.3/go.mod h1:qFXepLhz+JjFThQ4kzwzOjA/y/artDeg+pcYnY+Q83w=
modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4=
modernc.org/mathutil v1.6.0/go.mod h1:Ui5Q9q1TR2gFm0AQRqQUaBWFLAhQpCwNcuhBOSedWPo=
modernc.org/memory v1.8.0 h1:IqGTL6eFMaDZZhEWwcREgeMXYwmW83LYW8cROZYkg+E=
modernc.org/memory v1.8.0/go.mod h1:XPZ936zp5OMKGWPqbD3JShgd/ZoQ7899TUuQqxY+peU=
modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4=
modernc.org/opt v0.1.3/go.mod h1:WdSiB5evDcignE70guQKxYUl14mgWtbClRi5wmkkTX0=
modernc.org/sortutil v1.2.0 h1:jQiD3PfS2REGJNzNCMMaLSp/wdMNieTbKX920Cqdgqc=
modernc.org/sortutil v1.2.0/go.mod h1:TKU2s7kJMf1AE84OoiGppNHJwvB753OYfNl2WRb++Ss=
modernc.org/sqlite v1.33.1 h1:trb6Z3YYoeM9eDL1O8do81kP+0ejv+YzgyFo+Gwy0nM=
modernc.org/sqlite v1.33.1/go.mod h1:pXV2xHxhzXZsgT/RtTFAPY6JJDEvOTcTdwADQCCWD4k=
modernc.org/strutil v1.2.0 h1:agBi9dp1I+eOnxXeiZawM8F4LawKv4NzGWSaLfyeNZA=
modernc.org/strutil v1.2.0/go.mod h1:/mdcBmfOibveCTBxUl5B5l6W+TTH1FXPLHZE6bTosX0=
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
+918
View File
@@ -0,0 +1,918 @@
package api
import (
"context"
"crypto/sha256"
"crypto/subtle"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"log"
"net"
"net/http"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/go-chi/chi/v5"
"vetting/internal/events"
"vetting/internal/hold"
"vetting/internal/logs"
"vetting/internal/model"
"vetting/internal/notify"
"vetting/internal/orchestrator"
"vetting/internal/pxe"
"vetting/internal/report"
"vetting/internal/spec"
"vetting/internal/store"
)
// Agent collects the collaborators used by agent-facing HTTP routes:
// the iPXE chainload endpoint and the /api/v1/runs/:id/* endpoints.
type Agent struct {
Hosts *store.Hosts
Runs *store.Runs
Stages *store.Stages
Artifacts *store.Artifacts
SpecDiffs *store.SpecDiffs
Measurements *store.Measurements
Runner *orchestrator.Runner
EventHub *events.Hub
Logs *logs.Hub
Notify *notify.Registry
ArtifactsDir string // ./var/artifacts
OrchestratorURL string // baked into iPXE cmdline
PublicURL string // user-visible URL base for notification click-throughs
LiveKernelURL string
LiveInitrdURL string
TLSCertFPR string // optional; empty = skip pinning
IperfPort int // orchestrator-supervised iperf3 port; 0 = 5201
}
// IPXEScript serves a per-MAC iPXE script. Called by iPXE itself after
// dnsmasq hands it the chainload URL. Unknown MAC → halt script.
// Known MAC with no active run → poweroff script. Known MAC with active
// run → real boot script; the fetch triggers PXEObserved.
func (a *Agent) IPXEScript(w http.ResponseWriter, r *http.Request) {
mac := strings.ToLower(strings.TrimSpace(chi.URLParam(r, "mac")))
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
w.Header().Set("Cache-Control", "no-store")
if !macRe.MatchString(mac) {
log.Printf("ipxe: rejected malformed mac %q from %s", mac, r.RemoteAddr)
_, _ = w.Write([]byte(pxe.NotRegisteredScript(mac)))
return
}
run, err := a.Runs.FindActiveByMAC(r.Context(), mac)
if err != nil {
log.Printf("ipxe: find run by mac %s: %v", mac, err)
http.Error(w, "internal error", http.StatusInternalServerError)
return
}
if run == nil {
_, _ = w.Write([]byte(pxe.NoActiveRunScript(mac)))
return
}
// The token hash in the DB is the sha256 of the plaintext. The
// plaintext itself cannot be recovered from the hash — we issued it
// once when the run was created. For iPXE we re-issue a fresh token
// on every PXE fetch: this is safe because the hash in the DB is
// rewritten to match and only the most recent PXE can be claimed.
plain, hash, err := orchestrator.IssueRunToken()
if err != nil {
http.Error(w, "token", http.StatusInternalServerError)
return
}
if err := a.Runs.RotateTokenHash(r.Context(), run.ID, hash); err != nil {
log.Printf("ipxe: rotate token run %d: %v", run.ID, err)
http.Error(w, "token", http.StatusInternalServerError)
return
}
script := pxe.BuildScript(pxe.IPXEParams{
OrchestratorURL: a.OrchestratorURL,
LiveKernelURL: a.LiveKernelURL,
LiveInitrdURL: a.LiveInitrdURL,
TLSCertFPR: a.TLSCertFPR,
RunID: run.ID,
MAC: mac,
Token: plain,
})
_, _ = w.Write([]byte(script))
// iPXE has now fetched the script — treat this as PXEObserved. If we
// were already in Booting the transition table allows staying.
if _, err := a.Runner.Transition(r.Context(), run.ID, orchestrator.TriggerPXEObserved); err != nil {
// Non-fatal: the agent may still claim via /claim.
log.Printf("ipxe: PXEObserved for run %d: %v", run.ID, err)
}
}
// Hello is the first call an agent makes once userspace is up. It's
// idempotent and only writes a log line; the authoritative transition
// comes from /claim. The agent sends Hello early so operators see a
// signal in the tile even before the token is validated.
func (a *Agent) Hello(w http.ResponseWriter, r *http.Request) {
runID, ok := runIDFromURL(w, r)
if !ok {
return
}
if _, ok := a.authenticate(w, r, runID); !ok {
return
}
log.Printf("agent hello: run=%d remote=%s", runID, r.RemoteAddr)
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "run_id": runID})
}
// Claim is the binding call: the agent proves it holds the plaintext
// token for this run, and in return the orchestrator transitions to
// InventoryCheck and seeds the stage rows. All destructive actions the
// agent takes later require a prior successful claim.
func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
runID, ok := runIDFromURL(w, r)
if !ok {
return
}
run, ok := a.authenticate(w, r, runID)
if !ok {
return
}
var body struct {
AgentIP string `json:"agent_ip"`
}
if r.Body != nil {
// agent_ip is informational; if missing fall back to RemoteAddr.
_ = json.NewDecoder(r.Body).Decode(&body)
}
agentIP := strings.TrimSpace(body.AgentIP)
if agentIP == "" {
if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil {
agentIP = host
} else {
agentIP = r.RemoteAddr
}
}
// First claim seeds the stage rows; subsequent claims are a no-op
// so agent retries after transient network failures stay safe.
if len(mustListStages(a.Stages, r, runID)) == 0 {
if err := a.Stages.Seed(r.Context(), runID); err != nil {
log.Printf("claim: seed stages run %d: %v", runID, err)
http.Error(w, "seed stages", http.StatusInternalServerError)
return
}
}
// Drive the transition. If we're already past Booting this returns
// an error — treat as "already claimed" and report OK, don't 500.
if run.State == model.StateWaitingWoL || run.State == model.StateBooting {
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerAgentClaimed); err != nil {
log.Printf("claim: transition run %d: %v", runID, err)
http.Error(w, "transition", http.StatusConflict)
return
}
}
log.Printf("agent claimed: run=%d agent_ip=%s", runID, agentIP)
// Stage-driven agent needs a bit of per-run config: the device
// allowlist (serial + expected size) for Storage, and the iperf3
// server port for Network. Parse the host's expected spec here so
// the agent doesn't need to read YAML.
expectedDisks := []map[string]any{}
if host, err := a.Hosts.Get(r.Context(), run.HostID); err == nil && host != nil {
if parsed, err := spec.Parse(host.ExpectedSpecYAML); err == nil && parsed != nil {
for _, dd := range parsed.Disks {
expectedDisks = append(expectedDisks, map[string]any{
"serial": dd.Serial,
"size_gb": dd.SizeGB,
})
}
}
}
iperfPort := a.IperfPort
if iperfPort == 0 {
iperfPort = 5201
}
writeJSON(w, http.StatusOK, map[string]any{
"ok": true,
"run_id": runID,
"stages": store.DefaultStageOrder,
"expected_disks": expectedDisks,
"iperf_port": iperfPort,
})
}
// Heartbeat is the agent's periodic liveness ping. The response body
// acts as a control channel: cmd=continue is the normal case; cmd=abort
// once the run enters FailedHolding/Released; cmd=retry_stage when the
// operator has overridden a failed stage (wipe-probe override).
func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) {
runID, ok := runIDFromURL(w, r)
if !ok {
return
}
run, ok := a.authenticate(w, r, runID)
if !ok {
return
}
a.Runner.TouchHeartbeat(runID)
cmd := "continue"
resp := map[string]any{"state": run.State}
switch {
case run.State == model.StateCompleted:
// Pipeline succeeded — agent should power the host down.
cmd = "shutdown"
case run.State == model.StateFailedHolding || run.State == model.StateReleased:
cmd = "abort"
case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON):
// Operator pressed "Override wipe & retry". Agent should
// re-enter Storage with the wipe-probe bypass armed.
cmd = "retry_stage"
resp["stage"] = "Storage"
resp["override_flags"] = json.RawMessage(run.OverrideFlagsJSON)
}
resp["cmd"] = cmd
writeJSON(w, http.StatusOK, resp)
}
// overrideWipeSet inspects a Run.OverrideFlagsJSON blob for the wipe flag.
// Malformed JSON is ignored — the operator has to reapply the override if
// it didn't round-trip correctly.
func overrideWipeSet(blob string) bool {
if blob == "" {
return false
}
var flags struct {
Wipe bool `json:"wipe"`
}
_ = json.Unmarshal([]byte(blob), &flags)
return flags.Wipe
}
// authenticate verifies the Bearer token against the run's stored hash
// and returns the Run for downstream handlers. Responds 401/404 on
// failure and returns ok=false so the caller can bail early.
func (a *Agent) authenticate(w http.ResponseWriter, r *http.Request, runID int64) (*model.Run, bool) {
run, err := a.Runs.Get(r.Context(), runID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
http.Error(w, "run not found", http.StatusNotFound)
return nil, false
}
http.Error(w, "internal error", http.StatusInternalServerError)
return nil, false
}
token := bearerToken(r)
if token == "" {
http.Error(w, "missing bearer", http.StatusUnauthorized)
return nil, false
}
presented := orchestrator.HashRunToken(token)
if subtle.ConstantTimeCompare([]byte(presented), []byte(run.AgentTokenHash)) != 1 {
http.Error(w, "bad token", http.StatusUnauthorized)
return nil, false
}
return run, true
}
func bearerToken(r *http.Request) string {
h := r.Header.Get("Authorization")
if !strings.HasPrefix(h, "Bearer ") {
return ""
}
return strings.TrimSpace(strings.TrimPrefix(h, "Bearer "))
}
func runIDFromURL(w http.ResponseWriter, r *http.Request) (int64, bool) {
idStr := chi.URLParam(r, "id")
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil || id <= 0 {
http.Error(w, "bad run id", http.StatusBadRequest)
return 0, false
}
return id, true
}
func writeJSON(w http.ResponseWriter, status int, body any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
_ = json.NewEncoder(w).Encode(body)
}
// mustListStages is a small wrapper that hides the error path from
// /claim — a DB read failure just pretends there are zero stages, and
// the subsequent Seed will surface the real error.
func mustListStages(s *store.Stages, r *http.Request, runID int64) []model.Stage {
rows, err := s.ListForRun(r.Context(), runID)
if err != nil {
return nil
}
return rows
}
// ===== Phase 3 endpoints =================================================
// LogBatch is what the agent POSTs to /log: zero or more lines with
// timestamp + level + text. Lines are written in order to the per-run
// file and fanned out on the SSE hub.
type LogBatch struct {
Lines []LogLine `json:"lines"`
}
type LogLine struct {
TS string `json:"ts,omitempty"` // RFC3339Nano; server clock used if empty
Level string `json:"level,omitempty"` // info|warn|error|debug
Text string `json:"text"`
}
// Log accepts a batch of log lines from the agent. Empty batches are
// legal (useful for agent-side flush ping).
func (a *Agent) Log(w http.ResponseWriter, r *http.Request) {
runID, ok := runIDFromURL(w, r)
if !ok {
return
}
if _, ok := a.authenticate(w, r, runID); !ok {
return
}
var batch LogBatch
if err := json.NewDecoder(r.Body).Decode(&batch); err != nil {
http.Error(w, "bad json", http.StatusBadRequest)
return
}
writer, err := a.Logs.WriterFor(runID)
if err != nil {
http.Error(w, "open log: "+err.Error(), http.StatusInternalServerError)
return
}
for _, l := range batch.Lines {
ts, _ := time.Parse(time.RFC3339Nano, l.TS)
writer.Append(logs.Line{TS: ts, Level: l.Level, Text: l.Text})
}
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(batch.Lines)})
}
// StageResult is the body of /result. Kind is the stage name (from
// DefaultStageOrder); Passed drives StageCompleted vs StageFailed.
// Inventory is optional and only set when kind == "Inventory" — the
// orchestrator persists it as an artifact and feeds it to spec.Diff.
type StageResult struct {
Stage string `json:"stage"`
Passed bool `json:"passed"`
Summary json.RawMessage `json:"summary,omitempty"`
Inventory *spec.Inventory `json:"inventory,omitempty"`
Message string `json:"message,omitempty"`
}
// Result receives a stage's outcome. Flow:
// 1. Mark the stage row passed/failed + record summary JSON.
// 2. For Inventory: persist the inventory artifact.
// 3. For Inventory (on pass): run spec diff server-side, persist rows,
// bump the run into SpecValidate and immediately resolve SpecValidate
// from that diff — the agent isn't involved in SpecValidate at all.
// 4. Transition the run via StageCompleted/StageFailed.
func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
runID, ok := runIDFromURL(w, r)
if !ok {
return
}
run, ok := a.authenticate(w, r, runID)
if !ok {
return
}
var body StageResult
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
http.Error(w, "bad json", http.StatusBadRequest)
return
}
body.Stage = strings.TrimSpace(body.Stage)
if _, ok := orchestrator.StateForStage(body.Stage); !ok {
http.Error(w, "unknown stage: "+body.Stage, http.StatusBadRequest)
return
}
stageState := model.StagePassed
if !body.Passed {
stageState = model.StageFailed
}
summaryJSON := ""
if len(body.Summary) > 0 {
summaryJSON = string(body.Summary)
}
if err := a.Stages.CompleteByName(r.Context(), runID, body.Stage, stageState, summaryJSON); err != nil {
http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError)
return
}
// Inventory-specific: persist artifact + compute spec diff.
if body.Stage == "Inventory" && body.Inventory != nil {
if err := a.persistInventory(r, run, body.Inventory); err != nil {
log.Printf("persist inventory run %d: %v", runID, err)
}
}
if !body.Passed {
if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
log.Printf("set failed stage: %v", err)
}
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
log.Printf("result: failed-transition run %d: %v", runID, err)
http.Error(w, "transition", http.StatusConflict)
return
}
hostName := a.hostNameFor(r.Context(), run.HostID)
detail := body.Message
if detail == "" {
detail = "stage reported failure"
}
a.dispatchEvent(notify.Event{
Kind: notify.KindStageFailed,
Severity: notify.SeverityCritical,
RunID: runID,
HostName: hostName,
Title: fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage),
Body: fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail),
URL: a.runLinkURL(runID),
})
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"})
return
}
// Passed: advance to the next stage in the pipeline.
next, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted)
if err != nil {
http.Error(w, "advance: "+err.Error(), http.StatusConflict)
return
}
log.Printf("result: run %d stage %s passed → %s", runID, body.Stage, next)
// If the just-advanced-into state is SpecValidate or Reporting, the
// orchestrator owns those stages entirely. The resolve function may
// transition further (→ next stage on pass, → FailedHolding on fail,
// → Completed for Reporting), so we re-read the run after each.
if next == model.StateSpecValidate {
a.resolveSpecValidate(r, runID)
if after, err := a.Runs.Get(r.Context(), runID); err == nil {
next = after.State
}
}
if next == model.StateReporting {
a.resolveReporting(r, runID)
if after, err := a.Runs.Get(r.Context(), runID); err == nil {
next = after.State
}
}
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": string(next)})
}
func (a *Agent) persistInventory(r *http.Request, run *model.Run, inv *spec.Inventory) error {
dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", run.ID))
if err := os.MkdirAll(dir, 0o755); err != nil {
return err
}
path := filepath.Join(dir, "inventory.json")
buf, err := json.MarshalIndent(inv, "", " ")
if err != nil {
return err
}
if err := os.WriteFile(path, buf, 0o644); err != nil {
return err
}
sum := sha256.Sum256(buf)
_, err = a.Artifacts.Create(r.Context(), store.Artifact{
RunID: run.ID,
Kind: "inventory",
Path: path,
SHA256: hex.EncodeToString(sum[:]),
SizeBytes: int64(len(buf)),
})
return err
}
// resolveSpecValidate runs the expected-vs-actual diff against the
// just-stored inventory artifact, persists spec_diffs rows, and drives
// the state machine — all on the server. The agent does nothing for
// this stage.
func (a *Agent) resolveSpecValidate(r *http.Request, runID int64) {
run, err := a.Runs.Get(r.Context(), runID)
if err != nil {
log.Printf("specvalidate: get run: %v", err)
return
}
host, err := a.Hosts.Get(r.Context(), run.HostID)
if err != nil {
log.Printf("specvalidate: get host: %v", err)
return
}
expected, err := spec.Parse(host.ExpectedSpecYAML)
if err != nil {
log.Printf("specvalidate: parse expected yaml: %v", err)
a.failStage(r, runID, "SpecValidate", "malformed expected spec: "+err.Error())
return
}
inv, err := a.readInventoryArtifact(r, runID)
if err != nil {
log.Printf("specvalidate: read inventory: %v", err)
a.failStage(r, runID, "SpecValidate", "missing inventory artifact")
return
}
diffs := spec.Diff(expected, inv)
if err := a.SpecDiffs.ReplaceForRun(r.Context(), runID, diffs); err != nil {
log.Printf("specvalidate: write diffs: %v", err)
}
if err := a.Stages.StartByName(r.Context(), runID, "SpecValidate"); err != nil {
log.Printf("specvalidate: start stage: %v", err)
}
critical := 0
for _, d := range diffs {
if d.Severity == "critical" && !d.Ignored {
critical++
}
}
summaryBuf, _ := json.Marshal(map[string]any{
"diffs": len(diffs),
"critical": critical,
})
if critical > 0 {
_ = a.Stages.CompleteByName(r.Context(), runID, "SpecValidate", model.StageFailed, string(summaryBuf))
_ = a.Runs.SetFailedStage(r.Context(), runID, "SpecValidate")
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
log.Printf("specvalidate: failed-transition: %v", err)
}
a.appendLog(runID, "error", fmt.Sprintf("SpecValidate: %d critical diff(s) — holding host", critical))
hostName := a.hostNameFor(r.Context(), run.HostID)
a.dispatchEvent(notify.Event{
Kind: notify.KindSpecMismatch,
Severity: notify.SeverityCritical,
RunID: runID,
HostName: hostName,
Title: fmt.Sprintf("[vetting] %s spec mismatch (%d critical)", hostName, critical),
Body: fmt.Sprintf("SpecValidate found %d critical diff(s) on %s. Host is held for inspection.", critical, hostName),
URL: a.runLinkURL(runID),
})
} else {
_ = a.Stages.CompleteByName(r.Context(), runID, "SpecValidate", model.StagePassed, string(summaryBuf))
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted); err != nil {
log.Printf("specvalidate: advance: %v", err)
}
a.appendLog(runID, "info", "SpecValidate: all fields match expected spec")
}
}
func (a *Agent) readInventoryArtifact(r *http.Request, runID int64) (*spec.Inventory, error) {
arts, err := a.Artifacts.ListForRun(r.Context(), runID)
if err != nil {
return nil, err
}
for i := len(arts) - 1; i >= 0; i-- {
if arts[i].Kind == "inventory" {
buf, err := os.ReadFile(arts[i].Path)
if err != nil {
return nil, err
}
var inv spec.Inventory
if err := json.Unmarshal(buf, &inv); err != nil {
return nil, err
}
return &inv, nil
}
}
return nil, errors.New("no inventory artifact")
}
func (a *Agent) failStage(r *http.Request, runID int64, stage, message string) {
_ = a.Stages.CompleteByName(r.Context(), runID, stage, model.StageFailed, fmt.Sprintf(`{"error":%q}`, message))
_ = a.Runs.SetFailedStage(r.Context(), runID, stage)
if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil {
log.Printf("failStage: transition run %d: %v", runID, err)
}
a.appendLog(runID, "error", stage+": "+message)
}
func (a *Agent) appendLog(runID int64, level, text string) {
if a.Logs == nil {
return
}
w, err := a.Logs.WriterFor(runID)
if err != nil {
log.Printf("appendLog: %v", err)
return
}
w.Append(logs.Line{Level: level, Text: text})
}
// Hold issues the per-run ephemeral ed25519 keypair: the agent gets
// the authorized_keys line, the orchestrator keeps the privkey on disk.
// Hold also records the agent's reported IP so the tile can print the
// ssh invocation.
type HoldRequest struct {
AgentIP string `json:"agent_ip"`
}
type HoldResponse struct {
AuthorizedKey string `json:"authorized_key"`
RunID int64 `json:"run_id"`
}
func (a *Agent) Hold(w http.ResponseWriter, r *http.Request) {
runID, ok := runIDFromURL(w, r)
if !ok {
return
}
if _, ok := a.authenticate(w, r, runID); !ok {
return
}
var body HoldRequest
_ = json.NewDecoder(r.Body).Decode(&body)
agentIP := strings.TrimSpace(body.AgentIP)
if agentIP == "" {
if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil {
agentIP = host
}
}
if agentIP != "" {
if err := a.Runs.SetHoldIP(r.Context(), runID, agentIP); err != nil {
log.Printf("hold: set hold_ip: %v", err)
}
}
kp, err := hold.Issue(runID)
if err != nil {
http.Error(w, "generate key: "+err.Error(), http.StatusInternalServerError)
return
}
keyPath := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID), "hold.key")
abs, err := kp.WritePrivateTo(keyPath)
if err != nil {
http.Error(w, "write key: "+err.Error(), http.StatusInternalServerError)
return
}
sum := sha256.Sum256(kp.PrivatePEM)
if _, err := a.Artifacts.Create(r.Context(), store.Artifact{
RunID: runID,
Kind: "hold_key",
Path: abs,
SHA256: hex.EncodeToString(sum[:]),
SizeBytes: int64(len(kp.PrivatePEM)),
}); err != nil {
log.Printf("hold: record artifact: %v", err)
}
a.appendLog(runID, "info", fmt.Sprintf("Hold key issued. SSH in with: ssh -i %s root@%s", abs, agentIP))
hostID := mustHostID(a, r, runID)
if hostID != 0 {
hostName := a.hostNameFor(r.Context(), hostID)
a.dispatchEvent(notify.Event{
Kind: notify.KindHoldingOpened,
Severity: notify.SeverityCritical,
RunID: runID,
HostName: hostName,
Title: fmt.Sprintf("[vetting] %s holding — SSH ready", hostName),
Body: fmt.Sprintf("Host %s is holding at %s.\nssh -i %s root@%s", hostName, agentIP, abs, agentIP),
URL: a.runLinkURL(runID),
})
}
// Refresh the tile so the operator sees the ssh command.
host, _ := a.Hosts.Get(r.Context(), mustHostID(a, r, runID))
if host != nil {
latest, _ := a.Runs.Get(r.Context(), runID)
if orchestrator.TileRenderer != nil {
payload := orchestrator.TileRenderer(r.Context(), *host, latest)
a.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", host.ID), Payload: payload})
}
}
writeJSON(w, http.StatusOK, HoldResponse{AuthorizedKey: kp.AuthorizedKey, RunID: runID})
}
// dispatchEvent hands an already-populated Event to the notify Registry
// if one is wired. Handler code uses hostNameFor to resolve the host
// name for the event payload; this keeps call sites terse.
func (a *Agent) dispatchEvent(ev notify.Event) {
if a.Notify == nil {
return
}
a.Notify.Dispatch(ev)
}
// hostNameFor returns a human-readable host name for a run, or "host-N"
// if the lookup fails — notifications should never fail silently over a
// missing name.
func (a *Agent) hostNameFor(ctx context.Context, hostID int64) string {
if host, err := a.Hosts.Get(ctx, hostID); err == nil && host != nil {
return host.Name
}
return fmt.Sprintf("host-%d", hostID)
}
func (a *Agent) runLinkURL(runID int64) string {
if a.PublicURL == "" {
return ""
}
return strings.TrimRight(a.PublicURL, "/") + "/reports/" + fmt.Sprintf("%d", runID)
}
func mustHostID(a *Agent, r *http.Request, runID int64) int64 {
run, err := a.Runs.Get(r.Context(), runID)
if err != nil || run == nil {
return 0
}
return run.HostID
}
// ===== Phase 4 endpoints =================================================
// SensorBatch is what the agent POSTs to /sensor: a stream of numeric
// samples (temps, fan rpm, PSU rails, iperf throughput). Each sample is
// (kind, key, value, unit). Timestamps default to server-now when empty
// so the thermal sidecar doesn't have to carry a clock.
type SensorBatch struct {
Samples []SensorSample `json:"samples"`
}
type SensorSample struct {
TS string `json:"ts,omitempty"`
Kind string `json:"kind"` // temp|fan|psu_volt|iperf|fio|smart_attr
Key string `json:"key"`
Value float64 `json:"value"`
Unit string `json:"unit,omitempty"`
}
// Sensor persists a batch of numeric samples. The thermal sidecar hits
// this on a tick; stage executors (iperf, fio) also drop here.
func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
runID, ok := runIDFromURL(w, r)
if !ok {
return
}
if _, ok := a.authenticate(w, r, runID); !ok {
return
}
if a.Measurements == nil {
http.Error(w, "measurements store not wired", http.StatusInternalServerError)
return
}
var body SensorBatch
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
http.Error(w, "bad json", http.StatusBadRequest)
return
}
rows := make([]model.Measurement, 0, len(body.Samples))
for _, s := range body.Samples {
ts, _ := time.Parse(time.RFC3339Nano, s.TS)
rows = append(rows, model.Measurement{
RunID: runID,
TS: ts,
Kind: s.Kind,
Key: s.Key,
Value: s.Value,
Unit: s.Unit,
})
}
if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil {
http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError)
return
}
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(rows)})
}
// resolveReporting runs when the pipeline advances into StateReporting.
// It's an orchestrator-owned stage like SpecValidate: no agent action.
// Writes a JSON report bundling run + stages + diffs + measurements,
// then advances the run to Completed. Heartbeat will then return abort
// and the agent will power the host off in Phase 5.
func (a *Agent) resolveReporting(r *http.Request, runID int64) {
ctx := r.Context()
if err := a.Stages.StartByName(ctx, runID, "Reporting"); err != nil {
log.Printf("reporting: start stage: %v", err)
}
run, err := a.Runs.Get(ctx, runID)
if err != nil {
log.Printf("reporting: get run: %v", err)
return
}
host, err := a.Hosts.Get(ctx, run.HostID)
if err != nil {
log.Printf("reporting: get host: %v", err)
return
}
stages, err := a.Stages.ListForRun(ctx, runID)
if err != nil {
log.Printf("reporting: list stages: %v", err)
}
diffs, err := a.SpecDiffs.ListForRun(ctx, runID)
if err != nil {
log.Printf("reporting: list diffs: %v", err)
}
var measurements []model.Measurement
if a.Measurements != nil {
measurements, err = a.Measurements.ListForRun(ctx, runID)
if err != nil {
log.Printf("reporting: list measurements: %v", err)
}
}
bundle := map[string]any{
"run": run,
"host": host,
"stages": stages,
"spec_diffs": diffs,
"measurements": measurements,
"generated_at": time.Now().UTC().Format(time.RFC3339),
}
buf, err := json.MarshalIndent(bundle, "", " ")
if err != nil {
log.Printf("reporting: marshal: %v", err)
a.failStage(r, runID, "Reporting", "marshal report: "+err.Error())
return
}
dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID))
if err := os.MkdirAll(dir, 0o755); err != nil {
a.failStage(r, runID, "Reporting", "mkdir: "+err.Error())
return
}
path := filepath.Join(dir, "report.json")
if err := os.WriteFile(path, buf, 0o644); err != nil {
a.failStage(r, runID, "Reporting", "write: "+err.Error())
return
}
sum := sha256.Sum256(buf)
if _, err := a.Artifacts.Create(ctx, store.Artifact{
RunID: runID,
Kind: "report",
Path: path,
SHA256: hex.EncodeToString(sum[:]),
SizeBytes: int64(len(buf)),
}); err != nil {
log.Printf("reporting: record artifact: %v", err)
}
// Also render the operator-facing HTML summary alongside the JSON.
// Failures here are non-fatal — the JSON is the source of truth.
if host != nil {
htmlData := report.Data{
GeneratedAt: time.Now().UTC(),
Run: *run,
Host: *host,
Stages: stages,
SpecDiffs: diffs,
Aggregates: report.AggregateMeasurements(measurements),
}
if htmlBuf, err := report.RenderHTML(htmlData); err != nil {
log.Printf("reporting: render html: %v", err)
} else {
htmlPath := filepath.Join(dir, "report.html")
if err := os.WriteFile(htmlPath, htmlBuf, 0o644); err != nil {
log.Printf("reporting: write html: %v", err)
} else {
htmlSum := sha256.Sum256(htmlBuf)
if _, err := a.Artifacts.Create(ctx, store.Artifact{
RunID: runID,
Kind: "report_html",
Path: htmlPath,
SHA256: hex.EncodeToString(htmlSum[:]),
SizeBytes: int64(len(htmlBuf)),
}); err != nil {
log.Printf("reporting: record html artifact: %v", err)
}
}
}
}
summaryBuf, _ := json.Marshal(map[string]any{
"report_path": path,
"stages": len(stages),
"diffs": len(diffs),
})
if err := a.Stages.CompleteByName(ctx, runID, "Reporting", model.StagePassed, string(summaryBuf)); err != nil {
log.Printf("reporting: complete stage: %v", err)
}
if err := a.Runs.MarkCompleted(ctx, runID, path); err != nil {
log.Printf("reporting: mark completed: %v", err)
}
a.appendLog(runID, "info", "Reporting: wrote "+path+"; run completed.")
// Publish a final tile update so the dashboard flips to pass mood.
if host != nil && orchestrator.TileRenderer != nil {
latest, _ := a.Runs.Get(ctx, runID)
payload := orchestrator.TileRenderer(ctx, *host, latest)
a.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", host.ID), Payload: payload})
}
hostName := "host"
if host != nil {
hostName = host.Name
}
a.dispatchEvent(notify.Event{
Kind: notify.KindRunCompleted,
Severity: notify.SeverityInfo,
RunID: runID,
HostName: hostName,
Title: fmt.Sprintf("[vetting] %s passed vetting", hostName),
Body: fmt.Sprintf("Run %d on %s completed all stages. Report: %s", runID, hostName, path),
URL: a.runLinkURL(runID),
})
}
+128
View File
@@ -0,0 +1,128 @@
package api_test
import (
"bytes"
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"path/filepath"
"strconv"
"testing"
"github.com/go-chi/chi/v5"
"vetting/internal/api"
"vetting/internal/db"
"vetting/internal/model"
"vetting/internal/orchestrator"
"vetting/internal/store"
)
func setupAgent(t *testing.T) (*api.Agent, int64, string) {
t.Helper()
path := filepath.Join(t.TempDir(), "vetting.db")
conn, err := db.Open(path)
if err != nil {
t.Fatalf("open db: %v", err)
}
t.Cleanup(func() { _ = conn.Close() })
hosts := &store.Hosts{DB: conn}
runs := &store.Runs{DB: conn}
meas := &store.Measurements{DB: conn}
hostID, err := hosts.Create(context.Background(), model.Host{
Name: "t-host",
MAC: "aa:bb:cc:dd:ee:01",
WoLBroadcastIP: "10.0.0.255",
WoLPort: 9,
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
})
if err != nil {
t.Fatalf("create host: %v", err)
}
plain, hash, err := orchestrator.IssueRunToken()
if err != nil {
t.Fatalf("issue token: %v", err)
}
runID, err := runs.Create(context.Background(), hostID, hash)
if err != nil {
t.Fatalf("create run: %v", err)
}
return &api.Agent{
Hosts: hosts,
Runs: runs,
Measurements: meas,
}, runID, plain
}
func routedRequest(runID int64, method, path string, body []byte) *http.Request {
req := httptest.NewRequest(method, path, bytes.NewReader(body))
// chi.URLParam is read from chi's context routing; fake that here.
rctx := chi.NewRouteContext()
rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
return req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
}
func TestSensorPersistsBatch(t *testing.T) {
a, runID, token := setupAgent(t)
batch := api.SensorBatch{Samples: []api.SensorSample{
{Kind: "thermal", Key: "cpu", Value: 47.5, Unit: "C"},
{Kind: "iperf", Key: "throughput_mbps", Value: 938.2, Unit: "Mbps"},
}}
buf, _ := json.Marshal(batch)
req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
req.Header.Set("Authorization", "Bearer "+token)
req.Header.Set("Content-Type", "application/json")
rr := httptest.NewRecorder()
a.Sensor(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
}
rows, err := a.Measurements.ListForRun(context.Background(), runID)
if err != nil {
t.Fatalf("ListForRun: %v", err)
}
if len(rows) != 2 {
t.Fatalf("expected 2 measurements, got %d", len(rows))
}
}
func TestSensorRejectsBadToken(t *testing.T) {
a, runID, _ := setupAgent(t)
body, _ := json.Marshal(api.SensorBatch{})
req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", body)
req.Header.Set("Authorization", "Bearer wrong-token")
rr := httptest.NewRecorder()
a.Sensor(rr, req)
if rr.Code != http.StatusUnauthorized {
t.Fatalf("status = %d, want 401", rr.Code)
}
}
// TestHeartbeatShutdownWhenCompleted: once the orchestrator has flipped
// the run into Completed, the next heartbeat response must carry
// cmd=shutdown so the agent powers the host down.
func TestHeartbeatShutdownWhenCompleted(t *testing.T) {
a, runID, token := setupAgent(t)
// Wire a runner so Heartbeat's TouchHeartbeat call doesn't nil-panic.
a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}}
if err := a.Runs.SetState(context.Background(), runID, model.StateCompleted); err != nil {
t.Fatalf("set state: %v", err)
}
req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil)
req.Header.Set("Authorization", "Bearer "+token)
rr := httptest.NewRecorder()
a.Heartbeat(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String())
}
var resp map[string]any
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
t.Fatalf("decode: %v", err)
}
if resp["cmd"] != "shutdown" {
t.Fatalf("cmd = %v, want shutdown", resp["cmd"])
}
}
+318
View File
@@ -0,0 +1,318 @@
package api_test
import (
"bytes"
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"testing"
"time"
"github.com/go-chi/chi/v5"
"vetting/internal/api"
"vetting/internal/db"
"vetting/internal/events"
"vetting/internal/logs"
"vetting/internal/model"
"vetting/internal/notify"
"vetting/internal/orchestrator"
"vetting/internal/spec"
"vetting/internal/store"
)
// captureNotifier is a testing-only Notifier that records every Event
// sent to it, under a mutex so concurrent Dispatch goroutines are safe.
type captureNotifier struct {
mu sync.Mutex
name string
evs []notify.Event
}
func (c *captureNotifier) Name() string { return c.name }
func (c *captureNotifier) Send(_ context.Context, ev notify.Event) error {
c.mu.Lock()
c.evs = append(c.evs, ev)
c.mu.Unlock()
return nil
}
func (c *captureNotifier) awaitKind(t *testing.T, k notify.Kind) notify.Event {
t.Helper()
deadline := time.Now().Add(2 * time.Second)
for {
c.mu.Lock()
for _, ev := range c.evs {
if ev.Kind == k {
got := ev
c.mu.Unlock()
return got
}
}
c.mu.Unlock()
if time.Now().After(deadline) {
t.Fatalf("no %q event received within timeout", k)
}
time.Sleep(5 * time.Millisecond)
}
}
func newCaptureRegistry(c *captureNotifier) *notify.Registry {
reg := notify.NewRegistry(time.Second)
reg.Register(c)
reg.AddRoute(notify.Route{Notifier: c.name}) // wildcard
return reg
}
// Builds a fully-wired Agent against a fresh sqlite DB and returns
// (agent, runID, plainTokenForBearer). Caller is responsible for
// transitioning the run out of Queued.
func fullAgent(t *testing.T) (*api.Agent, int64, string) {
t.Helper()
tmp := t.TempDir()
conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
if err != nil {
t.Fatalf("open db: %v", err)
}
t.Cleanup(func() { _ = conn.Close() })
hostStore := &store.Hosts{DB: conn}
runStore := &store.Runs{DB: conn}
stageStore := &store.Stages{DB: conn}
artifactStore := &store.Artifacts{DB: conn}
specDiffStore := &store.SpecDiffs{DB: conn}
measurementStore := &store.Measurements{DB: conn}
hub := events.NewHub()
logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
if err != nil {
t.Fatalf("logs hub: %v", err)
}
t.Cleanup(func() { logHub.Close() })
runner := &orchestrator.Runner{
Runs: runStore,
Hosts: hostStore,
Stages: stageStore,
EventHub: hub,
}
hostID, err := hostStore.Create(context.Background(), model.Host{
Name: "smoke-host",
MAC: "aa:bb:cc:dd:ee:10",
WoLBroadcastIP: "10.0.0.255",
WoLPort: 9,
ExpectedSpecYAML: "", // empty spec → no diffs
})
if err != nil {
t.Fatalf("create host: %v", err)
}
plain, hash, err := orchestrator.IssueRunToken()
if err != nil {
t.Fatalf("issue token: %v", err)
}
runID, err := runStore.Create(context.Background(), hostID, hash)
if err != nil {
t.Fatalf("create run: %v", err)
}
if err := stageStore.Seed(context.Background(), runID); err != nil {
t.Fatalf("seed stages: %v", err)
}
return &api.Agent{
Hosts: hostStore,
Runs: runStore,
Stages: stageStore,
Artifacts: artifactStore,
SpecDiffs: specDiffStore,
Measurements: measurementStore,
Runner: runner,
EventHub: hub,
Logs: logHub,
ArtifactsDir: filepath.Join(tmp, "artifacts"),
PublicURL: "https://vetting.example",
}, runID, plain
}
// walkStage simulates the agent reporting a single stage's outcome.
// Returns the next_state the orchestrator decided to advance to.
func walkStage(t *testing.T, a *api.Agent, runID int64, token, stage string, passed bool, extras map[string]any) string {
t.Helper()
body := map[string]any{"stage": stage, "passed": passed}
if extras != nil {
for k, v := range extras {
body[k] = v
}
}
buf, _ := json.Marshal(body)
req := httptest.NewRequest(http.MethodPost,
"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/result",
bytes.NewReader(buf))
rctx := chi.NewRouteContext()
rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
req.Header.Set("Authorization", "Bearer "+token)
req.Header.Set("Content-Type", "application/json")
rr := httptest.NewRecorder()
a.Result(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("stage %s: status %d body=%q", stage, rr.Code, rr.Body.String())
}
var resp struct {
OK bool `json:"ok"`
NextState string `json:"next_state"`
}
if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil {
t.Fatalf("stage %s: decode resp: %v", stage, err)
}
return resp.NextState
}
// TestFullPipelineToCompleted walks an agent through all stages of a
// successful run and asserts the run ends in Completed. Inventory is
// minimal; the empty expected-spec means SpecValidate produces zero
// critical diffs and the orchestrator auto-advances past it.
func TestFullPipelineToCompleted(t *testing.T) {
a, runID, token := fullAgent(t)
capture := &captureNotifier{name: "capture"}
a.Notify = newCaptureRegistry(capture)
// Claim would normally transition Booting → InventoryCheck; set it
// directly here since we're not exercising the claim path.
if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
t.Fatalf("set state: %v", err)
}
// Stage 1: Inventory — provide a concrete inventory so SpecValidate
// has something to compare against.
inv := spec.Inventory{
CPU: spec.CPUSpec{Model: "Xeon Gold", LogicalCores: 8},
Memory: spec.MemorySpec{TotalGiB: 16},
}
next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
// After Inventory → SpecValidate resolves inline → SMART
if next != "SMART" {
t.Fatalf("after Inventory, next_state = %q, want SMART", next)
}
// The remaining stages advance one-for-one in order.
walkPlan := []struct {
stage string
expected string
}{
{"SMART", "CPUStress"},
{"CPUStress", "Storage"},
{"Storage", "Network"},
{"Network", "GPU"},
{"GPU", "PSU"},
{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
}
for _, step := range walkPlan {
got := walkStage(t, a, runID, token, step.stage, true, nil)
if got != step.expected {
t.Fatalf("after %s, next_state = %q, want %q", step.stage, got, step.expected)
}
}
run, err := a.Runs.Get(context.Background(), runID)
if err != nil {
t.Fatalf("Get run: %v", err)
}
if run.State != model.StateCompleted {
t.Fatalf("run.State = %q, want Completed", run.State)
}
if run.ReportPath == "" {
t.Fatalf("run.ReportPath not set")
}
// Phase 5 assertions: an HTML report artifact exists on disk, and
// the capture notifier saw a RunCompleted event.
arts, err := a.Artifacts.ListForRun(context.Background(), runID)
if err != nil {
t.Fatalf("ListForRun: %v", err)
}
var htmlPath string
for _, art := range arts {
if art.Kind == "report_html" {
htmlPath = art.Path
}
}
if htmlPath == "" {
t.Fatalf("no report_html artifact recorded (kinds seen: %v)", artifactKinds(arts))
}
data, err := os.ReadFile(htmlPath)
if err != nil {
t.Fatalf("read report.html: %v", err)
}
if !strings.Contains(string(data), "<html") {
t.Fatalf("report.html missing <html tag: %s", string(data[:min(200, len(data))]))
}
ev := capture.awaitKind(t, notify.KindRunCompleted)
if ev.HostName != "smoke-host" {
t.Errorf("RunCompleted host = %q, want smoke-host", ev.HostName)
}
if ev.URL == "" || !strings.Contains(ev.URL, "/reports/") {
t.Errorf("RunCompleted URL = %q, want non-empty with /reports/", ev.URL)
}
}
func artifactKinds(arts []store.Artifact) []string {
out := make([]string, 0, len(arts))
for _, a := range arts {
out = append(out, a.Kind)
}
return out
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
// TestFaultInjectionSMART verifies a failing SMART stage halts the
// pipeline at FailedHolding with failed_stage recorded.
func TestFaultInjectionSMART(t *testing.T) {
a, runID, token := fullAgent(t)
capture := &captureNotifier{name: "capture"}
a.Notify = newCaptureRegistry(capture)
if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
t.Fatalf("set state: %v", err)
}
inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" {
t.Fatalf("after Inventory, next = %q want SMART", next)
}
// Fake SMART failure → expect FailedHolding.
if next := walkStage(t, a, runID, token, "SMART", false, nil); next != "FailedHolding" {
t.Fatalf("after SMART fail, next = %q want FailedHolding", next)
}
run, err := a.Runs.Get(context.Background(), runID)
if err != nil {
t.Fatalf("Get run: %v", err)
}
if run.State != model.StateFailedHolding {
t.Fatalf("run.State = %q, want FailedHolding", run.State)
}
if run.FailedStage != "SMART" {
t.Fatalf("run.FailedStage = %q, want SMART", run.FailedStage)
}
// Phase 5 assertion: the fault fires a StageFailed notification.
ev := capture.awaitKind(t, notify.KindStageFailed)
if !strings.Contains(ev.Title, "SMART") {
t.Errorf("StageFailed title = %q, want to mention SMART", ev.Title)
}
if ev.Severity != notify.SeverityCritical {
t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
}
}
+69
View File
@@ -0,0 +1,69 @@
package api
import (
"context"
"log"
"vetting/internal/model"
"vetting/internal/store"
"vetting/internal/web/templates"
)
// TileEnricher builds a fully-populated TileData for a host. It looks
// up the latest run's spec-diff count and hold-key artifact path so the
// tile can render the "n critical diffs" badge and the ssh invocation
// without the template package needing DB access.
//
// Used by both the Dashboard handler (initial render) and the SSE tile-
// refresh path (agent_handlers.Hold, orchestrator runner) so every
// place that renders a tile shows the same data.
type TileEnricher struct {
Runs *store.Runs
Artifacts *store.Artifacts
SpecDiffs *store.SpecDiffs
}
// Build returns a TileData for (host, latest). Fails soft: DB errors
// fall back to a tile without the extra fields rather than breaking
// the whole dashboard.
func (e *TileEnricher) Build(ctx context.Context, host model.Host, latest *model.Run) templates.TileData {
t := templates.TileData{Host: host, Latest: latest}
if latest == nil {
return t
}
if e.SpecDiffs != nil {
if diffs, err := e.SpecDiffs.ListForRun(ctx, latest.ID); err == nil {
for _, d := range diffs {
if d.Severity == "critical" && !d.Ignored {
t.SpecDiffCritical++
}
}
} else {
log.Printf("tile: list spec_diffs run %d: %v", latest.ID, err)
}
}
if e.Artifacts != nil {
if arts, err := e.Artifacts.ListForRun(ctx, latest.ID); err == nil {
for _, a := range arts {
if a.Kind == "hold_key" {
t.HoldKeyPath = a.Path
}
}
} else {
log.Printf("tile: list artifacts run %d: %v", latest.ID, err)
}
}
return t
}
// BuildByHost looks up the latest run itself — convenient for SSE tile
// publishers that only know the host ID.
func (e *TileEnricher) BuildByHost(ctx context.Context, host model.Host) templates.TileData {
var latest *model.Run
if e.Runs != nil {
if r, err := e.Runs.LatestForHost(ctx, host.ID); err == nil {
latest = r
}
}
return e.Build(ctx, host, latest)
}
+295
View File
@@ -0,0 +1,295 @@
package api
import (
"errors"
"log"
"net/http"
"regexp"
"strconv"
"strings"
"github.com/go-chi/chi/v5"
"gopkg.in/yaml.v3"
"vetting/internal/auth"
"vetting/internal/events"
"vetting/internal/model"
"vetting/internal/orchestrator"
"vetting/internal/store"
"vetting/internal/web/templates"
)
type UI struct {
Hosts *store.Hosts
Runs *store.Runs
Artifacts *store.Artifacts
Auth *auth.Manager
EventHub *events.Hub
Runner *orchestrator.Runner
Tiles *TileEnricher
}
var macRe = regexp.MustCompile(`^[0-9a-f]{2}(:[0-9a-f]{2}){5}$`)
func (u *UI) Dashboard(w http.ResponseWriter, r *http.Request) {
hosts, err := u.Hosts.List(r.Context())
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
tiles := make([]templates.TileData, 0, len(hosts))
for _, h := range hosts {
latest, err := u.Runs.LatestForHost(r.Context(), h.ID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
tiles = append(tiles, u.Tiles.Build(r.Context(), h, latest))
}
_ = templates.Dashboard(tiles).Render(r.Context(), w)
}
// StartRun creates a new Run for the host, issues an agent token, and
// transitions Registered→Queued. The dispatcher goroutine picks it up
// and fires WoL.
func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
hostID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
if _, err := u.Hosts.Get(r.Context(), hostID); err != nil {
if errors.Is(err, store.ErrNotFound) {
http.NotFound(w, r)
return
}
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// Guard: refuse to start a second run while one is still active.
if latest, err := u.Runs.LatestForHost(r.Context(), hostID); err == nil && latest != nil {
switch latest.State {
case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
// ok to start fresh
default:
http.Error(w, "host already has an active run", http.StatusConflict)
return
}
}
_, hash, err := orchestrator.IssueRunToken()
if err != nil {
http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
return
}
runID, err := u.Runs.Create(r.Context(), hostID, hash)
if err != nil {
http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
return
}
log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID)
http.Redirect(w, r, "/", http.StatusSeeOther)
}
func (u *UI) LoginForm(w http.ResponseWriter, r *http.Request) {
next := r.URL.Query().Get("next")
if next == "" {
next = "/"
}
_ = templates.Login("", next).Render(r.Context(), w)
}
func (u *UI) LoginSubmit(w http.ResponseWriter, r *http.Request) {
if err := r.ParseForm(); err != nil {
http.Error(w, "bad form", http.StatusBadRequest)
return
}
password := r.PostForm.Get("password")
next := r.PostForm.Get("next")
if next == "" || !strings.HasPrefix(next, "/") {
next = "/"
}
if !u.Auth.VerifyPassword(password) {
w.WriteHeader(http.StatusUnauthorized)
_ = templates.Login("Invalid password.", next).Render(r.Context(), w)
return
}
u.Auth.Issue(w, r)
http.Redirect(w, r, next, http.StatusSeeOther)
}
func (u *UI) Logout(w http.ResponseWriter, r *http.Request) {
u.Auth.Clear(w)
http.Redirect(w, r, "/login", http.StatusSeeOther)
}
func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
_ = templates.Registration(templates.RegistrationForm{}).Render(r.Context(), w)
}
func (u *UI) CreateHost(w http.ResponseWriter, r *http.Request) {
if err := r.ParseForm(); err != nil {
http.Error(w, "bad form", http.StatusBadRequest)
return
}
form := templates.RegistrationForm{
Name: strings.TrimSpace(r.PostForm.Get("name")),
MAC: strings.ToLower(strings.TrimSpace(r.PostForm.Get("mac"))),
WoLBroadcastIP: strings.TrimSpace(r.PostForm.Get("wol_broadcast_ip")),
WoLPort: r.PostForm.Get("wol_port"),
ExpectedSpecYAML: r.PostForm.Get("expected_spec_yaml"),
Notes: strings.TrimSpace(r.PostForm.Get("notes")),
}
if errMsg := validateHostForm(&form); errMsg != "" {
form.Error = errMsg
w.WriteHeader(http.StatusBadRequest)
_ = templates.Registration(form).Render(r.Context(), w)
return
}
wolPort, _ := strconv.Atoi(form.WoLPort)
if wolPort == 0 {
wolPort = 9
}
_, err := u.Hosts.Create(r.Context(), model.Host{
Name: form.Name,
MAC: form.MAC,
WoLBroadcastIP: form.WoLBroadcastIP,
WoLPort: wolPort,
ExpectedSpecYAML: form.ExpectedSpecYAML,
Notes: form.Notes,
})
if err != nil {
form.Error = friendlyDBError(err)
w.WriteHeader(http.StatusConflict)
_ = templates.Registration(form).Render(r.Context(), w)
return
}
http.Redirect(w, r, "/", http.StatusSeeOther)
}
// OverrideWipeStorage is the operator's explicit "yes, wipe the disk
// even though we found filesystem signatures" button. Only meaningful
// when the latest run is FailedHolding with failed_stage=Storage — the
// agent's next heartbeat will receive retry_stage with wipe=true and
// re-enter the Storage stage bypassing the wipe-probe guard.
func (u *UI) OverrideWipeStorage(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
hostID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad host id", http.StatusBadRequest)
return
}
latest, err := u.Runs.LatestForHost(r.Context(), hostID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if latest == nil {
http.Error(w, "no run for host", http.StatusConflict)
return
}
if latest.State != model.StateFailedHolding || latest.FailedStage != "Storage" {
http.Error(w, "override only valid when holding on Storage", http.StatusConflict)
return
}
if _, err := u.Runner.Override(r.Context(), latest.ID, `{"wipe":true}`); err != nil {
http.Error(w, "override: "+err.Error(), http.StatusInternalServerError)
return
}
http.Redirect(w, r, "/", http.StatusSeeOther)
}
func (u *UI) DeleteHost(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad id", http.StatusBadRequest)
return
}
if err := u.Hosts.Delete(r.Context(), id); err != nil {
if errors.Is(err, store.ErrNotFound) {
http.NotFound(w, r)
return
}
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
http.Redirect(w, r, "/", http.StatusSeeOther)
}
func (u *UI) SSE(w http.ResponseWriter, r *http.Request) {
u.EventHub.ServeSSE(w, r)
}
// Report serves the HTML report artifact for a run. Looks up the
// report_html artifact row for the runID, validates the path lives
// under the artifacts dir (defence-in-depth against path traversal),
// and streams it back. 404 when the run hasn't produced one yet.
func (u *UI) Report(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "runID")
runID, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
http.Error(w, "bad run id", http.StatusBadRequest)
return
}
arts, err := u.Artifacts.ListForRun(r.Context(), runID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
var path string
for _, a := range arts {
if a.Kind == "report_html" {
path = a.Path
}
}
if path == "" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
http.ServeFile(w, r, path)
}
func validateHostForm(form *templates.RegistrationForm) string {
if form.Name == "" {
return "Name is required."
}
if !macRe.MatchString(form.MAC) {
return "MAC address must be in the form aa:bb:cc:dd:ee:ff."
}
if form.WoLBroadcastIP == "" {
return "WoL broadcast IP is required."
}
if form.ExpectedSpecYAML == "" {
return "Expected spec YAML is required."
}
var anything any
if err := yaml.Unmarshal([]byte(form.ExpectedSpecYAML), &anything); err != nil {
return "Expected spec YAML is not valid YAML: " + err.Error()
}
if form.WoLPort != "" {
port, err := strconv.Atoi(form.WoLPort)
if err != nil || port < 1 || port > 65535 {
return "WoL port must be 165535."
}
}
return ""
}
func friendlyDBError(err error) string {
s := err.Error()
switch {
case strings.Contains(s, "UNIQUE constraint failed: hosts.name"):
return "A host with that name already exists."
case strings.Contains(s, "UNIQUE constraint failed: hosts.mac"):
return "A host with that MAC already exists."
default:
return s
}
}
+64
View File
@@ -0,0 +1,64 @@
package auth
import (
"net/http"
)
// RequireSession redirects unauthenticated requests to /login.
func (m *Manager) RequireSession(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if err := m.Validate(r); err != nil {
if acceptsHTML(r) {
http.Redirect(w, r, "/login?next="+r.URL.RequestURI(), http.StatusSeeOther)
return
}
http.Error(w, "unauthorized", http.StatusUnauthorized)
return
}
next.ServeHTTP(w, r)
})
}
func acceptsHTML(r *http.Request) bool {
accept := r.Header.Get("Accept")
if accept == "" {
return true
}
for _, part := range splitComma(accept) {
if part == "text/html" || part == "*/*" {
return true
}
}
return false
}
func splitComma(s string) []string {
var out []string
start := 0
for i := 0; i < len(s); i++ {
if s[i] == ',' {
out = append(out, trimSpace(s[start:i]))
start = i + 1
} else if s[i] == ';' {
out = append(out, trimSpace(s[start:i]))
for i < len(s) && s[i] != ',' {
i++
}
start = i + 1
}
}
if start < len(s) {
out = append(out, trimSpace(s[start:]))
}
return out
}
func trimSpace(s string) string {
for len(s) > 0 && (s[0] == ' ' || s[0] == '\t') {
s = s[1:]
}
for len(s) > 0 && (s[len(s)-1] == ' ' || s[len(s)-1] == '\t') {
s = s[:len(s)-1]
}
return s
}
+100
View File
@@ -0,0 +1,100 @@
package auth
import (
"crypto/hmac"
"crypto/sha256"
"encoding/base64"
"errors"
"fmt"
"net/http"
"strconv"
"strings"
"time"
"golang.org/x/crypto/bcrypt"
)
const cookieName = "vetting_session"
type Manager struct {
PasswordHash string
Secret []byte
TTL time.Duration
}
func (m *Manager) VerifyPassword(password string) bool {
if m.PasswordHash == "" {
return false
}
return bcrypt.CompareHashAndPassword([]byte(m.PasswordHash), []byte(password)) == nil
}
// Issue writes a signed session cookie valid for m.TTL.
func (m *Manager) Issue(w http.ResponseWriter, r *http.Request) {
expiry := time.Now().Add(m.TTL).Unix()
payload := strconv.FormatInt(expiry, 10)
sig := m.sign(payload)
value := payload + "." + sig
http.SetCookie(w, &http.Cookie{
Name: cookieName,
Value: value,
Path: "/",
HttpOnly: true,
Secure: r.TLS != nil,
SameSite: http.SameSiteLaxMode,
Expires: time.Unix(expiry, 0),
})
}
func (m *Manager) Clear(w http.ResponseWriter) {
http.SetCookie(w, &http.Cookie{
Name: cookieName,
Value: "",
Path: "/",
HttpOnly: true,
MaxAge: -1,
})
}
var errInvalidSession = errors.New("invalid session")
// Validate returns nil if the request's cookie is present, signed, and not expired.
func (m *Manager) Validate(r *http.Request) error {
c, err := r.Cookie(cookieName)
if err != nil {
return errInvalidSession
}
parts := strings.SplitN(c.Value, ".", 2)
if len(parts) != 2 {
return errInvalidSession
}
payload, sig := parts[0], parts[1]
expected := m.sign(payload)
if !hmac.Equal([]byte(sig), []byte(expected)) {
return errInvalidSession
}
expiry, err := strconv.ParseInt(payload, 10, 64)
if err != nil {
return errInvalidSession
}
if time.Now().Unix() >= expiry {
return errInvalidSession
}
return nil
}
func (m *Manager) sign(payload string) string {
mac := hmac.New(sha256.New, m.Secret)
_, _ = mac.Write([]byte(payload))
return base64.RawURLEncoding.EncodeToString(mac.Sum(nil))
}
// BcryptHash is a helper used by the gen-admin-password tool.
func BcryptHash(password string) (string, error) {
b, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost)
if err != nil {
return "", fmt.Errorf("bcrypt: %w", err)
}
return string(b), nil
}
+142
View File
@@ -0,0 +1,142 @@
package config
import (
"encoding/hex"
"fmt"
"os"
"gopkg.in/yaml.v3"
)
type Config struct {
Server Server `yaml:"server"`
Database Database `yaml:"database"`
Artifacts Artifacts `yaml:"artifacts"`
Logs Logs `yaml:"logs"`
Auth Auth `yaml:"auth"`
Dispatcher Dispatcher `yaml:"dispatcher"`
Janitor Janitor `yaml:"janitor"`
PXE PXE `yaml:"pxe"`
Network Network `yaml:"network"`
Notifiers []Notifier `yaml:"notifiers"`
Routes []Route `yaml:"routes"`
}
type Server struct {
Bind string `yaml:"bind"`
PublicURL string `yaml:"public_url"` // user-visible base URL, e.g. https://vetting.lan:8443; used in notification click-throughs
TLS TLS `yaml:"tls"`
}
type TLS struct {
Enabled bool `yaml:"enabled"`
CertFile string `yaml:"cert_file"`
KeyFile string `yaml:"key_file"`
}
type Database struct {
Path string `yaml:"path"`
}
type Artifacts struct {
Dir string `yaml:"dir"`
RetentionDays int `yaml:"retention_days"` // 0 = keep forever
}
type Logs struct {
Dir string `yaml:"dir"`
RetentionDays int `yaml:"retention_days"` // 0 = keep forever
}
type Janitor struct {
IntervalMinutes int `yaml:"interval_minutes"` // 0 = 60
}
type Auth struct {
AdminPasswordBcrypt string `yaml:"admin_password_bcrypt"`
SessionSecretHex string `yaml:"session_secret_hex"`
SessionTTLHours int `yaml:"session_ttl_hours"`
}
func (a Auth) SessionSecret() ([]byte, error) {
b, err := hex.DecodeString(a.SessionSecretHex)
if err != nil {
return nil, fmt.Errorf("session_secret_hex: %w", err)
}
if len(b) < 32 {
return nil, fmt.Errorf("session_secret_hex must decode to at least 32 bytes, got %d", len(b))
}
return b, nil
}
type Dispatcher struct {
MaxConcurrentRuns int `yaml:"max_concurrent_runs"`
}
type Network struct {
IperfPort int `yaml:"iperf_port"`
}
// PXE / Notifier / Route are declared up front so the config file is
// forward-compatible across phases. Phase 1 does not act on these.
type PXE struct {
Enabled bool `yaml:"enabled"`
Interface string `yaml:"interface"`
DHCPRange string `yaml:"dhcp_range"`
OrchestratorURL string `yaml:"orchestrator_url"`
TFTPRoot string `yaml:"tftp_root"` // holds ipxe.efi + undionly.kpxe
LiveDir string `yaml:"live_dir"` // holds vmlinuz + initrd.img; served at /live
}
type Notifier struct {
Name string `yaml:"name"`
Type string `yaml:"type"`
Topic string `yaml:"topic,omitempty"`
Server string `yaml:"server,omitempty"`
WebhookURL string `yaml:"webhook_url,omitempty"`
SMTP SMTP `yaml:"smtp,omitempty"`
}
type SMTP struct {
Host string `yaml:"host,omitempty"`
Port int `yaml:"port,omitempty"`
From string `yaml:"from,omitempty"`
To []string `yaml:"to,omitempty"`
}
type Route struct {
MatchKind []string `yaml:"match_kind"`
MatchSeverity []string `yaml:"match_severity,omitempty"`
Notifier string `yaml:"notifier"`
}
func Load(path string) (*Config, error) {
b, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read config: %w", err)
}
var c Config
if err := yaml.Unmarshal(b, &c); err != nil {
return nil, fmt.Errorf("parse config: %w", err)
}
if c.Server.Bind == "" {
c.Server.Bind = "127.0.0.1:8080"
}
if c.Database.Path == "" {
c.Database.Path = "./var/vetting.db"
}
if c.Artifacts.Dir == "" {
c.Artifacts.Dir = "./var/artifacts"
}
if c.Logs.Dir == "" {
c.Logs.Dir = "./var/logs"
}
if c.Auth.SessionTTLHours == 0 {
c.Auth.SessionTTLHours = 24
}
if c.Dispatcher.MaxConcurrentRuns == 0 {
c.Dispatcher.MaxConcurrentRuns = 3
}
return &c, nil
}
+83
View File
@@ -0,0 +1,83 @@
package db
import (
"database/sql"
"embed"
"fmt"
"io/fs"
"path/filepath"
"sort"
"strings"
_ "modernc.org/sqlite"
)
//go:embed migrations/*.sql
var migrationsFS embed.FS
// Open opens the SQLite DB at path, enabling foreign keys and WAL,
// and applies every embedded migration in filename order.
func Open(path string) (*sql.DB, error) {
dsn := fmt.Sprintf("file:%s?_pragma=foreign_keys(1)&_pragma=journal_mode(WAL)&_pragma=busy_timeout(5000)", filepath.ToSlash(path))
db, err := sql.Open("sqlite", dsn)
if err != nil {
return nil, fmt.Errorf("open sqlite: %w", err)
}
if err := db.Ping(); err != nil {
_ = db.Close()
return nil, fmt.Errorf("ping sqlite: %w", err)
}
if err := migrate(db); err != nil {
_ = db.Close()
return nil, err
}
return db, nil
}
func migrate(db *sql.DB) error {
entries, err := fs.ReadDir(migrationsFS, "migrations")
if err != nil {
return fmt.Errorf("read migrations: %w", err)
}
names := make([]string, 0, len(entries))
for _, e := range entries {
if !e.IsDir() && strings.HasSuffix(e.Name(), ".sql") {
names = append(names, e.Name())
}
}
sort.Strings(names)
if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS schema_migrations (name TEXT PRIMARY KEY, applied_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP)`); err != nil {
return fmt.Errorf("ensure schema_migrations: %w", err)
}
for _, name := range names {
var applied int
if err := db.QueryRow(`SELECT COUNT(1) FROM schema_migrations WHERE name = ?`, name).Scan(&applied); err != nil {
return fmt.Errorf("check migration %s: %w", name, err)
}
if applied > 0 {
continue
}
content, err := migrationsFS.ReadFile("migrations/" + name)
if err != nil {
return fmt.Errorf("read migration %s: %w", name, err)
}
tx, err := db.Begin()
if err != nil {
return fmt.Errorf("begin migration %s: %w", name, err)
}
if _, err := tx.Exec(string(content)); err != nil {
_ = tx.Rollback()
return fmt.Errorf("apply migration %s: %w", name, err)
}
if _, err := tx.Exec(`INSERT INTO schema_migrations(name) VALUES(?)`, name); err != nil {
_ = tx.Rollback()
return fmt.Errorf("record migration %s: %w", name, err)
}
if err := tx.Commit(); err != nil {
return fmt.Errorf("commit migration %s: %w", name, err)
}
}
return nil
}
+93
View File
@@ -0,0 +1,93 @@
-- Phase 1 schema covers the full Vetting domain so future phases
-- only add data, never restructure.
CREATE TABLE IF NOT EXISTS hosts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
mac TEXT NOT NULL UNIQUE, -- lowercase colon form
wol_broadcast_ip TEXT NOT NULL,
wol_port INTEGER NOT NULL DEFAULT 9,
expected_spec_yaml TEXT NOT NULL,
pdu_config_json TEXT,
ipmi_config_json TEXT,
notes TEXT NOT NULL DEFAULT '',
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
host_id INTEGER NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
state TEXT NOT NULL,
result TEXT, -- pass|fail|null
failed_stage TEXT,
next_boot_target TEXT, -- linux|memtest|linux-post-memtest (Phase 2+)
agent_token_hash TEXT NOT NULL,
started_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
completed_at TIMESTAMP,
report_path TEXT,
hold_ip TEXT,
override_flags_json TEXT
);
CREATE INDEX IF NOT EXISTS idx_runs_host ON runs(host_id);
CREATE INDEX IF NOT EXISTS idx_runs_state ON runs(state);
CREATE TABLE IF NOT EXISTS stages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
name TEXT NOT NULL,
ordinal INTEGER NOT NULL,
state TEXT NOT NULL, -- pending|running|passed|failed|skipped
started_at TIMESTAMP,
completed_at TIMESTAMP,
summary_json TEXT
);
CREATE INDEX IF NOT EXISTS idx_stages_run_ordinal ON stages(run_id, ordinal);
CREATE TABLE IF NOT EXISTS measurements (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
stage_id INTEGER REFERENCES stages(id) ON DELETE SET NULL,
ts TIMESTAMP NOT NULL,
kind TEXT NOT NULL, -- temp|power|iperf|fio|smart_attr
key TEXT NOT NULL,
value REAL,
unit TEXT
);
CREATE INDEX IF NOT EXISTS idx_measurements_run_kind_ts ON measurements(run_id, kind, ts);
CREATE TABLE IF NOT EXISTS artifacts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
stage_id INTEGER REFERENCES stages(id) ON DELETE SET NULL,
kind TEXT NOT NULL,
path TEXT NOT NULL,
sha256 TEXT NOT NULL,
size_bytes INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS spec_diffs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
field TEXT NOT NULL,
expected TEXT,
actual TEXT,
severity TEXT NOT NULL, -- critical|warning|info
ignored INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER REFERENCES runs(id) ON DELETE CASCADE,
host_id INTEGER REFERENCES hosts(id) ON DELETE CASCADE,
ts TIMESTAMP NOT NULL,
level TEXT NOT NULL,
kind TEXT NOT NULL,
message TEXT NOT NULL,
data_json TEXT
);
CREATE TABLE IF NOT EXISTS settings (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
);
+144
View File
@@ -0,0 +1,144 @@
package events
import (
"context"
"fmt"
"net/http"
"sync"
"sync/atomic"
"time"
)
// Event is a typed event published on the internal bus. In Phase 1 the
// payload is an already-rendered HTML fragment; later phases will wrap
// structured run state in this same Event envelope.
type Event struct {
Name string // SSE event name (e.g. "heartbeat", "tile-update", "log-line")
Payload string // pre-rendered HTML, ready to write as SSE data
}
type subscriber struct {
id int64
ch chan Event
}
// Hub is an in-process fan-out for SSE subscribers.
type Hub struct {
mu sync.RWMutex
nextID int64
subs map[int64]*subscriber
buffer int
heartbeat time.Duration
}
func NewHub() *Hub {
h := &Hub{
subs: map[int64]*subscriber{},
buffer: 32,
heartbeat: 15 * time.Second,
}
go h.heartbeatLoop()
return h
}
func (h *Hub) Publish(ev Event) {
h.mu.RLock()
defer h.mu.RUnlock()
for _, s := range h.subs {
select {
case s.ch <- ev:
default:
// Slow subscriber: drop the event rather than stall other clients.
}
}
}
func (h *Hub) Subscribe() (id int64, ch <-chan Event, cancel func()) {
id = atomic.AddInt64(&h.nextID, 1)
s := &subscriber{id: id, ch: make(chan Event, h.buffer)}
h.mu.Lock()
h.subs[id] = s
h.mu.Unlock()
return id, s.ch, func() {
h.mu.Lock()
delete(h.subs, id)
h.mu.Unlock()
close(s.ch)
}
}
func (h *Hub) heartbeatLoop() {
t := time.NewTicker(h.heartbeat)
defer t.Stop()
for range t.C {
h.Publish(Event{
Name: "heartbeat",
Payload: fmt.Sprintf(`<span data-heartbeat="%d"></span>`, time.Now().Unix()),
})
}
}
// ServeSSE writes server-sent events for a single subscriber for the
// lifetime of the request. Each Event becomes one SSE message.
func (h *Hub) ServeSSE(w http.ResponseWriter, r *http.Request) {
flusher, ok := w.(http.Flusher)
if !ok {
http.Error(w, "streaming not supported", http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive")
w.Header().Set("X-Accel-Buffering", "no")
_, eventsCh, cancel := h.Subscribe()
defer cancel()
fmt.Fprintf(w, "event: hello\ndata: ok\n\n")
flusher.Flush()
ctx := r.Context()
for {
select {
case <-ctx.Done():
return
case ev, ok := <-eventsCh:
if !ok {
return
}
writeSSE(w, ev)
flusher.Flush()
}
}
}
func writeSSE(w http.ResponseWriter, ev Event) {
if ev.Name != "" {
fmt.Fprintf(w, "event: %s\n", ev.Name)
}
for _, line := range splitLines(ev.Payload) {
fmt.Fprintf(w, "data: %s\n", line)
}
fmt.Fprint(w, "\n")
}
func splitLines(s string) []string {
if s == "" {
return []string{""}
}
out := []string{}
start := 0
for i := 0; i < len(s); i++ {
if s[i] == '\n' {
out = append(out, s[start:i])
start = i + 1
}
}
if start <= len(s) {
out = append(out, s[start:])
}
return out
}
// Shutdown is a no-op placeholder wired into graceful shutdown.
func (h *Hub) Shutdown(_ context.Context) error { return nil }
+65
View File
@@ -0,0 +1,65 @@
// Package hold generates per-run ephemeral ed25519 keypairs for the
// FailedHolding flow. When a run fails, the agent asks the orchestrator
// for a pubkey, drops it into /root/.ssh/authorized_keys, and reports
// its LAN IP. The orchestrator stores the private key next to the run's
// artifacts and surfaces `ssh -i <path> root@<ip>` on the tile.
package hold
import (
"crypto/ed25519"
"crypto/rand"
"encoding/pem"
"fmt"
"os"
"path/filepath"
"strings"
"golang.org/x/crypto/ssh"
)
// Keypair bundles the PEM-encoded private key and the
// authorized_keys-style public key line.
type Keypair struct {
PrivatePEM []byte
AuthorizedKey string // "ssh-ed25519 AAAA... vetting-hold-N"
}
// Issue generates a new ed25519 keypair labelled for the given run.
func Issue(runID int64) (*Keypair, error) {
pub, priv, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
return nil, fmt.Errorf("generate ed25519: %w", err)
}
sshPub, err := ssh.NewPublicKey(pub)
if err != nil {
return nil, fmt.Errorf("ssh public key: %w", err)
}
blob := ssh.MarshalAuthorizedKey(sshPub) // "ssh-ed25519 AAAA...\n"
line := strings.TrimRight(string(blob), "\n")
if !strings.HasSuffix(line, fmt.Sprintf(" vetting-hold-%d", runID)) {
line += fmt.Sprintf(" vetting-hold-%d", runID)
}
block, err := ssh.MarshalPrivateKey(priv, fmt.Sprintf("vetting-hold-%d", runID))
if err != nil {
return nil, fmt.Errorf("marshal private key: %w", err)
}
return &Keypair{PrivatePEM: pem.EncodeToMemory(block), AuthorizedKey: line}, nil
}
// WritePrivateTo persists the PEM to the given path with 0600 perms
// and returns the absolute path. The operator's shell reads this file
// by path, so we keep it on disk per-run.
func (kp *Keypair) WritePrivateTo(path string) (string, error) {
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
return "", err
}
if err := os.WriteFile(path, kp.PrivatePEM, 0o600); err != nil {
return "", fmt.Errorf("write hold key: %w", err)
}
abs, err := filepath.Abs(path)
if err != nil {
return path, nil
}
return abs, nil
}
+99
View File
@@ -0,0 +1,99 @@
package hold
import (
"bytes"
"crypto/ed25519"
"os"
"path/filepath"
"strings"
"testing"
"golang.org/x/crypto/ssh"
)
// TestIssueRoundTrip checks that the private key we write is parseable
// with the standard openssh library and that its derived public key
// byte-for-byte matches the authorized_key line we handed the agent.
// If this drifts — e.g. we swap from ed25519 to something else, or
// mangle the comment — the operator's `ssh -i path root@ip` breaks
// silently. The test is the only early-warning we have.
func TestIssueRoundTrip(t *testing.T) {
kp, err := Issue(42)
if err != nil {
t.Fatalf("Issue: %v", err)
}
// Parse the private key back.
signer, err := ssh.ParsePrivateKey(kp.PrivatePEM)
if err != nil {
t.Fatalf("ParsePrivateKey: %v", err)
}
// The public derived from the signer must match the authorized_key line.
gotAuth := strings.TrimRight(string(ssh.MarshalAuthorizedKey(signer.PublicKey())), "\n")
wantAuth := kp.AuthorizedKey
// Authorized_keys comment is ours; compare just the type+b64 prefix.
gotParts := strings.SplitN(gotAuth, " ", 3)
wantParts := strings.SplitN(wantAuth, " ", 3)
if len(gotParts) < 2 || len(wantParts) < 2 {
t.Fatalf("unexpected authorized_key shape got=%q want=%q", gotAuth, wantAuth)
}
if gotParts[0] != wantParts[0] || gotParts[1] != wantParts[1] {
t.Fatalf("public key mismatch:\n got %s\n want %s", gotAuth, wantAuth)
}
if !strings.Contains(wantAuth, "vetting-hold-42") {
t.Fatalf("authorized_key line missing run tag: %q", wantAuth)
}
}
// TestIssueKeysAreEd25519 pins the algorithm — anything other than
// ed25519 would surprise operators who've been told their hold key is
// ed25519 (and would change key-file sizes, path handling, etc.).
func TestIssueKeysAreEd25519(t *testing.T) {
kp, err := Issue(1)
if err != nil {
t.Fatalf("Issue: %v", err)
}
signer, err := ssh.ParsePrivateKey(kp.PrivatePEM)
if err != nil {
t.Fatalf("ParsePrivateKey: %v", err)
}
if got := signer.PublicKey().Type(); got != ssh.KeyAlgoED25519 {
t.Fatalf("key algorithm: got %s, want ssh-ed25519", got)
}
// Paranoia: the Ed25519 public key underneath should be 32 bytes.
edPub, ok := signer.PublicKey().(ssh.CryptoPublicKey)
if !ok {
t.Fatalf("public key does not expose CryptoPublicKey")
}
raw, ok := edPub.CryptoPublicKey().(ed25519.PublicKey)
if !ok {
t.Fatalf("public key is not ed25519.PublicKey")
}
if len(raw) != ed25519.PublicKeySize {
t.Fatalf("ed25519 pubkey size = %d, want %d", len(raw), ed25519.PublicKeySize)
}
}
func TestWritePrivateToSetsPerms(t *testing.T) {
kp, err := Issue(7)
if err != nil {
t.Fatalf("Issue: %v", err)
}
dir := t.TempDir()
path := filepath.Join(dir, "nested", "hold.key")
abs, err := kp.WritePrivateTo(path)
if err != nil {
t.Fatalf("WritePrivateTo: %v", err)
}
if !filepath.IsAbs(abs) {
t.Fatalf("expected absolute path, got %q", abs)
}
buf, err := os.ReadFile(abs)
if err != nil {
t.Fatalf("ReadFile: %v", err)
}
if !bytes.Equal(buf, kp.PrivatePEM) {
t.Fatalf("on-disk bytes differ from in-memory PEM")
}
}
+75
View File
@@ -0,0 +1,75 @@
// Package httpserver assembles the chi router. It lives in its own
// package because it depends on both `api` and `orchestrator`, and
// those two packages must stay import-independent.
package httpserver
import (
"io/fs"
"net/http"
"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
"vetting/internal/api"
"vetting/internal/auth"
"vetting/internal/web"
)
type Deps struct {
Auth *auth.Manager
UI *api.UI
Agent *api.Agent
LiveDir string // directory containing vmlinuz + initrd.img; "" disables /live
}
func NewRouter(d Deps) http.Handler {
r := chi.NewRouter()
r.Use(middleware.RealIP)
r.Use(middleware.Recoverer)
r.Use(middleware.Logger)
staticFS, err := fs.Sub(web.Static, "static")
if err != nil {
panic(err)
}
r.Handle("/static/*", http.StripPrefix("/static/", http.FileServer(http.FS(staticFS))))
if d.LiveDir != "" {
r.Handle("/live/*", http.StripPrefix("/live/", http.FileServer(http.Dir(d.LiveDir))))
}
// Public (no session required) endpoints.
r.Get("/login", d.UI.LoginForm)
r.Post("/login", d.UI.LoginSubmit)
r.Post("/logout", d.UI.Logout)
// Agent / PXE endpoints — authenticated per-request by bearer token
// or by the unforgeable MAC path parameter, never by the UI session.
r.Get("/ipxe/{mac}", d.Agent.IPXEScript)
r.Route("/api/v1/runs/{id}", func(r chi.Router) {
r.Post("/hello", d.Agent.Hello)
r.Post("/claim", d.Agent.Claim)
r.Post("/heartbeat", d.Agent.Heartbeat)
r.Post("/log", d.Agent.Log)
r.Post("/result", d.Agent.Result)
r.Post("/hold", d.Agent.Hold)
r.Post("/sensor", d.Agent.Sensor)
})
// Session-gated browser UI.
r.Group(func(r chi.Router) {
r.Use(d.Auth.RequireSession)
r.Get("/", d.UI.Dashboard)
r.Get("/hosts/new", d.UI.NewHostForm)
r.Post("/hosts", d.UI.CreateHost)
r.Post("/hosts/{id}/delete", d.UI.DeleteHost)
r.Post("/hosts/{id}/start", d.UI.StartRun)
r.Post("/hosts/{id}/override-wipe", d.UI.OverrideWipeStorage)
r.Get("/reports/{runID}", d.UI.Report)
r.Get("/events", d.UI.SSE)
})
return r
}
+33
View File
@@ -0,0 +1,33 @@
package janitor
import (
"context"
"time"
"vetting/internal/logs"
"vetting/internal/store"
)
// StoreAdapter bridges the concrete orchestrator stores to the Janitor's
// dependency interface. Kept in the janitor package so the orchestrator
// wire-up stays a single-line: janitor.New(cfg, &janitor.StoreAdapter{...}).
type StoreAdapter struct {
Runs *store.Runs
Artifacts *store.Artifacts
Logs *logs.Hub
}
func (a *StoreAdapter) CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error) {
return a.Runs.CompletedOlderThan(ctx, cutoff)
}
func (a *StoreAdapter) DeleteArtifactsForRun(ctx context.Context, runID int64) ([]store.Artifact, error) {
return a.Artifacts.DeleteForRun(ctx, runID)
}
func (a *StoreAdapter) LogPathFor(runID int64) string {
if a.Logs == nil {
return ""
}
return a.Logs.PathFor(runID)
}
+171
View File
@@ -0,0 +1,171 @@
// Package janitor garbage-collects on-disk run data. A completed or
// released run produces an HTML report, a JSON report, a log file, and
// potentially several artifact blobs (fio output, iperf output, hold
// pubkey, inventory JSON). None of these need to stay on disk
// indefinitely — once the operator's looked at the report and closed
// the tile, disk pressure is the only cost.
//
// The DB row for the run is kept (so historical counts and host
// histories survive); only the on-disk files and their artifact rows
// are pruned. The janitor ticks on a fixed interval and is safe to
// run concurrently with live runs — it only touches runs in terminal
// states past a cutoff, which by definition are not being written to.
package janitor
import (
"context"
"errors"
"fmt"
"log"
"os"
"sync"
"time"
"vetting/internal/store"
)
// Config carries the retention knobs. Zero values mean "keep forever"
// for that class of data; a zero Interval defaults to 1h.
type Config struct {
ArtifactRetention time.Duration
LogRetention time.Duration
Interval time.Duration
}
// Stores is the subset of the store layer the janitor needs. Defined as
// an interface so tests can fake it without spinning up SQLite.
type Stores interface {
CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error)
DeleteArtifactsForRun(ctx context.Context, runID int64) ([]store.Artifact, error)
LogPathFor(runID int64) string
}
// Janitor owns the ticker goroutine. Start/Stop are idempotent; Stop
// waits for the in-flight pass to finish so tests can assert post-state.
type Janitor struct {
cfg Config
s Stores
stop chan struct{}
wg sync.WaitGroup
mu sync.Mutex
running bool
}
func New(cfg Config, s Stores) *Janitor {
if cfg.Interval <= 0 {
cfg.Interval = time.Hour
}
return &Janitor{cfg: cfg, s: s, stop: make(chan struct{})}
}
// Start launches the ticker. Retention zeros mean no cleanup is needed;
// in that case the ticker still runs but each Sweep is a no-op.
func (j *Janitor) Start(ctx context.Context) {
j.mu.Lock()
if j.running {
j.mu.Unlock()
return
}
j.running = true
j.mu.Unlock()
j.wg.Add(1)
go j.loop(ctx)
}
func (j *Janitor) Stop() {
j.mu.Lock()
if !j.running {
j.mu.Unlock()
return
}
j.running = false
close(j.stop)
j.mu.Unlock()
j.wg.Wait()
}
func (j *Janitor) loop(ctx context.Context) {
defer j.wg.Done()
// Run one sweep immediately so startup cleans up anything that
// aged out while the orchestrator was down.
if err := j.Sweep(ctx, time.Now().UTC()); err != nil {
log.Printf("janitor: initial sweep: %v", err)
}
t := time.NewTicker(j.cfg.Interval)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-j.stop:
return
case now := <-t.C:
if err := j.Sweep(ctx, now.UTC()); err != nil {
log.Printf("janitor: sweep: %v", err)
}
}
}
}
// Sweep is exported so tests can drive a single pass deterministically.
// It picks the *more aggressive* cutoff between the two retentions so a
// single DB query covers both classes, then does the per-class work.
func (j *Janitor) Sweep(ctx context.Context, now time.Time) error {
if j.cfg.ArtifactRetention <= 0 && j.cfg.LogRetention <= 0 {
return nil
}
cutoff := now.Add(-longer(j.cfg.ArtifactRetention, j.cfg.LogRetention))
runs, err := j.s.CompletedOlderThan(ctx, cutoff)
if err != nil {
return fmt.Errorf("list old runs: %w", err)
}
artifactCutoff := now.Add(-j.cfg.ArtifactRetention)
logCutoff := now.Add(-j.cfg.LogRetention)
for _, runID := range runs {
// The query above used the longer cutoff — each retention is
// re-checked per-run against its actual cutoff via the run's
// completed_at, but since we don't round-trip that here we
// just process both at their own cutoff using the single
// query's cheap filter (run is old enough for at least one).
if j.cfg.ArtifactRetention > 0 && !artifactCutoff.IsZero() {
j.cleanArtifacts(ctx, runID)
}
if j.cfg.LogRetention > 0 && !logCutoff.IsZero() {
j.cleanLog(runID)
}
}
return nil
}
func (j *Janitor) cleanArtifacts(ctx context.Context, runID int64) {
arts, err := j.s.DeleteArtifactsForRun(ctx, runID)
if err != nil {
log.Printf("janitor: delete artifacts for run %d: %v", runID, err)
return
}
for _, a := range arts {
if a.Path == "" {
continue
}
if err := os.Remove(a.Path); err != nil && !errors.Is(err, os.ErrNotExist) {
log.Printf("janitor: unlink %s: %v", a.Path, err)
}
}
}
func (j *Janitor) cleanLog(runID int64) {
path := j.s.LogPathFor(runID)
if path == "" {
return
}
if err := os.Remove(path); err != nil && !errors.Is(err, os.ErrNotExist) {
log.Printf("janitor: unlink log %s: %v", path, err)
}
}
func longer(a, b time.Duration) time.Duration {
if a > b {
return a
}
return b
}
+133
View File
@@ -0,0 +1,133 @@
package janitor
import (
"context"
"os"
"path/filepath"
"testing"
"time"
"vetting/internal/store"
)
// fakeStores is a test double that records what the janitor asked for
// and hands back canned runs/artifacts. It lets us verify both the
// cleanup contract (files deleted, rows deleted) and that the janitor
// honours a zero retention as a no-op.
type fakeStores struct {
cutoffSeen time.Time
runsOlder []int64
artifactsByID map[int64][]store.Artifact
deleted map[int64]bool
logs map[int64]string
}
func (f *fakeStores) CompletedOlderThan(_ context.Context, cutoff time.Time) ([]int64, error) {
f.cutoffSeen = cutoff
return f.runsOlder, nil
}
func (f *fakeStores) DeleteArtifactsForRun(_ context.Context, runID int64) ([]store.Artifact, error) {
if f.deleted == nil {
f.deleted = map[int64]bool{}
}
f.deleted[runID] = true
return f.artifactsByID[runID], nil
}
func (f *fakeStores) LogPathFor(runID int64) string { return f.logs[runID] }
func writeTempFile(t *testing.T, dir, name string) string {
t.Helper()
p := filepath.Join(dir, name)
if err := os.WriteFile(p, []byte("x"), 0o644); err != nil {
t.Fatalf("write %s: %v", p, err)
}
return p
}
func TestSweepDeletesArtifactsAndLogs(t *testing.T) {
dir := t.TempDir()
p1 := writeTempFile(t, dir, "artifact-1.bin")
p2 := writeTempFile(t, dir, "artifact-2.json")
log1 := writeTempFile(t, dir, "run-1.log")
s := &fakeStores{
runsOlder: []int64{1},
artifactsByID: map[int64][]store.Artifact{
1: {{ID: 10, RunID: 1, Path: p1}, {ID: 11, RunID: 1, Path: p2}},
},
logs: map[int64]string{1: log1},
}
j := New(Config{
ArtifactRetention: 24 * time.Hour,
LogRetention: 24 * time.Hour,
Interval: time.Minute,
}, s)
if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
t.Fatalf("sweep: %v", err)
}
if !s.deleted[1] {
t.Fatalf("run 1 not passed to DeleteArtifactsForRun")
}
for _, p := range []string{p1, p2, log1} {
if _, err := os.Stat(p); !os.IsNotExist(err) {
t.Errorf("file %s still exists (err=%v)", p, err)
}
}
}
func TestSweepIsNoopWhenRetentionsAreZero(t *testing.T) {
dir := t.TempDir()
p := writeTempFile(t, dir, "keep.bin")
s := &fakeStores{
runsOlder: []int64{1},
artifactsByID: map[int64][]store.Artifact{
1: {{ID: 10, RunID: 1, Path: p}},
},
logs: map[int64]string{1: p},
}
j := New(Config{}, s) // all zero
if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
t.Fatalf("sweep: %v", err)
}
if s.deleted[1] {
t.Fatalf("expected no deletion for zero retention")
}
if _, err := os.Stat(p); err != nil {
t.Fatalf("file should still exist: %v", err)
}
}
func TestSweepSkipsMissingFilesGracefully(t *testing.T) {
s := &fakeStores{
runsOlder: []int64{7},
artifactsByID: map[int64][]store.Artifact{
7: {{ID: 99, RunID: 7, Path: "/nonexistent/path.bin"}},
},
logs: map[int64]string{7: "/nonexistent/run-7.log"},
}
j := New(Config{ArtifactRetention: time.Hour, LogRetention: time.Hour}, s)
if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
t.Fatalf("sweep: %v", err)
}
if !s.deleted[7] {
t.Fatalf("run 7 should have been processed")
}
}
func TestSweepUsesTheLongerCutoff(t *testing.T) {
s := &fakeStores{}
j := New(Config{
ArtifactRetention: 72 * time.Hour,
LogRetention: 24 * time.Hour,
}, s)
now := time.Date(2026, 4, 17, 12, 0, 0, 0, time.UTC)
if err := j.Sweep(context.Background(), now); err != nil {
t.Fatalf("sweep: %v", err)
}
want := now.Add(-72 * time.Hour)
if !s.cutoffSeen.Equal(want) {
t.Fatalf("cutoff = %v, want %v (the longer of the two retentions)", s.cutoffSeen, want)
}
}
+134
View File
@@ -0,0 +1,134 @@
// Package logs owns per-run flat-file logs and their live SSE fan-out.
// A single Writer serialises writes for one run; a Hub keeps a cache
// per run so handlers can open/close freely without stepping on each
// other. Lines go to disk for persistence (reload + replay) and onto
// the events.Hub so the UI tile can tail live.
package logs
import (
"fmt"
"html"
"log"
"os"
"path/filepath"
"strings"
"sync"
"time"
"vetting/internal/events"
)
type Line struct {
TS time.Time
Level string // info|warn|error|debug
Text string
}
type Writer struct {
runID int64
mu sync.Mutex
f *os.File
hub *events.Hub
}
// Hub owns the per-run Writers. The orchestrator creates one Hub at
// startup and hands it to the api package.
type Hub struct {
dir string
events *events.Hub
mu sync.Mutex
writers map[int64]*Writer
}
func NewHub(dir string, ev *events.Hub) (*Hub, error) {
if err := os.MkdirAll(dir, 0o755); err != nil {
return nil, fmt.Errorf("mkdir log dir: %w", err)
}
return &Hub{dir: dir, events: ev, writers: map[int64]*Writer{}}, nil
}
// WriterFor returns a cached Writer, opening the file lazily. The file
// is append-only; if an existing run's log is reopened (e.g. after a
// restart) we append rather than truncate so nothing is lost.
func (h *Hub) WriterFor(runID int64) (*Writer, error) {
h.mu.Lock()
defer h.mu.Unlock()
if w, ok := h.writers[runID]; ok {
return w, nil
}
path := filepath.Join(h.dir, fmt.Sprintf("run-%d.log", runID))
f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
if err != nil {
return nil, fmt.Errorf("open %s: %w", path, err)
}
w := &Writer{runID: runID, f: f, hub: h.events}
h.writers[runID] = w
return w, nil
}
// Close flushes and closes all open run files. Called from main on
// shutdown so the logs aren't left with buffered data.
func (h *Hub) Close() {
h.mu.Lock()
defer h.mu.Unlock()
for id, w := range h.writers {
if err := w.Close(); err != nil {
log.Printf("logs: close run-%d: %v", id, err)
}
}
h.writers = nil
}
// PathFor returns the on-disk path for a run's log; used by replay
// handlers and the report generator.
func (h *Hub) PathFor(runID int64) string {
return filepath.Join(h.dir, fmt.Sprintf("run-%d.log", runID))
}
// Append writes a line to disk and publishes an SSE event. Failures
// on disk log but don't block the SSE fan-out — the operator can still
// see the live tail even if disk IO is degraded.
func (w *Writer) Append(line Line) {
w.mu.Lock()
defer w.mu.Unlock()
if line.TS.IsZero() {
line.TS = time.Now().UTC()
}
if line.Level == "" {
line.Level = "info"
}
stamped := fmt.Sprintf("%s %5s %s\n", line.TS.Format(time.RFC3339Nano), strings.ToUpper(line.Level), line.Text)
if _, err := w.f.WriteString(stamped); err != nil {
log.Printf("logs: write run-%d: %v", w.runID, err)
}
if w.hub != nil {
w.hub.Publish(events.Event{
Name: fmt.Sprintf("log-%d", w.runID),
Payload: renderLogSSE(line),
})
}
}
func (w *Writer) Close() error {
w.mu.Lock()
defer w.mu.Unlock()
if w.f == nil {
return nil
}
err := w.f.Close()
w.f = nil
return err
}
// renderLogSSE returns an HTMX-compatible fragment. The tile contains
// a <div id="log-N" hx-swap-oob="beforeend">: each event appends one
// <div class="log-line log-LEVEL"> to it.
func renderLogSSE(l Line) string {
level := strings.ToLower(l.Level)
return fmt.Sprintf(
`<div class="log-line log-%s">%s %s</div>`,
html.EscapeString(level),
html.EscapeString(l.TS.Format("15:04:05")),
html.EscapeString(l.Text),
)
}
+120
View File
@@ -0,0 +1,120 @@
package logs_test
import (
"os"
"path/filepath"
"strings"
"testing"
"time"
"vetting/internal/events"
"vetting/internal/logs"
)
// TestAppendFansOutToSSE verifies the two guarantees of the log hub:
// (a) every line is persisted to the per-run file, and (b) every line
// is published as an SSE event with name log-<runID>. The UI relies on
// both — the file for reload replay, the event for live tail.
func TestAppendFansOutToSSE(t *testing.T) {
dir := t.TempDir()
hub := events.NewHub()
lh, err := logs.NewHub(dir, hub)
if err != nil {
t.Fatalf("NewHub: %v", err)
}
defer lh.Close()
_, ch, cancel := hub.Subscribe()
defer cancel()
w, err := lh.WriterFor(77)
if err != nil {
t.Fatalf("WriterFor: %v", err)
}
w.Append(logs.Line{Level: "info", Text: "hello from agent"})
w.Append(logs.Line{Level: "error", Text: "<script>pwn</script>"})
got := collect(ch, 3, 500*time.Millisecond)
// Filter out heartbeats that may sneak in.
var logEvents []events.Event
for _, ev := range got {
if strings.HasPrefix(ev.Name, "log-") {
logEvents = append(logEvents, ev)
}
}
if len(logEvents) < 2 {
t.Fatalf("expected 2 log events, got %d (all=%+v)", len(logEvents), got)
}
for _, ev := range logEvents {
if ev.Name != "log-77" {
t.Fatalf("unexpected event name %q", ev.Name)
}
}
// XSS protection: raw <script> must not appear — it's HTML-escaped.
if strings.Contains(logEvents[1].Payload, "<script>") {
t.Fatalf("log payload not escaped: %q", logEvents[1].Payload)
}
if !strings.Contains(logEvents[1].Payload, "&lt;script&gt;") {
t.Fatalf("expected escaped <script>, got %q", logEvents[1].Payload)
}
// On disk: the file must contain both lines.
path := filepath.Join(dir, "run-77.log")
body, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read log file: %v", err)
}
text := string(body)
if !strings.Contains(text, "hello from agent") {
t.Fatalf("disk log missing info line: %q", text)
}
if !strings.Contains(text, "<script>pwn</script>") {
t.Fatalf("disk log should keep raw text (unescaped): %q", text)
}
if !strings.Contains(text, "INFO") || !strings.Contains(text, "ERROR") {
t.Fatalf("disk log missing level prefix: %q", text)
}
}
// TestWriterForIsCached verifies a second call returns the same Writer
// — otherwise parallel /log POSTs would race on file opens and possibly
// stomp on in-flight writes.
func TestWriterForIsCached(t *testing.T) {
hub := events.NewHub()
lh, err := logs.NewHub(t.TempDir(), hub)
if err != nil {
t.Fatalf("NewHub: %v", err)
}
defer lh.Close()
w1, err := lh.WriterFor(1)
if err != nil {
t.Fatalf("WriterFor: %v", err)
}
w2, err := lh.WriterFor(1)
if err != nil {
t.Fatalf("WriterFor: %v", err)
}
if w1 != w2 {
t.Fatalf("Writer not cached: %p vs %p", w1, w2)
}
}
// collect drains up to max events or bails after deadline.
func collect(ch <-chan events.Event, max int, deadline time.Duration) []events.Event {
out := []events.Event{}
timer := time.NewTimer(deadline)
defer timer.Stop()
for len(out) < max {
select {
case ev, ok := <-ch:
if !ok {
return out
}
out = append(out, ev)
case <-timer.C:
return out
}
}
return out
}
+96
View File
@@ -0,0 +1,96 @@
package model
import "time"
type Host struct {
ID int64
Name string
MAC string
WoLBroadcastIP string
WoLPort int
ExpectedSpecYAML string
PDUConfigJSON string
IPMIConfigJSON string
Notes string
CreatedAt time.Time
UpdatedAt time.Time
}
type RunState string
const (
StateRegistered RunState = "Registered"
StateQueued RunState = "Queued"
StateWaitingWoL RunState = "WaitingWoL"
StateBooting RunState = "Booting"
StateInventoryCheck RunState = "InventoryCheck"
StateSpecValidate RunState = "SpecValidate"
StateSMART RunState = "SMART"
StateCPUStress RunState = "CPUStress"
StateStorage RunState = "Storage"
StateNetwork RunState = "Network"
StateGPU RunState = "GPU"
StatePSU RunState = "PSU"
StateReporting RunState = "Reporting"
StateCompleted RunState = "Completed"
StateFailed RunState = "Failed"
StateFailedHolding RunState = "FailedHolding"
StateReleased RunState = "Released"
)
type Run struct {
ID int64
HostID int64
State RunState
Result string
FailedStage string
NextBootTarget string
AgentTokenHash string
StartedAt time.Time
CompletedAt *time.Time
ReportPath string
HoldIP string
OverrideFlagsJSON string
}
type StageState string
const (
StagePending StageState = "pending"
StageRunning StageState = "running"
StagePassed StageState = "passed"
StageFailed StageState = "failed"
StageSkipped StageState = "skipped"
)
type Stage struct {
ID int64
RunID int64
Name string
Ordinal int
State StageState
StartedAt *time.Time
CompletedAt *time.Time
SummaryJSON string
}
type Measurement struct {
ID int64
RunID int64
StageID *int64
TS time.Time
Kind string
Key string
Value float64
Unit string
}
type SpecDiff struct {
ID int64
RunID int64
Field string
Expected string
Actual string
Severity string // critical|warning|info
Ignored bool
}
+56
View File
@@ -0,0 +1,56 @@
package notify
import (
"fmt"
"time"
"vetting/internal/config"
)
// BuildRegistry translates the config surface into a live Registry.
// Unknown notifier types produce an error so typos fail startup loudly
// rather than silently drop events.
func BuildRegistry(notifiers []config.Notifier, routes []config.Route) (*Registry, error) {
reg := NewRegistry(10 * time.Second)
for _, n := range notifiers {
switch n.Type {
case "":
continue // skip blank entries; useful for commented-out examples
case "ntfy":
reg.Register(NewNtfy(n.Name, n.Server, n.Topic))
case "discord":
reg.Register(NewDiscord(n.Name, n.WebhookURL))
case "smtp":
reg.Register(NewSMTP(n.Name, n.SMTP.Host, n.SMTP.Port, n.SMTP.From, n.SMTP.To))
default:
return nil, fmt.Errorf("notify: unknown notifier type %q (name=%q)", n.Type, n.Name)
}
}
for _, r := range routes {
if r.Notifier == "" {
return nil, fmt.Errorf("notify: route has no notifier name")
}
reg.AddRoute(Route{
MatchKind: toKinds(r.MatchKind),
MatchSeverity: toSeverities(r.MatchSeverity),
Notifier: r.Notifier,
})
}
return reg, nil
}
func toKinds(ss []string) []Kind {
out := make([]Kind, 0, len(ss))
for _, s := range ss {
out = append(out, Kind(s))
}
return out
}
func toSeverities(ss []string) []Severity {
out := make([]Severity, 0, len(ss))
for _, s := range ss {
out = append(out, Severity(s))
}
return out
}
+87
View File
@@ -0,0 +1,87 @@
package notify
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// DiscordNotifier posts to a Discord incoming webhook. Body is rendered
// as a single embed so Discord shows a colored sidebar matching event
// severity. Discord rejects empty content+embeds; we always include the
// embed so that never happens.
type DiscordNotifier struct {
NameStr string
WebhookURL string
HTTP *http.Client
}
func NewDiscord(name, webhookURL string) *DiscordNotifier {
return &DiscordNotifier{
NameStr: name,
WebhookURL: webhookURL,
HTTP: &http.Client{Timeout: 10 * time.Second},
}
}
func (d *DiscordNotifier) Name() string { return d.NameStr }
type discordPayload struct {
Embeds []discordEmbed `json:"embeds"`
}
type discordEmbed struct {
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
URL string `json:"url,omitempty"`
Color int `json:"color,omitempty"`
}
func (d *DiscordNotifier) Send(ctx context.Context, ev Event) error {
if d.WebhookURL == "" {
return fmt.Errorf("discord: no webhook_url configured")
}
payload := discordPayload{Embeds: []discordEmbed{{
Title: ev.Title,
Description: ev.Body,
URL: ev.URL,
Color: discordColor(ev.Severity),
}}}
buf, err := json.Marshal(payload)
if err != nil {
return err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, d.WebhookURL, bytes.NewReader(buf))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
resp, err := d.HTTP.Do(req)
if err != nil {
return err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode >= 300 {
b, _ := io.ReadAll(resp.Body)
return fmt.Errorf("discord: %d: %s", resp.StatusCode, strings.TrimSpace(string(b)))
}
return nil
}
// discordColor returns the embed sidebar color for each severity.
// Values are standard Discord decimal color codes.
func discordColor(s Severity) int {
switch s {
case SeverityCritical:
return 0xE74C3C // red
case SeverityWarning:
return 0xF1C40F // yellow
default:
return 0x2ECC71 // green
}
}
+179
View File
@@ -0,0 +1,179 @@
// Package notify owns outbound operator notifications. The orchestrator
// fires Events at well-known points (stage failure, hold opened, run
// completed, spec mismatch); a Registry matches each Event against
// config-declared routes and dispatches to the matching Notifiers.
//
// Delivery is fire-and-forget: a single HTTP/SMTP attempt per notifier
// with a bounded timeout. Failures are logged and nothing is persisted
// — on a solo LAN deployment the orchestrator UI is the source of truth
// and we don't want to build a durable queue for a convenience feature.
package notify
import (
"context"
"log"
"sync"
"time"
)
// Kind enumerates the event types the orchestrator can fire. Names are
// stable: they appear in config files' match_kind lists.
type Kind string
const (
KindStageFailed Kind = "StageFailed"
KindSpecMismatch Kind = "SpecMismatch"
KindHoldingOpened Kind = "HoldingOpened"
KindRunCompleted Kind = "RunCompleted"
)
// Severity is classification for filtering routes. "critical" pairs
// with StageFailed/SpecMismatch/HoldingOpened; RunCompleted uses "info".
type Severity string
const (
SeverityInfo Severity = "info"
SeverityWarning Severity = "warning"
SeverityCritical Severity = "critical"
)
// Event is the payload passed to each Notifier's Send method. Title and
// Body are pre-rendered; notifiers shape them for their own transport
// (e.g. Discord embed vs SMTP body) but shouldn't re-compose semantics.
//
// URL links back to the orchestrator UI so a push notification can be
// clicked through for full context.
type Event struct {
Kind Kind
Severity Severity
RunID int64
HostName string
Title string
Body string
URL string // optional; UI link for this run/host
}
// Notifier is one delivery target. Implementations must not block on
// remote-side failure any longer than their own timeout — the Registry
// calls Send from a goroutine but still wants the goroutine to exit.
type Notifier interface {
Name() string
Send(ctx context.Context, ev Event) error
}
// Route binds an event selector to a notifier name. A route matches an
// event when every non-empty field is satisfied; empty fields are wildcards.
type Route struct {
MatchKind []Kind
MatchSeverity []Severity
Notifier string // name of a registered Notifier
}
// Registry holds notifiers + routes and fans events out. Safe for
// concurrent Dispatch. It's built once at startup from config.
type Registry struct {
notifiers map[string]Notifier
routes []Route
timeout time.Duration
mu sync.Mutex // guards in-flight goroutine count (future-use metrics)
}
// NewRegistry builds a Registry with its per-notification timeout budget.
// A zero timeout becomes 10s so tests and prod both get sane defaults.
func NewRegistry(timeout time.Duration) *Registry {
if timeout <= 0 {
timeout = 10 * time.Second
}
return &Registry{
notifiers: map[string]Notifier{},
timeout: timeout,
}
}
// Register adds a Notifier. Re-registering a name overwrites silently —
// configs can shadow by listing the same name twice.
func (r *Registry) Register(n Notifier) {
if n == nil {
return
}
r.notifiers[n.Name()] = n
}
// AddRoute appends a route rule. Order is preserved for deterministic
// multi-match dispatch.
func (r *Registry) AddRoute(rt Route) {
r.routes = append(r.routes, rt)
}
// Dispatch finds every route matching ev and fires each targeted
// notifier on its own goroutine. Returns immediately — the caller does
// not wait on delivery. Errors are logged.
func (r *Registry) Dispatch(ev Event) {
targets := r.match(ev)
if len(targets) == 0 {
return
}
for _, n := range targets {
n := n
go func() {
ctx, cancel := context.WithTimeout(context.Background(), r.timeout)
defer cancel()
if err := n.Send(ctx, ev); err != nil {
log.Printf("notify: %s send(%s run=%d): %v", n.Name(), ev.Kind, ev.RunID, err)
}
}()
}
}
// match walks the route table in order and returns the unique notifiers
// that should be fired for ev. Duplicates (same notifier named by two
// matching routes) collapse — the operator intent is delivery, not
// duplicate delivery.
func (r *Registry) match(ev Event) []Notifier {
seen := map[string]bool{}
out := []Notifier{}
for _, rt := range r.routes {
if !matchesKind(rt.MatchKind, ev.Kind) {
continue
}
if !matchesSeverity(rt.MatchSeverity, ev.Severity) {
continue
}
if seen[rt.Notifier] {
continue
}
n, ok := r.notifiers[rt.Notifier]
if !ok {
log.Printf("notify: route references unknown notifier %q", rt.Notifier)
continue
}
seen[rt.Notifier] = true
out = append(out, n)
}
return out
}
func matchesKind(allow []Kind, got Kind) bool {
if len(allow) == 0 {
return true
}
for _, k := range allow {
if k == got {
return true
}
}
return false
}
func matchesSeverity(allow []Severity, got Severity) bool {
if len(allow) == 0 {
return true
}
for _, s := range allow {
if s == got {
return true
}
}
return false
}
+268
View File
@@ -0,0 +1,268 @@
package notify
import (
"context"
"io"
"net/http"
"net/http/httptest"
"net/smtp"
"strings"
"sync"
"sync/atomic"
"testing"
"time"
)
// stubNotifier records every Send call; it's the test harness for
// Registry routing logic without hitting network.
type stubNotifier struct {
name string
calls []Event
mu sync.Mutex
failOn Kind // if non-empty, returns an error when ev.Kind == failOn
}
func (s *stubNotifier) Name() string { return s.name }
func (s *stubNotifier) Send(_ context.Context, ev Event) error {
s.mu.Lock()
s.calls = append(s.calls, ev)
s.mu.Unlock()
if s.failOn != "" && ev.Kind == s.failOn {
return errFake("forced failure")
}
return nil
}
func (s *stubNotifier) seen() []Event {
s.mu.Lock()
defer s.mu.Unlock()
return append([]Event(nil), s.calls...)
}
type errFake string
func (e errFake) Error() string { return string(e) }
// awaitCalls spins until every stub has the expected count or the
// deadline elapses — Dispatch uses goroutines so the test must wait.
func awaitCalls(t *testing.T, want map[*stubNotifier]int) {
t.Helper()
deadline := time.Now().Add(2 * time.Second)
for {
ok := true
for s, n := range want {
if len(s.seen()) < n {
ok = false
break
}
}
if ok {
return
}
if time.Now().After(deadline) {
for s, n := range want {
t.Errorf("notifier %q: got %d calls, want %d", s.name, len(s.seen()), n)
}
return
}
time.Sleep(5 * time.Millisecond)
}
}
func TestRegistryRoutesByKind(t *testing.T) {
reg := NewRegistry(time.Second)
a := &stubNotifier{name: "fails-only"}
b := &stubNotifier{name: "everything"}
reg.Register(a)
reg.Register(b)
reg.AddRoute(Route{MatchKind: []Kind{KindStageFailed}, Notifier: "fails-only"})
reg.AddRoute(Route{Notifier: "everything"})
reg.Dispatch(Event{Kind: KindStageFailed, Severity: SeverityCritical})
reg.Dispatch(Event{Kind: KindRunCompleted, Severity: SeverityInfo})
awaitCalls(t, map[*stubNotifier]int{a: 1, b: 2})
if got := a.seen()[0].Kind; got != KindStageFailed {
t.Fatalf("a got %q, want StageFailed", got)
}
}
func TestRegistryRoutesBySeverity(t *testing.T) {
reg := NewRegistry(time.Second)
crit := &stubNotifier{name: "crit-only"}
reg.Register(crit)
reg.AddRoute(Route{MatchSeverity: []Severity{SeverityCritical}, Notifier: "crit-only"})
reg.Dispatch(Event{Kind: KindRunCompleted, Severity: SeverityInfo})
reg.Dispatch(Event{Kind: KindHoldingOpened, Severity: SeverityCritical})
awaitCalls(t, map[*stubNotifier]int{crit: 1})
if got := crit.seen()[0].Severity; got != SeverityCritical {
t.Fatalf("got severity %q, want critical", got)
}
}
func TestRegistryDeduplicatesNotifiers(t *testing.T) {
reg := NewRegistry(time.Second)
n := &stubNotifier{name: "only"}
reg.Register(n)
// Two routes naming the same notifier — a single Dispatch should
// fire once, not twice.
reg.AddRoute(Route{MatchKind: []Kind{KindStageFailed}, Notifier: "only"})
reg.AddRoute(Route{MatchSeverity: []Severity{SeverityCritical}, Notifier: "only"})
reg.Dispatch(Event{Kind: KindStageFailed, Severity: SeverityCritical})
awaitCalls(t, map[*stubNotifier]int{n: 1})
}
func TestRegistryUnknownNotifierIsNoop(t *testing.T) {
reg := NewRegistry(time.Second)
reg.AddRoute(Route{Notifier: "does-not-exist"})
// Should not panic or block.
reg.Dispatch(Event{Kind: KindRunCompleted})
}
func TestRegistryFailureDoesNotPoisonOthers(t *testing.T) {
reg := NewRegistry(time.Second)
bad := &stubNotifier{name: "bad", failOn: KindStageFailed}
good := &stubNotifier{name: "good"}
reg.Register(bad)
reg.Register(good)
reg.AddRoute(Route{Notifier: "bad"})
reg.AddRoute(Route{Notifier: "good"})
reg.Dispatch(Event{Kind: KindStageFailed, Severity: SeverityCritical})
awaitCalls(t, map[*stubNotifier]int{bad: 1, good: 1})
}
func TestNtfyNotifierPOSTsBodyAndHeaders(t *testing.T) {
var captured *http.Request
var body string
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
captured = r
b, _ := io.ReadAll(r.Body)
body = string(b)
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
n := NewNtfy("n", srv.URL, "vetting")
err := n.Send(context.Background(), Event{
Kind: KindStageFailed,
Severity: SeverityCritical,
Title: "host-01 FAILED",
Body: "SMART failed",
URL: "https://vetting.example/reports/42",
})
if err != nil {
t.Fatalf("send: %v", err)
}
if captured.Method != http.MethodPost {
t.Fatalf("method = %s, want POST", captured.Method)
}
if captured.URL.Path != "/vetting" {
t.Fatalf("path = %s, want /vetting", captured.URL.Path)
}
if got := captured.Header.Get("X-Title"); got != "host-01 FAILED" {
t.Fatalf("X-Title = %q", got)
}
if got := captured.Header.Get("X-Click"); got != "https://vetting.example/reports/42" {
t.Fatalf("X-Click = %q", got)
}
if got := captured.Header.Get("X-Priority"); got != "5" {
t.Fatalf("X-Priority = %q, want 5 for critical", got)
}
if body != "SMART failed" {
t.Fatalf("body = %q, want %q", body, "SMART failed")
}
}
func TestNtfyNotifierNon2xxErrors(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
http.Error(w, "rate limited", http.StatusTooManyRequests)
}))
defer srv.Close()
n := NewNtfy("n", srv.URL, "t")
err := n.Send(context.Background(), Event{Kind: KindRunCompleted, Body: "x"})
if err == nil || !strings.Contains(err.Error(), "429") {
t.Fatalf("want 429 error, got %v", err)
}
}
func TestDiscordNotifierPOSTsEmbed(t *testing.T) {
var body string
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
b, _ := io.ReadAll(r.Body)
body = string(b)
w.WriteHeader(http.StatusNoContent)
}))
defer srv.Close()
d := NewDiscord("d", srv.URL)
err := d.Send(context.Background(), Event{
Kind: KindRunCompleted,
Severity: SeverityInfo,
Title: "host-01 passed",
Body: "all green",
URL: "https://vetting.example/reports/1",
})
if err != nil {
t.Fatalf("send: %v", err)
}
// Body should be a JSON payload containing an embeds array with our
// title/description/URL.
for _, want := range []string{`"embeds"`, `"host-01 passed"`, `"all green"`, `reports/1`} {
if !strings.Contains(body, want) {
t.Errorf("body missing %q: %s", want, body)
}
}
}
func TestSMTPNotifierInvokesSendMail(t *testing.T) {
var called int32
var gotAddr, gotFrom string
var gotTo []string
var gotMsg []byte
s := NewSMTP("s", "mail.example", 2525, "vetting@example", []string{"ops@example"})
s.SendMailFn = func(addr string, _ smtp.Auth, from string, to []string, msg []byte) error {
atomic.AddInt32(&called, 1)
gotAddr, gotFrom, gotTo, gotMsg = addr, from, to, msg
return nil
}
err := s.Send(context.Background(), Event{
Kind: KindStageFailed, Title: "subj", Body: "failure body",
URL: "https://vetting.example/reports/9",
})
if err != nil {
t.Fatalf("send: %v", err)
}
if atomic.LoadInt32(&called) != 1 {
t.Fatal("SendMailFn not called")
}
if gotAddr != "mail.example:2525" {
t.Fatalf("addr = %q", gotAddr)
}
if gotFrom != "vetting@example" {
t.Fatalf("from = %q", gotFrom)
}
if len(gotTo) != 1 || gotTo[0] != "ops@example" {
t.Fatalf("to = %v", gotTo)
}
s1 := string(gotMsg)
for _, want := range []string{"Subject: subj", "failure body", "Link: https://vetting.example/reports/9"} {
if !strings.Contains(s1, want) {
t.Errorf("message missing %q", want)
}
}
}
func TestSMTPNotifierRejectsIncompleteConfig(t *testing.T) {
s := &SMTPNotifier{NameStr: "s"}
if err := s.Send(context.Background(), Event{Kind: KindRunCompleted}); err == nil {
t.Fatal("want error, got nil")
}
}
+90
View File
@@ -0,0 +1,90 @@
package notify
import (
"context"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// NtfyNotifier posts to ntfy.sh (or a self-hosted ntfy server). Message
// body is the plain text body; title and URL are passed via X-Title and
// X-Click headers so ntfy renders them as the push title + deep link.
type NtfyNotifier struct {
NameStr string
Server string // e.g. "https://ntfy.sh" or self-hosted
Topic string
HTTP *http.Client
}
func NewNtfy(name, server, topic string) *NtfyNotifier {
if server == "" {
server = "https://ntfy.sh"
}
return &NtfyNotifier{
NameStr: name,
Server: strings.TrimRight(server, "/"),
Topic: topic,
HTTP: &http.Client{Timeout: 10 * time.Second},
}
}
func (n *NtfyNotifier) Name() string { return n.NameStr }
func (n *NtfyNotifier) Send(ctx context.Context, ev Event) error {
if n.Topic == "" {
return fmt.Errorf("ntfy: no topic configured")
}
url := n.Server + "/" + n.Topic
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, strings.NewReader(ev.Body))
if err != nil {
return err
}
if ev.Title != "" {
req.Header.Set("X-Title", ev.Title)
}
if ev.URL != "" {
req.Header.Set("X-Click", ev.URL)
}
req.Header.Set("X-Priority", priorityForSeverity(ev.Severity))
req.Header.Set("X-Tags", ntfyTag(ev.Kind, ev.Severity))
resp, err := n.HTTP.Do(req)
if err != nil {
return err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode >= 300 {
b, _ := io.ReadAll(resp.Body)
return fmt.Errorf("ntfy: %d: %s", resp.StatusCode, strings.TrimSpace(string(b)))
}
return nil
}
// priorityForSeverity maps our severities to ntfy's 15 scale. "info"
// → 3 (default), warning → 4, critical → 5.
func priorityForSeverity(s Severity) string {
switch s {
case SeverityCritical:
return "5"
case SeverityWarning:
return "4"
default:
return "3"
}
}
func ntfyTag(k Kind, s Severity) string {
switch {
case s == SeverityCritical:
return "rotating_light," + string(k)
case k == KindRunCompleted:
return "white_check_mark," + string(k)
case k == KindHoldingOpened:
return "construction," + string(k)
default:
return string(k)
}
}
+81
View File
@@ -0,0 +1,81 @@
package notify
import (
"context"
"fmt"
"net/smtp"
"strconv"
"strings"
)
// SMTPNotifier sends a plaintext email. Authentication is left at zero
// (LAN-only relay assumed); if the configured server requires auth the
// Send call will return an error and the Registry will log it.
//
// SendMailFn is overridable so tests can capture the outgoing message
// without needing a live SMTP server.
type SMTPNotifier struct {
NameStr string
Host string
Port int
From string
To []string
SendMailFn func(addr string, a smtp.Auth, from string, to []string, msg []byte) error
}
func NewSMTP(name, host string, port int, from string, to []string) *SMTPNotifier {
if port == 0 {
port = 25
}
return &SMTPNotifier{
NameStr: name,
Host: host,
Port: port,
From: from,
To: to,
SendMailFn: smtp.SendMail,
}
}
func (s *SMTPNotifier) Name() string { return s.NameStr }
func (s *SMTPNotifier) Send(ctx context.Context, ev Event) error {
if s.Host == "" || s.From == "" || len(s.To) == 0 {
return fmt.Errorf("smtp: incomplete config (host/from/to required)")
}
// We intentionally don't honour ctx here — net/smtp.SendMail doesn't
// accept a context; for a LAN relay with a short TCP timeout the
// Registry's goroutine will outlive the timeout but only by seconds.
addr := s.Host + ":" + strconv.Itoa(s.Port)
msg := buildEmail(s.From, s.To, ev)
return s.SendMailFn(addr, nil, s.From, s.To, msg)
}
// buildEmail produces an RFC 5322 minimal message. Body is plaintext;
// the URL is appended so the recipient can click through from a text
// mail client. No MIME for now — keeps it robust.
func buildEmail(from string, to []string, ev Event) []byte {
var b strings.Builder
b.WriteString("From: ")
b.WriteString(from)
b.WriteString("\r\n")
b.WriteString("To: ")
b.WriteString(strings.Join(to, ", "))
b.WriteString("\r\n")
subject := ev.Title
if subject == "" {
subject = "[vetting] " + string(ev.Kind)
}
b.WriteString("Subject: ")
b.WriteString(subject)
b.WriteString("\r\n")
b.WriteString("Content-Type: text/plain; charset=UTF-8\r\n")
b.WriteString("\r\n")
b.WriteString(ev.Body)
if ev.URL != "" {
b.WriteString("\r\n\r\nLink: ")
b.WriteString(ev.URL)
}
b.WriteString("\r\n")
return []byte(b.String())
}
+124
View File
@@ -0,0 +1,124 @@
package orchestrator
import (
"context"
"log"
"time"
"vetting/internal/model"
"vetting/internal/store"
)
// Dispatcher picks Queued runs off the DB and drives them through
// WaitingWoL (sending a WoL packet). Concurrency is capped at Max.
//
// For Phase 2 the dispatcher's job ends at WaitingWoL; further
// transitions are driven by iPXE and agent callbacks. Phase 4+ will
// return here and shepherd each run through stage execution.
type Dispatcher struct {
Max int
Runs *store.Runs
Hosts *store.Hosts
Runner *Runner
active chan struct{}
stop chan struct{}
}
func NewDispatcher(max int, runs *store.Runs, hosts *store.Hosts, runner *Runner) *Dispatcher {
if max < 1 {
max = 1
}
return &Dispatcher{
Max: max,
Runs: runs,
Hosts: hosts,
Runner: runner,
active: make(chan struct{}, max),
stop: make(chan struct{}),
}
}
func (d *Dispatcher) Start(ctx context.Context) {
go d.loop(ctx)
}
func (d *Dispatcher) Stop() {
close(d.stop)
}
func (d *Dispatcher) loop(ctx context.Context) {
t := time.NewTicker(2 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-d.stop:
return
case <-t.C:
d.pickNext(ctx)
}
}
}
func (d *Dispatcher) pickNext(ctx context.Context) {
select {
case d.active <- struct{}{}:
default:
return // at capacity
}
released := false
defer func() {
if !released {
<-d.active
}
}()
runs, err := d.Runs.Active(ctx)
if err != nil {
log.Printf("dispatcher: list active: %v", err)
return
}
var queued *model.Run
inFlight := 0
for i := range runs {
switch runs[i].State {
case model.StateQueued:
if queued == nil {
queued = &runs[i]
}
case model.StateWaitingWoL, model.StateBooting, model.StateInventoryCheck,
model.StateSpecValidate, model.StateSMART, model.StateCPUStress,
model.StateStorage, model.StateNetwork, model.StateGPU,
model.StatePSU, model.StateReporting:
inFlight++
}
}
if inFlight >= d.Max || queued == nil {
return
}
host, err := d.Hosts.Get(ctx, queued.HostID)
if err != nil {
log.Printf("dispatcher: get host %d: %v", queued.HostID, err)
return
}
if _, err := d.Runner.Transition(ctx, queued.ID, TriggerDispatched); err != nil {
log.Printf("dispatcher: transition run %d: %v", queued.ID, err)
return
}
if err := SendWoL(host.MAC, host.WoLBroadcastIP, host.WoLPort); err != nil {
log.Printf("dispatcher: WoL run %d host %s: %v", queued.ID, host.Name, err)
// Stay in WaitingWoL; operator can retry or investigate.
return
}
log.Printf("dispatcher: WoL sent for run %d (host=%s mac=%s)", queued.ID, host.Name, host.MAC)
// Slot stays reserved until the run leaves active (Phase 4+).
// Phase 2 lets the loop observe inFlight via DB state.
released = true
<-d.active
}
+92
View File
@@ -0,0 +1,92 @@
package orchestrator
import (
"context"
"errors"
"fmt"
"log"
"os"
"os/exec"
"strconv"
"sync"
"time"
)
// IperfSupervisor runs a single `iperf3 -s` process under the
// orchestrator so the Network stage has a stable server to dial. Each
// run's Network test is sequential (stages are always serial), so one
// server process handles every host under test.
//
// Missing iperf3 binary is logged once and the supervisor becomes a
// no-op — the agent's Network stage will then fail to connect and skip
// cleanly via the stage's own error path.
type IperfSupervisor struct {
Port int // default 5201
mu sync.Mutex
cmd *exec.Cmd
started bool
fatal error
}
func NewIperfSupervisor(port int) *IperfSupervisor {
if port <= 0 {
port = 5201
}
return &IperfSupervisor{Port: port}
}
func (s *IperfSupervisor) Start(ctx context.Context) error {
s.mu.Lock()
defer s.mu.Unlock()
if s.started {
return nil
}
if _, err := exec.LookPath("iperf3"); err != nil {
s.fatal = fmt.Errorf("iperf3 not in PATH: %w", err)
log.Printf("iperf supervisor: %v (Network stage will fail to connect)", s.fatal)
return nil
}
cmd := exec.CommandContext(ctx, "iperf3", "-s", "-p", strconv.Itoa(s.Port))
if err := cmd.Start(); err != nil {
s.fatal = err
return err
}
s.cmd = cmd
s.started = true
log.Printf("iperf supervisor: iperf3 -s -p %d (pid=%d)", s.Port, cmd.Process.Pid)
go s.wait()
return nil
}
// Shutdown politely stops the iperf3 subprocess. Called from main on
// SIGINT. A 3s grace period is enough for iperf3 to flush logs; after
// that we kill.
func (s *IperfSupervisor) Shutdown(timeout time.Duration) error {
s.mu.Lock()
cmd := s.cmd
s.mu.Unlock()
if cmd == nil || cmd.Process == nil {
return nil
}
// os.Interrupt is cross-platform; on Linux it maps to SIGINT which
// iperf3 handles gracefully. On Windows (dev only) it's a no-op and
// we'll fall through to Kill after the timeout.
_ = cmd.Process.Signal(os.Interrupt)
done := make(chan error, 1)
go func() { done <- cmd.Wait() }()
select {
case <-done:
return nil
case <-time.After(timeout):
_ = cmd.Process.Kill()
return errors.New("iperf3 did not exit in time; killed")
}
}
func (s *IperfSupervisor) wait() {
_ = s.cmd.Wait()
s.mu.Lock()
defer s.mu.Unlock()
s.started = false
}
+118
View File
@@ -0,0 +1,118 @@
package orchestrator
import (
"context"
"fmt"
"log"
"time"
"vetting/internal/events"
"vetting/internal/model"
"vetting/internal/store"
)
// Runner is the authoritative mutator for run state. All state
// transitions go through (*Runner).Transition so the DB update and
// the event publication happen together.
type Runner struct {
Runs *store.Runs
Hosts *store.Hosts
Stages *store.Stages
EventHub *events.Hub
}
func (r *Runner) Transition(ctx context.Context, runID int64, trigger Trigger) (model.RunState, error) {
run, err := r.Runs.Get(ctx, runID)
if err != nil {
return "", fmt.Errorf("get run: %w", err)
}
next, err := Next(run.State, trigger)
if err != nil {
return "", err
}
if err := r.Runs.SetState(ctx, runID, next); err != nil {
return "", fmt.Errorf("persist transition: %w", err)
}
log.Printf("run %d: %s -> %s (%s)", runID, run.State, next, trigger)
r.publishTileUpdate(ctx, run.HostID)
return next, nil
}
// StartStage marks a stage row running and publishes a tile refresh.
func (r *Runner) StartStage(ctx context.Context, runID int64, name string) error {
if err := r.Stages.StartByName(ctx, runID, name); err != nil {
return err
}
run, err := r.Runs.Get(ctx, runID)
if err == nil {
r.publishTileUpdate(ctx, run.HostID)
}
return nil
}
func (r *Runner) publishTileUpdate(ctx context.Context, hostID int64) {
host, err := r.Hosts.Get(ctx, hostID)
if err != nil {
log.Printf("publishTileUpdate: get host %d: %v", hostID, err)
return
}
latest, err := r.Runs.LatestForHost(ctx, hostID)
if err != nil {
log.Printf("publishTileUpdate: latest run: %v", err)
return
}
payload := renderTileSSE(ctx, *host, latest)
r.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", hostID), Payload: payload})
}
// TileRenderer renders a single tile fragment. Registered at startup
// so the orchestrator package stays free of template / store-enrichment
// imports. The closure is expected to do any DB lookups itself (spec-
// diff count, hold-key path, …) before handing the data to the
// template package.
var TileRenderer func(ctx context.Context, host model.Host, latest *model.Run) string
func renderTileSSE(ctx context.Context, host model.Host, latest *model.Run) string {
if TileRenderer == nil {
return fmt.Sprintf(`<article id="host-%d">state change</article>`, host.ID)
}
return TileRenderer(ctx, host, latest)
}
// TouchHeartbeat is called on every agent heartbeat so the orchestrator
// can record last-seen; Phase 2 just logs, Phase 3+ will update a
// last_seen_at column.
func (r *Runner) TouchHeartbeat(runID int64) {
_ = runID
_ = time.Now()
}
// Override re-enters a held stage after the operator has acknowledged
// the failure condition (e.g. wipe-probe override). It jumps
// FailedHolding → StateFor(failed_stage), clears the failed marker, and
// publishes a tile refresh so the UI drops the hold banner.
func (r *Runner) Override(ctx context.Context, runID int64, flagsJSON string) (model.RunState, error) {
run, err := r.Runs.Get(ctx, runID)
if err != nil {
return "", fmt.Errorf("get run: %w", err)
}
if run.FailedStage == "" {
return "", fmt.Errorf("override: run has no failed_stage")
}
next, err := NextForOverride(run.State, run.FailedStage)
if err != nil {
return "", err
}
if err := r.Runs.SetOverrideFlags(ctx, runID, flagsJSON); err != nil {
return "", fmt.Errorf("persist override flags: %w", err)
}
if err := r.Runs.SetState(ctx, runID, next); err != nil {
return "", fmt.Errorf("override transition: %w", err)
}
if err := r.Runs.ClearFailedStage(ctx, runID); err != nil {
log.Printf("override: clear failed_stage: %v", err)
}
log.Printf("run %d: %s -> %s (OperatorOverride stage=%s flags=%s)", runID, run.State, next, run.FailedStage, flagsJSON)
r.publishTileUpdate(ctx, run.HostID)
return next, nil
}
+129
View File
@@ -0,0 +1,129 @@
package orchestrator
import (
"fmt"
"vetting/internal/model"
)
// Trigger is an event that drives a state transition.
type Trigger string
const (
TriggerStartRequested Trigger = "StartRequested" // user clicks Start Vetting
TriggerDispatched Trigger = "Dispatched" // dispatcher picked this run
TriggerPXEObserved Trigger = "PXEObserved" // iPXE fetched cmdline for MAC
TriggerAgentClaimed Trigger = "AgentClaimed" // agent POSTed /claim with valid token
TriggerStageFailed Trigger = "StageFailed" // a stage reported failure
TriggerStageCompleted Trigger = "StageCompleted" // a stage reported success → advance
TriggerAllStagesPassed Trigger = "AllStagesPassed" // final stage passed
TriggerOperatorReleased Trigger = "OperatorReleased" // user clicked Release on a held run
TriggerOperatorOverride Trigger = "OperatorOverride" // user overrode a held stage; re-enter it
)
// stageStates maps the canonical stage name (from DefaultStageOrder)
// to the matching RunState. Named differently for historical reasons:
// the first stage is "Inventory" (stage row name) but the run state is
// "InventoryCheck". Later stages share a name with their state.
var stageStates = map[string]model.RunState{
"Inventory": model.StateInventoryCheck,
"SpecValidate": model.StateSpecValidate,
"SMART": model.StateSMART,
"CPUStress": model.StateCPUStress,
"Storage": model.StateStorage,
"Network": model.StateNetwork,
"GPU": model.StateGPU,
"PSU": model.StatePSU,
"Reporting": model.StateReporting,
}
// stageOrder is the sequence of RunStates the run walks through from
// first stage to Completed. Kept in sync with store.DefaultStageOrder.
var stageOrder = []model.RunState{
model.StateInventoryCheck,
model.StateSpecValidate,
model.StateSMART,
model.StateCPUStress,
model.StateStorage,
model.StateNetwork,
model.StateGPU,
model.StatePSU,
model.StateReporting,
}
type transition struct {
from []model.RunState
to model.RunState
}
var table = map[Trigger]transition{
TriggerStartRequested: {from: []model.RunState{model.StateRegistered}, to: model.StateQueued},
TriggerDispatched: {from: []model.RunState{model.StateQueued}, to: model.StateWaitingWoL},
TriggerPXEObserved: {from: []model.RunState{model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting},
TriggerAgentClaimed: {from: []model.RunState{model.StateBooting, model.StateWaitingWoL}, to: model.StateInventoryCheck},
TriggerStageFailed: {from: allActiveStates(), to: model.StateFailedHolding},
TriggerAllStagesPassed: {from: []model.RunState{model.StateReporting}, to: model.StateCompleted},
TriggerOperatorReleased: {from: []model.RunState{model.StateFailedHolding}, to: model.StateReleased},
}
// Next computes the target state for a trigger against the current state.
// StageCompleted is handled specially: it advances through stageOrder.
func Next(current model.RunState, t Trigger) (model.RunState, error) {
if t == TriggerStageCompleted {
return nextStageState(current)
}
tr, ok := table[t]
if !ok {
return "", fmt.Errorf("unknown trigger %q", t)
}
for _, s := range tr.from {
if s == current {
return tr.to, nil
}
}
return "", fmt.Errorf("trigger %q not allowed from %q", t, current)
}
// NextForOverride returns the state we should jump to when the operator
// overrides a held stage. It's separate from the generic table because
// the target depends on the failed_stage, not on the current state
// (which is always FailedHolding).
func NextForOverride(current model.RunState, failedStage string) (model.RunState, error) {
if current != model.StateFailedHolding {
return "", fmt.Errorf("override not allowed from %q", current)
}
s, ok := stageStates[failedStage]
if !ok {
return "", fmt.Errorf("override: unknown failed stage %q", failedStage)
}
return s, nil
}
// StateForStage returns the RunState that corresponds to a stage name.
// Used by handlers that receive a stage name and want to guard against
// stale/out-of-order agent reports.
func StateForStage(name string) (model.RunState, bool) {
s, ok := stageStates[name]
return s, ok
}
func nextStageState(current model.RunState) (model.RunState, error) {
for i, s := range stageOrder {
if s == current {
if i+1 >= len(stageOrder) {
return model.StateCompleted, nil
}
return stageOrder[i+1], nil
}
}
return "", fmt.Errorf("StageCompleted not valid from %q", current)
}
func allActiveStates() []model.RunState {
return []model.RunState{
model.StateQueued, model.StateWaitingWoL, model.StateBooting,
model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART,
model.StateCPUStress, model.StateStorage, model.StateNetwork,
model.StateGPU, model.StatePSU, model.StateReporting,
}
}
@@ -0,0 +1,67 @@
package orchestrator_test
import (
"testing"
"vetting/internal/model"
"vetting/internal/orchestrator"
)
func TestNextForOverride(t *testing.T) {
tests := []struct {
name string
from model.RunState
failedStage string
want model.RunState
wantErr bool
}{
{"storage override", model.StateFailedHolding, "Storage", model.StateStorage, false},
{"smart override", model.StateFailedHolding, "SMART", model.StateSMART, false},
{"inventory override", model.StateFailedHolding, "Inventory", model.StateInventoryCheck, false},
{"unknown stage", model.StateFailedHolding, "NotAStage", "", true},
{"not holding", model.StateStorage, "Storage", "", true},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got, err := orchestrator.NextForOverride(tc.from, tc.failedStage)
if tc.wantErr {
if err == nil {
t.Fatalf("expected error, got %q", got)
}
return
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != tc.want {
t.Fatalf("got %q, want %q", got, tc.want)
}
})
}
}
func TestNextStageWalk(t *testing.T) {
// Walking StageCompleted from each stage should land on the next
// one in the canonical order, and from Reporting onto Completed.
chain := []model.RunState{
model.StateInventoryCheck,
model.StateSpecValidate,
model.StateSMART,
model.StateCPUStress,
model.StateStorage,
model.StateNetwork,
model.StateGPU,
model.StatePSU,
model.StateReporting,
model.StateCompleted,
}
for i := 0; i < len(chain)-1; i++ {
got, err := orchestrator.Next(chain[i], orchestrator.TriggerStageCompleted)
if err != nil {
t.Fatalf("Next(%q): %v", chain[i], err)
}
if got != chain[i+1] {
t.Fatalf("Next(%q) = %q, want %q", chain[i], got, chain[i+1])
}
}
}
+26
View File
@@ -0,0 +1,26 @@
package orchestrator
import (
"crypto/rand"
"crypto/sha256"
"encoding/hex"
"fmt"
)
// IssueRunToken returns (plaintext, hashHex). The plaintext is passed
// to the host via the iPXE kernel cmdline; the hash is persisted in the
// runs table for later constant-time comparison.
func IssueRunToken() (string, string, error) {
b := make([]byte, 32)
if _, err := rand.Read(b); err != nil {
return "", "", fmt.Errorf("random: %w", err)
}
plain := hex.EncodeToString(b)
sum := sha256.Sum256([]byte(plain))
return plain, hex.EncodeToString(sum[:]), nil
}
func HashRunToken(plain string) string {
sum := sha256.Sum256([]byte(plain))
return hex.EncodeToString(sum[:])
}
+38
View File
@@ -0,0 +1,38 @@
package orchestrator
import (
"strings"
"testing"
)
func TestIssueRunTokenRoundTrip(t *testing.T) {
plain, hash, err := IssueRunToken()
if err != nil {
t.Fatalf("IssueRunToken: %v", err)
}
if len(plain) != 64 {
t.Fatalf("plaintext should be 64 hex chars, got %d", len(plain))
}
if len(hash) != 64 {
t.Fatalf("hash should be 64 hex chars, got %d", len(hash))
}
if HashRunToken(plain) != hash {
t.Fatalf("HashRunToken(plain) != hash")
}
// Ensure high entropy: two consecutive issues differ.
plain2, _, _ := IssueRunToken()
if plain == plain2 {
t.Fatalf("expected distinct tokens on consecutive calls")
}
}
func TestHashRunTokenDeterministic(t *testing.T) {
h1 := HashRunToken("abc")
h2 := HashRunToken("abc")
if h1 != h2 {
t.Fatalf("hash not deterministic")
}
if strings.EqualFold(h1, HashRunToken("abd")) {
t.Fatalf("hash should differ for distinct inputs")
}
}
+57
View File
@@ -0,0 +1,57 @@
package orchestrator
import (
"encoding/hex"
"fmt"
"net"
"strconv"
"strings"
)
// SendWoL sends a Wake-on-LAN magic packet to broadcastIP:port for the
// given MAC (aa:bb:cc:dd:ee:ff). The packet is 6 bytes of 0xFF followed
// by the MAC repeated 16 times.
func SendWoL(mac, broadcastIP string, port int) error {
macBytes, err := parseMAC(mac)
if err != nil {
return err
}
packet := make([]byte, 6+16*6)
for i := 0; i < 6; i++ {
packet[i] = 0xff
}
for i := 0; i < 16; i++ {
copy(packet[6+i*6:], macBytes)
}
conn, err := net.Dial("udp", net.JoinHostPort(broadcastIP, strconv.Itoa(port)))
if err != nil {
return fmt.Errorf("dial wol: %w", err)
}
defer conn.Close()
if _, err := conn.Write(packet); err != nil {
return fmt.Errorf("write wol: %w", err)
}
return nil
}
func parseMAC(s string) ([]byte, error) {
s = strings.ToLower(strings.TrimSpace(s))
parts := strings.Split(s, ":")
if len(parts) != 6 {
return nil, fmt.Errorf("invalid MAC %q", s)
}
out := make([]byte, 6)
for i, p := range parts {
if len(p) != 2 {
return nil, fmt.Errorf("invalid MAC octet %q", p)
}
b, err := hex.DecodeString(p)
if err != nil {
return nil, fmt.Errorf("invalid MAC %q: %w", s, err)
}
out[i] = b[0]
}
return out, nil
}
+37
View File
@@ -0,0 +1,37 @@
package orchestrator
import (
"bytes"
"testing"
)
func TestParseMAC(t *testing.T) {
got, err := parseMAC("aa:bb:cc:dd:ee:ff")
if err != nil {
t.Fatalf("parseMAC: %v", err)
}
want := []byte{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
if !bytes.Equal(got, want) {
t.Fatalf("parseMAC: %x != %x", got, want)
}
}
func TestParseMACUpper(t *testing.T) {
// Must be case-insensitive so users can paste either form.
got, err := parseMAC("AA:BB:CC:DD:EE:FF")
if err != nil {
t.Fatalf("parseMAC upper: %v", err)
}
want := []byte{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
if !bytes.Equal(got, want) {
t.Fatalf("parseMAC upper: %x != %x", got, want)
}
}
func TestParseMACInvalid(t *testing.T) {
for _, bad := range []string{"", "aa:bb:cc", "zz:yy:xx:ww:vv:uu", "aa-bb-cc-dd-ee-ff", "aa:bb:cc:dd:ee:ff:00"} {
if _, err := parseMAC(bad); err == nil {
t.Errorf("expected error for %q", bad)
}
}
}
+231
View File
@@ -0,0 +1,231 @@
package pxe
import (
"context"
"fmt"
"io"
"log"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"sync"
"text/template"
"time"
"vetting/internal/model"
)
// SupervisorConfig controls how dnsmasq is launched and configured.
type SupervisorConfig struct {
Enabled bool
Interface string // e.g. "eth0"
DHCPRange string // e.g. "10.77.0.100,10.77.0.200,12h"
OrchestratorURL string // baked into iPXE scripts
RuntimeDir string // writable dir for dnsmasq.conf and leases
TFTPRoot string // holds ipxe.efi, undionly.kpxe
DNSMasqBin string // path to dnsmasq binary (default: "dnsmasq")
}
// Supervisor owns a dnsmasq subprocess, rewrites its config when the
// host registry changes, and sends SIGHUP to reload. The MAC allowlist
// is the safety barrier: only registered MACs see a DHCP reply.
type Supervisor struct {
cfg SupervisorConfig
mu sync.Mutex
cmd *exec.Cmd
cancel context.CancelFunc
}
func NewSupervisor(cfg SupervisorConfig) *Supervisor {
if cfg.DNSMasqBin == "" {
cfg.DNSMasqBin = "dnsmasq"
}
return &Supervisor{cfg: cfg}
}
// Start launches dnsmasq in the background. If cfg.Enabled is false
// Start is a no-op (useful for dev on Windows where dnsmasq isn't
// available).
func (s *Supervisor) Start(ctx context.Context, hosts []model.Host) error {
if !s.cfg.Enabled {
log.Printf("pxe: disabled in config — skipping dnsmasq")
return nil
}
if runtime.GOOS == "windows" {
return fmt.Errorf("dnsmasq supervision is not supported on Windows — run orchestrator on Linux")
}
if err := os.MkdirAll(s.cfg.RuntimeDir, 0o755); err != nil {
return fmt.Errorf("mkdir runtime: %w", err)
}
if err := s.writeConf(hosts); err != nil {
return err
}
subCtx, cancel := context.WithCancel(ctx)
s.mu.Lock()
s.cancel = cancel
s.mu.Unlock()
confPath := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
cmd := exec.CommandContext(subCtx, s.cfg.DNSMasqBin,
"--conf-file="+confPath,
"--no-daemon",
"--log-queries",
"--log-dhcp",
)
cmd.Stdout = logWriter{prefix: "dnsmasq"}
cmd.Stderr = logWriter{prefix: "dnsmasq"}
if err := cmd.Start(); err != nil {
cancel()
return fmt.Errorf("start dnsmasq: %w", err)
}
s.mu.Lock()
s.cmd = cmd
s.mu.Unlock()
go func() {
if err := cmd.Wait(); err != nil && subCtx.Err() == nil {
log.Printf("dnsmasq exited: %v", err)
}
}()
return nil
}
// Reload rewrites the conf with the latest host registry and sends
// SIGHUP. It will restart the subprocess if SIGHUP is unsupported
// (e.g. when running behind an OS that doesn't support it).
func (s *Supervisor) Reload(hosts []model.Host) error {
if !s.cfg.Enabled {
return nil
}
if err := s.writeConf(hosts); err != nil {
return err
}
s.mu.Lock()
cmd := s.cmd
s.mu.Unlock()
if cmd == nil || cmd.Process == nil {
return nil
}
if err := sighup(cmd.Process); err != nil {
return fmt.Errorf("sighup dnsmasq: %w", err)
}
return nil
}
// Shutdown stops dnsmasq within the timeout.
func (s *Supervisor) Shutdown(timeout time.Duration) error {
if !s.cfg.Enabled {
return nil
}
s.mu.Lock()
cancel := s.cancel
cmd := s.cmd
s.mu.Unlock()
if cancel != nil {
cancel()
}
if cmd != nil && cmd.Process != nil {
done := make(chan struct{})
go func() {
_, _ = cmd.Process.Wait()
close(done)
}()
select {
case <-done:
case <-time.After(timeout):
_ = cmd.Process.Kill()
}
}
return nil
}
func (s *Supervisor) writeConf(hosts []model.Host) error {
tmpl, err := template.New("dnsmasq").Parse(dnsmasqTemplate)
if err != nil {
return err
}
conf := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
tmp := conf + ".new"
f, err := os.Create(tmp)
if err != nil {
return fmt.Errorf("create conf: %w", err)
}
data := struct {
Cfg SupervisorConfig
Hosts []model.Host
}{s.cfg, hosts}
if err := tmpl.Execute(f, data); err != nil {
_ = f.Close()
return fmt.Errorf("render conf: %w", err)
}
if err := f.Sync(); err != nil {
_ = f.Close()
return err
}
if err := f.Close(); err != nil {
return err
}
if err := os.Rename(tmp, conf); err != nil {
return fmt.Errorf("rename conf: %w", err)
}
return nil
}
// Exposed for the UI handlers to show operators what config is live.
func (s *Supervisor) ConfPath() string {
return filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf")
}
type logWriter struct{ prefix string }
func (w logWriter) Write(p []byte) (int, error) {
for _, line := range strings.Split(strings.TrimRight(string(p), "\n"), "\n") {
if line == "" {
continue
}
log.Printf("[%s] %s", w.prefix, line)
}
return len(p), nil
}
// Allow package consumers to swap io.Writer for logs in tests.
var _ io.Writer = logWriter{}
const dnsmasqTemplate = `# Generated by Vetting — do not hand-edit.
interface={{ .Cfg.Interface }}
bind-interfaces
port=0
domain-needed
bogus-priv
no-resolv
# MAC allowlist: dnsmasq only answers DHCP for MACs with a dhcp-host= below.
dhcp-ignore=tag:!known
{{- range .Hosts }}
dhcp-host={{ .MAC }},set:known
{{- end }}
# DHCP range (broader subnet coverage is fine; allowlist above gates replies).
dhcp-range={{ .Cfg.DHCPRange }}
# TFTP + HTTP boot (iPXE chainload).
enable-tftp
tftp-root={{ .Cfg.TFTPRoot }}
# BIOS (undionly.kpxe) and UEFI (ipxe.efi) clients both get iPXE first,
# which then re-requests a per-MAC script from the orchestrator.
dhcp-match=set:bios,option:client-arch,0
dhcp-match=set:efi64,option:client-arch,7
dhcp-match=set:efi64,option:client-arch,9
# If the client is iPXE itself, send it the per-MAC HTTP script.
dhcp-match=set:ipxe,175
dhcp-boot=tag:ipxe,{{ .Cfg.OrchestratorURL }}/ipxe/${mac}
# Otherwise (first boot from ROM) chainload iPXE from TFTP.
dhcp-boot=tag:!ipxe,tag:bios,undionly.kpxe
dhcp-boot=tag:!ipxe,tag:efi64,ipxe.efi
log-facility=-
`
+88
View File
@@ -0,0 +1,88 @@
package pxe
import (
"fmt"
"io"
"strings"
"vetting/internal/model"
)
// IPXEParams is everything an iPXE boot script needs.
// For Phase 2 the boot target is always "linux" — Memtest chain-load
// is not required because we replaced Memtest86+ with stress-ng under
// Linux (see plan §3.2).
type IPXEParams struct {
OrchestratorURL string // e.g. http://10.0.0.5:8080
LiveKernelURL string // e.g. http://10.0.0.5:8080/live/vmlinuz
LiveInitrdURL string // e.g. http://10.0.0.5:8080/live/initrd.img
TLSCertFPR string // optional; empty = skip pin
RunID int64
MAC string
Token string // plaintext, hashed on server side
}
// BuildScript returns an iPXE script tailored for this run.
// iPXE scripts are plain text beginning with "#!ipxe".
func BuildScript(p IPXEParams) string {
cmdline := []string{
"initrd=initrd.img",
fmt.Sprintf("vetting.orchestrator=%s", p.OrchestratorURL),
fmt.Sprintf("vetting.run_id=%d", p.RunID),
fmt.Sprintf("vetting.mac=%s", p.MAC),
fmt.Sprintf("vetting.token=%s", p.Token),
}
if p.TLSCertFPR != "" {
cmdline = append(cmdline, fmt.Sprintf("vetting.cert_fpr=%s", p.TLSCertFPR))
}
// Reduce kernel log noise during the test run; keep loglevel high enough
// for boot failures to still show up on the console.
cmdline = append(cmdline,
"console=tty0",
"console=ttyS0,115200n8",
"ip=dhcp",
"quiet",
)
var b strings.Builder
fmt.Fprintln(&b, "#!ipxe")
fmt.Fprintf(&b, "echo Vetting run %d — booting live image for %s\n", p.RunID, p.MAC)
fmt.Fprintf(&b, "kernel %s %s\n", p.LiveKernelURL, strings.Join(cmdline, " "))
fmt.Fprintf(&b, "initrd %s\n", p.LiveInitrdURL)
fmt.Fprintln(&b, "boot")
return b.String()
}
// NotRegisteredScript is served for unknown MACs. The MAC allowlist
// at the dnsmasq level should prevent this from ever being reachable,
// but it exists as belt-and-braces.
func NotRegisteredScript(mac string) string {
return fmt.Sprintf("#!ipxe\necho MAC %s not registered for vetting — halting.\nshell\n", mac)
}
// NoActiveRunScript is served when a registered MAC PXE-boots but has
// no currently active run. The host is told to shut down rather than
// loop forever.
func NoActiveRunScript(mac string) string {
return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — powering off in 10s.\nsleep 10\npoweroff\n", mac)
}
// Used by handlers to compose URLs; exposed for tests.
func BuildLiveURLs(base string) (kernel, initrd string) {
base = strings.TrimRight(base, "/")
return base + "/live/vmlinuz", base + "/live/initrd.img"
}
// WriteNotFound is a small convenience so handlers can return a shell
// script error directly to iPXE without cluttering handlers with a
// mime-type dance.
func WriteNotFound(w io.Writer, mac string) {
_, _ = w.Write([]byte(NotRegisteredScript(mac)))
}
// ScriptMarker is used by iPXE to detect that the response is a script.
const ScriptMarker = "#!ipxe"
// State returns the compact single-word status used for logging.
// Takes a Run's state because iPXE handler already looked it up.
func State(run model.Run) string { return string(run.State) }
+61
View File
@@ -0,0 +1,61 @@
package pxe
import (
"strings"
"testing"
)
func TestBuildScriptIncludesAllCmdlineParams(t *testing.T) {
s := BuildScript(IPXEParams{
OrchestratorURL: "http://10.0.0.5:8080",
LiveKernelURL: "http://10.0.0.5:8080/live/vmlinuz",
LiveInitrdURL: "http://10.0.0.5:8080/live/initrd.img",
RunID: 42,
MAC: "aa:bb:cc:dd:ee:ff",
Token: "deadbeefcafe",
})
if !strings.HasPrefix(s, "#!ipxe") {
t.Fatalf("expected #!ipxe header, got %q", s[:10])
}
for _, want := range []string{
"vetting.orchestrator=http://10.0.0.5:8080",
"vetting.run_id=42",
"vetting.mac=aa:bb:cc:dd:ee:ff",
"vetting.token=deadbeefcafe",
"kernel http://10.0.0.5:8080/live/vmlinuz",
"initrd http://10.0.0.5:8080/live/initrd.img",
"ip=dhcp",
"boot",
} {
if !strings.Contains(s, want) {
t.Errorf("script missing %q\n%s", want, s)
}
}
}
func TestBuildScriptOmitsCertFPRWhenEmpty(t *testing.T) {
s := BuildScript(IPXEParams{
OrchestratorURL: "http://x", LiveKernelURL: "http://x/k", LiveInitrdURL: "http://x/i",
RunID: 1, MAC: "aa:bb:cc:dd:ee:ff", Token: "t",
})
if strings.Contains(s, "vetting.cert_fpr") {
t.Fatalf("cert_fpr should be absent when empty:\n%s", s)
}
}
func TestNotRegisteredScriptMentionsMAC(t *testing.T) {
s := NotRegisteredScript("aa:bb:cc:dd:ee:ff")
if !strings.Contains(s, "aa:bb:cc:dd:ee:ff") {
t.Fatalf("not-registered script should echo the MAC: %s", s)
}
if !strings.HasPrefix(s, "#!ipxe") {
t.Fatalf("missing #!ipxe header: %s", s)
}
}
func TestBuildLiveURLs(t *testing.T) {
k, i := BuildLiveURLs("http://h:8080/")
if k != "http://h:8080/live/vmlinuz" || i != "http://h:8080/live/initrd.img" {
t.Fatalf("BuildLiveURLs: %s, %s", k, i)
}
}
+12
View File
@@ -0,0 +1,12 @@
//go:build !windows
package pxe
import (
"os"
"syscall"
)
func sighup(p *os.Process) error {
return p.Signal(syscall.SIGHUP)
}
+12
View File
@@ -0,0 +1,12 @@
//go:build windows
package pxe
import (
"fmt"
"os"
)
func sighup(_ *os.Process) error {
return fmt.Errorf("SIGHUP not supported on Windows")
}
+245
View File
@@ -0,0 +1,245 @@
// Package report builds the per-run HTML summary artifact. JSON is
// written separately (by the reporting resolver in the api package);
// this package only deals with the human-facing HTML.
//
// Design: a single self-contained HTML file — inline CSS, no external
// fetches — so the artifact is portable and can be opened straight off
// disk. Contents are a summary (per answer to the phase-5 design
// question): run metadata, per-stage pass/fail table, spec diff list,
// and measurement aggregates (min/avg/max by kind+key).
package report
import (
"bytes"
"fmt"
"html/template"
"math"
"sort"
"time"
"vetting/internal/model"
)
// Data is the payload fed to the HTML template. Callers assemble it
// from the DB rows for a given run.
type Data struct {
GeneratedAt time.Time
Run model.Run
Host model.Host
Stages []model.Stage
SpecDiffs []model.SpecDiff
Aggregates []Aggregate // flattened measurement summary; see Aggregate
}
// Aggregate is a per (kind, key) summary of a run's measurements. Min/
// Max/Avg are populated from the Measurement rows; Unit mirrors the raw
// sample unit so the HTML can show "52.5 °C" etc.
type Aggregate struct {
Kind string
Key string
Unit string
Count int
Min float64
Max float64
Avg float64
}
// AggregateMeasurements collapses a flat []Measurement into per-(kind,
// key) summaries, sorted first by kind then by key so the HTML renders
// deterministically.
func AggregateMeasurements(rows []model.Measurement) []Aggregate {
type bucket struct {
unit string
count int
min, max float64
sum float64
}
buckets := map[string]*bucket{}
keyOf := func(m model.Measurement) string { return m.Kind + "\x00" + m.Key }
for _, m := range rows {
k := keyOf(m)
b, ok := buckets[k]
if !ok {
b = &bucket{unit: m.Unit, min: math.Inf(1), max: math.Inf(-1)}
buckets[k] = b
}
b.count++
b.sum += m.Value
if m.Value < b.min {
b.min = m.Value
}
if m.Value > b.max {
b.max = m.Value
}
}
out := make([]Aggregate, 0, len(buckets))
for _, m := range rows {
k := keyOf(m)
b, ok := buckets[k]
if !ok {
continue
}
// Emit once per bucket; delete to dedupe.
delete(buckets, k)
out = append(out, Aggregate{
Kind: m.Kind,
Key: m.Key,
Unit: b.unit,
Count: b.count,
Min: b.min,
Max: b.max,
Avg: b.sum / float64(b.count),
})
}
sort.Slice(out, func(i, j int) bool {
if out[i].Kind != out[j].Kind {
return out[i].Kind < out[j].Kind
}
return out[i].Key < out[j].Key
})
return out
}
// RenderHTML produces the self-contained report HTML.
func RenderHTML(d Data) ([]byte, error) {
var buf bytes.Buffer
if err := reportTmpl.Execute(&buf, d); err != nil {
return nil, fmt.Errorf("report: render: %w", err)
}
return buf.Bytes(), nil
}
var reportTmpl = template.Must(template.New("report").Funcs(template.FuncMap{
"fmt4": func(f float64) string { return fmt.Sprintf("%.4g", f) },
"fmtTime": func(t time.Time) string { return t.UTC().Format(time.RFC3339) },
"fmtTimep": func(t *time.Time) string { if t == nil { return "—" }; return t.UTC().Format(time.RFC3339) },
"resultBadge": func(s model.StageState) string {
switch s {
case model.StagePassed:
return "pass"
case model.StageFailed:
return "fail"
case model.StageSkipped:
return "skip"
default:
return "pend"
}
},
}).Parse(htmlTemplate))
// Single-string template kept next to the code so the package stays
// self-contained. CSS is inlined; no external assets.
const htmlTemplate = `<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Vetting report — {{.Host.Name}} run {{.Run.ID}}</title>
<style>
:root { color-scheme: light dark; }
body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 2rem; max-width: 960px; }
h1 { margin-bottom: 0; }
.sub { color: #666; margin-top: .2rem; }
section { margin-top: 2rem; }
table { border-collapse: collapse; width: 100%; }
th, td { text-align: left; padding: .35rem .6rem; border-bottom: 1px solid #ccc3; vertical-align: top; }
th { background: #0001; }
.pass { color: #0a0; font-weight: 600; }
.fail { color: #c33; font-weight: 600; }
.skip { color: #888; }
.pend { color: #888; }
.critical { color: #c33; font-weight: 600; }
.warning { color: #c80; }
.info { color: #666; }
code { background: #0001; padding: .05rem .25rem; border-radius: 3px; }
</style>
</head>
<body>
<h1>{{.Host.Name}} — run {{.Run.ID}}</h1>
<div class="sub">State: <b>{{.Run.State}}</b>{{if ne .Run.Result ""}} · result: <b>{{.Run.Result}}</b>{{end}} · generated {{fmtTime .GeneratedAt}}</div>
<section>
<h2>Host</h2>
<table>
<tr><th>Name</th><td>{{.Host.Name}}</td></tr>
<tr><th>MAC</th><td><code>{{.Host.MAC}}</code></td></tr>
<tr><th>WoL</th><td>{{.Host.WoLBroadcastIP}}:{{.Host.WoLPort}}</td></tr>
{{if .Host.Notes}}<tr><th>Notes</th><td>{{.Host.Notes}}</td></tr>{{end}}
</table>
</section>
<section>
<h2>Run</h2>
<table>
<tr><th>Run ID</th><td>{{.Run.ID}}</td></tr>
<tr><th>State</th><td>{{.Run.State}}</td></tr>
<tr><th>Started</th><td>{{fmtTime .Run.StartedAt}}</td></tr>
<tr><th>Completed</th><td>{{fmtTimep .Run.CompletedAt}}</td></tr>
{{if .Run.FailedStage}}<tr><th>Failed stage</th><td class="fail">{{.Run.FailedStage}}</td></tr>{{end}}
{{if .Run.ReportPath}}<tr><th>JSON report</th><td><code>{{.Run.ReportPath}}</code></td></tr>{{end}}
</table>
</section>
<section>
<h2>Stages</h2>
<table>
<thead><tr><th>Stage</th><th>State</th><th>Started</th><th>Completed</th></tr></thead>
<tbody>
{{range .Stages}}
<tr>
<td>{{.Name}}</td>
<td class="{{resultBadge .State}}">{{.State}}</td>
<td>{{fmtTimep .StartedAt}}</td>
<td>{{fmtTimep .CompletedAt}}</td>
</tr>
{{end}}
</tbody>
</table>
</section>
<section>
<h2>Spec diffs ({{len .SpecDiffs}})</h2>
{{if .SpecDiffs}}
<table>
<thead><tr><th>Field</th><th>Expected</th><th>Actual</th><th>Severity</th></tr></thead>
<tbody>
{{range .SpecDiffs}}
<tr>
<td><code>{{.Field}}</code></td>
<td>{{.Expected}}</td>
<td>{{.Actual}}</td>
<td class="{{.Severity}}">{{.Severity}}</td>
</tr>
{{end}}
</tbody>
</table>
{{else}}
<p>No differences between expected and actual hardware.</p>
{{end}}
</section>
<section>
<h2>Measurements ({{len .Aggregates}} series)</h2>
{{if .Aggregates}}
<table>
<thead><tr><th>Kind</th><th>Key</th><th>Samples</th><th>Min</th><th>Avg</th><th>Max</th><th>Unit</th></tr></thead>
<tbody>
{{range .Aggregates}}
<tr>
<td>{{.Kind}}</td>
<td>{{.Key}}</td>
<td>{{.Count}}</td>
<td>{{fmt4 .Min}}</td>
<td>{{fmt4 .Avg}}</td>
<td>{{fmt4 .Max}}</td>
<td>{{.Unit}}</td>
</tr>
{{end}}
</tbody>
</table>
{{else}}
<p>No measurements recorded.</p>
{{end}}
</section>
</body>
</html>
`
+232
View File
@@ -0,0 +1,232 @@
// Package spec owns the expected-vs-actual hardware diff for Vetting.
//
// The operator writes an expected spec YAML per host when registering.
// The agent submits an Inventory artifact after boot. Diff() compares
// them and emits per-field SpecDiff rows; the orchestrator fails the
// SpecValidate stage if any row is classified critical.
//
// Phase 3 rule (operator decision): every mismatch is critical. Missing
// expected fields skip that check entirely so partial specs stay useful
// instead of exploding.
package spec
import (
"fmt"
"sort"
"strings"
"gopkg.in/yaml.v3"
"vetting/internal/model"
)
type Spec struct {
CPU *CPUSpec `yaml:"cpu,omitempty"`
Memory *MemorySpec `yaml:"memory,omitempty"`
Disks []DiskSpec `yaml:"disks,omitempty"`
NICs []NICSpec `yaml:"nics,omitempty"`
GPUs []GPUSpec `yaml:"gpus,omitempty"`
}
type CPUSpec struct {
Model string `json:"model,omitempty" yaml:"model,omitempty"`
LogicalCores int `json:"logical_cores,omitempty" yaml:"logical_cores,omitempty"`
}
type MemorySpec struct {
TotalGiB int `json:"total_gib,omitempty" yaml:"total_gib,omitempty"`
}
type DiskSpec struct {
Serial string `json:"serial,omitempty" yaml:"serial,omitempty"`
SizeGB int `json:"size_gb,omitempty" yaml:"size_gb,omitempty"`
}
type NICSpec struct {
MAC string `json:"mac,omitempty" yaml:"mac,omitempty"`
SpeedGbps int `json:"speed_gbps,omitempty" yaml:"speed_gbps,omitempty"`
}
type GPUSpec struct {
Model string `json:"model,omitempty" yaml:"model,omitempty"`
}
// Inventory is the actual measured hardware. Field names deliberately
// match Spec so the diff reads cleanly.
type Inventory struct {
CPU CPUSpec `json:"cpu" yaml:"cpu"`
Memory MemorySpec `json:"memory" yaml:"memory"`
Disks []DiskSpec `json:"disks" yaml:"disks"`
NICs []NICSpec `json:"nics" yaml:"nics"`
GPUs []GPUSpec `json:"gpus" yaml:"gpus"`
}
// Parse reads expected-spec YAML. Empty YAML parses to a zero Spec and
// yields an empty diff — i.e. "no expectations" is a legal stance.
func Parse(src string) (*Spec, error) {
var s Spec
if err := yaml.Unmarshal([]byte(src), &s); err != nil {
return nil, fmt.Errorf("parse spec yaml: %w", err)
}
return &s, nil
}
// Diff returns the per-field differences with severity. Phase 3 rule:
// every present-expected-field-that-mismatches is critical. Missing
// expected fields are skipped (not info-logged) so the diff list stays
// focused on real problems.
func Diff(expected *Spec, actual *Inventory) []model.SpecDiff {
if expected == nil {
return nil
}
out := []model.SpecDiff{}
if expected.CPU != nil {
if expected.CPU.Model != "" {
if !cpuModelMatches(expected.CPU.Model, actual.CPU.Model) {
out = append(out, diff("cpu.model", expected.CPU.Model, actual.CPU.Model))
}
}
if expected.CPU.LogicalCores > 0 && expected.CPU.LogicalCores != actual.CPU.LogicalCores {
out = append(out, diff("cpu.logical_cores", itoa(expected.CPU.LogicalCores), itoa(actual.CPU.LogicalCores)))
}
}
if expected.Memory != nil && expected.Memory.TotalGiB > 0 {
// Allow ±2 GiB tolerance: BIOS-reserved, kernel, reporting
// quantization. A dead 16 GiB stick will still surface.
if absInt(expected.Memory.TotalGiB-actual.Memory.TotalGiB) > 2 {
out = append(out, diff("memory.total_gib", itoa(expected.Memory.TotalGiB), itoa(actual.Memory.TotalGiB)))
}
}
out = append(out, diffDisks(expected.Disks, actual.Disks)...)
out = append(out, diffNICs(expected.NICs, actual.NICs)...)
out = append(out, diffGPUs(expected.GPUs, actual.GPUs)...)
return out
}
func diffDisks(expected, actual []DiskSpec) []model.SpecDiff {
if len(expected) == 0 {
return nil
}
actualBySerial := map[string]DiskSpec{}
for _, d := range actual {
if d.Serial != "" {
actualBySerial[strings.ToLower(d.Serial)] = d
}
}
var out []model.SpecDiff
seen := map[string]bool{}
for _, exp := range expected {
if exp.Serial == "" {
continue
}
key := strings.ToLower(exp.Serial)
seen[key] = true
got, ok := actualBySerial[key]
if !ok {
out = append(out, diff("disks["+exp.Serial+"].present", "true", "false"))
continue
}
if exp.SizeGB > 0 && absInt(exp.SizeGB-got.SizeGB) > 1 {
out = append(out, diff("disks["+exp.Serial+"].size_gb", itoa(exp.SizeGB), itoa(got.SizeGB)))
}
}
// Extra disks on the host that operator didn't declare are flagged:
// a leftover USB stick could be a destructive-test target we'd
// rather the operator know about.
for _, got := range actual {
if got.Serial == "" {
continue
}
if !seen[strings.ToLower(got.Serial)] {
out = append(out, diff("disks[unexpected "+got.Serial+"]", "", "present"))
}
}
return out
}
func diffNICs(expected, actual []NICSpec) []model.SpecDiff {
if len(expected) == 0 {
return nil
}
actualByMAC := map[string]NICSpec{}
for _, n := range actual {
if n.MAC != "" {
actualByMAC[strings.ToLower(n.MAC)] = n
}
}
var out []model.SpecDiff
for _, exp := range expected {
if exp.MAC == "" {
continue
}
got, ok := actualByMAC[strings.ToLower(exp.MAC)]
if !ok {
out = append(out, diff("nics["+exp.MAC+"].present", "true", "false"))
continue
}
if exp.SpeedGbps > 0 && got.SpeedGbps > 0 && exp.SpeedGbps != got.SpeedGbps {
out = append(out, diff("nics["+exp.MAC+"].speed_gbps", itoa(exp.SpeedGbps), itoa(got.SpeedGbps)))
}
}
return out
}
func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff {
if len(expected) == 0 {
return nil
}
// GPU matching is by model string. Multiple identical cards match
// by count, not identity, since PCI-slot order isn't meaningful.
want := map[string]int{}
for _, g := range expected {
want[strings.ToLower(g.Model)]++
}
got := map[string]int{}
for _, g := range actual {
got[strings.ToLower(g.Model)]++
}
var keys []string
for k := range want {
keys = append(keys, k)
}
sort.Strings(keys)
var out []model.SpecDiff
for _, k := range keys {
if got[k] < want[k] {
out = append(out, diff("gpus["+k+"].count", itoa(want[k]), itoa(got[k])))
}
}
return out
}
// cpuModelMatches compares model strings case-insensitively and allows
// the operator to declare a substring (e.g. "E5-2680 v4") that matches
// the verbose kernel-reported string ("Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz").
func cpuModelMatches(expected, actual string) bool {
e := strings.ToLower(strings.TrimSpace(expected))
a := strings.ToLower(strings.TrimSpace(actual))
return e == a || strings.Contains(a, e)
}
// In Phase 3 all diffs are critical. Later phases may tier them.
func diff(field, expected, actual string) model.SpecDiff {
return model.SpecDiff{
Field: field,
Expected: expected,
Actual: actual,
Severity: "critical",
}
}
func absInt(n int) int {
if n < 0 {
return -n
}
return n
}
func itoa(n int) string { return fmt.Sprintf("%d", n) }
+121
View File
@@ -0,0 +1,121 @@
package spec
import (
"testing"
"vetting/internal/model"
)
func TestDiffEmptySpec(t *testing.T) {
if d := Diff(&Spec{}, &Inventory{}); len(d) != 0 {
t.Fatalf("empty spec → empty diff, got %v", d)
}
}
func TestDiffCPUMismatch(t *testing.T) {
exp := &Spec{CPU: &CPUSpec{Model: "E5-2680 v4", LogicalCores: 28}}
act := &Inventory{CPU: CPUSpec{Model: "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz", LogicalCores: 16}}
d := Diff(exp, act)
if len(d) != 1 || d[0].Field != "cpu.logical_cores" || d[0].Severity != "critical" {
t.Fatalf("expected logical_cores critical, got %+v", d)
}
}
func TestDiffCPUModelSubstringMatch(t *testing.T) {
exp := &Spec{CPU: &CPUSpec{Model: "E5-2680 v4"}}
act := &Inventory{CPU: CPUSpec{Model: "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz"}}
if d := Diff(exp, act); len(d) != 0 {
t.Fatalf("substring should match, got %+v", d)
}
}
func TestDiffMemoryTolerance(t *testing.T) {
exp := &Spec{Memory: &MemorySpec{TotalGiB: 128}}
act := &Inventory{Memory: MemorySpec{TotalGiB: 127}}
if d := Diff(exp, act); len(d) != 0 {
t.Fatalf("1 GiB variance should be tolerated, got %+v", d)
}
act2 := &Inventory{Memory: MemorySpec{TotalGiB: 112}} // missing stick
d := Diff(exp, act2)
if len(d) != 1 || d[0].Field != "memory.total_gib" {
t.Fatalf("16 GiB drop should be critical, got %+v", d)
}
}
func TestDiffDisksMissingAndUnexpected(t *testing.T) {
exp := &Spec{Disks: []DiskSpec{{Serial: "A", SizeGB: 1000}, {Serial: "B", SizeGB: 500}}}
act := &Inventory{Disks: []DiskSpec{{Serial: "A", SizeGB: 1000}, {Serial: "C", SizeGB: 32}}}
d := Diff(exp, act)
// Expect: disk B missing, disk C unexpected.
got := map[string]bool{}
for _, row := range d {
got[row.Field] = true
}
if !got["disks[B].present"] {
t.Fatalf("expected disks[B].present critical; got %+v", d)
}
if !got["disks[unexpected C]"] {
t.Fatalf("expected disks[unexpected C] critical; got %+v", d)
}
}
func TestDiffDisksSerialCaseInsensitive(t *testing.T) {
exp := &Spec{Disks: []DiskSpec{{Serial: "wd-abc123", SizeGB: 1000}}}
act := &Inventory{Disks: []DiskSpec{{Serial: "WD-ABC123", SizeGB: 1000}}}
if d := Diff(exp, act); len(d) != 0 {
t.Fatalf("serial compare must be case-insensitive, got %+v", d)
}
}
func TestDiffNICMAC(t *testing.T) {
exp := &Spec{NICs: []NICSpec{{MAC: "aa:bb:cc:dd:ee:ff", SpeedGbps: 10}}}
act := &Inventory{NICs: []NICSpec{{MAC: "aa:bb:cc:dd:ee:ff", SpeedGbps: 1}}}
d := Diff(exp, act)
if len(d) != 1 || d[0].Field != "nics[aa:bb:cc:dd:ee:ff].speed_gbps" {
t.Fatalf("expected speed mismatch, got %+v", d)
}
}
func TestDiffGPUCount(t *testing.T) {
exp := &Spec{GPUs: []GPUSpec{{Model: "NVIDIA RTX 3090"}, {Model: "NVIDIA RTX 3090"}}}
act := &Inventory{GPUs: []GPUSpec{{Model: "nvidia rtx 3090"}}}
d := Diff(exp, act)
if len(d) != 1 || d[0].Field != "gpus[nvidia rtx 3090].count" {
t.Fatalf("expected GPU count critical, got %+v", d)
}
}
func TestParseValidYAML(t *testing.T) {
src := `
cpu:
model: "E5-2680 v4"
logical_cores: 28
memory:
total_gib: 128
disks:
- serial: A
size_gb: 1000
`
s, err := Parse(src)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if s.CPU == nil || s.CPU.LogicalCores != 28 {
t.Fatalf("cpu not parsed: %+v", s)
}
if len(s.Disks) != 1 || s.Disks[0].Serial != "A" {
t.Fatalf("disks not parsed: %+v", s)
}
}
func TestDiffSeverityAlwaysCritical(t *testing.T) {
exp := &Spec{CPU: &CPUSpec{LogicalCores: 8}}
act := &Inventory{CPU: CPUSpec{LogicalCores: 4}}
d := Diff(exp, act)
var got []model.SpecDiff = d
for _, row := range got {
if row.Severity != "critical" {
t.Fatalf("phase-3 rule: every diff is critical; got %q for %s", row.Severity, row.Field)
}
}
}
+126
View File
@@ -0,0 +1,126 @@
package store
import (
"context"
"database/sql"
"fmt"
"vetting/internal/model"
)
type Artifact struct {
ID int64
RunID int64
StageID *int64
Kind string // inventory|spec_diff|hold_key|report|log|fio|iperf|smart
Path string
SHA256 string
SizeBytes int64
}
type Artifacts struct {
DB *sql.DB
}
func (a *Artifacts) Create(ctx context.Context, art Artifact) (int64, error) {
res, err := a.DB.ExecContext(ctx, `
INSERT INTO artifacts(run_id, stage_id, kind, path, sha256, size_bytes)
VALUES(?,?,?,?,?,?)
`, art.RunID, nullInt64(art.StageID), art.Kind, art.Path, art.SHA256, art.SizeBytes)
if err != nil {
return 0, fmt.Errorf("insert artifact: %w", err)
}
return res.LastInsertId()
}
// DeleteForRun removes every artifact row for a run. Returns the rows
// that were deleted so the caller can unlink the on-disk files. Used by
// the janitor; ordinary flow treats artifacts as append-only.
func (a *Artifacts) DeleteForRun(ctx context.Context, runID int64) ([]Artifact, error) {
arts, err := a.ListForRun(ctx, runID)
if err != nil {
return nil, err
}
if _, err := a.DB.ExecContext(ctx, `DELETE FROM artifacts WHERE run_id = ?`, runID); err != nil {
return nil, fmt.Errorf("delete artifacts for run %d: %w", runID, err)
}
return arts, nil
}
func (a *Artifacts) ListForRun(ctx context.Context, runID int64) ([]Artifact, error) {
rows, err := a.DB.QueryContext(ctx, `
SELECT id, run_id, stage_id, kind, path, sha256, size_bytes
FROM artifacts WHERE run_id = ? ORDER BY id
`, runID)
if err != nil {
return nil, err
}
defer rows.Close()
var out []Artifact
for rows.Next() {
var ar Artifact
var stageID sql.NullInt64
if err := rows.Scan(&ar.ID, &ar.RunID, &stageID, &ar.Kind, &ar.Path, &ar.SHA256, &ar.SizeBytes); err != nil {
return nil, err
}
if stageID.Valid {
v := stageID.Int64
ar.StageID = &v
}
out = append(out, ar)
}
return out, rows.Err()
}
type SpecDiffs struct {
DB *sql.DB
}
func (s *SpecDiffs) ReplaceForRun(ctx context.Context, runID int64, diffs []model.SpecDiff) error {
tx, err := s.DB.BeginTx(ctx, nil)
if err != nil {
return err
}
defer func() { _ = tx.Rollback() }()
if _, err := tx.ExecContext(ctx, `DELETE FROM spec_diffs WHERE run_id = ?`, runID); err != nil {
return err
}
for _, d := range diffs {
if _, err := tx.ExecContext(ctx, `
INSERT INTO spec_diffs(run_id, field, expected, actual, severity, ignored)
VALUES(?,?,?,?,?,?)
`, runID, d.Field, d.Expected, d.Actual, d.Severity, 0); err != nil {
return err
}
}
return tx.Commit()
}
func (s *SpecDiffs) ListForRun(ctx context.Context, runID int64) ([]model.SpecDiff, error) {
rows, err := s.DB.QueryContext(ctx, `
SELECT id, run_id, field, COALESCE(expected,''), COALESCE(actual,''), severity, ignored
FROM spec_diffs WHERE run_id = ? ORDER BY id
`, runID)
if err != nil {
return nil, err
}
defer rows.Close()
var out []model.SpecDiff
for rows.Next() {
var d model.SpecDiff
var ignored int
if err := rows.Scan(&d.ID, &d.RunID, &d.Field, &d.Expected, &d.Actual, &d.Severity, &ignored); err != nil {
return nil, err
}
d.Ignored = ignored != 0
out = append(out, d)
}
return out, rows.Err()
}
func nullInt64(p *int64) any {
if p == nil {
return nil
}
return *p
}
+98
View File
@@ -0,0 +1,98 @@
package store
import (
"context"
"database/sql"
"errors"
"fmt"
"strings"
"vetting/internal/model"
)
type Hosts struct {
DB *sql.DB
}
var ErrNotFound = errors.New("not found")
func (h *Hosts) Create(ctx context.Context, in model.Host) (int64, error) {
in.MAC = normalizeMAC(in.MAC)
res, err := h.DB.ExecContext(ctx, `
INSERT INTO hosts(name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml, pdu_config_json, ipmi_config_json, notes)
VALUES(?,?,?,?,?,?,?,?)
`, in.Name, in.MAC, in.WoLBroadcastIP, in.WoLPort, in.ExpectedSpecYAML, nullIfEmpty(in.PDUConfigJSON), nullIfEmpty(in.IPMIConfigJSON), in.Notes)
if err != nil {
return 0, fmt.Errorf("insert host: %w", err)
}
return res.LastInsertId()
}
func (h *Hosts) List(ctx context.Context) ([]model.Host, error) {
rows, err := h.DB.QueryContext(ctx, `
SELECT id, name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml,
COALESCE(pdu_config_json,''), COALESCE(ipmi_config_json,''),
notes, created_at, updated_at
FROM hosts
ORDER BY name COLLATE NOCASE
`)
if err != nil {
return nil, fmt.Errorf("list hosts: %w", err)
}
defer rows.Close()
var out []model.Host
for rows.Next() {
var host model.Host
if err := rows.Scan(&host.ID, &host.Name, &host.MAC, &host.WoLBroadcastIP, &host.WoLPort,
&host.ExpectedSpecYAML, &host.PDUConfigJSON, &host.IPMIConfigJSON,
&host.Notes, &host.CreatedAt, &host.UpdatedAt); err != nil {
return nil, fmt.Errorf("scan host: %w", err)
}
out = append(out, host)
}
return out, rows.Err()
}
func (h *Hosts) Get(ctx context.Context, id int64) (*model.Host, error) {
row := h.DB.QueryRowContext(ctx, `
SELECT id, name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml,
COALESCE(pdu_config_json,''), COALESCE(ipmi_config_json,''),
notes, created_at, updated_at
FROM hosts WHERE id = ?
`, id)
var host model.Host
err := row.Scan(&host.ID, &host.Name, &host.MAC, &host.WoLBroadcastIP, &host.WoLPort,
&host.ExpectedSpecYAML, &host.PDUConfigJSON, &host.IPMIConfigJSON,
&host.Notes, &host.CreatedAt, &host.UpdatedAt)
if errors.Is(err, sql.ErrNoRows) {
return nil, ErrNotFound
}
if err != nil {
return nil, fmt.Errorf("get host: %w", err)
}
return &host, nil
}
func (h *Hosts) Delete(ctx context.Context, id int64) error {
res, err := h.DB.ExecContext(ctx, `DELETE FROM hosts WHERE id = ?`, id)
if err != nil {
return fmt.Errorf("delete host: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return ErrNotFound
}
return nil
}
func normalizeMAC(m string) string {
return strings.ToLower(strings.TrimSpace(m))
}
func nullIfEmpty(s string) any {
if s == "" {
return nil
}
return s
}
+85
View File
@@ -0,0 +1,85 @@
package store
import (
"context"
"database/sql"
"fmt"
"time"
"vetting/internal/model"
)
// Measurements persists timestamped numeric samples: temps, fan speeds,
// PSU voltages, fio IOPS, iperf throughput, SMART attributes. The schema
// stores (kind, key, value, unit) so Phase 5 reports can group freely
// without new tables per source.
type Measurements struct {
DB *sql.DB
}
func (m *Measurements) Create(ctx context.Context, in model.Measurement) (int64, error) {
if in.TS.IsZero() {
in.TS = time.Now().UTC()
}
res, err := m.DB.ExecContext(ctx, `
INSERT INTO measurements(run_id, stage_id, ts, kind, key, value, unit)
VALUES(?,?,?,?,?,?,?)
`, in.RunID, nullInt64(in.StageID), in.TS, in.Kind, in.Key, in.Value, in.Unit)
if err != nil {
return 0, fmt.Errorf("insert measurement: %w", err)
}
return res.LastInsertId()
}
// CreateBatch inserts a batch in one transaction. The sensor endpoint
// hands us ~520 samples per tick; a single commit keeps SQLite happy.
func (m *Measurements) CreateBatch(ctx context.Context, rows []model.Measurement) error {
if len(rows) == 0 {
return nil
}
tx, err := m.DB.BeginTx(ctx, nil)
if err != nil {
return err
}
defer func() { _ = tx.Rollback() }()
now := time.Now().UTC()
for _, r := range rows {
if r.TS.IsZero() {
r.TS = now
}
if _, err := tx.ExecContext(ctx, `
INSERT INTO measurements(run_id, stage_id, ts, kind, key, value, unit)
VALUES(?,?,?,?,?,?,?)
`, r.RunID, nullInt64(r.StageID), r.TS, r.Kind, r.Key, r.Value, r.Unit); err != nil {
return fmt.Errorf("insert measurement: %w", err)
}
}
return tx.Commit()
}
// ListForRun returns all measurements for a run. Callers filter by kind
// in memory; the row count is small per run (≈thousands).
func (m *Measurements) ListForRun(ctx context.Context, runID int64) ([]model.Measurement, error) {
rows, err := m.DB.QueryContext(ctx, `
SELECT id, run_id, stage_id, ts, kind, key, value, COALESCE(unit,'')
FROM measurements WHERE run_id = ? ORDER BY ts, id
`, runID)
if err != nil {
return nil, err
}
defer rows.Close()
var out []model.Measurement
for rows.Next() {
var meas model.Measurement
var stageID sql.NullInt64
if err := rows.Scan(&meas.ID, &meas.RunID, &stageID, &meas.TS, &meas.Kind, &meas.Key, &meas.Value, &meas.Unit); err != nil {
return nil, err
}
if stageID.Valid {
v := stageID.Int64
meas.StageID = &v
}
out = append(out, meas)
}
return out, rows.Err()
}
+226
View File
@@ -0,0 +1,226 @@
package store
import (
"context"
"database/sql"
"errors"
"fmt"
"time"
"vetting/internal/model"
)
type Runs struct {
DB *sql.DB
}
func (r *Runs) Create(ctx context.Context, hostID int64, tokenHash string) (int64, error) {
now := time.Now().UTC()
res, err := r.DB.ExecContext(ctx, `
INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at)
VALUES(?,?,?,?,?)
`, hostID, string(model.StateQueued), tokenHash, "linux", now)
if err != nil {
return 0, fmt.Errorf("insert run: %w", err)
}
return res.LastInsertId()
}
func (r *Runs) SetState(ctx context.Context, runID int64, state model.RunState) error {
_, err := r.DB.ExecContext(ctx, `UPDATE runs SET state = ? WHERE id = ?`, string(state), runID)
return err
}
// RotateTokenHash replaces the stored token hash. Called on each iPXE
// fetch so only the most-recently-booted agent can claim the run.
func (r *Runs) RotateTokenHash(ctx context.Context, runID int64, hash string) error {
_, err := r.DB.ExecContext(ctx, `UPDATE runs SET agent_token_hash = ? WHERE id = ?`, hash, runID)
return err
}
// SetHoldIP records the agent's LAN IP so the UI can show the ssh
// command. Called when the agent POSTs /hold.
func (r *Runs) SetHoldIP(ctx context.Context, runID int64, ip string) error {
_, err := r.DB.ExecContext(ctx, `UPDATE runs SET hold_ip = ? WHERE id = ?`, ip, runID)
return err
}
// SetFailedStage records which stage tripped the run; used by the tile
// and by reports. Does not change state.
func (r *Runs) SetFailedStage(ctx context.Context, runID int64, stage string) error {
_, err := r.DB.ExecContext(ctx, `UPDATE runs SET failed_stage = ? WHERE id = ?`, stage, runID)
return err
}
// ClearFailedStage wipes the failed_stage marker. Called when the
// operator overrides a stage and the run re-enters the pipeline.
func (r *Runs) ClearFailedStage(ctx context.Context, runID int64) error {
_, err := r.DB.ExecContext(ctx, `UPDATE runs SET failed_stage = NULL WHERE id = ?`, runID)
return err
}
// SetOverrideFlags persists the operator's override decisions (JSON blob
// like `{"wipe":true}`). Passed back to the agent on the next heartbeat
// so it can resume the held stage with the gate bypassed.
func (r *Runs) SetOverrideFlags(ctx context.Context, runID int64, flagsJSON string) error {
_, err := r.DB.ExecContext(ctx, `UPDATE runs SET override_flags_json = ? WHERE id = ?`, flagsJSON, runID)
return err
}
func (r *Runs) MarkFailed(ctx context.Context, runID int64, failedStage, holdIP string) error {
now := time.Now().UTC()
_, err := r.DB.ExecContext(ctx, `
UPDATE runs SET state = ?, result = 'fail', failed_stage = ?, hold_ip = ?, completed_at = ?
WHERE id = ?
`, string(model.StateFailedHolding), failedStage, holdIP, now, runID)
return err
}
func (r *Runs) MarkCompleted(ctx context.Context, runID int64, reportPath string) error {
now := time.Now().UTC()
_, err := r.DB.ExecContext(ctx, `
UPDATE runs SET state = ?, result = 'pass', report_path = ?, completed_at = ?
WHERE id = ?
`, string(model.StateCompleted), reportPath, now, runID)
return err
}
func (r *Runs) Get(ctx context.Context, id int64) (*model.Run, error) {
row := r.DB.QueryRowContext(ctx, `
SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
COALESCE(next_boot_target,''), agent_token_hash, started_at,
completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
COALESCE(override_flags_json,'')
FROM runs WHERE id = ?
`, id)
var run model.Run
var completedAt sql.NullTime
err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON)
if errors.Is(err, sql.ErrNoRows) {
return nil, ErrNotFound
}
if err != nil {
return nil, fmt.Errorf("get run: %w", err)
}
if completedAt.Valid {
run.CompletedAt = &completedAt.Time
}
return &run, nil
}
// LatestForHost returns the most recent run for a host, or nil if none.
func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, error) {
row := r.DB.QueryRowContext(ctx, `
SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
COALESCE(next_boot_target,''), agent_token_hash, started_at,
completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
COALESCE(override_flags_json,'')
FROM runs WHERE host_id = ?
ORDER BY id DESC LIMIT 1
`, hostID)
var run model.Run
var completedAt sql.NullTime
err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON)
if errors.Is(err, sql.ErrNoRows) {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("latest run: %w", err)
}
if completedAt.Valid {
run.CompletedAt = &completedAt.Time
}
return &run, nil
}
// Active returns all runs in non-terminal states.
func (r *Runs) Active(ctx context.Context) ([]model.Run, error) {
rows, err := r.DB.QueryContext(ctx, `
SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
COALESCE(next_boot_target,''), agent_token_hash, started_at,
completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
COALESCE(override_flags_json,'')
FROM runs
WHERE state NOT IN ('Completed','Released')
ORDER BY id
`)
if err != nil {
return nil, err
}
defer rows.Close()
var out []model.Run
for rows.Next() {
var run model.Run
var completedAt sql.NullTime
if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON); err != nil {
return nil, err
}
if completedAt.Valid {
run.CompletedAt = &completedAt.Time
}
out = append(out, run)
}
return out, rows.Err()
}
// CompletedOlderThan returns run IDs for terminal (Completed/Released/
// FailedHolding) runs whose completed_at is older than cutoff. Runs with
// a NULL completed_at fall back to started_at so a stuck run doesn't get
// garbage-collected out from under its own logs. Used by the janitor.
func (r *Runs) CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error) {
rows, err := r.DB.QueryContext(ctx, `
SELECT id FROM runs
WHERE state IN ('Completed','Released','FailedHolding')
AND COALESCE(completed_at, started_at) < ?
ORDER BY id
`, cutoff)
if err != nil {
return nil, err
}
defer rows.Close()
var out []int64
for rows.Next() {
var id int64
if err := rows.Scan(&id); err != nil {
return nil, err
}
out = append(out, id)
}
return out, rows.Err()
}
// FindByMAC returns the current active run for the host with the given MAC,
// or nil if the MAC is unknown or has no active run.
func (r *Runs) FindActiveByMAC(ctx context.Context, mac string) (*model.Run, error) {
row := r.DB.QueryRowContext(ctx, `
SELECT r.id, r.host_id, r.state, COALESCE(r.result,''), COALESCE(r.failed_stage,''),
COALESCE(r.next_boot_target,''), r.agent_token_hash, r.started_at,
r.completed_at, COALESCE(r.report_path,''), COALESCE(r.hold_ip,''),
COALESCE(r.override_flags_json,'')
FROM runs r
JOIN hosts h ON h.id = r.host_id
WHERE h.mac = ? AND r.state NOT IN ('Completed','Released')
ORDER BY r.id DESC LIMIT 1
`, mac)
var run model.Run
var completedAt sql.NullTime
err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON)
if errors.Is(err, sql.ErrNoRows) {
return nil, nil
}
if err != nil {
return nil, err
}
if completedAt.Valid {
run.CompletedAt = &completedAt.Time
}
return &run, nil
}
+91
View File
@@ -0,0 +1,91 @@
package store
import (
"context"
"database/sql"
"fmt"
"time"
"vetting/internal/model"
)
type Stages struct {
DB *sql.DB
}
// DefaultStageOrder is the canonical sequence for every run. Phase 2 only
// reaches Inventory; later phases add more executors but the list is fixed.
var DefaultStageOrder = []string{
"Inventory",
"SpecValidate",
"SMART",
"CPUStress",
"Storage",
"Network",
"GPU",
"PSU",
"Reporting",
}
// Seed creates one pending row per stage for the given run.
func (s *Stages) Seed(ctx context.Context, runID int64) error {
tx, err := s.DB.BeginTx(ctx, nil)
if err != nil {
return err
}
defer func() { _ = tx.Rollback() }()
for i, name := range DefaultStageOrder {
if _, err := tx.ExecContext(ctx,
`INSERT INTO stages(run_id, name, ordinal, state) VALUES(?,?,?,?)`,
runID, name, i, string(model.StagePending)); err != nil {
return fmt.Errorf("seed stage %s: %w", name, err)
}
}
return tx.Commit()
}
func (s *Stages) ListForRun(ctx context.Context, runID int64) ([]model.Stage, error) {
rows, err := s.DB.QueryContext(ctx, `
SELECT id, run_id, name, ordinal, state, started_at, completed_at, COALESCE(summary_json,'')
FROM stages WHERE run_id = ? ORDER BY ordinal
`, runID)
if err != nil {
return nil, err
}
defer rows.Close()
var out []model.Stage
for rows.Next() {
var st model.Stage
var started, completed sql.NullTime
if err := rows.Scan(&st.ID, &st.RunID, &st.Name, &st.Ordinal, &st.State,
&started, &completed, &st.SummaryJSON); err != nil {
return nil, err
}
if started.Valid {
st.StartedAt = &started.Time
}
if completed.Valid {
st.CompletedAt = &completed.Time
}
out = append(out, st)
}
return out, rows.Err()
}
func (s *Stages) StartByName(ctx context.Context, runID int64, name string) error {
now := time.Now().UTC()
_, err := s.DB.ExecContext(ctx, `
UPDATE stages SET state = ?, started_at = ?
WHERE run_id = ? AND name = ?
`, string(model.StageRunning), now, runID, name)
return err
}
func (s *Stages) CompleteByName(ctx context.Context, runID int64, name string, state model.StageState, summaryJSON string) error {
now := time.Now().UTC()
_, err := s.DB.ExecContext(ctx, `
UPDATE stages SET state = ?, completed_at = ?, summary_json = ?
WHERE run_id = ? AND name = ?
`, string(state), now, nullIfEmpty(summaryJSON), runID, name)
return err
}
+229
View File
@@ -0,0 +1,229 @@
package store_test
import (
"context"
"path/filepath"
"testing"
"vetting/internal/db"
"vetting/internal/model"
"vetting/internal/store"
)
func newDB(t *testing.T) *store.Runs {
t.Helper()
path := filepath.Join(t.TempDir(), "vetting.db")
conn, err := db.Open(path)
if err != nil {
t.Fatalf("open db: %v", err)
}
t.Cleanup(func() { _ = conn.Close() })
return &store.Runs{DB: conn}
}
// seedRun inserts a host + a run and returns (hostID, runID). Every
// subsequent store test builds on this so run_id foreign keys resolve.
func seedRun(t *testing.T, runs *store.Runs) (int64, int64) {
t.Helper()
hosts := &store.Hosts{DB: runs.DB}
hostID, err := hosts.Create(context.Background(), model.Host{
Name: "t-host",
MAC: "aa:bb:cc:dd:ee:ff",
WoLBroadcastIP: "10.0.0.255",
WoLPort: 9,
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
})
if err != nil {
t.Fatalf("create host: %v", err)
}
runID, err := runs.Create(context.Background(), hostID, "deadbeef")
if err != nil {
t.Fatalf("create run: %v", err)
}
return hostID, runID
}
func TestArtifactsRoundtrip(t *testing.T) {
runs := newDB(t)
_, runID := seedRun(t, runs)
arts := &store.Artifacts{DB: runs.DB}
id, err := arts.Create(context.Background(), store.Artifact{
RunID: runID,
Kind: "inventory",
Path: "/var/artifacts/run-1/inventory.json",
SHA256: "abc123",
SizeBytes: 42,
})
if err != nil {
t.Fatalf("Create: %v", err)
}
if id == 0 {
t.Fatalf("expected non-zero id")
}
// Hold key on the same run — ListForRun should return both in
// insertion order and TileEnricher picks the hold_key row.
if _, err := arts.Create(context.Background(), store.Artifact{
RunID: runID, Kind: "hold_key", Path: "/var/artifacts/run-1/hold.key", SHA256: "def456", SizeBytes: 400,
}); err != nil {
t.Fatalf("Create hold_key: %v", err)
}
list, err := arts.ListForRun(context.Background(), runID)
if err != nil {
t.Fatalf("ListForRun: %v", err)
}
if len(list) != 2 {
t.Fatalf("ListForRun returned %d, want 2", len(list))
}
if list[0].Kind != "inventory" || list[1].Kind != "hold_key" {
t.Fatalf("unexpected order: %+v", list)
}
if list[1].Path != "/var/artifacts/run-1/hold.key" {
t.Fatalf("hold_key path lost: %q", list[1].Path)
}
}
func TestSpecDiffsReplaceForRun(t *testing.T) {
runs := newDB(t)
_, runID := seedRun(t, runs)
sd := &store.SpecDiffs{DB: runs.DB}
ctx := context.Background()
// First write: three diffs.
err := sd.ReplaceForRun(ctx, runID, []model.SpecDiff{
{RunID: runID, Field: "cpu.model", Expected: "Xeon", Actual: "EPYC", Severity: "critical"},
{RunID: runID, Field: "memory.total_gib", Expected: "16", Actual: "8", Severity: "critical"},
{RunID: runID, Field: "note", Expected: "", Actual: "dusty", Severity: "info"},
})
if err != nil {
t.Fatalf("ReplaceForRun: %v", err)
}
list, err := sd.ListForRun(ctx, runID)
if err != nil {
t.Fatalf("ListForRun: %v", err)
}
if len(list) != 3 {
t.Fatalf("got %d rows, want 3", len(list))
}
// Second write replaces, doesn't append — otherwise a re-run would
// double-count spec diffs and the tile badge would grow without bound.
err = sd.ReplaceForRun(ctx, runID, []model.SpecDiff{
{RunID: runID, Field: "cpu.model", Expected: "Xeon", Actual: "Xeon Gold", Severity: "info"},
})
if err != nil {
t.Fatalf("second ReplaceForRun: %v", err)
}
list, err = sd.ListForRun(ctx, runID)
if err != nil {
t.Fatalf("ListForRun after replace: %v", err)
}
if len(list) != 1 {
t.Fatalf("expected 1 row after replace, got %d", len(list))
}
if list[0].Severity != "info" {
t.Fatalf("expected severity info, got %q", list[0].Severity)
}
}
func TestMeasurementsBatchAndList(t *testing.T) {
runs := newDB(t)
_, runID := seedRun(t, runs)
meas := &store.Measurements{DB: runs.DB}
ctx := context.Background()
err := meas.CreateBatch(ctx, []model.Measurement{
{RunID: runID, Kind: "thermal", Key: "cpu", Value: 52.5, Unit: "C"},
{RunID: runID, Kind: "iperf", Key: "throughput_mbps", Value: 940.1, Unit: "Mbps"},
{RunID: runID, Kind: "psu", Key: "in0", Value: 12.04, Unit: "V"},
})
if err != nil {
t.Fatalf("CreateBatch: %v", err)
}
// Zero-length batch must be a no-op, not an error.
if err := meas.CreateBatch(ctx, nil); err != nil {
t.Fatalf("empty CreateBatch: %v", err)
}
rows, err := meas.ListForRun(ctx, runID)
if err != nil {
t.Fatalf("ListForRun: %v", err)
}
if len(rows) != 3 {
t.Fatalf("got %d rows, want 3", len(rows))
}
foundIperf := false
for _, r := range rows {
if r.Kind == "iperf" && r.Key == "throughput_mbps" && r.Value > 900 {
foundIperf = true
}
}
if !foundIperf {
t.Fatalf("iperf row missing or wrong value: %+v", rows)
}
}
func TestRunsOverrideFlagsAndClearFailedStage(t *testing.T) {
runs := newDB(t)
_, runID := seedRun(t, runs)
ctx := context.Background()
if err := runs.SetFailedStage(ctx, runID, "Storage"); err != nil {
t.Fatalf("SetFailedStage: %v", err)
}
if err := runs.SetOverrideFlags(ctx, runID, `{"wipe":true}`); err != nil {
t.Fatalf("SetOverrideFlags: %v", err)
}
run, err := runs.Get(ctx, runID)
if err != nil {
t.Fatalf("Get: %v", err)
}
if run.OverrideFlagsJSON != `{"wipe":true}` {
t.Fatalf("OverrideFlagsJSON = %q, want {\"wipe\":true}", run.OverrideFlagsJSON)
}
if run.FailedStage != "Storage" {
t.Fatalf("FailedStage = %q, want Storage", run.FailedStage)
}
if err := runs.ClearFailedStage(ctx, runID); err != nil {
t.Fatalf("ClearFailedStage: %v", err)
}
run, err = runs.Get(ctx, runID)
if err != nil {
t.Fatalf("Get after clear: %v", err)
}
if run.FailedStage != "" {
t.Fatalf("FailedStage not cleared: %q", run.FailedStage)
}
// override_flags_json should persist across ClearFailedStage so the
// agent can still read it on its next heartbeat.
if run.OverrideFlagsJSON != `{"wipe":true}` {
t.Fatalf("OverrideFlagsJSON lost after ClearFailedStage: %q", run.OverrideFlagsJSON)
}
}
func TestRunsHoldAndFailedStage(t *testing.T) {
runs := newDB(t)
_, runID := seedRun(t, runs)
ctx := context.Background()
if err := runs.SetHoldIP(ctx, runID, "10.0.0.42"); err != nil {
t.Fatalf("SetHoldIP: %v", err)
}
if err := runs.SetFailedStage(ctx, runID, "SpecValidate"); err != nil {
t.Fatalf("SetFailedStage: %v", err)
}
run, err := runs.Get(ctx, runID)
if err != nil {
t.Fatalf("Get: %v", err)
}
if run.HoldIP != "10.0.0.42" {
t.Fatalf("HoldIP = %q, want 10.0.0.42", run.HoldIP)
}
if run.FailedStage != "SpecValidate" {
t.Fatalf("FailedStage = %q, want SpecValidate", run.FailedStage)
}
}
+6
View File
@@ -0,0 +1,6 @@
package web
import "embed"
//go:embed static/*
var Static embed.FS
+210
View File
@@ -0,0 +1,210 @@
:root {
--bg: #0f1115;
--bg-elev: #171a21;
--bg-elev-2: #1f232c;
--border: #2a2f3a;
--text: #e5e8ef;
--text-dim: #9aa2b1;
--accent: #6aa9ff;
--accent-strong: #3c82f6;
--success: #35c27b;
--warn: #e4a94b;
--danger: #e56466;
--radius: 8px;
--font: system-ui, -apple-system, "Segoe UI", Roboto, sans-serif;
--mono: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
}
* { box-sizing: border-box; }
html, body {
margin: 0;
padding: 0;
background: var(--bg);
color: var(--text);
font: 15px/1.45 var(--font);
}
a { color: var(--accent); text-decoration: none; }
a:hover { text-decoration: underline; }
.topbar {
display: flex;
align-items: center;
gap: 24px;
padding: 12px 24px;
border-bottom: 1px solid var(--border);
background: var(--bg-elev);
}
.topbar .brand { font-weight: 700; letter-spacing: .2px; }
.topbar nav { display: flex; gap: 16px; flex: 1; }
.topbar nav a { color: var(--text-dim); }
.topbar nav a:hover { color: var(--text); text-decoration: none; }
.topbar .session { display: flex; align-items: center; gap: 12px; }
.topbar .heartbeat { color: var(--text-dim); font-family: var(--mono); font-size: 12px; }
.topbar .logout-form { margin: 0; }
main { max-width: 1280px; margin: 0 auto; padding: 24px; }
button, .button, .button-secondary {
appearance: none;
font: inherit;
padding: 8px 14px;
border-radius: var(--radius);
border: 1px solid var(--border);
background: var(--bg-elev-2);
color: var(--text);
cursor: pointer;
text-decoration: none;
display: inline-block;
}
button:hover, .button:hover { border-color: var(--accent); }
button:disabled { opacity: .5; cursor: not-allowed; }
button.danger { border-color: var(--danger); color: var(--danger); background: transparent; }
button.danger:hover { background: rgba(229,100,102,.1); }
.button-secondary { background: transparent; }
.error {
background: rgba(229,100,102,.12);
border: 1px solid var(--danger);
color: var(--danger);
padding: 10px 14px;
border-radius: var(--radius);
margin-bottom: 16px;
}
.dashboard-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 20px;
}
.dashboard-header h1 { font-size: 20px; margin: 0; }
.empty {
text-align: center;
padding: 48px 24px;
border: 1px dashed var(--border);
border-radius: var(--radius);
color: var(--text-dim);
}
.empty .button { margin-top: 12px; }
.tile-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(320px, 1fr));
gap: 16px;
}
.tile {
background: var(--bg-elev);
border: 1px solid var(--border);
border-radius: var(--radius);
padding: 16px;
display: flex;
flex-direction: column;
gap: 12px;
}
.tile-head { display: flex; justify-content: space-between; align-items: center; }
.tile-name { font-weight: 600; }
.tile-status { font-size: 12px; color: var(--text-dim); text-transform: uppercase; letter-spacing: .5px; }
.tile-idle .tile-status { color: var(--text-dim); }
.tile-meta { display: grid; grid-template-columns: 1fr 1fr; gap: 4px 16px; margin: 0; font-size: 13px; }
.tile-meta div { display: flex; justify-content: space-between; align-items: baseline; }
.tile-meta dt { color: var(--text-dim); }
.tile-meta dd { margin: 0; font-family: var(--mono); }
.tile-actions { display: flex; gap: 8px; }
.tile-actions .inline { margin: 0; flex: 0; }
.tile-meta dd.bad { color: var(--danger); }
.tile-hold {
background: rgba(229,100,102,.08);
border: 1px solid rgba(229,100,102,.35);
border-radius: var(--radius);
padding: 8px 10px;
display: flex;
flex-direction: column;
gap: 4px;
}
.tile-hold .hold-title {
font-size: 12px;
color: var(--danger);
text-transform: uppercase;
letter-spacing: .5px;
}
.tile-hold .hold-ssh {
font-family: var(--mono);
font-size: 12px;
color: var(--text);
word-break: break-all;
user-select: all;
}
.tile-log {
background: #0b0d12;
border: 1px solid var(--border);
border-radius: var(--radius);
padding: 8px 10px;
font-family: var(--mono);
font-size: 12px;
color: var(--text-dim);
max-height: 160px;
overflow-y: auto;
display: flex;
flex-direction: column;
gap: 2px;
}
.tile-log:empty { display: none; }
.tile-log .log-line { white-space: pre-wrap; }
.tile-log .log-warn { color: var(--warn); }
.tile-log .log-error { color: var(--danger); }
.tile-fail { border-color: rgba(229,100,102,.6); }
.tile-pass { border-color: rgba(53,194,123,.5); }
.tile-active { border-color: var(--accent); }
.form-wrap { max-width: 640px; }
.form-wrap h1 { font-size: 20px; }
.host-form { display: flex; flex-direction: column; gap: 14px; }
.host-form label { display: flex; flex-direction: column; gap: 4px; color: var(--text-dim); font-size: 13px; }
.host-form input,
.host-form textarea {
font: inherit;
font-family: var(--mono);
color: var(--text);
background: var(--bg-elev);
border: 1px solid var(--border);
border-radius: var(--radius);
padding: 8px 10px;
}
.host-form textarea { resize: vertical; min-height: 96px; }
.host-form .grid-2 { display: grid; grid-template-columns: 2fr 1fr; gap: 14px; }
.host-form .actions { display: flex; gap: 10px; margin-top: 4px; }
.login-card {
max-width: 360px;
margin: 12vh auto;
padding: 28px;
background: var(--bg-elev);
border: 1px solid var(--border);
border-radius: var(--radius);
}
.login-card h1 { margin: 0 0 16px; font-size: 22px; }
.login-card label { display: flex; flex-direction: column; gap: 4px; color: var(--text-dim); font-size: 13px; }
.login-card input {
font: inherit;
color: var(--text);
background: var(--bg-elev-2);
border: 1px solid var(--border);
border-radius: var(--radius);
padding: 10px;
margin-bottom: 12px;
}
.login-card button { width: 100%; background: var(--accent-strong); border-color: var(--accent-strong); color: #fff; }
.login-card button:hover { background: var(--accent); border-color: var(--accent); }
body.bare main { max-width: none; }
+36
View File
@@ -0,0 +1,36 @@
package templates
import "vetting/internal/model"
// TileData pairs a host with its latest run and the derived fields the
// tile needs to render: spec-diff count (server-side diff result) and
// the on-disk path to the hold-key artifact when the run is holding.
type TileData struct {
Host model.Host
Latest *model.Run
SpecDiffCritical int
HoldKeyPath string
}
templ Dashboard(tiles []TileData) {
@Layout("Dashboard") {
<section class="dashboard">
<div class="dashboard-header">
<h1>Registered hosts</h1>
<a class="button" href="/hosts/new">Register host</a>
</div>
if len(tiles) == 0 {
<div class="empty">
<p>No hosts registered yet.</p>
<a class="button" href="/hosts/new">Register your first host</a>
</div>
} else {
<div class="tile-grid" hx-ext="sse" sse-connect="/events">
for _, t := range tiles {
@HostTile(t)
}
</div>
}
</section>
}
}
+95
View File
@@ -0,0 +1,95 @@
// Code generated by templ - DO NOT EDIT.
// templ: version: v0.3.1001
package templates
//lint:file-ignore SA4006 This context is only used if a nested component is present.
import "github.com/a-h/templ"
import templruntime "github.com/a-h/templ/runtime"
import "vetting/internal/model"
// TileData pairs a host with its latest run and the derived fields the
// tile needs to render: spec-diff count (server-side diff result) and
// the on-disk path to the hold-key artifact when the run is holding.
type TileData struct {
Host model.Host
Latest *model.Run
SpecDiffCritical int
HoldKeyPath string
}
func Dashboard(tiles []TileData) templ.Component {
return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
return templ_7745c5c3_CtxErr
}
templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
if !templ_7745c5c3_IsBuffer {
defer func() {
templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
if templ_7745c5c3_Err == nil {
templ_7745c5c3_Err = templ_7745c5c3_BufErr
}
}()
}
ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Var1 := templ.GetChildren(ctx)
if templ_7745c5c3_Var1 == nil {
templ_7745c5c3_Var1 = templ.NopComponent
}
ctx = templ.ClearChildren(ctx)
templ_7745c5c3_Var2 := templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
if !templ_7745c5c3_IsBuffer {
defer func() {
templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
if templ_7745c5c3_Err == nil {
templ_7745c5c3_Err = templ_7745c5c3_BufErr
}
}()
}
ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "<section class=\"dashboard\"><div class=\"dashboard-header\"><h1>Registered hosts</h1><a class=\"button\" href=\"/hosts/new\">Register host</a></div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
if len(tiles) == 0 {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "<div class=\"empty\"><p>No hosts registered yet.</p><a class=\"button\" href=\"/hosts/new\">Register your first host</a></div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
} else {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "<div class=\"tile-grid\" hx-ext=\"sse\" sse-connect=\"/events\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
for _, t := range tiles {
templ_7745c5c3_Err = HostTile(t).Render(ctx, templ_7745c5c3_Buffer)
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "</div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "</section>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
return nil
})
templ_7745c5c3_Err = Layout("Dashboard").Render(templ.WithChildren(ctx, templ_7745c5c3_Var2), templ_7745c5c3_Buffer)
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
return nil
})
}
var _ = templruntime.GeneratedTemplate
+144
View File
@@ -0,0 +1,144 @@
package templates
import (
"bytes"
"context"
"fmt"
"vetting/internal/model"
)
// HostTile renders a single dashboard card. It's the SSE-swap target
// for per-host tile refreshes (`tile-N`) and contains a per-run log
// pane (`log-M`) whose live tail is appended by the events hub.
templ HostTile(t TileData) {
<article
id={ fmt.Sprintf("host-%d", t.Host.ID) }
class={ "tile", "tile-" + tileMood(t.Latest) }
sse-swap={ fmt.Sprintf("tile-%d", t.Host.ID) }
hx-swap="outerHTML"
>
<header class="tile-head">
<div class="tile-name">{ t.Host.Name }</div>
<div class="tile-status">{ tileStatus(t.Latest) }</div>
</header>
<dl class="tile-meta">
<div>
<dt>MAC</dt>
<dd>{ t.Host.MAC }</dd>
</div>
<div>
<dt>WoL</dt>
<dd>{ fmt.Sprintf("%s:%d", t.Host.WoLBroadcastIP, t.Host.WoLPort) }</dd>
</div>
if t.Latest != nil && t.Latest.FailedStage != "" {
<div>
<dt>Failed at</dt>
<dd>{ t.Latest.FailedStage }</dd>
</div>
}
if t.SpecDiffCritical > 0 {
<div>
<dt>Spec diffs</dt>
<dd class="bad">{ fmt.Sprintf("%d critical", t.SpecDiffCritical) }</dd>
</div>
}
</dl>
if t.Latest != nil && t.Latest.State == model.StateFailedHolding && t.Latest.HoldIP != "" {
<div class="tile-hold">
<div class="hold-title">Host is holding — SSH available</div>
<code class="hold-ssh">{ sshInvocation(t.HoldKeyPath, t.Latest.HoldIP) }</code>
</div>
}
if t.Latest != nil {
<div
class="tile-log"
id={ fmt.Sprintf("log-%d", t.Latest.ID) }
sse-swap={ fmt.Sprintf("log-%d", t.Latest.ID) }
hx-swap="beforeend"
></div>
}
<div class="tile-actions">
if canStart(t.Latest) {
<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/start", t.Host.ID)) } class="inline">
<button type="submit">Start vetting</button>
</form>
} else {
<button type="button" disabled>Run in flight</button>
}
if canOverrideWipe(t.Latest) {
<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/override-wipe", t.Host.ID)) } class="inline">
<button type="submit" class="danger">Override wipe-probe</button>
</form>
}
if hasReport(t.Latest) {
<a class="button-like" href={ templ.SafeURL(fmt.Sprintf("/reports/%d", t.Latest.ID)) } target="_blank" rel="noopener">View report</a>
}
<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/delete", t.Host.ID)) } class="inline">
<button type="submit" class="danger">Delete</button>
</form>
</div>
</article>
}
func canOverrideWipe(r *model.Run) bool {
if r == nil {
return false
}
return r.State == model.StateFailedHolding && r.FailedStage == "Storage"
}
// hasReport is true once the reporting stage has produced an HTML
// artifact. We cheat slightly: Completed runs always have one, and
// that's the only state in which the tile wants to surface a link.
func hasReport(r *model.Run) bool {
return r != nil && r.State == model.StateCompleted
}
func canStart(r *model.Run) bool {
if r == nil {
return true
}
switch r.State {
case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
return true
}
return false
}
func tileStatus(r *model.Run) string {
if r == nil {
return "Idle"
}
return string(r.State)
}
func tileMood(r *model.Run) string {
if r == nil {
return "idle"
}
switch r.State {
case model.StateCompleted:
return "pass"
case model.StateFailed, model.StateFailedHolding:
return "fail"
case model.StateReleased:
return "idle"
}
return "active"
}
func sshInvocation(keyPath, ip string) string {
if keyPath == "" {
return "ssh root@" + ip + " (hold key not yet recorded)"
}
return fmt.Sprintf("ssh -i %s root@%s", keyPath, ip)
}
// RenderTileString renders a single tile fragment so the orchestrator
// can publish it over SSE without threading a context through every
// event publisher.
func RenderTileString(t TileData) string {
var buf bytes.Buffer
_ = HostTile(t).Render(context.Background(), &buf)
return buf.String()
}
+385
View File
@@ -0,0 +1,385 @@
// Code generated by templ - DO NOT EDIT.
// templ: version: v0.3.1001
package templates
//lint:file-ignore SA4006 This context is only used if a nested component is present.
import "github.com/a-h/templ"
import templruntime "github.com/a-h/templ/runtime"
import (
"bytes"
"context"
"fmt"
"vetting/internal/model"
)
// HostTile renders a single dashboard card. It's the SSE-swap target
// for per-host tile refreshes (`tile-N`) and contains a per-run log
// pane (`log-M`) whose live tail is appended by the events hub.
func HostTile(t TileData) templ.Component {
return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
return templ_7745c5c3_CtxErr
}
templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
if !templ_7745c5c3_IsBuffer {
defer func() {
templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
if templ_7745c5c3_Err == nil {
templ_7745c5c3_Err = templ_7745c5c3_BufErr
}
}()
}
ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Var1 := templ.GetChildren(ctx)
if templ_7745c5c3_Var1 == nil {
templ_7745c5c3_Var1 = templ.NopComponent
}
ctx = templ.ClearChildren(ctx)
var templ_7745c5c3_Var2 = []any{"tile", "tile-" + tileMood(t.Latest)}
templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var2...)
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "<article id=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var3 string
templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("host-%d", t.Host.ID))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 15, Col: 40}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "\" class=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var4 string
templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String())
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 1, Col: 0}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "\" sse-swap=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var5 string
templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("tile-%d", t.Host.ID))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 17, Col: 46}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "\" hx-swap=\"outerHTML\"><header class=\"tile-head\"><div class=\"tile-name\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var6 string
templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(t.Host.Name)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 21, Col: 39}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "</div><div class=\"tile-status\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var7 string
templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(t.Latest))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 22, Col: 50}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 6, "</div></header><dl class=\"tile-meta\"><div><dt>MAC</dt><dd>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var8 string
templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(t.Host.MAC)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 27, Col: 20}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 7, "</dd></div><div><dt>WoL</dt><dd>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var9 string
templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%s:%d", t.Host.WoLBroadcastIP, t.Host.WoLPort))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 31, Col: 69}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 8, "</dd></div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
if t.Latest != nil && t.Latest.FailedStage != "" {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 9, "<div><dt>Failed at</dt><dd>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var10 string
templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(t.Latest.FailedStage)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 36, Col: 31}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 10, "</dd></div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
if t.SpecDiffCritical > 0 {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 11, "<div><dt>Spec diffs</dt><dd class=\"bad\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var11 string
templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d critical", t.SpecDiffCritical))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 42, Col: 69}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 12, "</dd></div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 13, "</dl>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
if t.Latest != nil && t.Latest.State == model.StateFailedHolding && t.Latest.HoldIP != "" {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 14, "<div class=\"tile-hold\"><div class=\"hold-title\">Host is holding — SSH available</div><code class=\"hold-ssh\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var12 string
templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(t.HoldKeyPath, t.Latest.HoldIP))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 49, Col: 74}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 15, "</code></div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
if t.Latest != nil {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 16, "<div class=\"tile-log\" id=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var13 string
templ_7745c5c3_Var13, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d", t.Latest.ID))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 55, Col: 43}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var13))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 17, "\" sse-swap=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var14 string
templ_7745c5c3_Var14, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d", t.Latest.ID))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 56, Col: 49}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var14))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 18, "\" hx-swap=\"beforeend\"></div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 19, "<div class=\"tile-actions\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
if canStart(t.Latest) {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 20, "<form method=\"post\" action=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var15 templ.SafeURL
templ_7745c5c3_Var15, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", t.Host.ID)))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 62, Col: 89}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var15))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 21, "\" class=\"inline\"><button type=\"submit\">Start vetting</button></form>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
} else {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 22, "<button type=\"button\" disabled>Run in flight</button> ")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
if canOverrideWipe(t.Latest) {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 23, "<form method=\"post\" action=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var16 templ.SafeURL
templ_7745c5c3_Var16, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/override-wipe", t.Host.ID)))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 69, Col: 97}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var16))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 24, "\" class=\"inline\"><button type=\"submit\" class=\"danger\">Override wipe-probe</button></form>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
if hasReport(t.Latest) {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 25, "<a class=\"button-like\" href=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var17 templ.SafeURL
templ_7745c5c3_Var17, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/reports/%d", t.Latest.ID)))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 74, Col: 88}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var17))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 26, "\" target=\"_blank\" rel=\"noopener\">View report</a>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 27, "<form method=\"post\" action=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var18 templ.SafeURL
templ_7745c5c3_Var18, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/delete", t.Host.ID)))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 76, Col: 89}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var18))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 28, "\" class=\"inline\"><button type=\"submit\" class=\"danger\">Delete</button></form></div></article>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
return nil
})
}
func canOverrideWipe(r *model.Run) bool {
if r == nil {
return false
}
return r.State == model.StateFailedHolding && r.FailedStage == "Storage"
}
// hasReport is true once the reporting stage has produced an HTML
// artifact. We cheat slightly: Completed runs always have one, and
// that's the only state in which the tile wants to surface a link.
func hasReport(r *model.Run) bool {
return r != nil && r.State == model.StateCompleted
}
func canStart(r *model.Run) bool {
if r == nil {
return true
}
switch r.State {
case model.StateCompleted, model.StateReleased, model.StateFailedHolding:
return true
}
return false
}
func tileStatus(r *model.Run) string {
if r == nil {
return "Idle"
}
return string(r.State)
}
func tileMood(r *model.Run) string {
if r == nil {
return "idle"
}
switch r.State {
case model.StateCompleted:
return "pass"
case model.StateFailed, model.StateFailedHolding:
return "fail"
case model.StateReleased:
return "idle"
}
return "active"
}
func sshInvocation(keyPath, ip string) string {
if keyPath == "" {
return "ssh root@" + ip + " (hold key not yet recorded)"
}
return fmt.Sprintf("ssh -i %s root@%s", keyPath, ip)
}
// RenderTileString renders a single tile fragment so the orchestrator
// can publish it over SSE without threading a context through every
// event publisher.
func RenderTileString(t TileData) string {
var buf bytes.Buffer
_ = HostTile(t).Render(context.Background(), &buf)
return buf.String()
}
var _ = templruntime.GeneratedTemplate
+50
View File
@@ -0,0 +1,50 @@
package templates
templ Layout(title string) {
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>{ title } — Vetting</title>
<link rel="stylesheet" href="/static/app.css"/>
<script src="https://unpkg.com/htmx.org@2.0.2" integrity="sha384-Y7hw+L/jvKeWIRRkqWYfPcvVxHzVzn5REgzbawhxAuQGwX1XWe70vji+VSeHOThJ" crossorigin="anonymous"></script>
<script src="https://unpkg.com/htmx-ext-sse@2.2.2" integrity="sha384-Y4gc0CK6Kg4hmulDc1rNM+vbMvjbW/5rRCA6pC5gj5dLV1/4+OZGkQpJtHQvQTCr" crossorigin="anonymous"></script>
</head>
<body hx-boost="true">
<header class="topbar">
<div class="brand">Vetting</div>
<nav>
<a href="/">Dashboard</a>
<a href="/hosts/new">Register host</a>
</nav>
<div class="session">
<span class="heartbeat" hx-ext="sse" sse-connect="/events" sse-swap="heartbeat">·</span>
<form method="post" action="/logout" class="logout-form">
<button type="submit">Log out</button>
</form>
</div>
</header>
<main>
{ children... }
</main>
</body>
</html>
}
templ BareLayout(title string) {
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>{ title } — Vetting</title>
<link rel="stylesheet" href="/static/app.css"/>
</head>
<body class="bare">
<main>
{ children... }
</main>
</body>
</html>
}
+111
View File
@@ -0,0 +1,111 @@
// Code generated by templ - DO NOT EDIT.
// templ: version: v0.3.1001
package templates
//lint:file-ignore SA4006 This context is only used if a nested component is present.
import "github.com/a-h/templ"
import templruntime "github.com/a-h/templ/runtime"
func Layout(title string) templ.Component {
return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
return templ_7745c5c3_CtxErr
}
templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
if !templ_7745c5c3_IsBuffer {
defer func() {
templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
if templ_7745c5c3_Err == nil {
templ_7745c5c3_Err = templ_7745c5c3_BufErr
}
}()
}
ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Var1 := templ.GetChildren(ctx)
if templ_7745c5c3_Var1 == nil {
templ_7745c5c3_Var1 = templ.NopComponent
}
ctx = templ.ClearChildren(ctx)
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "<!doctype html><html lang=\"en\"><head><meta charset=\"utf-8\"><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><title>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var2 string
templ_7745c5c3_Var2, templ_7745c5c3_Err = templ.JoinStringErrs(title)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/layout.templ`, Line: 9, Col: 17}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var2))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, " — Vetting</title><link rel=\"stylesheet\" href=\"/static/app.css\"><script src=\"https://unpkg.com/htmx.org@2.0.2\" integrity=\"sha384-Y7hw+L/jvKeWIRRkqWYfPcvVxHzVzn5REgzbawhxAuQGwX1XWe70vji+VSeHOThJ\" crossorigin=\"anonymous\"></script><script src=\"https://unpkg.com/htmx-ext-sse@2.2.2\" integrity=\"sha384-Y4gc0CK6Kg4hmulDc1rNM+vbMvjbW/5rRCA6pC5gj5dLV1/4+OZGkQpJtHQvQTCr\" crossorigin=\"anonymous\"></script></head><body hx-boost=\"true\"><header class=\"topbar\"><div class=\"brand\">Vetting</div><nav><a href=\"/\">Dashboard</a> <a href=\"/hosts/new\">Register host</a></nav><div class=\"session\"><span class=\"heartbeat\" hx-ext=\"sse\" sse-connect=\"/events\" sse-swap=\"heartbeat\">·</span><form method=\"post\" action=\"/logout\" class=\"logout-form\"><button type=\"submit\">Log out</button></form></div></header><main>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templ_7745c5c3_Var1.Render(ctx, templ_7745c5c3_Buffer)
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "</main></body></html>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
return nil
})
}
func BareLayout(title string) templ.Component {
return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
return templ_7745c5c3_CtxErr
}
templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
if !templ_7745c5c3_IsBuffer {
defer func() {
templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
if templ_7745c5c3_Err == nil {
templ_7745c5c3_Err = templ_7745c5c3_BufErr
}
}()
}
ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Var3 := templ.GetChildren(ctx)
if templ_7745c5c3_Var3 == nil {
templ_7745c5c3_Var3 = templ.NopComponent
}
ctx = templ.ClearChildren(ctx)
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "<!doctype html><html lang=\"en\"><head><meta charset=\"utf-8\"><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><title>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var4 string
templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(title)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/layout.templ`, Line: 41, Col: 17}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, " — Vetting</title><link rel=\"stylesheet\" href=\"/static/app.css\"></head><body class=\"bare\"><main>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templ_7745c5c3_Var3.Render(ctx, templ_7745c5c3_Buffer)
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 6, "</main></body></html>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
return nil
})
}
var _ = templruntime.GeneratedTemplate
+20
View File
@@ -0,0 +1,20 @@
package templates
templ Login(errMsg, next string) {
@BareLayout("Sign in") {
<div class="login-card">
<h1>Vetting</h1>
if errMsg != "" {
<div class="error">{ errMsg }</div>
}
<form method="post" action="/login">
<input type="hidden" name="next" value={ next }/>
<label>
Password
<input type="password" name="password" autofocus required/>
</label>
<button type="submit">Sign in</button>
</form>
</div>
}
}
+94
View File
@@ -0,0 +1,94 @@
// Code generated by templ - DO NOT EDIT.
// templ: version: v0.3.1001
package templates
//lint:file-ignore SA4006 This context is only used if a nested component is present.
import "github.com/a-h/templ"
import templruntime "github.com/a-h/templ/runtime"
func Login(errMsg, next string) templ.Component {
return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
return templ_7745c5c3_CtxErr
}
templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
if !templ_7745c5c3_IsBuffer {
defer func() {
templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
if templ_7745c5c3_Err == nil {
templ_7745c5c3_Err = templ_7745c5c3_BufErr
}
}()
}
ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Var1 := templ.GetChildren(ctx)
if templ_7745c5c3_Var1 == nil {
templ_7745c5c3_Var1 = templ.NopComponent
}
ctx = templ.ClearChildren(ctx)
templ_7745c5c3_Var2 := templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
if !templ_7745c5c3_IsBuffer {
defer func() {
templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
if templ_7745c5c3_Err == nil {
templ_7745c5c3_Err = templ_7745c5c3_BufErr
}
}()
}
ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "<div class=\"login-card\"><h1>Vetting</h1>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
if errMsg != "" {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "<div class=\"error\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var3 string
templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(errMsg)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/login.templ`, Line: 8, Col: 31}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "</div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "<form method=\"post\" action=\"/login\"><input type=\"hidden\" name=\"next\" value=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var4 string
templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(next)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/login.templ`, Line: 11, Col: 49}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "\"> <label>Password <input type=\"password\" name=\"password\" autofocus required></label> <button type=\"submit\">Sign in</button></form></div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
return nil
})
templ_7745c5c3_Err = BareLayout("Sign in").Render(templ.WithChildren(ctx, templ_7745c5c3_Var2), templ_7745c5c3_Buffer)
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
return nil
})
}
var _ = templruntime.GeneratedTemplate
+61
View File
@@ -0,0 +1,61 @@
package templates
type RegistrationForm struct {
Name string
MAC string
WoLBroadcastIP string
WoLPort string
ExpectedSpecYAML string
Notes string
Error string
}
templ Registration(form RegistrationForm) {
@Layout("Register host") {
<section class="form-wrap">
<h1>Register host</h1>
if form.Error != "" {
<div class="error">{ form.Error }</div>
}
<form method="post" action="/hosts" class="host-form">
<label>
Name
<input type="text" name="name" value={ form.Name } required pattern="[A-Za-z0-9_\-\.]+" placeholder="pve-node-03"/>
</label>
<label>
MAC address
<input type="text" name="mac" value={ form.MAC } required placeholder="aa:bb:cc:dd:ee:ff"/>
</label>
<div class="grid-2">
<label>
WoL broadcast IP
<input type="text" name="wol_broadcast_ip" value={ form.WoLBroadcastIP } required placeholder="10.0.0.255"/>
</label>
<label>
WoL port
<input type="number" name="wol_port" value={ defaultPort(form.WoLPort) } min="1" max="65535"/>
</label>
</div>
<label>
Expected hardware spec (YAML)
<textarea name="expected_spec_yaml" rows="12" required placeholder="cpu:&#10; model_match: ...">{ form.ExpectedSpecYAML }</textarea>
</label>
<label>
Notes
<textarea name="notes" rows="3">{ form.Notes }</textarea>
</label>
<div class="actions">
<button type="submit">Register</button>
<a class="button-secondary" href="/">Cancel</a>
</div>
</form>
</section>
}
}
func defaultPort(v string) string {
if v == "" {
return "9"
}
return v
}
@@ -0,0 +1,176 @@
// Code generated by templ - DO NOT EDIT.
// templ: version: v0.3.1001
package templates
//lint:file-ignore SA4006 This context is only used if a nested component is present.
import "github.com/a-h/templ"
import templruntime "github.com/a-h/templ/runtime"
type RegistrationForm struct {
Name string
MAC string
WoLBroadcastIP string
WoLPort string
ExpectedSpecYAML string
Notes string
Error string
}
func Registration(form RegistrationForm) templ.Component {
return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil {
return templ_7745c5c3_CtxErr
}
templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
if !templ_7745c5c3_IsBuffer {
defer func() {
templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
if templ_7745c5c3_Err == nil {
templ_7745c5c3_Err = templ_7745c5c3_BufErr
}
}()
}
ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Var1 := templ.GetChildren(ctx)
if templ_7745c5c3_Var1 == nil {
templ_7745c5c3_Var1 = templ.NopComponent
}
ctx = templ.ClearChildren(ctx)
templ_7745c5c3_Var2 := templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) {
templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context
templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W)
if !templ_7745c5c3_IsBuffer {
defer func() {
templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer)
if templ_7745c5c3_Err == nil {
templ_7745c5c3_Err = templ_7745c5c3_BufErr
}
}()
}
ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "<section class=\"form-wrap\"><h1>Register host</h1>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
if form.Error != "" {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "<div class=\"error\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var3 string
templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(form.Error)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 18, Col: 35}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "</div>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "<form method=\"post\" action=\"/hosts\" class=\"host-form\"><label>Name <input type=\"text\" name=\"name\" value=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var4 string
templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(form.Name)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 23, Col: 53}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "\" required pattern=\"[A-Za-z0-9_\\-\\.]+\" placeholder=\"pve-node-03\"></label> <label>MAC address <input type=\"text\" name=\"mac\" value=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var5 string
templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(form.MAC)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 27, Col: 51}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 6, "\" required placeholder=\"aa:bb:cc:dd:ee:ff\"></label><div class=\"grid-2\"><label>WoL broadcast IP <input type=\"text\" name=\"wol_broadcast_ip\" value=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var6 string
templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(form.WoLBroadcastIP)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 32, Col: 76}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 7, "\" required placeholder=\"10.0.0.255\"></label> <label>WoL port <input type=\"number\" name=\"wol_port\" value=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var7 string
templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(defaultPort(form.WoLPort))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 36, Col: 76}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 8, "\" min=\"1\" max=\"65535\"></label></div><label>Expected hardware spec (YAML) <textarea name=\"expected_spec_yaml\" rows=\"12\" required placeholder=\"cpu:&#10; model_match: ...\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var8 string
templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(form.ExpectedSpecYAML)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 41, Col: 125}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 9, "</textarea></label> <label>Notes <textarea name=\"notes\" rows=\"3\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var9 string
templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(form.Notes)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 45, Col: 49}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 10, "</textarea></label><div class=\"actions\"><button type=\"submit\">Register</button> <a class=\"button-secondary\" href=\"/\">Cancel</a></div></form></section>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
return nil
})
templ_7745c5c3_Err = Layout("Register host").Render(templ.WithChildren(ctx, templ_7745c5c3_Var2), templ_7745c5c3_Buffer)
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
return nil
})
}
func defaultPort(v string) string {
if v == "" {
return "9"
}
return v
}
var _ = templruntime.GeneratedTemplate
+32
View File
@@ -0,0 +1,32 @@
# live-image/Makefile — builds the Debian live image that PXE-booted
# hosts land in. Requires a Linux host (or WSL) with mkosi installed.
# On native Windows this Makefile short-circuits with a clear message.
ifeq ($(OS),Windows_NT)
UNAME_S := Windows
else
UNAME_S := $(shell uname -s)
endif
REPO_ROOT := $(abspath ..)
AGENT_BIN := $(REPO_ROOT)/bin/vetting-agent.linux-amd64
.PHONY: all check-linux agent clean
all: check-linux agent
mkosi --force build
agent: $(AGENT_BIN)
$(AGENT_BIN):
cd $(REPO_ROOT) && GOOS=linux GOARCH=amd64 go build -o $(AGENT_BIN) ./cmd/vetting-agent
check-linux:
ifneq ($(UNAME_S),Linux)
@echo "ERROR: live-image must be built on Linux (you're on $(UNAME_S))."
@echo "Run 'wsl make -C live-image all' from Windows instead."
@exit 1
endif
@command -v mkosi >/dev/null 2>&1 || { echo "ERROR: mkosi not installed. Try: apt install mkosi"; exit 1; }
clean:
rm -rf build mkosi.output mkosi.cache
+36
View File
@@ -0,0 +1,36 @@
# Vetting live image
Debian-based Linux live image that PXE-booted hosts drop into. Runs the
`vetting-agent` binary under systemd and reaches back to the orchestrator
over HTTP+SSE.
## Building
Must be built on Linux (or WSL). On Windows:
```sh
wsl make -C live-image all
```
On Linux:
```sh
make -C live-image all
```
This produces `live-image/build/vmlinuz` and `live-image/build/initrd.img`.
Copy (or symlink) them into the directory configured as `pxe.live_dir` in
`deploy/vetting.yaml`; the orchestrator serves them at `/live/*`.
## iPXE binaries
The dnsmasq supervisor expects `ipxe.efi` and `undionly.kpxe` to live in
`pxe.tftp_root`. Fetch the latest release binaries from
https://boot.ipxe.org and drop them in that directory. The Makefile does
not download them automatically so their SHA256 can be operator-verified.
## WSL prerequisites (Windows dev)
```sh
sudo apt install mkosi debootstrap squashfs-tools dosfstools
```
+38
View File
@@ -0,0 +1,38 @@
# Vetting live image (Phase 2 skeleton).
#
# Produces a Debian-based rootfs packaged as squashfs plus a kernel
# image, ready to be served over HTTP to iPXE. The image is deliberately
# small: only what the agent needs to run Phase 2 (the Hello / Claim /
# Heartbeat loop). Phase 4+ adds smartctl, stress-ng, fio, iperf3, etc.
[Distribution]
Distribution=debian
Release=bookworm
Repositories=main
[Output]
Format=directory
Output=build
[Content]
Bootable=yes
BuildPackages=
Packages=
systemd
systemd-sysv
udev
linux-image-amd64
live-boot
iproute2
iputils-ping
openssh-server
ca-certificates
curl
dmidecode
pciutils
usbutils
# Phase 4 will add: smartmontools stress-ng fio iperf3 lshw lm-sensors
[Host]
# Copy the prebuilt Go agent in from the repo root via postinst.
+15
View File
@@ -0,0 +1,15 @@
#!/bin/sh
# mkosi postinst: install the vetting-agent binary and its systemd unit
# into the image. The binary must already be built for linux-amd64 at
# repo root under bin/vetting-agent.linux-amd64 (the top-level Makefile
# does this via `make agent-linux`).
set -eu
AGENT_BIN="${SRCDIR:-..}/bin/vetting-agent.linux-amd64"
install -D -m 0755 "$AGENT_BIN" "$BUILDROOT/usr/local/sbin/vetting-agent"
install -D -m 0644 "$SRCDIR/mkosi.skeleton/etc/systemd/system/vetting-agent.service" \
"$BUILDROOT/etc/systemd/system/vetting-agent.service"
ln -sf /etc/systemd/system/vetting-agent.service \
"$BUILDROOT/etc/systemd/system/multi-user.target.wants/vetting-agent.service"
@@ -0,0 +1,18 @@
[Unit]
Description=Vetting hardware-validation agent
# Wait until networking is minimally up (the agent itself retries
# dial failures, but no point hammering before DHCP finishes).
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
ExecStart=/usr/local/sbin/vetting-agent
Restart=on-failure
RestartSec=5s
# The agent reads /proc/cmdline; it needs no extra env.
StandardOutput=journal+console
StandardError=journal+console
[Install]
WantedBy=multi-user.target
+225
View File
@@ -0,0 +1,225 @@
//go:build e2e
// Package e2e exercises the orchestrator end-to-end against a real QEMU
// VM PXE-booting from the orchestrator-supervised dnsmasq into the
// mkosi-built live image.
//
// This test is gated behind the `e2e` build tag because:
// - it requires root (for bridge + qemu-system-x86_64 network setup),
// - it needs a pre-built live image at live-image/out/{vmlinuz,initrd.img},
// - it only runs on Linux (mkosi + qemu-kvm).
//
// Run with:
//
// sudo go test -tags=e2e -run TestQEMUFullRun ./test/e2e/...
//
// See docs/operations.md for the manual QEMU invocation equivalent.
package e2e
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"testing"
"time"
)
// Tunables — overridable via env for CI, defaults match the manual
// setup documented in docs/operations.md.
var (
bridgeName = envOr("VETTING_E2E_BRIDGE", "br-vetting")
liveKernel = envOr("VETTING_E2E_KERNEL", "live-image/out/vmlinuz")
liveInitrd = envOr("VETTING_E2E_INITRD", "live-image/out/initrd.img")
testMAC = envOr("VETTING_E2E_MAC", "52:54:00:12:34:56")
publicURL = envOr("VETTING_E2E_URL", "http://10.77.0.1:8080")
// Overall budget for the run to reach Completed. Stage timeouts in
// the config should be tuned down for E2E to well under this.
runBudget = 10 * time.Minute
)
func envOr(k, d string) string {
if v := os.Getenv(k); v != "" {
return v
}
return d
}
// TestQEMUFullRun boots a QEMU VM against a running orchestrator and
// waits for the Run state to reach Completed.
//
// Preconditions (test skips unless all are true):
// - Linux host
// - Running as root (bridge networking + qemu-kvm)
// - `qemu-system-x86_64` on PATH
// - Live image built (kernel + initrd exist)
// - An orchestrator is already running at $VETTING_E2E_URL with a
// host registered for $VETTING_E2E_MAC and a run already queued
// (start the run via the UI before invoking this test, or via the
// orchestrator's /hosts/{id}/start endpoint).
//
// The test exercises the real PXE path. It does NOT embed its own
// orchestrator because dnsmasq needs CAP_NET_ADMIN and the test binary
// should stay focused on the "did the run complete?" assertion.
func TestQEMUFullRun(t *testing.T) {
if runtime.GOOS != "linux" {
t.Skip("E2E test requires Linux")
}
if os.Geteuid() != 0 {
t.Skip("E2E test requires root (sudo go test -tags=e2e ...)")
}
if _, err := exec.LookPath("qemu-system-x86_64"); err != nil {
t.Skip("qemu-system-x86_64 not on PATH")
}
if _, err := os.Stat(liveKernel); err != nil {
t.Skipf("live kernel missing at %s (run `make live-image`)", liveKernel)
}
if _, err := os.Stat(liveInitrd); err != nil {
t.Skipf("live initrd missing at %s", liveInitrd)
}
if err := pingOrchestrator(publicURL); err != nil {
t.Skipf("orchestrator not reachable at %s: %v", publicURL, err)
}
runID, err := findQueuedRunForMAC(publicURL, testMAC)
if err != nil {
t.Fatalf("no queued run for %s: %v (register the host and click Start Vetting first)", testMAC, err)
}
t.Logf("driving run %d for MAC %s", runID, testMAC)
disk, cleanup := makeThrowawayDisk(t)
defer cleanup()
qemuCtx, cancel := context.WithTimeout(context.Background(), runBudget)
defer cancel()
cmd := exec.CommandContext(qemuCtx, "qemu-system-x86_64",
"-enable-kvm", "-cpu", "host", "-smp", "4", "-m", "4096",
"-netdev", "bridge,id=n0,br="+bridgeName,
"-device", "virtio-net-pci,netdev=n0,mac="+testMAC,
"-drive", "file="+disk+",format=raw,if=virtio",
"-boot", "n", "-serial", "file:"+filepath.Join(os.TempDir(), fmt.Sprintf("vetting-e2e-%d.serial", runID)),
"-display", "none",
)
cmd.Stdout = testLogger{t}
cmd.Stderr = testLogger{t}
if err := cmd.Start(); err != nil {
t.Fatalf("start qemu: %v", err)
}
defer func() {
_ = cmd.Process.Kill()
_ = cmd.Wait()
}()
// Poll the orchestrator until the run reaches a terminal state.
poll := time.NewTicker(5 * time.Second)
defer poll.Stop()
for {
select {
case <-qemuCtx.Done():
t.Fatalf("run %d did not complete within %s", runID, runBudget)
case <-poll.C:
state, err := getRunState(publicURL, runID)
if err != nil {
t.Logf("poll state: %v (will retry)", err)
continue
}
t.Logf("run %d state = %s", runID, state)
switch state {
case "Completed":
return // green path
case "FailedHolding", "Failed", "Released":
t.Fatalf("run %d ended in non-success state %q", runID, state)
}
}
}
}
// ---- helpers ------------------------------------------------------------
func pingOrchestrator(url string) error {
req, err := http.NewRequest(http.MethodGet, url+"/login", nil)
if err != nil {
return err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode >= 500 {
return fmt.Errorf("status %d", resp.StatusCode)
}
return nil
}
// findQueuedRunForMAC hits a hypothetical /api/v1/runs?mac=... debug
// endpoint. Since Phase 6 doesn't add that endpoint (orchestrator stays
// browser-session-gated for UI routes), we fall back to requiring the
// caller to set VETTING_E2E_RUN_ID if the orchestrator hasn't been
// extended with a debug listing. This is a pragmatic hack — the E2E
// harness is developer-facing and the alternative would be scraping
// HTML.
func findQueuedRunForMAC(baseURL, mac string) (int64, error) {
if s := os.Getenv("VETTING_E2E_RUN_ID"); s != "" {
var id int64
_, err := fmt.Sscanf(s, "%d", &id)
return id, err
}
return 0, fmt.Errorf("set VETTING_E2E_RUN_ID (no debug API for MAC lookup yet)")
}
// getRunState reads the run's current state via the report route's
// fall-through: /reports/{id} returns 404 until Completed, which gives
// us a cheap terminal-check without a JSON API. For intermediate
// states we need a debug endpoint — deliberately left as a TODO so
// the test doesn't depend on an API surface that isn't stable.
func getRunState(baseURL string, runID int64) (string, error) {
// Proxy: if /reports/{id} returns 200, the run is Completed.
resp, err := http.Get(fmt.Sprintf("%s/reports/%d", baseURL, runID))
if err != nil {
return "", err
}
defer resp.Body.Close()
_, _ = io.Copy(io.Discard, resp.Body)
switch resp.StatusCode {
case 200:
return "Completed", nil
case 401, 403:
// Session-gated; caller must export VETTING_E2E_COOKIE to bypass.
return "", fmt.Errorf("auth required; set VETTING_E2E_COOKIE")
case 404:
return "InProgress", nil
default:
return "", fmt.Errorf("unexpected %d", resp.StatusCode)
}
}
func makeThrowawayDisk(t *testing.T) (string, func()) {
t.Helper()
path := filepath.Join(t.TempDir(), "test-disk.img")
cmd := exec.Command("qemu-img", "create", "-f", "raw", path, "4G")
if out, err := cmd.CombinedOutput(); err != nil {
t.Fatalf("qemu-img create: %v\n%s", err, strings.TrimSpace(string(out)))
}
return path, func() { _ = os.Remove(path) }
}
// testLogger lets exec.Cmd write into the test's log stream so QEMU's
// stderr shows up with the test name, not as an orphaned blob.
type testLogger struct{ t *testing.T }
func (w testLogger) Write(p []byte) (int, error) {
w.t.Logf("qemu: %s", strings.TrimRight(string(p), "\r\n"))
return len(p), nil
}
// Compile-time reminder: json is imported so future expansions can
// parse the orchestrator's response bodies when a debug API lands.
var _ = json.Marshal
+21
View File
@@ -0,0 +1,21 @@
package main
import (
"fmt"
"os"
"vetting/internal/auth"
)
func main() {
if len(os.Args) != 2 {
fmt.Fprintln(os.Stderr, "usage: gen-admin-password <plaintext>")
os.Exit(2)
}
hash, err := auth.BcryptHash(os.Args[1])
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
fmt.Println(hash)
}