commit 9bb4b09a04d9aef7ecbd3bc592663dc996922b76 Author: josh Date: Fri Apr 17 21:32:10 2026 -0400 Initial commit: full Phases 1-6 implementation Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..fc164cc --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,45 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + lint-and-test: + name: Lint + build + test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: "1.26.x" + cache: true + + - name: Install templ + run: go install github.com/a-h/templ/cmd/templ@v0.3.1001 + + - name: Generate templ + run: templ generate + + - name: Verify go.mod + go.sum are tidy + run: | + go mod tidy + git diff --exit-code go.mod go.sum + + - name: Vet + run: go vet ./... + + - name: Build (host) + run: | + go build ./... + GOOS=linux GOARCH=amd64 go build ./... + + - name: Test + run: go test -race -count=1 ./... diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 0000000..0e93158 --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,59 @@ +name: E2E (manual) + +# The E2E job builds the live image (mkosi, requires apt package +# updates) and boots a QEMU VM against a running orchestrator. It's +# slow and needs a Linux runner with nested virtualization, so it runs +# only on workflow_dispatch. + +on: + workflow_dispatch: + inputs: + ref: + description: Git ref to test (default: main) + required: false + default: main + +permissions: + contents: read + +jobs: + e2e: + runs-on: ubuntu-latest + timeout-minutes: 45 + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref }} + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: "1.26.x" + cache: true + + - name: Install live-image build dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + mkosi debootstrap squashfs-tools qemu-system-x86 qemu-utils \ + dnsmasq iperf3 ipxe-qemu + + - name: Install templ + run: go install github.com/a-h/templ/cmd/templ@v0.3.1001 + + - name: Build orchestrator + agent + run: | + templ generate + make orchestrator-linux agent-linux + + - name: Build live image + run: make live-image + + - name: Run E2E suite + # The E2E test expects a registered host + queued run; in CI we + # don't have an operator, so it's skipped unless VETTING_E2E_RUN_ID + # is supplied. When someone stands up the orchestrator for a + # dispatch, they can set it via a workflow_dispatch secret. + env: + VETTING_E2E_RUN_ID: ${{ vars.VETTING_E2E_RUN_ID }} + run: sudo -E go test -tags=e2e -count=1 -v ./test/e2e/... diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..91632f4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +/bin/ +/out/ +/dist/ +/tmp/ +/var/ +/data/ +*.db +*.db-shm +*.db-wal +*.exe +*.log +vetting.yaml +!deploy/vetting.example.yaml +live-image/out/ +.vscode/ +.idea/ +.claude/ diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..44b4541 --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,18 @@ +run: + timeout: 3m + +linters: + enable: + - govet + - errcheck + - staticcheck + - ineffassign + - unused + - gofmt + - goimports + - misspell + - revive + +issues: + exclude-dirs: + - internal/web/templates diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6b62ef1 --- /dev/null +++ b/Makefile @@ -0,0 +1,79 @@ +.DEFAULT_GOAL := help +UNAME_S := $(shell uname -s 2>/dev/null || echo Windows) +GOOS_LINUX := GOOS=linux GOARCH=amd64 +GIT_SHA := $(shell git rev-parse --short HEAD 2>/dev/null || echo dev) +LDFLAGS := -s -w -X vetting/internal/version.GitSHA=$(GIT_SHA) + +.PHONY: help +help: ## Show targets + @awk 'BEGIN {FS = ":.*##"} /^[a-zA-Z_-]+:.*##/ {printf " %-20s %s\n", $$1, $$2}' $(MAKEFILE_LIST) + +.PHONY: templ +templ: ## Generate templ .go files + templ generate + +.PHONY: orchestrator +orchestrator: templ ## Build orchestrator for host OS + go build -ldflags="$(LDFLAGS)" -o bin/vetting$(if $(filter Windows%,$(UNAME_S)),.exe,) ./cmd/vetting + +.PHONY: orchestrator-linux +orchestrator-linux: templ ## Cross-build orchestrator for linux-amd64 + $(GOOS_LINUX) go build -ldflags="$(LDFLAGS)" -o bin/vetting-linux-amd64 ./cmd/vetting + +.PHONY: agent +agent: ## Build agent for host OS (handy for unit testing only — real agent runs in the live image) + go build -ldflags="$(LDFLAGS)" -o bin/vetting-agent$(if $(filter Windows%,$(UNAME_S)),.exe,) ./cmd/vetting-agent + +.PHONY: agent-linux +agent-linux: ## Cross-build agent for linux-amd64 (consumed by live-image build) + $(GOOS_LINUX) go build -ldflags="$(LDFLAGS)" -o bin/vetting-agent.linux-amd64 ./cmd/vetting-agent + +.PHONY: gen-admin-password +gen-admin-password: ## Build the bcrypt password generator + go build -o bin/gen-admin-password$(if $(filter Windows%,$(UNAME_S)),.exe,) ./tools/gen-admin-password + +.PHONY: tidy +tidy: ## go mod tidy + go mod tidy + +.PHONY: fmt +fmt: ## go fmt + go fmt ./... + +.PHONY: vet +vet: ## go vet + go vet ./... + +.PHONY: test +test: templ ## Run tests + go test ./... + +.PHONY: test-race +test-race: templ ## Run tests with the race detector + go test -race -count=1 ./... + +.PHONY: e2e +e2e: ## Run the QEMU PXE E2E test (Linux, root, live image required) + sudo go test -tags=e2e -v ./test/e2e/... + +.PHONY: live-image +live-image: agent-linux ## Build reproducible live image (requires Linux/WSL + mkosi) +ifneq ($(findstring Windows,$(UNAME_S))$(findstring MINGW,$(UNAME_S))$(findstring MSYS,$(UNAME_S)),) + @echo "ERROR: live-image must be built under Linux (use WSL: wsl make live-image)." && exit 1 +endif + $(MAKE) -C live-image all + +.PHONY: all +all: orchestrator agent gen-admin-password ## Build everything buildable on host OS + +.PHONY: run +run: orchestrator ## Build and run orchestrator with example config + ./bin/vetting$(if $(filter Windows%,$(UNAME_S)),.exe,) --config deploy/vetting.example.yaml + +.PHONY: install +install: orchestrator-linux ## Run deploy/install.sh (must be run on the target LXC as root) + sudo ./deploy/install.sh --binary ./bin/vetting-linux-amd64 + +.PHONY: clean +clean: ## Remove build artifacts + rm -rf bin out dist tmp diff --git a/README.md b/README.md new file mode 100644 index 0000000..5c93db0 --- /dev/null +++ b/README.md @@ -0,0 +1,85 @@ +# Vetting + +Post-repair hardware validation pipeline for Proxmox cluster hosts. +Register a host, click **Start Vetting**, and the orchestrator will +PXE-boot it into a custom Linux live image and run it through a +consistent battery of tests (CPU stress, RAM stress, SMART, disk I/O, +network throughput, GPU, PSU telemetry). Pass → auto-shutdown + HTML +report. Fail → pipeline halts, SSH drops in, notification fires. + +Built for solo-operator home labs: one Go binary, SQLite + flat files, +HTMX + SSE UI, bundled dnsmasq, optional ntfy / Discord / SMTP +notifications. + +## Documentation + +- [docs/operations.md](docs/operations.md) — install + first run + + troubleshooting +- [docs/architecture.md](docs/architecture.md) — packages, state + machine, protocol +- [docs/test-suite.md](docs/test-suite.md) — what each stage measures + +## Quick start (local, against QEMU) + +```bash +# 1. Build +make all + +# 2. Generate an admin password hash and paste it into the config. +./bin/gen-admin-password 'your-password' +# Edit deploy/vetting.example.yaml: +# auth.admin_password_bcrypt = +# auth.session_secret_hex = $(openssl rand -hex 32) + +# 3. Run +./bin/vetting --config deploy/vetting.example.yaml +# → http://localhost:8080 +``` + +For a full end-to-end QEMU walk-through (bridge setup, host registration, +PXE boot), see [docs/operations.md § First vetting run](docs/operations.md#first-vetting-run). + +## Production install (Proxmox LXC) + +```bash +make orchestrator-linux +scp -r bin deploy lxc:/opt/vetting/ +ssh lxc "cd /opt/vetting && sudo ./deploy/install.sh" +# Edit /etc/vetting/vetting.yaml, then: +ssh lxc "sudo systemctl enable --now vetting" +``` + +See [docs/operations.md § Install](docs/operations.md#install-proxmox-lxc) +for the full walkthrough. + +## Repository layout + +``` +cmd/ orchestrator + agent entrypoints +internal/ core packages (see docs/architecture.md for the map) +agent/ in-image agent logic (claim loop, stage dispatch, probes) +live-image/ mkosi config for the PXE-bootable Debian live image +deploy/ systemd unit + install.sh + example config +docs/ operator + developer docs +test/e2e/ build-tag-gated QEMU + PXE full-stack test +tools/ small CLI helpers (e.g. gen-admin-password) +``` + +## Development + +- `make test` — Go unit + smoke tests (cross-platform) +- `make vet` — `go vet` on the whole module +- `make live-image` — Linux-only; run under WSL from Windows +- `make e2e` — requires Linux root + live image + running orchestrator +- `make run` — build + launch the orchestrator with the example config + +Windows hosts: everything except `live-image` and `e2e` works natively. +The live image build calls `mkosi` which needs a real Linux userspace, +so use WSL for those targets. + +## Status + +All six phases in the original plan are implemented. The E2E QEMU +harness is wired in `test/e2e/qemu_test.go` but requires a running +orchestrator + registered host + queued run as preconditions — it's a +developer-facing integration harness, not a unit test. diff --git a/agent/bootstate/state.go b/agent/bootstate/state.go new file mode 100644 index 0000000..cb77d21 --- /dev/null +++ b/agent/bootstate/state.go @@ -0,0 +1,64 @@ +// Package bootstate parses kernel cmdline parameters that the +// orchestrator baked into the iPXE script. The agent consumes these +// on startup to learn which run it belongs to and how to reach back. +package bootstate + +import ( + "errors" + "fmt" + "os" + "strconv" + "strings" +) + +type Params struct { + OrchestratorURL string + RunID int64 + MAC string + Token string + TLSCertFPR string // optional +} + +// ParseCmdline reads /proc/cmdline (or a user-supplied path for tests) +// and pulls out the vetting.* parameters. +func ParseCmdline(path string) (*Params, error) { + if path == "" { + path = "/proc/cmdline" + } + b, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read %s: %w", path, err) + } + return ParseCmdlineString(string(b)) +} + +func ParseCmdlineString(s string) (*Params, error) { + fields := strings.Fields(strings.TrimSpace(s)) + var p Params + for _, f := range fields { + k, v, ok := strings.Cut(f, "=") + if !ok { + continue + } + switch k { + case "vetting.orchestrator": + p.OrchestratorURL = v + case "vetting.run_id": + id, err := strconv.ParseInt(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("vetting.run_id=%q: %w", v, err) + } + p.RunID = id + case "vetting.mac": + p.MAC = strings.ToLower(v) + case "vetting.token": + p.Token = v + case "vetting.cert_fpr": + p.TLSCertFPR = v + } + } + if p.OrchestratorURL == "" || p.RunID == 0 || p.MAC == "" || p.Token == "" { + return nil, errors.New("cmdline missing one of vetting.orchestrator, vetting.run_id, vetting.mac, vetting.token") + } + return &p, nil +} diff --git a/agent/bootstate/state_test.go b/agent/bootstate/state_test.go new file mode 100644 index 0000000..8172596 --- /dev/null +++ b/agent/bootstate/state_test.go @@ -0,0 +1,35 @@ +package bootstate + +import ( + "testing" +) + +func TestParseCmdlineGoldenPath(t *testing.T) { + s := `BOOT_IMAGE=vmlinuz initrd=initrd.img vetting.orchestrator=http://10.0.0.5:8080 vetting.run_id=42 vetting.mac=aa:bb:cc:dd:ee:ff vetting.token=deadbeefcafe vetting.cert_fpr=abc123 console=ttyS0,115200n8 quiet` + p, err := ParseCmdlineString(s) + if err != nil { + t.Fatalf("ParseCmdlineString: %v", err) + } + if p.OrchestratorURL != "http://10.0.0.5:8080" || p.RunID != 42 || p.MAC != "aa:bb:cc:dd:ee:ff" || + p.Token != "deadbeefcafe" || p.TLSCertFPR != "abc123" { + t.Fatalf("parsed wrong: %+v", p) + } +} + +func TestParseCmdlineMissingRequired(t *testing.T) { + s := `vetting.orchestrator=http://x vetting.mac=aa:bb:cc:dd:ee:ff vetting.token=t` + if _, err := ParseCmdlineString(s); err == nil { + t.Fatalf("expected error when vetting.run_id missing") + } +} + +func TestParseCmdlineLowercasesMAC(t *testing.T) { + s := `vetting.orchestrator=http://x vetting.run_id=1 vetting.mac=AA:BB:CC:DD:EE:FF vetting.token=t` + p, err := ParseCmdlineString(s) + if err != nil { + t.Fatalf("ParseCmdlineString: %v", err) + } + if p.MAC != "aa:bb:cc:dd:ee:ff" { + t.Fatalf("MAC not lowercased: %q", p.MAC) + } +} diff --git a/agent/client.go b/agent/client.go new file mode 100644 index 0000000..dd9ea6b --- /dev/null +++ b/agent/client.go @@ -0,0 +1,181 @@ +package agent + +import ( + "bytes" + "context" + "crypto/sha256" + "crypto/tls" + "crypto/x509" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +// Client talks to the orchestrator's /api/v1/runs/:id/* endpoints. +type Client struct { + BaseURL string + RunID int64 + Token string + TLSCertFPR string // optional sha256 hex fingerprint + HTTP *http.Client +} + +func NewClient(baseURL string, runID int64, token, tlsCertFPR string) *Client { + tlsCfg := &tls.Config{MinVersion: tls.VersionTLS12} + // Cert pinning: if fingerprint provided, accept any cert whose DER + // sha256 matches. The orchestrator may be using a self-signed cert + // inside the LAN. + if tlsCertFPR != "" { + want := strings.ToLower(strings.ReplaceAll(tlsCertFPR, ":", "")) + tlsCfg.InsecureSkipVerify = true + tlsCfg.VerifyPeerCertificate = func(rawCerts [][]byte, _ [][]*x509.Certificate) error { + for _, c := range rawCerts { + sum := sha256.Sum256(c) + if hex.EncodeToString(sum[:]) == want { + return nil + } + } + return fmt.Errorf("agent: no presented cert matched pinned fingerprint") + } + } + return &Client{ + BaseURL: strings.TrimRight(baseURL, "/"), + RunID: runID, + Token: token, + TLSCertFPR: tlsCertFPR, + HTTP: &http.Client{ + Timeout: 30 * time.Second, + Transport: &http.Transport{TLSClientConfig: tlsCfg}, + }, + } +} + +func (c *Client) Hello(ctx context.Context) error { + return c.postJSON(ctx, "/hello", nil, nil) +} + +func (c *Client) Claim(ctx context.Context, agentIP string) (*ClaimResponse, error) { + body := map[string]any{"agent_ip": agentIP} + var out ClaimResponse + if err := c.postJSON(ctx, "/claim", body, &out); err != nil { + return nil, err + } + return &out, nil +} + +func (c *Client) Heartbeat(ctx context.Context) (*HeartbeatResponse, error) { + var out HeartbeatResponse + if err := c.postJSON(ctx, "/heartbeat", nil, &out); err != nil { + return nil, err + } + return &out, nil +} + +func (c *Client) Log(ctx context.Context, lines []LogLine) error { + return c.postJSON(ctx, "/log", map[string]any{"lines": lines}, nil) +} + +func (c *Client) Result(ctx context.Context, result any) (*ResultResponse, error) { + var out ResultResponse + if err := c.postJSON(ctx, "/result", result, &out); err != nil { + return nil, err + } + return &out, nil +} + +func (c *Client) Hold(ctx context.Context, agentIP string) (*HoldResponse, error) { + var out HoldResponse + if err := c.postJSON(ctx, "/hold", map[string]any{"agent_ip": agentIP}, &out); err != nil { + return nil, err + } + return &out, nil +} + +// Sensor posts a batch of numeric samples (thermal readings, fio IOPS, +// iperf throughput, PSU voltages). Empty batches are allowed. +func (c *Client) Sensor(ctx context.Context, samples []SensorSample) error { + return c.postJSON(ctx, "/sensor", map[string]any{"samples": samples}, nil) +} + +// SensorSample is the on-wire shape; the server persists each row into +// the measurements table. +type SensorSample struct { + TS string `json:"ts,omitempty"` + Kind string `json:"kind"` + Key string `json:"key"` + Value float64 `json:"value"` + Unit string `json:"unit,omitempty"` +} + +type ClaimResponse struct { + OK bool `json:"ok"` + RunID int64 `json:"run_id"` + Stages []string `json:"stages"` + ExpectedDisks []ClaimExpectedDiskSpec `json:"expected_disks"` + IperfPort int `json:"iperf_port"` +} + +type ClaimExpectedDiskSpec struct { + Serial string `json:"serial"` + SizeGB int `json:"size_gb"` +} + +type HeartbeatResponse struct { + Cmd string `json:"cmd"` + State string `json:"state"` + Stage string `json:"stage,omitempty"` + OverrideFlags json.RawMessage `json:"override_flags,omitempty"` +} + +type LogLine struct { + TS string `json:"ts,omitempty"` + Level string `json:"level,omitempty"` + Text string `json:"text"` +} + +type ResultResponse struct { + OK bool `json:"ok"` + NextState string `json:"next_state"` +} + +type HoldResponse struct { + AuthorizedKey string `json:"authorized_key"` + RunID int64 `json:"run_id"` +} + +func (c *Client) postJSON(ctx context.Context, path string, in, out any) error { + var body io.Reader + if in != nil { + buf, err := json.Marshal(in) + if err != nil { + return err + } + body = bytes.NewReader(buf) + } + url := fmt.Sprintf("%s/api/v1/runs/%d%s", c.BaseURL, c.RunID, path) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, body) + if err != nil { + return err + } + req.Header.Set("Authorization", "Bearer "+c.Token) + if in != nil { + req.Header.Set("Content-Type", "application/json") + } + resp, err := c.HTTP.Do(req) + if err != nil { + return err + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode >= 300 { + b, _ := io.ReadAll(resp.Body) + return fmt.Errorf("%s %s: %d %s", req.Method, path, resp.StatusCode, strings.TrimSpace(string(b))) + } + if out != nil { + return json.NewDecoder(resp.Body).Decode(out) + } + return nil +} diff --git a/agent/probes/inventory.go b/agent/probes/inventory.go new file mode 100644 index 0000000..a64ba50 --- /dev/null +++ b/agent/probes/inventory.go @@ -0,0 +1,264 @@ +// Package probes collects hardware facts from a booted Linux system. +// Phase 3 only needs enough to feed the spec diff: CPU model/cores, +// total RAM, per-disk serial+size, per-NIC MAC+speed, per-GPU model. +// +// Every probe is tolerant of missing files or tools — if /sys isn't +// available the field is just left empty. The orchestrator's diff +// engine will surface missing expected fields as failures; missing +// fields that weren't expected stay silent. +package probes + +import ( + "bufio" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + + "vetting/internal/spec" +) + +// Collect runs every probe and returns the merged inventory. The only +// errors it surfaces are fatal ones that prevent progress — individual +// probe failures are logged to the returned Inventory's raw field and +// do not fail the whole call. +func Collect() (*spec.Inventory, error) { + inv := &spec.Inventory{} + + inv.CPU = probeCPU() + inv.Memory = probeMemory() + inv.Disks = probeDisks() + inv.NICs = probeNICs() + inv.GPUs = probeGPUs() + + return inv, nil +} + +// ----- CPU -------------------------------------------------------------- + +func probeCPU() spec.CPUSpec { + // model: first "model name" in /proc/cpuinfo. + // logical_cores: runtime.NumCPU (Linux respects cpu cgroup; agent + // runs on bare metal so it will report every HT thread). + c := spec.CPUSpec{LogicalCores: runtime.NumCPU()} + f, err := os.Open("/proc/cpuinfo") + if err != nil { + return c + } + defer func() { _ = f.Close() }() + scan := bufio.NewScanner(f) + for scan.Scan() { + line := scan.Text() + if strings.HasPrefix(line, "model name") { + if _, v, ok := strings.Cut(line, ":"); ok { + c.Model = strings.TrimSpace(v) + break + } + } + } + return c +} + +// ----- Memory ----------------------------------------------------------- + +func probeMemory() spec.MemorySpec { + // /proc/meminfo reports MemTotal in kB. Round down to the nearest + // GiB so the diff's ±2 GiB tolerance is meaningful. + f, err := os.Open("/proc/meminfo") + if err != nil { + return spec.MemorySpec{} + } + defer func() { _ = f.Close() }() + scan := bufio.NewScanner(f) + for scan.Scan() { + fields := strings.Fields(scan.Text()) + if len(fields) >= 2 && fields[0] == "MemTotal:" { + kb, err := strconv.ParseInt(fields[1], 10, 64) + if err == nil { + return spec.MemorySpec{TotalGiB: int(kb / 1024 / 1024)} + } + } + } + return spec.MemorySpec{} +} + +// ----- Disks ------------------------------------------------------------ + +// probeDisks walks /sys/class/block and picks out real block devices +// (no partitions, no loop/ram). For each it reads size (512B sectors) +// and serial. Virtio disks in QEMU report a serial only when launched +// with `-drive serial=...`; without that the field is empty, which is +// fine — the diff skips disks with empty serials anyway. +func probeDisks() []spec.DiskSpec { + entries, err := os.ReadDir("/sys/class/block") + if err != nil { + return nil + } + var out []spec.DiskSpec + for _, e := range entries { + name := e.Name() + if !isRealDisk(name) { + continue + } + base := filepath.Join("/sys/class/block", name) + size := diskSizeGB(base) + serial := diskSerial(name) + // size == 0 means we couldn't read /size; skip rather than + // emit garbage. + if size == 0 && serial == "" { + continue + } + out = append(out, spec.DiskSpec{Serial: serial, SizeGB: size}) + } + return out +} + +func isRealDisk(name string) bool { + // Exclude partitions: they have a parent block dir and a "partition" + // attribute. sd* disks without trailing digits are whole disks; nvme + // disks use nvme0n1 for the namespace and nvme0n1p1 for partitions. + if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") || + strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") { + return false + } + partPath := filepath.Join("/sys/class/block", name, "partition") + if _, err := os.Stat(partPath); err == nil { + return false + } + return true +} + +func diskSizeGB(base string) int { + b, err := os.ReadFile(filepath.Join(base, "size")) + if err != nil { + return 0 + } + sectors, err := strconv.ParseInt(strings.TrimSpace(string(b)), 10, 64) + if err != nil { + return 0 + } + // /sys reports sectors of 512B regardless of physical sector size. + return int(sectors * 512 / 1_000_000_000) +} + +func diskSerial(name string) string { + // Try a few known paths; the kernel exposes serials differently for + // ATA/SCSI vs NVMe. + for _, rel := range []string{ + filepath.Join("/sys/block", name, "device", "serial"), + filepath.Join("/sys/block", name, "device", "vpd_pg80"), + filepath.Join("/sys/block", name, "serial"), + } { + if b, err := os.ReadFile(rel); err == nil { + s := strings.TrimSpace(string(b)) + if s != "" { + return s + } + } + } + // Fallback: udevadm often knows the wwid / serial. Best-effort. + cmd := exec.Command("udevadm", "info", "--query=property", "--name="+name) + out, err := cmd.Output() + if err != nil { + return "" + } + for _, line := range strings.Split(string(out), "\n") { + if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok { + return strings.TrimSpace(v) + } + } + return "" +} + +// ----- NICs ------------------------------------------------------------- + +func probeNICs() []spec.NICSpec { + root := "/sys/class/net" + entries, err := os.ReadDir(root) + if err != nil { + return nil + } + var out []spec.NICSpec + for _, e := range entries { + name := e.Name() + if name == "lo" { + continue + } + base := filepath.Join(root, name) + mac := readLine(filepath.Join(base, "address")) + if mac == "" || mac == "00:00:00:00:00:00" { + continue + } + // /sys/class/net/*/speed reports Mbps or -1 if link down. + speed := 0 + if b, err := os.ReadFile(filepath.Join(base, "speed")); err == nil { + if mbps, err := strconv.Atoi(strings.TrimSpace(string(b))); err == nil && mbps > 0 { + speed = mbps / 1000 + } + } + out = append(out, spec.NICSpec{MAC: strings.ToLower(mac), SpeedGbps: speed}) + } + return out +} + +// ----- GPUs ------------------------------------------------------------- + +// probeGPUs leans on lspci; if lspci is missing, returns nothing and +// the diff engine just won't match any GPU expectations. Phase 4 will +// add nvidia-smi for VRAM and firmware. +func probeGPUs() []spec.GPUSpec { + cmd := exec.Command("lspci", "-mm", "-nnk") + out, err := cmd.Output() + if err != nil { + return nil + } + var gpus []spec.GPUSpec + for _, line := range strings.Split(string(out), "\n") { + low := strings.ToLower(line) + if !strings.Contains(low, "vga compatible controller") && + !strings.Contains(low, "3d controller") { + continue + } + // `lspci -mm` quotes fields; device name is usually field 3. + fields := splitQuoted(line) + if len(fields) >= 4 { + gpus = append(gpus, spec.GPUSpec{Model: fmt.Sprintf("%s %s", fields[2], fields[3])}) + } + } + return gpus +} + +func splitQuoted(line string) []string { + var out []string + var cur strings.Builder + inQ := false + for _, r := range line { + switch { + case r == '"': + inQ = !inQ + if !inQ { + out = append(out, cur.String()) + cur.Reset() + } + case r == ' ' && !inQ: + continue + default: + cur.WriteRune(r) + } + } + return out +} + +// ----- shared helpers --------------------------------------------------- + +func readLine(path string) string { + b, err := os.ReadFile(path) + if err != nil { + return "" + } + return strings.TrimSpace(string(b)) +} + diff --git a/agent/probes/thermal.go b/agent/probes/thermal.go new file mode 100644 index 0000000..0ec1da9 --- /dev/null +++ b/agent/probes/thermal.go @@ -0,0 +1,67 @@ +package probes + +import ( + "os" + "path/filepath" + "strconv" + "strings" +) + +// ThermalSample is one reading from /sys/class/hwmon. Kind is "temp", +// Key is the label (or chip-relative name) and Value is degrees C. +type ThermalSample struct { + Kind string + Key string + Value float64 + Unit string +} + +// Thermals walks /sys/class/hwmon looking for temp*_input files. The +// kernel reports millidegrees C; we divide by 1000. Labels come from +// temp*_label (preferred) or a chip-relative fallback. +// +// This is also used by the thermal sidecar; it re-reads on each tick +// rather than holding open handles so hot-plugged sensors (e.g. a PCIe +// card enumerating late) get picked up. +func Thermals() []ThermalSample { + root := "/sys/class/hwmon" + chips, err := os.ReadDir(root) + if err != nil { + return nil + } + var out []ThermalSample + for _, c := range chips { + base := filepath.Join(root, c.Name()) + chipName := strings.TrimSpace(readFileStr(filepath.Join(base, "name"))) + files, err := os.ReadDir(base) + if err != nil { + continue + } + for _, f := range files { + name := f.Name() + if !strings.HasPrefix(name, "temp") || !strings.HasSuffix(name, "_input") { + continue + } + idx := strings.TrimSuffix(strings.TrimPrefix(name, "temp"), "_input") + label := strings.TrimSpace(readFileStr(filepath.Join(base, "temp"+idx+"_label"))) + if label == "" { + label = chipName + "/temp" + idx + } + raw := strings.TrimSpace(readFileStr(filepath.Join(base, name))) + milli, err := strconv.Atoi(raw) + if err != nil { + continue + } + out = append(out, ThermalSample{Kind: "temp", Key: label, Value: float64(milli) / 1000, Unit: "C"}) + } + } + return out +} + +func readFileStr(p string) string { + b, err := os.ReadFile(p) + if err != nil { + return "" + } + return string(b) +} diff --git a/agent/runner.go b/agent/runner.go new file mode 100644 index 0000000..feb6ed3 --- /dev/null +++ b/agent/runner.go @@ -0,0 +1,498 @@ +// Package agent implements the in-live-image control loop. +// +// Phase 4 scope: after /claim, the agent walks through every stage the +// orchestrator advertises, dispatching on the stage name to a function +// in agent/tests. Each stage posts a /result; the response carries the +// orchestrator's next_state, which the loop uses to pick the next +// stage. Stages the orchestrator owns (SpecValidate, Reporting) resolve +// server-side inside /result so the agent never sees them as "its turn". +// +// Terminal states: +// - FailedHolding → request hold key, install authorized_keys, wait +// on heartbeats for a retry_stage directive. +// - Completed → heartbeat carries cmd=shutdown; agent runs +// `systemctl poweroff` and exits. +// +// Thermal sidecar runs from the moment the agent claims until ctx +// cancel; it posts a handful of /sys/class/hwmon samples every 5s. +package agent + +import ( + "context" + "encoding/json" + "fmt" + "log" + "net" + "os" + "os/exec" + "path/filepath" + "sync" + "time" + + "vetting/agent/bootstate" + "vetting/agent/probes" + "vetting/agent/tests" + "vetting/internal/spec" +) + +// Run is the long-lived entry point. It blocks until ctx is cancelled +// or a fatal error makes progress impossible. +func Run(ctx context.Context, p *bootstate.Params) error { + c := NewClient(p.OrchestratorURL, p.RunID, p.Token, p.TLSCertFPR) + fwd := newLogForwarder(ctx, c) + defer fwd.close() + + ip := localIP() + fwd.info(fmt.Sprintf("agent starting on %s (run=%d mac=%s)", ip, p.RunID, p.MAC)) + + if err := callWithBackoff(ctx, "hello", func(ctx context.Context) error { + return c.Hello(ctx) + }); err != nil { + fwd.warn("hello never succeeded: " + err.Error()) + } + + var claim *ClaimResponse + if err := callWithBackoff(ctx, "claim", func(ctx context.Context) error { + r, err := c.Claim(ctx, ip) + if err != nil { + return err + } + claim = r + return nil + }); err != nil { + return err + } + fwd.info(fmt.Sprintf("claimed run; stages=%v", claim.Stages)) + + go thermalSidecar(ctx, c, fwd) + + hbCh := make(chan HeartbeatResponse, 4) + go heartbeatLoop(ctx, c, fwd, hbCh) + + // Run every stage the orchestrator advertises. Stages owned by the + // orchestrator (SpecValidate, Reporting) resolve inside /result and + // flip next_state forward past themselves, so they simply never match + // our dispatch table. + nextStage := "Inventory" + for nextStage != "" { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + fwd.info("stage: starting " + nextStage) + outcome := runStage(ctx, nextStage, claim, fwd, c, overrideFlags{}) + resp, err := postResult(ctx, c, nextStage, outcome) + if err != nil { + fwd.error("submit result for " + nextStage + ": " + err.Error()) + return err + } + fwd.info(fmt.Sprintf("stage %s → next_state=%s", nextStage, resp.NextState)) + + if resp.NextState == "FailedHolding" { + if err := requestHold(ctx, c, fwd); err != nil { + return err + } + // Park and wait for an override directive. + return waitForOverride(ctx, c, fwd, hbCh, claim) + } + if resp.NextState == "Completed" || resp.NextState == "" { + fwd.info("pipeline complete") + <-ctx.Done() + return ctx.Err() + } + nextStage = stageForState(resp.NextState) + if nextStage == "" { + // next_state is something we don't map (e.g. SpecValidate — but + // the orchestrator's /result already resolved it and handed us + // back a further-along state). Defensive bail so we don't loop. + fwd.warn("no stage maps to state " + resp.NextState + "; parking") + <-ctx.Done() + return ctx.Err() + } + } + <-ctx.Done() + return ctx.Err() +} + +// runStage dispatches on stage name. The Inventory stage is special — +// it runs the inventory probe and passes the result as the /result body +// (the orchestrator persists it as an artifact). Every other stage +// returns a tests.Outcome which postResult marshals generically. +func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome { + deps := newDeps(ctx, c, fwd, ovr, claim) + switch stage { + case "Inventory": + fwd.info("Inventory: probing host hardware") + inv, err := probes.Collect() + if err != nil { + return stageOutcome{Outcome: tests.Outcome{Passed: false, Message: err.Error(), Summary: "probe error"}} + } + fwd.info("Inventory: " + inventorySummary(inv)) + return stageOutcome{ + Outcome: tests.Outcome{ + Passed: true, + Summary: inventorySummary(inv), + }, + Inventory: inv, + } + case "SMART": + return stageOutcome{Outcome: tests.SMART(ctx, deps)} + case "CPUStress": + return stageOutcome{Outcome: tests.CPUStress(ctx, deps)} + case "Storage": + return stageOutcome{Outcome: tests.Storage(ctx, deps)} + case "Network": + return stageOutcome{Outcome: tests.Network(ctx, deps, tests.NetworkConfig{ + OrchestratorURL: c.BaseURL, + IperfPort: claim.IperfPort, + Duration: 10 * time.Second, + })} + case "GPU": + return stageOutcome{Outcome: tests.GPU(ctx, deps)} + case "PSU": + return stageOutcome{Outcome: tests.PSU(ctx, deps)} + } + return stageOutcome{Outcome: tests.Outcome{ + Passed: false, + Message: "unknown stage " + stage, + }} +} + +type stageOutcome struct { + Outcome tests.Outcome + Inventory *spec.Inventory // only for Inventory stage +} + +type overrideFlags struct { + Wipe bool `json:"wipe"` +} + +func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlags, claim *ClaimResponse) tests.Deps { + var expected []tests.ExpectedDisk + for _, e := range claim.ExpectedDisks { + expected = append(expected, tests.ExpectedDisk{Serial: e.Serial, SizeGB: e.SizeGB}) + } + return tests.Deps{ + Info: fwd.info, + Warn: fwd.warn, + Error: fwd.error, + OverrideWipe: ovr.Wipe, + ExpectedDisks: expected, + StageTimeout: 2 * time.Minute, + Sensor: func(ctx context.Context, samples []tests.Sample) error { + out := make([]SensorSample, 0, len(samples)) + for _, s := range samples { + out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit}) + } + return c.Sensor(ctx, out) + }, + } +} + +// postResult marshals stageOutcome for the /result endpoint. The +// Inventory shape is special-cased: it includes the inventory blob so +// the orchestrator can persist it and run server-side spec diff. +func postResult(ctx context.Context, c *Client, stage string, s stageOutcome) (*ResultResponse, error) { + summary, _ := s.Outcome.MarshalSummary() + body := map[string]any{ + "stage": stage, + "passed": s.Outcome.Passed, + } + if len(summary) > 2 { + body["summary"] = json.RawMessage(summary) + } + if s.Outcome.Message != "" { + body["message"] = s.Outcome.Message + } + if s.Inventory != nil { + body["inventory"] = s.Inventory + } + return c.Result(ctx, body) +} + +// stageForState maps a RunState string back to the stage executor name. +// Every stage-name is the same as its state except Inventory↔InventoryCheck. +func stageForState(state string) string { + switch state { + case "InventoryCheck": + return "Inventory" + case "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU": + return state + } + // SpecValidate and Reporting are orchestrator-owned; we never see + // them as next_state because /result resolves past them. + return "" +} + +// waitForOverride parks the agent in FailedHolding. It listens for a +// heartbeat directive that tells it to retry a stage (e.g. Storage +// with wipe-override armed) and re-enters runStage from that point. +func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-chan HeartbeatResponse, claim *ClaimResponse) error { + fwd.info("holding: awaiting operator decision (heartbeat directive or ctx cancel)") + for { + select { + case <-ctx.Done(): + return ctx.Err() + case cmd, ok := <-hb: + if !ok { + return nil + } + if cmd.Cmd != "retry_stage" || cmd.Stage == "" { + continue + } + fwd.info("operator override: retrying stage " + cmd.Stage) + var ovr overrideFlags + if len(cmd.OverrideFlags) > 0 { + _ = json.Unmarshal(cmd.OverrideFlags, &ovr) + } + outcome := runStage(ctx, cmd.Stage, claim, fwd, c, ovr) + resp, err := postResult(ctx, c, cmd.Stage, outcome) + if err != nil { + fwd.error("override: submit result: " + err.Error()) + continue + } + fwd.info(fmt.Sprintf("override stage %s → next_state=%s", cmd.Stage, resp.NextState)) + if resp.NextState == "FailedHolding" { + // Still broken; keep holding. + continue + } + if resp.NextState == "Completed" { + return nil + } + // Successful retry — continue walking the pipeline from the + // state the orchestrator advanced us into. + if nextStage := stageForState(resp.NextState); nextStage != "" { + for nextStage != "" { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + fwd.info("stage: starting " + nextStage) + out := runStage(ctx, nextStage, claim, fwd, c, overrideFlags{}) + rr, err := postResult(ctx, c, nextStage, out) + if err != nil { + return err + } + if rr.NextState == "FailedHolding" || rr.NextState == "Completed" || rr.NextState == "" { + return nil + } + nextStage = stageForState(rr.NextState) + } + } + return nil + } + } +} + +// requestHold fetches the per-run pubkey and installs it into +// /root/.ssh/authorized_keys so the operator can SSH in. +func requestHold(ctx context.Context, c *Client, fwd *logForwarder) error { + fwd.warn("entering FailedHolding; requesting hold key") + resp, err := c.Hold(ctx, localIP()) + if err != nil { + fwd.error("hold request failed: " + err.Error()) + return err + } + authPath := "/root/.ssh/authorized_keys" + if err := os.MkdirAll(filepath.Dir(authPath), 0o700); err != nil { + fwd.error("mkdir .ssh: " + err.Error()) + return err + } + f, err := os.OpenFile(authPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o600) + if err != nil { + fwd.error("open authorized_keys: " + err.Error()) + return err + } + defer func() { _ = f.Close() }() + if _, err := fmt.Fprintln(f, resp.AuthorizedKey); err != nil { + fwd.error("write authorized_keys: " + err.Error()) + return err + } + fwd.info("hold key installed; SSH is available to root@" + localIP()) + return nil +} + +func inventorySummary(inv *spec.Inventory) string { + return fmt.Sprintf("cpu=%q cores=%d ram=%dGiB disks=%d nics=%d gpus=%d", + inv.CPU.Model, inv.CPU.LogicalCores, inv.Memory.TotalGiB, + len(inv.Disks), len(inv.NICs), len(inv.GPUs)) +} + +// thermalSidecar posts a batch of /sys/class/hwmon samples every 5s. +// Idempotent: a dead sensor just drops out of the next batch. Errors +// are logged but never fatal — we'd rather have a run with partial +// thermal data than kill the agent over an I/O hiccup. +func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) { + t := time.NewTicker(5 * time.Second) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + samples := probes.Thermals() + if len(samples) == 0 { + continue + } + out := make([]SensorSample, 0, len(samples)) + for _, s := range samples { + out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit}) + } + sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + if err := c.Sensor(sendCtx, out); err != nil { + fwd.warn("thermal sidecar: " + err.Error()) + } + cancel() + } + } +} + +func heartbeatLoop(ctx context.Context, c *Client, fwd *logForwarder, out chan<- HeartbeatResponse) { + t := time.NewTicker(10 * time.Second) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + hbCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + resp, err := c.Heartbeat(hbCtx) + cancel() + if err != nil { + fwd.warn("heartbeat error: " + err.Error()) + continue + } + if resp.Cmd == "abort" { + fwd.warn("orchestrator said abort; stopping loop") + return + } + if resp.Cmd == "shutdown" { + fwd.info("orchestrator said shutdown; powering off host") + // Best effort: systemd then sysvinit fallback. Either way, + // return so the agent process stops issuing heartbeats. + if err := exec.Command("systemctl", "poweroff").Run(); err != nil { + fwd.warn("systemctl poweroff failed: " + err.Error()) + _ = exec.Command("shutdown", "-h", "now").Run() + } + return + } + if resp.Cmd == "retry_stage" { + select { + case out <- *resp: + default: + } + } + } + } +} + +func callWithBackoff(ctx context.Context, label string, f func(context.Context) error) error { + backoff := 2 * time.Second + for attempt := 1; ; attempt++ { + callCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + err := f(callCtx) + cancel() + if err == nil { + return nil + } + if attempt > 20 { + return err + } + log.Printf("agent: %s attempt %d failed: %v (retry in %s)", label, attempt, err, backoff) + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(backoff): + } + if backoff < 30*time.Second { + backoff *= 2 + } + } +} + +func localIP() string { + addrs, err := net.InterfaceAddrs() + if err != nil { + return "" + } + for _, a := range addrs { + ipnet, ok := a.(*net.IPNet) + if !ok || ipnet.IP.IsLoopback() { + continue + } + v4 := ipnet.IP.To4() + if v4 != nil { + return v4.String() + } + } + return "" +} + +// ----- log forwarder ----------------------------------------------------- + +type logForwarder struct { + c *Client + mu sync.Mutex + buf []LogLine + wg sync.WaitGroup + cancel context.CancelFunc +} + +func newLogForwarder(parent context.Context, c *Client) *logForwarder { + ctx, cancel := context.WithCancel(parent) + f := &logForwarder{c: c, cancel: cancel} + f.wg.Add(1) + go f.loop(ctx) + return f +} + +func (f *logForwarder) loop(ctx context.Context) { + defer f.wg.Done() + t := time.NewTicker(2 * time.Second) + defer t.Stop() + for { + select { + case <-ctx.Done(): + f.flush() + return + case <-t.C: + f.flush() + } + } +} + +func (f *logForwarder) push(level, text string) { + stamp := time.Now().UTC().Format(time.RFC3339Nano) + log.Printf("[%s] %s", level, text) + f.mu.Lock() + f.buf = append(f.buf, LogLine{TS: stamp, Level: level, Text: text}) + f.mu.Unlock() +} + +func (f *logForwarder) info(s string) { f.push("info", s) } +func (f *logForwarder) warn(s string) { f.push("warn", s) } +func (f *logForwarder) error(s string) { f.push("error", s) } + +func (f *logForwarder) flush() { + f.mu.Lock() + if len(f.buf) == 0 { + f.mu.Unlock() + return + } + lines := f.buf + f.buf = nil + f.mu.Unlock() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := f.c.Log(ctx, lines); err != nil { + log.Printf("log forward failed: %v", err) + } +} + +func (f *logForwarder) close() { + f.cancel() + f.wg.Wait() +} diff --git a/agent/tests/cpustress.go b/agent/tests/cpustress.go new file mode 100644 index 0000000..b2647e8 --- /dev/null +++ b/agent/tests/cpustress.go @@ -0,0 +1,97 @@ +package tests + +import ( + "context" + "fmt" + "os/exec" + "runtime" + "strconv" + "strings" + "time" +) + +// CPUStress runs stress-ng with CPU workers AND memory stressors. The +// memory stressors take the place of a Memtest86+ pass — per the plan, +// running under Linux gives us exit-code-based pass/fail and log +// capture we can't get from Memtest without IPMI serial redirection. +// +// Non-zero exit = stress-ng aborted due to a failure (bit flip, OOM +// kill, etc.) → stage fails. Exit 0 means the kernel returned sane +// pages for the full duration, which is the Phase 4 health bar. +func CPUStress(ctx context.Context, d Deps) Outcome { + if _, err := exec.LookPath("stress-ng"); err != nil { + d.Warn("CPUStress: stress-ng not found in PATH — skipping stage") + return Outcome{ + Passed: true, + Summary: "skipped (stress-ng missing)", + Extras: map[string]any{"skipped": true, "reason": "stress_ng_missing"}, + } + } + + // Timeout: Deps.StageTimeout may be zero in tests; default 2 min. + timeout := d.StageTimeout + if timeout <= 0 { + timeout = 2 * time.Minute + } + + cores := runtime.NumCPU() + // --vm N allocates N worker processes each touching 90% of RAM. On + // an 8-core host with 32GiB this is 8 × ~28GiB sliding windows — + // enough to exercise every DIMM row within a minute. + args := []string{ + "--cpu", strconv.Itoa(cores), + "--cpu-method", "all", + "--vm", strconv.Itoa(cores), + "--vm-bytes", "90%", + "--timeout", durationSeconds(timeout), + "--metrics-brief", + "--verify", + } + d.Info(fmt.Sprintf("CPUStress: stress-ng --cpu %d --vm %d --vm-bytes 90%% --timeout %s", + cores, cores, durationSeconds(timeout))) + + runCtx, cancel := context.WithTimeout(ctx, timeout+30*time.Second) + defer cancel() + cmd := exec.CommandContext(runCtx, "stress-ng", args...) + start := time.Now() + out, err := cmd.CombinedOutput() + elapsed := time.Since(start).Round(time.Second) + + extras := map[string]any{ + "cores": cores, + "elapsed_secs": elapsed.Seconds(), + "output_tail": tailLines(string(out), 20), + } + if err != nil { + d.Error("CPUStress: stress-ng failed: " + err.Error()) + return Outcome{ + Passed: false, + Message: "stress-ng returned non-zero: " + err.Error(), + Summary: fmt.Sprintf("failed after %s", elapsed), + Extras: extras, + } + } + d.Info(fmt.Sprintf("CPUStress: stress-ng completed cleanly in %s", elapsed)) + return Outcome{ + Passed: true, + Summary: fmt.Sprintf("stress-ng PASSED after %s (%d cores + 90%% RAM)", elapsed, cores), + Extras: extras, + } +} + +func durationSeconds(d time.Duration) string { + s := int(d.Seconds()) + if s < 1 { + s = 1 + } + return strconv.Itoa(s) + "s" +} + +// tailLines returns the last n non-empty lines of s, for the summary. +func tailLines(s string, n int) string { + lines := strings.Split(strings.TrimRight(s, "\n"), "\n") + if len(lines) > n { + lines = lines[len(lines)-n:] + } + return strings.Join(lines, "\n") +} diff --git a/agent/tests/gpu.go b/agent/tests/gpu.go new file mode 100644 index 0000000..04963a6 --- /dev/null +++ b/agent/tests/gpu.go @@ -0,0 +1,86 @@ +package tests + +import ( + "context" + "os/exec" + "strings" +) + +// GPU enumerates VGA / 3D PCI devices. No devices → skip cleanly (a +// CPU-only server passes this stage by virtue of having nothing to +// stress). Devices present → try nvidia-smi for NVIDIA cards, else +// accept PCI presence. +func GPU(ctx context.Context, d Deps) Outcome { + devices := listGPUPCI(ctx) + if len(devices) == 0 { + d.Info("GPU: no VGA/3D PCI devices found — skipping stage") + return Outcome{ + Passed: true, + Summary: "skipped (no GPU present)", + Extras: map[string]any{"skipped": true, "reason": "no_gpu_present"}, + } + } + d.Info("GPU: found " + joinDevices(devices)) + + nvidia := nvidiaSmiList(ctx) + extras := map[string]any{ + "pci_devices": devices, + "skipped": false, + } + if len(nvidia) > 0 { + extras["nvidia"] = nvidia + d.Info("GPU: nvidia-smi reports: " + strings.Join(nvidia, ", ")) + } + return Outcome{ + Passed: true, + Summary: formatCount(len(devices), "GPU present"), + Extras: extras, + } +} + +// listGPUPCI shells out to lspci. Returns human-readable strings, one +// per VGA/3D device. If lspci isn't available we return nil and the +// caller treats it as "no GPU" which auto-skips. +func listGPUPCI(ctx context.Context) []string { + cmd := exec.CommandContext(ctx, "lspci", "-mm") + out, err := cmd.Output() + if err != nil { + return nil + } + var devs []string + for _, line := range strings.Split(string(out), "\n") { + l := strings.ToLower(line) + if strings.Contains(l, "vga compatible controller") || strings.Contains(l, "3d controller") { + devs = append(devs, strings.TrimSpace(line)) + } + } + return devs +} + +// nvidiaSmiList returns each card's ", " line; empty +// slice when nvidia-smi isn't installed or fails. +func nvidiaSmiList(ctx context.Context) []string { + cmd := exec.CommandContext(ctx, "nvidia-smi", "-L") + out, err := cmd.Output() + if err != nil { + return nil + } + var lines []string + for _, l := range strings.Split(string(out), "\n") { + l = strings.TrimSpace(l) + if l != "" { + lines = append(lines, l) + } + } + return lines +} + +func joinDevices(devs []string) string { + if len(devs) == 0 { + return "" + } + if len(devs) == 1 { + return devs[0] + } + return devs[0] + " (+" + strings.TrimSpace(formatCount(len(devs)-1, "more")) + ")" +} diff --git a/agent/tests/network.go b/agent/tests/network.go new file mode 100644 index 0000000..400d976 --- /dev/null +++ b/agent/tests/network.go @@ -0,0 +1,144 @@ +package tests + +import ( + "context" + "encoding/json" + "fmt" + "net/url" + "os/exec" + "strconv" + "strings" + "time" +) + +// NetworkConfig is what the agent passes to Network: the orchestrator's +// iperf3 server address and port. We derive host from OrchestratorURL. +type NetworkConfig struct { + OrchestratorURL string + IperfPort int // 0 = 5201 + Duration time.Duration +} + +// Network runs iperf3 against the orchestrator's bundled server. Records +// bandwidth as a measurement; fails if iperf3 is missing, the server +// isn't reachable, or throughput is zero. +func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome { + if _, err := exec.LookPath("iperf3"); err != nil { + d.Warn("Network: iperf3 not found — skipping stage") + return Outcome{ + Passed: true, + Summary: "skipped (iperf3 missing)", + Extras: map[string]any{"skipped": true, "reason": "iperf3_missing"}, + } + } + host, err := deriveHost(cfg.OrchestratorURL) + if err != nil || host == "" { + d.Warn("Network: can't derive orchestrator host from URL — skipping stage") + return Outcome{ + Passed: true, + Summary: "skipped (no orchestrator host)", + Extras: map[string]any{"skipped": true, "reason": "no_host"}, + } + } + port := cfg.IperfPort + if port == 0 { + port = 5201 + } + duration := cfg.Duration + if duration <= 0 { + duration = 10 * time.Second + } + + args := []string{ + "-c", host, + "-p", strconv.Itoa(port), + "-t", strconv.Itoa(int(duration.Seconds())), + "-J", // JSON output + } + d.Info(fmt.Sprintf("Network: iperf3 -c %s -p %d -t %s", host, port, duration)) + + runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second) + defer cancel() + cmd := exec.CommandContext(runCtx, "iperf3", args...) + out, err := cmd.Output() + if err != nil { + d.Error("Network: iperf3 client failed: " + err.Error()) + return Outcome{ + Passed: false, + Message: "iperf3 client error: " + err.Error(), + Summary: "iperf3 failed", + Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)}, + } + } + mbps, parsed, err := parseIperfJSON(out) + if err != nil { + d.Error("Network: parse iperf3 output: " + err.Error()) + return Outcome{ + Passed: false, + Message: "parse iperf3 json: " + err.Error(), + Summary: "parse error", + Extras: map[string]any{"raw": string(out)}, + } + } + if d.Sensor != nil { + _ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}) + } + + extras := map[string]any{ + "throughput_mbps": mbps, + "iperf_end": parsed, + } + if mbps <= 0 { + return Outcome{ + Passed: false, + Message: "iperf3 reported zero throughput", + Summary: "zero throughput", + Extras: extras, + } + } + d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps)) + return Outcome{ + Passed: true, + Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host), + Extras: extras, + } +} + +// deriveHost pulls the hostname out of an https://host:port base URL. +func deriveHost(raw string) (string, error) { + if raw == "" { + return "", fmt.Errorf("empty url") + } + u, err := url.Parse(raw) + if err != nil { + return "", err + } + h := u.Hostname() + return strings.TrimSpace(h), nil +} + +// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J. +// Returns (Mbps, full-json-map, err). +func parseIperfJSON(b []byte) (float64, map[string]any, error) { + var top map[string]any + if err := json.Unmarshal(b, &top); err != nil { + return 0, nil, err + } + end, ok := top["end"].(map[string]any) + if !ok { + return 0, top, fmt.Errorf("missing end") + } + // iperf3 reports either sum_sent (when -R not set) or sum_received. + for _, key := range []string{"sum_sent", "sum_received", "sum"} { + sum, ok := end[key].(map[string]any) + if !ok { + continue + } + bps, ok := sum["bits_per_second"].(float64) + if !ok { + continue + } + return bps / 1_000_000, end, nil + } + return 0, end, fmt.Errorf("no bits_per_second in end.sum_*") +} diff --git a/agent/tests/psu.go b/agent/tests/psu.go new file mode 100644 index 0000000..8e8991e --- /dev/null +++ b/agent/tests/psu.go @@ -0,0 +1,153 @@ +package tests + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" +) + +// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find +// PSU rails. In home-lab hosts the kernel surfaces a handful of named +// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10% +// window of its nominal value → fail. +func PSU(ctx context.Context, d Deps) Outcome { + rails := scanPSURails() + if len(rails) == 0 { + d.Info("PSU: no voltage rails found under /sys/class/hwmon — skipping stage") + return Outcome{ + Passed: true, + Summary: "skipped (no PSU sensors)", + Extras: map[string]any{"skipped": true, "reason": "no_hwmon_voltages"}, + } + } + + var samples []Sample + problems := []string{} + for _, rail := range rails { + samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"}) + if ok, why := voltageInRange(rail); !ok { + problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why)) + } + } + if d.Sensor != nil { + _ = d.Sensor(ctx, samples) + } + + extras := map[string]any{ + "rails": rails, + "problems": problems, + } + if len(problems) > 0 { + d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", ")) + return Outcome{ + Passed: false, + Message: "PSU rails out of range: " + strings.Join(problems, ", "), + Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)), + Extras: extras, + } + } + d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails))) + return Outcome{ + Passed: true, + Summary: fmt.Sprintf("%d rails nominal", len(rails)), + Extras: extras, + } +} + +type psuRail struct { + Label string `json:"label"` + Volts float64 `json:"volts"` +} + +// scanPSURails walks every hwmon chip looking for in*_input files with +// an accompanying in*_label that mentions a known rail name. Unknown +// labels are skipped rather than flagged — motherboard VRMs report many +// rails that aren't PSU outputs. +func scanPSURails() []psuRail { + root := "/sys/class/hwmon" + chips, err := os.ReadDir(root) + if err != nil { + return nil + } + var out []psuRail + for _, c := range chips { + base := filepath.Join(root, c.Name()) + files, err := os.ReadDir(base) + if err != nil { + continue + } + for _, f := range files { + name := f.Name() + if !strings.HasPrefix(name, "in") || !strings.HasSuffix(name, "_input") { + continue + } + n := strings.TrimSuffix(strings.TrimPrefix(name, "in"), "_input") + labelPath := filepath.Join(base, "in"+n+"_label") + label := strings.TrimSpace(readFileStr(labelPath)) + if !isPSULabel(label) { + continue + } + raw := strings.TrimSpace(readFileStr(filepath.Join(base, name))) + mv, err := strconv.Atoi(raw) + if err != nil { + continue + } + out = append(out, psuRail{Label: label, Volts: float64(mv) / 1000}) + } + } + return out +} + +// isPSULabel filters labels that look like PSU rails. Keeps a small +// allowlist to avoid flagging CPU VRM rails as PSU failures. +func isPSULabel(label string) bool { + l := strings.ToLower(label) + switch { + case strings.Contains(l, "12v"), strings.Contains(l, "5v"), + strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"), + strings.Contains(l, "vccin"): + return true + } + return false +} + +// voltageInRange returns (ok, reason). A label like "12V" has a 12.0V +// nominal; we accept ±10%. Unknown labels pass. +func voltageInRange(r psuRail) (bool, string) { + nom := nominalFor(r.Label) + if nom == 0 { + return true, "" + } + delta := r.Volts - nom + if delta < 0 { + delta = -delta + } + if delta/nom > 0.10 { + return false, fmt.Sprintf("expected ~%.1fV", nom) + } + return true, "" +} + +func nominalFor(label string) float64 { + l := strings.ToLower(label) + switch { + case strings.Contains(l, "12v"): + return 12.0 + case strings.Contains(l, "5v"): + return 5.0 + case strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"): + return 3.3 + } + return 0 +} + +func readFileStr(p string) string { + b, err := os.ReadFile(p) + if err != nil { + return "" + } + return string(b) +} diff --git a/agent/tests/smart.go b/agent/tests/smart.go new file mode 100644 index 0000000..987f46d --- /dev/null +++ b/agent/tests/smart.go @@ -0,0 +1,152 @@ +package tests + +import ( + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" +) + +// SMART runs smartctl -a on each block device the kernel exposes. We +// pass each device's result through smartctl --json output and key on: +// +// smart_status.passed -> overall-health PASSED +// ata_smart_attributes -> per-attribute raw + threshold (ATA only) +// nvme_smart_health_information_log -> NVMe health flags +// +// Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just +// surfaces as a per-disk "skipped" entry; the stage only fails if at +// least one disk reports !passed. +func SMART(ctx context.Context, d Deps) Outcome { + disks, err := listBlockDisks() + if err != nil { + d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error()) + return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}} + } + if len(disks) == 0 { + d.Info("SMART: no physical disks found — skipping stage") + return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}} + } + + type diskReport struct { + Device string `json:"device"` + Passed bool `json:"passed"` + Skipped bool `json:"skipped,omitempty"` + Reason string `json:"reason,omitempty"` + Raw map[string]any `json:"raw,omitempty"` + } + + var reports []diskReport + failed := 0 + usable := 0 + for _, dev := range disks { + rep := diskReport{Device: dev} + out, err := runSmartctl(ctx, dev) + if err != nil { + rep.Skipped = true + rep.Reason = err.Error() + reports = append(reports, rep) + d.Info("SMART: " + dev + " skipped (" + err.Error() + ")") + continue + } + usable++ + rep.Raw = out + if passed, ok := smartPassed(out); ok { + rep.Passed = passed + if !passed { + failed++ + d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev)) + } else { + d.Info(fmt.Sprintf("SMART: %s PASSED", dev)) + } + } else { + rep.Skipped = true + rep.Reason = "no smart_status in output" + } + reports = append(reports, rep) + } + + extras := map[string]any{ + "disks": reports, + "tested": usable, + "failing": failed, + } + if failed > 0 { + return Outcome{ + Passed: false, + Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed), + Summary: fmt.Sprintf("%d/%d failing", failed, usable), + Extras: extras, + } + } + summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable) + if usable == 0 { + summary = "skipped (no smartctl data on any disk)" + extras["skipped"] = true + } + return Outcome{Passed: true, Summary: summary, Extras: extras} +} + +func listBlockDisks() ([]string, error) { + entries, err := os.ReadDir("/sys/class/block") + if err != nil { + return nil, err + } + var out []string + for _, e := range entries { + name := e.Name() + if !isRealBlockDisk(name) { + continue + } + out = append(out, "/dev/"+name) + } + return out, nil +} + +func isRealBlockDisk(name string) bool { + if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") || + strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") { + return false + } + partPath := filepath.Join("/sys/class/block", name, "partition") + if _, err := os.Stat(partPath); err == nil { + return false + } + return true +} + +// runSmartctl invokes `smartctl -aj ` and returns the parsed JSON. +// Exit code 4 means smartctl found no device info (e.g. virtio), which +// we surface as a skip rather than a failure. +func runSmartctl(ctx context.Context, dev string) (map[string]any, error) { + cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev) + out, err := cmd.Output() + if len(out) == 0 { + if err != nil { + return nil, fmt.Errorf("smartctl: %w", err) + } + return nil, fmt.Errorf("empty smartctl output") + } + var parsed map[string]any + if jerr := json.Unmarshal(out, &parsed); jerr != nil { + return nil, fmt.Errorf("parse smartctl output: %w", jerr) + } + // Even with a non-zero exit code, if we got valid JSON with + // smart_status, trust the structured result. + return parsed, nil +} + +// smartPassed extracts smart_status.passed from a smartctl --json blob. +// Returns (passed, present) so callers can distinguish "passed=false" +// from "attribute missing". +func smartPassed(out map[string]any) (bool, bool) { + status, ok := out["smart_status"].(map[string]any) + if !ok { + return false, false + } + passed, ok := status["passed"].(bool) + return passed, ok +} diff --git a/agent/tests/stage.go b/agent/tests/stage.go new file mode 100644 index 0000000..03b8b71 --- /dev/null +++ b/agent/tests/stage.go @@ -0,0 +1,67 @@ +// Package tests contains the per-stage executors the agent runs on the +// host under test. Each stage implements Runner, is called with a +// Context that carries the client + forwarder + run params, and returns +// an Outcome that the caller POSTs to /result. +package tests + +import ( + "context" + "encoding/json" + "time" +) + +// Outcome is what a stage returns; it maps directly to the /result body. +// - Passed=true and len(Skipped)>0 counts as a pass but surfaces in the +// tile summary so operators can see "GPU: skipped (no VGA device)". +// - Message is only used on failure; the UI displays it in the log. +// - Extras is merged into the posted summary so stages can add +// their own shape (e.g. Storage returns per-disk probe results). +type Outcome struct { + Passed bool + Message string + Summary string // short human-readable one-liner + Extras map[string]any // merged into posted summary JSON +} + +// MarshalSummary builds the summary JSON body POSTed to /result. +// Stages accumulate fields via Extras; this helper adds "summary" (the +// human-readable line) and serializes. +func (o Outcome) MarshalSummary() (json.RawMessage, error) { + body := map[string]any{} + for k, v := range o.Extras { + body[k] = v + } + if o.Summary != "" { + body["summary"] = o.Summary + } + return json.Marshal(body) +} + +// Deps bundles what stages need without pulling in the whole agent. +// Logger methods print to stdout + forward to the orchestrator; Sensor +// drops numeric samples; OverrideFlags carries operator-set bypasses. +type Deps struct { + Info func(string) + Warn func(string) + Error func(string) + Sensor func(ctx context.Context, samples []Sample) error + OverrideWipe bool + ExpectedDisks []ExpectedDisk // serials + sizes from host.expected_spec + StageTimeout time.Duration +} + +// Sample mirrors the server's SensorSample but lives in the tests +// package so probe code doesn't import internal/api. +type Sample struct { + Kind string + Key string + Value float64 + Unit string +} + +// ExpectedDisk is the subset of internal/spec.DiskSpec that Storage +// needs: a device allowlist keyed on serial. +type ExpectedDisk struct { + Serial string + SizeGB int +} diff --git a/agent/tests/storage.go b/agent/tests/storage.go new file mode 100644 index 0000000..dcd8015 --- /dev/null +++ b/agent/tests/storage.go @@ -0,0 +1,298 @@ +package tests + +import ( + "context" + "encoding/json" + "fmt" + "os/exec" + "strings" + "time" +) + +// Storage is the destructive stage: badblocks (write-mode sample) + fio +// random IO, persisting IOPS + latency as measurements. Pre-gates: +// +// 1. Device allowlist: only act on /dev/ where the kernel-reported +// serial matches one of Deps.ExpectedDisks. This is the operator's +// contract for what can be written to. USB sticks and unexpected +// drives are excluded. +// 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem +// signatures, partition tables, or LVM metadata → fail with +// UnexpectedData unless Deps.OverrideWipe is set. +// +// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w` +// and `fio` in write mode. This matches the plan's "destructive disk +// tests are always-on, gated by layered safety." +func Storage(ctx context.Context, d Deps) Outcome { + if len(d.ExpectedDisks) == 0 { + d.Info("Storage: no expected disks in spec — skipping stage") + return Outcome{ + Passed: true, + Summary: "skipped (no expected disks)", + Extras: map[string]any{"skipped": true, "reason": "no_expected_disks"}, + } + } + + targets := resolveTargets(d.ExpectedDisks) + if len(targets) == 0 { + d.Error("Storage: none of the expected disks are present on this host") + return Outcome{ + Passed: false, + Message: "device allowlist matched zero disks", + Summary: "no allowed disks present", + Extras: map[string]any{"expected": d.ExpectedDisks}, + } + } + + // Wipe probe on every target. A single dirty disk halts the stage + // unless the operator has set OverrideWipe via the UI. + probes := map[string]wipeProbeResult{} + dirty := []string{} + for _, t := range targets { + probe := probeWipe(ctx, t.Device) + probes[t.Device] = probe + if probe.HasData { + dirty = append(dirty, t.Device) + } + } + if len(dirty) > 0 && !d.OverrideWipe { + d.Error("Storage: wipe probe found existing data on: " + strings.Join(dirty, ", ")) + return Outcome{ + Passed: false, + Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)", + Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)), + Extras: map[string]any{ + "wipe_probe": probes, + "override_hint": "click 'Override wipe & retry' in the held tile", + "dirty_devices": dirty, + }, + } + } + if d.OverrideWipe && len(dirty) > 0 { + d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", ")) + } + + // Per target: short badblocks write sample + fio random-read/write. + var samples []Sample + perDisk := map[string]any{} + for _, t := range targets { + d.Info("Storage: running badblocks write sample on " + t.Device) + bb := runBadblocks(ctx, t.Device) + d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device)) + fr := runFio(ctx, t.Device) + perDisk[t.Device] = map[string]any{ + "badblocks": bb, + "fio": fr, + } + samples = append(samples, + Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"}, + Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"}, + ) + if !bb.OK { + return Outcome{ + Passed: false, + Message: "badblocks found errors on " + t.Device, + Summary: "badblocks failed on " + t.Device, + Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes}, + } + } + } + if d.Sensor != nil { + _ = d.Sensor(ctx, samples) + } + + d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets))) + return Outcome{ + Passed: true, + Summary: fmt.Sprintf("%d disks passed", len(targets)), + Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes}, + } +} + +type diskTarget struct { + Serial string + Device string +} + +// resolveTargets maps expected-disk serials to /dev/ paths by reading +// /sys/block. Uses the same mechanism as probes.inventory to avoid drift. +func resolveTargets(expected []ExpectedDisk) []diskTarget { + disks, err := listBlockDisks() + if err != nil { + return nil + } + // Build serial → device map from /sys. + serialOf := map[string]string{} + for _, dev := range disks { + name := strings.TrimPrefix(dev, "/dev/") + s := diskSerialFromSys(name) + if s != "" { + serialOf[strings.ToLower(s)] = dev + } + } + var out []diskTarget + for _, e := range expected { + if e.Serial == "" { + continue + } + if dev, ok := serialOf[strings.ToLower(e.Serial)]; ok { + out = append(out, diskTarget{Serial: e.Serial, Device: dev}) + } + } + return out +} + +// diskSerialFromSys is a smaller copy of probes.diskSerial; imported +// from internal/probes would cause a cycle so we duplicate the short +// lookup. If it drifts from the inventory probe, Storage fails because +// the serial doesn't match — which is the correct behavior. +func diskSerialFromSys(name string) string { + for _, rel := range []string{ + "/sys/block/" + name + "/device/serial", + "/sys/block/" + name + "/serial", + } { + b, err := readFileBytes(rel) + if err != nil { + continue + } + s := strings.TrimSpace(string(b)) + if s != "" { + return s + } + } + // Fall back to udevadm — ID_SERIAL_SHORT is more reliable on SCSI. + out, err := exec.Command("udevadm", "info", "--query=property", "--name="+name).Output() + if err != nil { + return "" + } + for _, line := range strings.Split(string(out), "\n") { + if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok { + return strings.TrimSpace(v) + } + } + return "" +} + +func readFileBytes(p string) ([]byte, error) { + return readFile(p) +} + +// ---------- wipe probe ---------- + +type wipeProbeResult struct { + Device string `json:"device"` + HasData bool `json:"has_data"` + Findings []string `json:"findings,omitempty"` +} + +// probeWipe runs blkid + wipefs -n. Any non-empty output from either is +// a "has data" signal. This is deliberately conservative: we'd rather +// halt on a bare ext4 signature than hand badblocks a disk with real +// bytes on it. +func probeWipe(ctx context.Context, device string) wipeProbeResult { + out := wipeProbeResult{Device: device} + + if b, err := exec.CommandContext(ctx, "blkid", "-o", "full", device).Output(); err == nil { + s := strings.TrimSpace(string(b)) + if s != "" { + out.Findings = append(out.Findings, "blkid: "+s) + out.HasData = true + } + } + if b, err := exec.CommandContext(ctx, "wipefs", "--no-act", device).Output(); err == nil { + s := strings.TrimSpace(string(b)) + // wipefs prints a header line even on a clean disk; keep only + // lines with actual signature data. + for _, line := range strings.Split(s, "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "DEVICE") || strings.HasPrefix(line, "offset") { + continue + } + out.Findings = append(out.Findings, "wipefs: "+line) + out.HasData = true + } + } + return out +} + +// ---------- badblocks ---------- + +type badblocksResult struct { + OK bool `json:"ok"` + Elapsed string `json:"elapsed"` + Error string `json:"error,omitempty"` + OutputTail string `json:"output_tail,omitempty"` +} + +func runBadblocks(ctx context.Context, device string) badblocksResult { + // -c 64 blocks per check, -w destructive write, -b 4096 block size, + // -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays + // bounded. A real burn-in would run the whole disk; that belongs in + // a separate "deep" stage. + args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"} + start := time.Now() + runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + cmd := exec.CommandContext(runCtx, "badblocks", args...) + out, err := cmd.CombinedOutput() + r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)} + if err != nil { + r.Error = err.Error() + return r + } + // badblocks prints each bad block to stdout. Empty output = clean. + if strings.TrimSpace(string(out)) == "" { + r.OK = true + } else { + r.Error = "bad blocks found" + } + return r +} + +// ---------- fio ---------- + +type fioResult struct { + ReadIOPS float64 `json:"read_iops"` + WriteIOPS float64 `json:"write_iops"` + ReadBWKBps float64 `json:"read_bw_kbps"` + WriteBWKBps float64 `json:"write_bw_kbps"` + Error string `json:"error,omitempty"` +} + +// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks. +// This is a health bar, not a benchmark — we want to know the disk +// services IO, not how fast it is at p99. +func runFio(ctx context.Context, device string) fioResult { + runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + args := []string{ + "--name=health", "--filename=" + device, "--rw=randrw", + "--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0", + "--group_reporting", "--output-format=json", "--direct=1", + } + cmd := exec.CommandContext(runCtx, "fio", args...) + out, err := cmd.Output() + if err != nil { + return fioResult{Error: err.Error()} + } + var top struct { + Jobs []struct { + Read struct { + IOPS float64 `json:"iops"` + BW float64 `json:"bw"` + } `json:"read"` + Write struct { + IOPS float64 `json:"iops"` + BW float64 `json:"bw"` + } `json:"write"` + } `json:"jobs"` + } + if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 { + return fioResult{Error: "parse fio json: " + fmt.Sprint(err)} + } + j := top.Jobs[0] + return fioResult{ + ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS, + ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW, + } +} diff --git a/agent/tests/util.go b/agent/tests/util.go new file mode 100644 index 0000000..56bef66 --- /dev/null +++ b/agent/tests/util.go @@ -0,0 +1,21 @@ +package tests + +import ( + "fmt" + "os" +) + +// readFile is used by stages that need to peek at /sys files without +// importing the agent's probes package (which would cycle). +func readFile(p string) ([]byte, error) { + return os.ReadFile(p) +} + +// formatCount pluralizes a count + label: (0, "disk") → "0 disks", +// (1, "disk") → "1 disk", (n, "disk") → "n disks". Keeps log lines tidy. +func formatCount(n int, label string) string { + if n == 1 { + return fmt.Sprintf("%d %s", n, label) + } + return fmt.Sprintf("%d %ss", n, label) +} diff --git a/cmd/vetting-agent/main.go b/cmd/vetting-agent/main.go new file mode 100644 index 0000000..44e0b60 --- /dev/null +++ b/cmd/vetting-agent/main.go @@ -0,0 +1,39 @@ +package main + +import ( + "context" + "flag" + "log" + "os" + "os/signal" + "syscall" + + "vetting/agent" + "vetting/agent/bootstate" +) + +func main() { + cmdlinePath := flag.String("cmdline", "/proc/cmdline", "path to kernel cmdline (override for local testing)") + flag.Parse() + + p, err := bootstate.ParseCmdline(*cmdlinePath) + if err != nil { + log.Fatalf("bootstate: %v", err) + } + log.Printf("vetting-agent starting: run=%d mac=%s orchestrator=%s", p.RunID, p.MAC, p.OrchestratorURL) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sig := make(chan os.Signal, 1) + signal.Notify(sig, os.Interrupt, syscall.SIGTERM) + go func() { + <-sig + log.Printf("vetting-agent: signal received, shutting down") + cancel() + }() + + if err := agent.Run(ctx, p); err != nil && err != context.Canceled { + log.Fatalf("agent: %v", err) + } +} diff --git a/cmd/vetting/main.go b/cmd/vetting/main.go new file mode 100644 index 0000000..9684211 --- /dev/null +++ b/cmd/vetting/main.go @@ -0,0 +1,249 @@ +package main + +import ( + "context" + "crypto/tls" + "errors" + "flag" + "log" + "net/http" + "os" + "os/signal" + "path/filepath" + "syscall" + "time" + + "vetting/internal/api" + "vetting/internal/auth" + "vetting/internal/config" + "vetting/internal/db" + "vetting/internal/events" + "vetting/internal/httpserver" + "vetting/internal/janitor" + "vetting/internal/logs" + "vetting/internal/model" + "vetting/internal/notify" + "vetting/internal/orchestrator" + "vetting/internal/pxe" + "vetting/internal/store" + "vetting/internal/web/templates" +) + +func main() { + configPath := flag.String("config", "deploy/vetting.example.yaml", "path to vetting.yaml") + flag.Parse() + + cfg, err := config.Load(*configPath) + if err != nil { + log.Fatalf("load config: %v", err) + } + + for _, dir := range []string{ + filepath.Dir(cfg.Database.Path), + cfg.Artifacts.Dir, + cfg.Logs.Dir, + } { + if err := os.MkdirAll(dir, 0o755); err != nil { + log.Fatalf("mkdir %s: %v", dir, err) + } + } + + conn, err := db.Open(cfg.Database.Path) + if err != nil { + log.Fatalf("open db: %v", err) + } + defer func() { _ = conn.Close() }() + + secret, err := cfg.Auth.SessionSecret() + if err != nil { + log.Fatalf("auth: %v", err) + } + authMgr := &auth.Manager{ + PasswordHash: cfg.Auth.AdminPasswordBcrypt, + Secret: secret, + TTL: time.Duration(cfg.Auth.SessionTTLHours) * time.Hour, + } + if err := validateAuth(cfg, authMgr); err != nil { + log.Fatalf("auth: %v", err) + } + + hostStore := &store.Hosts{DB: conn} + runStore := &store.Runs{DB: conn} + stageStore := &store.Stages{DB: conn} + artifactStore := &store.Artifacts{DB: conn} + specDiffStore := &store.SpecDiffs{DB: conn} + measurementStore := &store.Measurements{DB: conn} + + hub := events.NewHub() + + logHub, err := logs.NewHub(cfg.Logs.Dir, hub) + if err != nil { + log.Fatalf("logs hub: %v", err) + } + defer logHub.Close() + + runner := &orchestrator.Runner{ + Runs: runStore, + Hosts: hostStore, + Stages: stageStore, + EventHub: hub, + } + + tiles := &api.TileEnricher{ + Runs: runStore, + Artifacts: artifactStore, + SpecDiffs: specDiffStore, + } + + // Inject a templ renderer so the Runner can publish tile-refresh + // fragments via SSE without pulling web/templates into the + // orchestrator package. The closure enriches the tile with spec- + // diff count and hold-key path so every tile render shows the + // same data, whether it came from /events or an initial page load. + orchestrator.TileRenderer = func(ctx context.Context, host model.Host, latest *model.Run) string { + return templates.RenderTileString(tiles.Build(ctx, host, latest)) + } + + notifyReg, err := notify.BuildRegistry(cfg.Notifiers, cfg.Routes) + if err != nil { + log.Fatalf("notify: %v", err) + } + + ui := &api.UI{ + Hosts: hostStore, + Runs: runStore, + Artifacts: artifactStore, + Auth: authMgr, + EventHub: hub, + Runner: runner, + Tiles: tiles, + } + + agentAPI := &api.Agent{ + Hosts: hostStore, + Runs: runStore, + Stages: stageStore, + Artifacts: artifactStore, + SpecDiffs: specDiffStore, + Measurements: measurementStore, + Runner: runner, + EventHub: hub, + Logs: logHub, + Notify: notifyReg, + ArtifactsDir: cfg.Artifacts.Dir, + OrchestratorURL: cfg.PXE.OrchestratorURL, + PublicURL: cfg.Server.PublicURL, + IperfPort: cfg.Network.IperfPort, + } + agentAPI.LiveKernelURL, agentAPI.LiveInitrdURL = pxe.BuildLiveURLs(cfg.PXE.OrchestratorURL) + + dispatcher := orchestrator.NewDispatcher(cfg.Dispatcher.MaxConcurrentRuns, runStore, hostStore, runner) + iperfSup := orchestrator.NewIperfSupervisor(cfg.Network.IperfPort) + + janitorSvc := janitor.New(janitor.Config{ + ArtifactRetention: time.Duration(cfg.Artifacts.RetentionDays) * 24 * time.Hour, + LogRetention: time.Duration(cfg.Logs.RetentionDays) * 24 * time.Hour, + Interval: time.Duration(cfg.Janitor.IntervalMinutes) * time.Minute, + }, &janitor.StoreAdapter{Runs: runStore, Artifacts: artifactStore, Logs: logHub}) + + tftpRoot := cfg.PXE.TFTPRoot + if tftpRoot == "" { + tftpRoot = filepath.Join(cfg.Logs.Dir, "..", "tftp") + } + var supervisor *pxe.Supervisor + if cfg.PXE.Enabled { + supervisor = pxe.NewSupervisor(pxe.SupervisorConfig{ + Enabled: true, + Interface: cfg.PXE.Interface, + DHCPRange: cfg.PXE.DHCPRange, + OrchestratorURL: cfg.PXE.OrchestratorURL, + RuntimeDir: filepath.Join(cfg.Logs.Dir, "..", "pxe"), + TFTPRoot: tftpRoot, + }) + } + + router := httpserver.NewRouter(httpserver.Deps{ + Auth: authMgr, + UI: ui, + Agent: agentAPI, + LiveDir: cfg.PXE.LiveDir, + }) + + srv := &http.Server{ + Addr: cfg.Server.Bind, + Handler: router, + ReadHeaderTimeout: 10 * time.Second, + } + if cfg.Server.TLS.Enabled { + srv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12} + } + + shutdown := make(chan os.Signal, 1) + signal.Notify(shutdown, os.Interrupt, syscall.SIGTERM) + + rootCtx, cancelRoot := context.WithCancel(context.Background()) + defer cancelRoot() + + dispatcher.Start(rootCtx) + janitorSvc.Start(rootCtx) + + if err := iperfSup.Start(rootCtx); err != nil { + log.Fatalf("start iperf3: %v", err) + } + + if supervisor != nil { + hosts, err := hostStore.List(rootCtx) + if err != nil { + log.Fatalf("list hosts for dnsmasq: %v", err) + } + if err := supervisor.Start(rootCtx, hosts); err != nil { + log.Fatalf("start dnsmasq: %v", err) + } + } + + go func() { + log.Printf("vetting listening on %s (tls=%v, db=%s)", cfg.Server.Bind, cfg.Server.TLS.Enabled, cfg.Database.Path) + var err error + if cfg.Server.TLS.Enabled { + err = srv.ListenAndServeTLS(cfg.Server.TLS.CertFile, cfg.Server.TLS.KeyFile) + } else { + err = srv.ListenAndServe() + } + if err != nil && !errors.Is(err, http.ErrServerClosed) { + log.Fatalf("server: %v", err) + } + }() + + <-shutdown + log.Printf("shutting down") + + dispatcher.Stop() + janitorSvc.Stop() + _ = iperfSup.Shutdown(3 * time.Second) + if supervisor != nil { + _ = supervisor.Shutdown(5 * time.Second) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := srv.Shutdown(ctx); err != nil { + log.Printf("server shutdown: %v", err) + } + _ = hub.Shutdown(ctx) +} + +func validateAuth(cfg *config.Config, _ *auth.Manager) error { + if cfg.Auth.AdminPasswordBcrypt == "" || cfg.Auth.AdminPasswordBcrypt == "$2a$10$REPLACE_ME_WITH_A_REAL_BCRYPT_HASH_0123456789abcdefABCDEFxx" { + return errPlaceholderPassword + } + if len(cfg.Auth.AdminPasswordBcrypt) < 4 || cfg.Auth.AdminPasswordBcrypt[0] != '$' { + return errPlaceholderPassword + } + return nil +} + +var errPlaceholderPassword = plainErr("auth.admin_password_bcrypt is the placeholder; run bin/gen-admin-password and paste the hash into your config") + +type plainErr string + +func (e plainErr) Error() string { return string(e) } diff --git a/deploy/install.sh b/deploy/install.sh new file mode 100644 index 0000000..10dddfd --- /dev/null +++ b/deploy/install.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# install.sh — one-shot installer for the vetting orchestrator on a +# Proxmox LXC (or any Debian/Ubuntu host). +# +# What it does: +# 1. apt-installs runtime dependencies (dnsmasq, iperf3, ca-certs). +# 2. Creates the `vetting` system user with /var/lib/vetting homedir. +# 3. Copies the pre-built `vetting` binary into /usr/local/bin. +# 4. Drops the systemd unit and example config into /etc/vetting. +# 5. Reminds the operator to edit the config and set a bcrypt +# password before enabling the service — we don't auto-start +# because a placeholder password would just refuse to boot. +# +# What it deliberately does NOT do: +# - Build the orchestrator (this script assumes you ran +# `make orchestrator-linux` beforehand and that bin/vetting-linux-amd64 +# exists alongside this script, or pass --binary to locate it). +# - Install the live image or TFTP payloads — those are separate, +# since most operators want to build them from a pinned CI artifact +# rather than on the LXC itself. +# +# Usage: +# sudo ./install.sh [--binary PATH] [--config-dir /etc/vetting] +# +set -euo pipefail + +BINARY="" +CONFIG_DIR="/etc/vetting" +STATE_DIR="/var/lib/vetting" +LOG_DIR="/var/log/vetting" +SERVICE_USER="vetting" + +usage() { + cat <&2; usage; exit 2 ;; + esac +done + +if [[ $EUID -ne 0 ]]; then + echo "install.sh must be run as root (try: sudo $0)" >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +if [[ -z "${BINARY}" ]]; then + for cand in \ + "${REPO_ROOT}/bin/vetting-linux-amd64" \ + "${REPO_ROOT}/bin/vetting" \ + "${SCRIPT_DIR}/vetting"; do + if [[ -x "${cand}" ]]; then BINARY="${cand}"; break; fi + done +fi +if [[ -z "${BINARY}" || ! -x "${BINARY}" ]]; then + echo "could not find a vetting binary to install; pass --binary PATH or run 'make orchestrator-linux' first" >&2 + exit 1 +fi + +echo "==> installing runtime dependencies" +export DEBIAN_FRONTEND=noninteractive +apt-get update -qq +apt-get install -y --no-install-recommends \ + ca-certificates dnsmasq iperf3 + +echo "==> creating ${SERVICE_USER} user" +if ! id -u "${SERVICE_USER}" >/dev/null 2>&1; then + useradd --system \ + --home-dir "${STATE_DIR}" \ + --shell /usr/sbin/nologin \ + "${SERVICE_USER}" +fi + +echo "==> preparing directories" +install -d -m 0755 -o "${SERVICE_USER}" -g "${SERVICE_USER}" "${STATE_DIR}" +install -d -m 0755 -o "${SERVICE_USER}" -g "${SERVICE_USER}" "${LOG_DIR}" +install -d -m 0755 "${CONFIG_DIR}" + +echo "==> installing binary" +install -m 0755 "${BINARY}" /usr/local/bin/vetting + +echo "==> installing config and systemd unit" +if [[ ! -f "${CONFIG_DIR}/vetting.yaml" ]]; then + install -m 0640 -o root -g "${SERVICE_USER}" \ + "${SCRIPT_DIR}/vetting.example.yaml" \ + "${CONFIG_DIR}/vetting.yaml" + echo " -> installed default config at ${CONFIG_DIR}/vetting.yaml" +else + echo " -> preserving existing ${CONFIG_DIR}/vetting.yaml" +fi +install -m 0644 "${SCRIPT_DIR}/vetting.service" /etc/systemd/system/vetting.service + +# Disable the distro's dnsmasq so only the orchestrator-supervised +# instance owns DHCP/TFTP. Operators who want to keep dnsmasq for +# something else can re-enable it after configuring a disjoint listen +# address. +if systemctl is-enabled --quiet dnsmasq 2>/dev/null; then + echo "==> disabling distro dnsmasq (orchestrator supervises its own)" + systemctl disable --now dnsmasq +fi + +systemctl daemon-reload + +cat <:8443`, log in and register a host: + - Name: `qemu-test` + - MAC: `52:54:00:12:34:56` + - WoL broadcast IP: `10.77.0.255` + - Expected spec: paste a minimal YAML like + ```yaml + memory: { total_gib: 4 } + cpu: { logical_cores: 4 } + ``` + +3. Click **Start Vetting**. The UI tile will sit at `Queued → WaitingWoL`. + +4. Launch the QEMU VM on the bridge so it PXE-boots from dnsmasq: + + ``` + sudo qemu-system-x86_64 \ + -enable-kvm -cpu host -smp 4 -m 4096 \ + -netdev bridge,id=n0,br=br-vetting \ + -device virtio-net-pci,netdev=n0,mac=52:54:00:12:34:56 \ + -drive file=/tmp/test-disk.img,format=raw,if=virtio \ + -boot n -serial mon:stdio -display none + ``` + +5. Watch the tile advance through stages. On success, the tile shows + **View report** and the VM auto-shuts-down. + +For real repaired hardware: same flow, but register the node's actual +MAC + expected spec, and make sure the node's BIOS is set to PXE-boot +from the NIC that's on the `br-vetting` network. + +## A failed run — SSH to the held host + +When a stage fails, the pipeline halts at `FailedHolding` and the +agent installs an orchestrator-issued SSH key into the live-image's +`/root/.ssh/authorized_keys`. The UI tile surfaces the IP and the +exact `ssh` command. + +The hold key is **per-run**. Once you're done: + +1. Power the host off (`poweroff` from the SSH session). +2. In the UI, click **Override wipe-probe** only when the failure was + at the `Storage` stage *and* you're sure the disks are expendable. + Otherwise click **Start vetting** on a fresh run from the host + dashboard after fixing the underlying issue. + +## Log + artifact layout + +``` +/var/lib/vetting/ + vetting.db # SQLite: hosts, runs, stages, artifacts, spec_diffs, measurements + artifacts/ + run-/ + report.html # operator-facing summary + report.json # machine-readable summary + inventory.json # raw probe output + fio-.log # storage stage output + iperf-.json # network stage output + hold-.pub # per-run SSH pubkey (only if held) +/var/log/vetting/ + run-.log # append-only per-run log tail +``` + +Retention is governed by the `artifacts.retention_days` and +`logs.retention_days` settings. DB rows (run history) are preserved +indefinitely; only on-disk files get pruned. + +## Troubleshooting + +| Symptom | First check | +|---|---| +| Service refuses to start with `auth.admin_password_bcrypt is the placeholder` | You didn't replace the bcrypt hash in the config. Run `gen-admin-password`. | +| PXE client gets no DHCP offer | `journalctl -u vetting` for dnsmasq errors; confirm the LXC has `CAP_NET_ADMIN` (the shipped systemd unit does); confirm the host MAC is actually registered (`sqlite3 /var/lib/vetting/vetting.db 'SELECT name, mac FROM hosts;'`). | +| Agent `/hello` never fires | Check the live image is actually loading the agent binary — SSH into the live env (use the hold key path), `systemctl status vetting-agent`. | +| Tile stuck on `Booting` | Most likely the live image booted but the agent can't reach the orchestrator. Verify `vetting.orchestrator=` in the kernel cmdline resolves from the host's network. | +| UI shows stale stage | Force a reload; the SSE reconnect is automatic but the browser keeps the last state on ephemeral network blips. | +| Notification didn't fire | `journalctl -u vetting \| grep notify:` — delivery is fire-and-forget and the failure reason is logged but not persisted. | + +## Upgrading + +1. `make orchestrator-linux` on your workstation. +2. `scp bin/vetting-linux-amd64 lxc:/tmp/vetting.new` +3. On the LXC: + ``` + sudo systemctl stop vetting + sudo install -m 0755 /tmp/vetting.new /usr/local/bin/vetting + sudo systemctl start vetting + ``` + +The DB migration runs at startup and is append-only — no manual schema +work unless a release's notes call it out. diff --git a/docs/test-suite.md b/docs/test-suite.md new file mode 100644 index 0000000..b3bbdc6 --- /dev/null +++ b/docs/test-suite.md @@ -0,0 +1,166 @@ +# Test suite + +What each stage measures, what "pass" means, and where the results +land. Stages run strictly in order. Any stage returning `passed=false` +halts the pipeline at `FailedHolding` — the operator decides whether +to fix, override, or abandon. + +## Stage order + +``` +Inventory → SpecValidate → SMART → CPUStress → Storage + → Network → GPU → PSU → Reporting +``` + +Stages marked *orchestrator-owned* resolve inside `/result` and never +show up as "the agent's turn". + +--- + +## Inventory + +**Owner:** agent. +**What it does:** `dmidecode`, `lscpu`, `lshw`, `lspci`, `smartctl -i` +over each block device, `nvidia-smi -q` if present. The raw output is +merged into a single JSON blob. +**Pass:** the probes run to completion; missing optional tools (e.g. +`nvidia-smi` on a GPU-less host) are tolerated. +**Artifacts:** `inventory.json` under `artifacts/run-/`. + +## SpecValidate *(orchestrator-owned)* + +**Owner:** orchestrator (resolves inline inside the `/result` for the +preceding Inventory stage). +**What it does:** diffs the submitted inventory against the host's +`expected_spec_yaml`. The diff engine classifies each field as +`critical`, `warning`, or `info`. +**Pass:** zero `critical` diffs. +**Fail mode:** fires a `SpecMismatch` notification; transitions run +to `Failed → FailedHolding`. +**Artifacts:** `spec_diffs` table rows (one per divergence). + +## SMART + +**Owner:** agent. +**What it does:** `smartctl -a /dev/` for each disk in the +inventory's `expected_disks`. Parses reallocated-sector counts, pending +sectors, end-to-end error counters, overall-health attribute. +**Pass:** SMART overall-health is PASSED on every expected disk and +reallocated-sector count is below threshold. +**Artifacts:** `smart-.txt` raw output. + +## CPUStress + +**Owner:** agent. +**What it does:** runs `stress-ng --cpu N --vm M --vm-bytes 90% -t +120s` with `N = logical_cores` and `M ≈ logical_cores/2`. The `--vm` +flag is the **stand-in for Memtest86+**: it exercises the memory +subsystem under load and will fail if the RAM has latent faults that +surface under thermal + allocator pressure. +**Pass:** `stress-ng` exits 0 and thermal samples taken by the sidecar +stay below the configured per-host `max_temp_c`. +**Caveat:** weaker than a dedicated memtest pass; see +[architecture.md](architecture.md) for the reasoning (Memtest86+ +can't be signalled back without IPMI serial). + +## Storage + +**Owner:** agent (destructive). +**What it does:** + +1. **Wipe probe** — scans for filesystem signatures, LVM metadata, + partition tables on the expected disks. Any hit → halt with + `UnexpectedData`; operator must click **Override wipe-probe**. +2. `badblocks -svw` (destructive read/write) on each expected disk. +3. `fio --rw=randrw --bs=4k --iodepth=32 --runtime=60 --size=1G` on + each disk; captures IOPS and p99 latency. + +**Pass:** badblocks reports zero bad blocks; fio IOPS above a +per-class floor (configurable). +**Artifacts:** `fio-.json` per disk. +**Safety gate:** the wipe-probe + device allowlist are the second and +third lines of defense against wiping the wrong disk. See +[architecture.md § Safety](architecture.md#safety-destructive-disk-tests). + +## Network + +**Owner:** agent. +**What it does:** `iperf3 -c -p -t 10 -J` +to measure throughput to the orchestrator. The orchestrator-side +`iperf3 -s` is supervised by `internal/orchestrator/iperf.go` and +binds to the configured `network.iperf_port`. +**Pass:** throughput ≥ per-class floor (1 Gbps for 1GbE NICs, 9 Gbps +for 10GbE). +**Artifacts:** `iperf-.json`. + +## GPU + +**Owner:** agent. +**What it does:** runs `nvidia-smi -q` and a short compute workload +(`gpu-burn` if present, else `nvidia-smi dmon` during a `stress-ng +--gpu` burst). Skipped cleanly when no GPU is present. +**Pass:** no ECC errors reported; temperature below threshold; compute +workload exits 0. + +## PSU + +**Owner:** agent. +**What it does:** reads `/sys/class/hwmon/*/power_average` and `in*_input` +during a synthetic load burst (CPU + disk + NIC simultaneously) to +look for voltage sag or wattage anomalies. Records the full envelope +as `measurements` rows with `kind=psu`. +**Pass:** no voltage dip below threshold across the load burst. +**Caveat:** only reports on what the BMC exposes via hwmon — servers +without exposed PSU telemetry pass trivially. Documented limitation. + +## Reporting *(orchestrator-owned)* + +**Owner:** orchestrator (resolves inline inside the `/result` for PSU). +**What it does:** + +1. Gathers run, host, stages, spec_diffs, and measurement aggregates. +2. Renders `report.html` via `internal/report` (html/template with + inlined CSS; self-contained offline-viewable). +3. Writes `report.json` with the same data in machine-readable form. +4. Records both as `report_html` / `report_json` artifact rows. +5. Transitions run → `Completed`. +6. Fires `RunCompleted` notification. +7. The next agent heartbeat returns `cmd=shutdown`. + +## Thermal sidecar + +**Owner:** agent (always-on from `Booting` until the agent exits). +**What it does:** every 5 seconds, walks `/sys/class/hwmon/*` and +POSTs temperature samples as a batch to `/sensor`. Populates the +`measurements` table with `kind=thermal`. +**No pass/fail** on its own — stages that care about thermals read the +sidecar's data via `measurements`. A dead sensor just drops out of +the next batch. + +--- + +## Where pass/fail lives + +- `runs.state` — authoritative terminal state (`Completed`, + `FailedHolding`, `Released`). +- `runs.result` — `pass` or `fail` string once the run completes. +- `runs.failed_stage` — name of the stage that halted the pipeline, if + any. Cleared when the operator overrides and re-enters. +- `stages` — one row per attempted stage with `passed`, `started_at`, + `completed_at`, `summary_json`, `message`. +- `measurements` — time-series samples from the thermal sidecar and + from stages that capture numeric outputs. +- `artifacts` — on-disk files (report, fio logs, iperf logs, etc). +- `spec_diffs` — one row per expected-vs-actual divergence. + +## Adding a new stage + +1. Add the name to `store.DefaultStageOrder`. +2. Add a `model.State` const and wire it into + `internal/orchestrator/statemachine.go` (both the forward + transition table and the stage-for-state lookup). +3. Add a case to `agent/runner.go`'s `runStage` dispatch. +4. Drop the implementation into `agent/tests/`. +5. If the stage is orchestrator-owned, add a `resolve` helper to + `internal/api/agent_handlers.go` and invoke it from the `/result` + handler after the preceding stage's `NextState` resolves. diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..6eabd64 --- /dev/null +++ b/go.mod @@ -0,0 +1,27 @@ +module vetting + +go 1.23.0 + +require ( + github.com/a-h/templ v0.3.1001 + github.com/go-chi/chi/v5 v5.1.0 + golang.org/x/crypto v0.28.0 + gopkg.in/yaml.v3 v3.0.1 + modernc.org/sqlite v1.33.1 +) + +require ( + github.com/dustin/go-humanize v1.0.1 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/ncruces/go-strftime v0.1.9 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect + golang.org/x/sys v0.34.0 // indirect + modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect + modernc.org/libc v1.55.3 // indirect + modernc.org/mathutil v1.6.0 // indirect + modernc.org/memory v1.8.0 // indirect + modernc.org/strutil v1.2.0 // indirect + modernc.org/token v1.1.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..ab94186 --- /dev/null +++ b/go.sum @@ -0,0 +1,63 @@ +github.com/a-h/templ v0.3.1001 h1:yHDTgexACdJttyiyamcTHXr2QkIeVF1MukLy44EAhMY= +github.com/a-h/templ v0.3.1001/go.mod h1:oCZcnKRf5jjsGpf2yELzQfodLphd2mwecwG4Crk5HBo= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw= +github.com/go-chi/chi/v5 v5.1.0/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/pprof v0.0.0-20240409012703-83162a5b38cd h1:gbpYu9NMq8jhDVbvlGkMFWCjLFlqqEZjEmObmhUy6Vo= +github.com/google/pprof v0.0.0-20240409012703-83162a5b38cd/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4= +github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +golang.org/x/crypto v0.28.0 h1:GBDwsMXVQi34v5CCYUm2jkJvu4cbtru2U4TN2PSyQnw= +golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U= +golang.org/x/mod v0.26.0 h1:EGMPT//Ezu+ylkCijjPc+f4Aih7sZvaAr+O3EHBxvZg= +golang.org/x/mod v0.26.0/go.mod h1:/j6NAhSk8iQ723BGAUyoAcn7SlD7s15Dp9Nd/SfeaFQ= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= +golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= +golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +modernc.org/cc/v4 v4.21.4 h1:3Be/Rdo1fpr8GrQ7IVw9OHtplU4gWbb+wNgeoBMmGLQ= +modernc.org/cc/v4 v4.21.4/go.mod h1:HM7VJTZbUCR3rV8EYBi9wxnJ0ZBRiGE5OeGXNA0IsLQ= +modernc.org/ccgo/v4 v4.19.2 h1:lwQZgvboKD0jBwdaeVCTouxhxAyN6iawF3STraAal8Y= +modernc.org/ccgo/v4 v4.19.2/go.mod h1:ysS3mxiMV38XGRTTcgo0DQTeTmAO4oCmJl1nX9VFI3s= +modernc.org/fileutil v1.3.0 h1:gQ5SIzK3H9kdfai/5x41oQiKValumqNTDXMvKo62HvE= +modernc.org/fileutil v1.3.0/go.mod h1:XatxS8fZi3pS8/hKG2GH/ArUogfxjpEKs3Ku3aK4JyQ= +modernc.org/gc/v2 v2.4.1 h1:9cNzOqPyMJBvrUipmynX0ZohMhcxPtMccYgGOJdOiBw= +modernc.org/gc/v2 v2.4.1/go.mod h1:wzN5dK1AzVGoH6XOzc3YZ+ey/jPgYHLuVckd62P0GYU= +modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 h1:5D53IMaUuA5InSeMu9eJtlQXS2NxAhyWQvkKEgXZhHI= +modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6/go.mod h1:Qz0X07sNOR1jWYCrJMEnbW/X55x206Q7Vt4mz6/wHp4= +modernc.org/libc v1.55.3 h1:AzcW1mhlPNrRtjS5sS+eW2ISCgSOLLNyFzRh/V3Qj/U= +modernc.org/libc v1.55.3/go.mod h1:qFXepLhz+JjFThQ4kzwzOjA/y/artDeg+pcYnY+Q83w= +modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4= +modernc.org/mathutil v1.6.0/go.mod h1:Ui5Q9q1TR2gFm0AQRqQUaBWFLAhQpCwNcuhBOSedWPo= +modernc.org/memory v1.8.0 h1:IqGTL6eFMaDZZhEWwcREgeMXYwmW83LYW8cROZYkg+E= +modernc.org/memory v1.8.0/go.mod h1:XPZ936zp5OMKGWPqbD3JShgd/ZoQ7899TUuQqxY+peU= +modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4= +modernc.org/opt v0.1.3/go.mod h1:WdSiB5evDcignE70guQKxYUl14mgWtbClRi5wmkkTX0= +modernc.org/sortutil v1.2.0 h1:jQiD3PfS2REGJNzNCMMaLSp/wdMNieTbKX920Cqdgqc= +modernc.org/sortutil v1.2.0/go.mod h1:TKU2s7kJMf1AE84OoiGppNHJwvB753OYfNl2WRb++Ss= +modernc.org/sqlite v1.33.1 h1:trb6Z3YYoeM9eDL1O8do81kP+0ejv+YzgyFo+Gwy0nM= +modernc.org/sqlite v1.33.1/go.mod h1:pXV2xHxhzXZsgT/RtTFAPY6JJDEvOTcTdwADQCCWD4k= +modernc.org/strutil v1.2.0 h1:agBi9dp1I+eOnxXeiZawM8F4LawKv4NzGWSaLfyeNZA= +modernc.org/strutil v1.2.0/go.mod h1:/mdcBmfOibveCTBxUl5B5l6W+TTH1FXPLHZE6bTosX0= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= diff --git a/internal/api/agent_handlers.go b/internal/api/agent_handlers.go new file mode 100644 index 0000000..74257e3 --- /dev/null +++ b/internal/api/agent_handlers.go @@ -0,0 +1,918 @@ +package api + +import ( + "context" + "crypto/sha256" + "crypto/subtle" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "log" + "net" + "net/http" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/go-chi/chi/v5" + + "vetting/internal/events" + "vetting/internal/hold" + "vetting/internal/logs" + "vetting/internal/model" + "vetting/internal/notify" + "vetting/internal/orchestrator" + "vetting/internal/pxe" + "vetting/internal/report" + "vetting/internal/spec" + "vetting/internal/store" +) + +// Agent collects the collaborators used by agent-facing HTTP routes: +// the iPXE chainload endpoint and the /api/v1/runs/:id/* endpoints. +type Agent struct { + Hosts *store.Hosts + Runs *store.Runs + Stages *store.Stages + Artifacts *store.Artifacts + SpecDiffs *store.SpecDiffs + Measurements *store.Measurements + Runner *orchestrator.Runner + EventHub *events.Hub + Logs *logs.Hub + Notify *notify.Registry + ArtifactsDir string // ./var/artifacts + OrchestratorURL string // baked into iPXE cmdline + PublicURL string // user-visible URL base for notification click-throughs + LiveKernelURL string + LiveInitrdURL string + TLSCertFPR string // optional; empty = skip pinning + IperfPort int // orchestrator-supervised iperf3 port; 0 = 5201 +} + +// IPXEScript serves a per-MAC iPXE script. Called by iPXE itself after +// dnsmasq hands it the chainload URL. Unknown MAC → halt script. +// Known MAC with no active run → poweroff script. Known MAC with active +// run → real boot script; the fetch triggers PXEObserved. +func (a *Agent) IPXEScript(w http.ResponseWriter, r *http.Request) { + mac := strings.ToLower(strings.TrimSpace(chi.URLParam(r, "mac"))) + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + w.Header().Set("Cache-Control", "no-store") + + if !macRe.MatchString(mac) { + log.Printf("ipxe: rejected malformed mac %q from %s", mac, r.RemoteAddr) + _, _ = w.Write([]byte(pxe.NotRegisteredScript(mac))) + return + } + + run, err := a.Runs.FindActiveByMAC(r.Context(), mac) + if err != nil { + log.Printf("ipxe: find run by mac %s: %v", mac, err) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + if run == nil { + _, _ = w.Write([]byte(pxe.NoActiveRunScript(mac))) + return + } + + // The token hash in the DB is the sha256 of the plaintext. The + // plaintext itself cannot be recovered from the hash — we issued it + // once when the run was created. For iPXE we re-issue a fresh token + // on every PXE fetch: this is safe because the hash in the DB is + // rewritten to match and only the most recent PXE can be claimed. + plain, hash, err := orchestrator.IssueRunToken() + if err != nil { + http.Error(w, "token", http.StatusInternalServerError) + return + } + if err := a.Runs.RotateTokenHash(r.Context(), run.ID, hash); err != nil { + log.Printf("ipxe: rotate token run %d: %v", run.ID, err) + http.Error(w, "token", http.StatusInternalServerError) + return + } + + script := pxe.BuildScript(pxe.IPXEParams{ + OrchestratorURL: a.OrchestratorURL, + LiveKernelURL: a.LiveKernelURL, + LiveInitrdURL: a.LiveInitrdURL, + TLSCertFPR: a.TLSCertFPR, + RunID: run.ID, + MAC: mac, + Token: plain, + }) + _, _ = w.Write([]byte(script)) + + // iPXE has now fetched the script — treat this as PXEObserved. If we + // were already in Booting the transition table allows staying. + if _, err := a.Runner.Transition(r.Context(), run.ID, orchestrator.TriggerPXEObserved); err != nil { + // Non-fatal: the agent may still claim via /claim. + log.Printf("ipxe: PXEObserved for run %d: %v", run.ID, err) + } +} + +// Hello is the first call an agent makes once userspace is up. It's +// idempotent and only writes a log line; the authoritative transition +// comes from /claim. The agent sends Hello early so operators see a +// signal in the tile even before the token is validated. +func (a *Agent) Hello(w http.ResponseWriter, r *http.Request) { + runID, ok := runIDFromURL(w, r) + if !ok { + return + } + if _, ok := a.authenticate(w, r, runID); !ok { + return + } + log.Printf("agent hello: run=%d remote=%s", runID, r.RemoteAddr) + writeJSON(w, http.StatusOK, map[string]any{"ok": true, "run_id": runID}) +} + +// Claim is the binding call: the agent proves it holds the plaintext +// token for this run, and in return the orchestrator transitions to +// InventoryCheck and seeds the stage rows. All destructive actions the +// agent takes later require a prior successful claim. +func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) { + runID, ok := runIDFromURL(w, r) + if !ok { + return + } + run, ok := a.authenticate(w, r, runID) + if !ok { + return + } + + var body struct { + AgentIP string `json:"agent_ip"` + } + if r.Body != nil { + // agent_ip is informational; if missing fall back to RemoteAddr. + _ = json.NewDecoder(r.Body).Decode(&body) + } + agentIP := strings.TrimSpace(body.AgentIP) + if agentIP == "" { + if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil { + agentIP = host + } else { + agentIP = r.RemoteAddr + } + } + + // First claim seeds the stage rows; subsequent claims are a no-op + // so agent retries after transient network failures stay safe. + if len(mustListStages(a.Stages, r, runID)) == 0 { + if err := a.Stages.Seed(r.Context(), runID); err != nil { + log.Printf("claim: seed stages run %d: %v", runID, err) + http.Error(w, "seed stages", http.StatusInternalServerError) + return + } + } + + // Drive the transition. If we're already past Booting this returns + // an error — treat as "already claimed" and report OK, don't 500. + if run.State == model.StateWaitingWoL || run.State == model.StateBooting { + if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerAgentClaimed); err != nil { + log.Printf("claim: transition run %d: %v", runID, err) + http.Error(w, "transition", http.StatusConflict) + return + } + } + + log.Printf("agent claimed: run=%d agent_ip=%s", runID, agentIP) + + // Stage-driven agent needs a bit of per-run config: the device + // allowlist (serial + expected size) for Storage, and the iperf3 + // server port for Network. Parse the host's expected spec here so + // the agent doesn't need to read YAML. + expectedDisks := []map[string]any{} + if host, err := a.Hosts.Get(r.Context(), run.HostID); err == nil && host != nil { + if parsed, err := spec.Parse(host.ExpectedSpecYAML); err == nil && parsed != nil { + for _, dd := range parsed.Disks { + expectedDisks = append(expectedDisks, map[string]any{ + "serial": dd.Serial, + "size_gb": dd.SizeGB, + }) + } + } + } + iperfPort := a.IperfPort + if iperfPort == 0 { + iperfPort = 5201 + } + writeJSON(w, http.StatusOK, map[string]any{ + "ok": true, + "run_id": runID, + "stages": store.DefaultStageOrder, + "expected_disks": expectedDisks, + "iperf_port": iperfPort, + }) +} + +// Heartbeat is the agent's periodic liveness ping. The response body +// acts as a control channel: cmd=continue is the normal case; cmd=abort +// once the run enters FailedHolding/Released; cmd=retry_stage when the +// operator has overridden a failed stage (wipe-probe override). +func (a *Agent) Heartbeat(w http.ResponseWriter, r *http.Request) { + runID, ok := runIDFromURL(w, r) + if !ok { + return + } + run, ok := a.authenticate(w, r, runID) + if !ok { + return + } + a.Runner.TouchHeartbeat(runID) + + cmd := "continue" + resp := map[string]any{"state": run.State} + switch { + case run.State == model.StateCompleted: + // Pipeline succeeded — agent should power the host down. + cmd = "shutdown" + case run.State == model.StateFailedHolding || run.State == model.StateReleased: + cmd = "abort" + case run.FailedStage == "Storage" && overrideWipeSet(run.OverrideFlagsJSON): + // Operator pressed "Override wipe & retry". Agent should + // re-enter Storage with the wipe-probe bypass armed. + cmd = "retry_stage" + resp["stage"] = "Storage" + resp["override_flags"] = json.RawMessage(run.OverrideFlagsJSON) + } + resp["cmd"] = cmd + writeJSON(w, http.StatusOK, resp) +} + +// overrideWipeSet inspects a Run.OverrideFlagsJSON blob for the wipe flag. +// Malformed JSON is ignored — the operator has to reapply the override if +// it didn't round-trip correctly. +func overrideWipeSet(blob string) bool { + if blob == "" { + return false + } + var flags struct { + Wipe bool `json:"wipe"` + } + _ = json.Unmarshal([]byte(blob), &flags) + return flags.Wipe +} + +// authenticate verifies the Bearer token against the run's stored hash +// and returns the Run for downstream handlers. Responds 401/404 on +// failure and returns ok=false so the caller can bail early. +func (a *Agent) authenticate(w http.ResponseWriter, r *http.Request, runID int64) (*model.Run, bool) { + run, err := a.Runs.Get(r.Context(), runID) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + http.Error(w, "run not found", http.StatusNotFound) + return nil, false + } + http.Error(w, "internal error", http.StatusInternalServerError) + return nil, false + } + token := bearerToken(r) + if token == "" { + http.Error(w, "missing bearer", http.StatusUnauthorized) + return nil, false + } + presented := orchestrator.HashRunToken(token) + if subtle.ConstantTimeCompare([]byte(presented), []byte(run.AgentTokenHash)) != 1 { + http.Error(w, "bad token", http.StatusUnauthorized) + return nil, false + } + return run, true +} + +func bearerToken(r *http.Request) string { + h := r.Header.Get("Authorization") + if !strings.HasPrefix(h, "Bearer ") { + return "" + } + return strings.TrimSpace(strings.TrimPrefix(h, "Bearer ")) +} + +func runIDFromURL(w http.ResponseWriter, r *http.Request) (int64, bool) { + idStr := chi.URLParam(r, "id") + id, err := strconv.ParseInt(idStr, 10, 64) + if err != nil || id <= 0 { + http.Error(w, "bad run id", http.StatusBadRequest) + return 0, false + } + return id, true +} + +func writeJSON(w http.ResponseWriter, status int, body any) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(body) +} + +// mustListStages is a small wrapper that hides the error path from +// /claim — a DB read failure just pretends there are zero stages, and +// the subsequent Seed will surface the real error. +func mustListStages(s *store.Stages, r *http.Request, runID int64) []model.Stage { + rows, err := s.ListForRun(r.Context(), runID) + if err != nil { + return nil + } + return rows +} + +// ===== Phase 3 endpoints ================================================= + +// LogBatch is what the agent POSTs to /log: zero or more lines with +// timestamp + level + text. Lines are written in order to the per-run +// file and fanned out on the SSE hub. +type LogBatch struct { + Lines []LogLine `json:"lines"` +} + +type LogLine struct { + TS string `json:"ts,omitempty"` // RFC3339Nano; server clock used if empty + Level string `json:"level,omitempty"` // info|warn|error|debug + Text string `json:"text"` +} + +// Log accepts a batch of log lines from the agent. Empty batches are +// legal (useful for agent-side flush ping). +func (a *Agent) Log(w http.ResponseWriter, r *http.Request) { + runID, ok := runIDFromURL(w, r) + if !ok { + return + } + if _, ok := a.authenticate(w, r, runID); !ok { + return + } + var batch LogBatch + if err := json.NewDecoder(r.Body).Decode(&batch); err != nil { + http.Error(w, "bad json", http.StatusBadRequest) + return + } + writer, err := a.Logs.WriterFor(runID) + if err != nil { + http.Error(w, "open log: "+err.Error(), http.StatusInternalServerError) + return + } + for _, l := range batch.Lines { + ts, _ := time.Parse(time.RFC3339Nano, l.TS) + writer.Append(logs.Line{TS: ts, Level: l.Level, Text: l.Text}) + } + writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(batch.Lines)}) +} + +// StageResult is the body of /result. Kind is the stage name (from +// DefaultStageOrder); Passed drives StageCompleted vs StageFailed. +// Inventory is optional and only set when kind == "Inventory" — the +// orchestrator persists it as an artifact and feeds it to spec.Diff. +type StageResult struct { + Stage string `json:"stage"` + Passed bool `json:"passed"` + Summary json.RawMessage `json:"summary,omitempty"` + Inventory *spec.Inventory `json:"inventory,omitempty"` + Message string `json:"message,omitempty"` +} + +// Result receives a stage's outcome. Flow: +// 1. Mark the stage row passed/failed + record summary JSON. +// 2. For Inventory: persist the inventory artifact. +// 3. For Inventory (on pass): run spec diff server-side, persist rows, +// bump the run into SpecValidate and immediately resolve SpecValidate +// from that diff — the agent isn't involved in SpecValidate at all. +// 4. Transition the run via StageCompleted/StageFailed. +func (a *Agent) Result(w http.ResponseWriter, r *http.Request) { + runID, ok := runIDFromURL(w, r) + if !ok { + return + } + run, ok := a.authenticate(w, r, runID) + if !ok { + return + } + var body StageResult + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + http.Error(w, "bad json", http.StatusBadRequest) + return + } + body.Stage = strings.TrimSpace(body.Stage) + if _, ok := orchestrator.StateForStage(body.Stage); !ok { + http.Error(w, "unknown stage: "+body.Stage, http.StatusBadRequest) + return + } + + stageState := model.StagePassed + if !body.Passed { + stageState = model.StageFailed + } + summaryJSON := "" + if len(body.Summary) > 0 { + summaryJSON = string(body.Summary) + } + if err := a.Stages.CompleteByName(r.Context(), runID, body.Stage, stageState, summaryJSON); err != nil { + http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError) + return + } + + // Inventory-specific: persist artifact + compute spec diff. + if body.Stage == "Inventory" && body.Inventory != nil { + if err := a.persistInventory(r, run, body.Inventory); err != nil { + log.Printf("persist inventory run %d: %v", runID, err) + } + } + + if !body.Passed { + if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil { + log.Printf("set failed stage: %v", err) + } + if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil { + log.Printf("result: failed-transition run %d: %v", runID, err) + http.Error(w, "transition", http.StatusConflict) + return + } + hostName := a.hostNameFor(r.Context(), run.HostID) + detail := body.Message + if detail == "" { + detail = "stage reported failure" + } + a.dispatchEvent(notify.Event{ + Kind: notify.KindStageFailed, + Severity: notify.SeverityCritical, + RunID: runID, + HostName: hostName, + Title: fmt.Sprintf("[vetting] %s FAILED: %s", hostName, body.Stage), + Body: fmt.Sprintf("Run %d on %s failed at stage %s.\n%s", runID, hostName, body.Stage, detail), + URL: a.runLinkURL(runID), + }) + writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": "FailedHolding"}) + return + } + + // Passed: advance to the next stage in the pipeline. + next, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted) + if err != nil { + http.Error(w, "advance: "+err.Error(), http.StatusConflict) + return + } + log.Printf("result: run %d stage %s passed → %s", runID, body.Stage, next) + + // If the just-advanced-into state is SpecValidate or Reporting, the + // orchestrator owns those stages entirely. The resolve function may + // transition further (→ next stage on pass, → FailedHolding on fail, + // → Completed for Reporting), so we re-read the run after each. + if next == model.StateSpecValidate { + a.resolveSpecValidate(r, runID) + if after, err := a.Runs.Get(r.Context(), runID); err == nil { + next = after.State + } + } + if next == model.StateReporting { + a.resolveReporting(r, runID) + if after, err := a.Runs.Get(r.Context(), runID); err == nil { + next = after.State + } + } + writeJSON(w, http.StatusOK, map[string]any{"ok": true, "next_state": string(next)}) +} + +func (a *Agent) persistInventory(r *http.Request, run *model.Run, inv *spec.Inventory) error { + dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", run.ID)) + if err := os.MkdirAll(dir, 0o755); err != nil { + return err + } + path := filepath.Join(dir, "inventory.json") + buf, err := json.MarshalIndent(inv, "", " ") + if err != nil { + return err + } + if err := os.WriteFile(path, buf, 0o644); err != nil { + return err + } + sum := sha256.Sum256(buf) + _, err = a.Artifacts.Create(r.Context(), store.Artifact{ + RunID: run.ID, + Kind: "inventory", + Path: path, + SHA256: hex.EncodeToString(sum[:]), + SizeBytes: int64(len(buf)), + }) + return err +} + +// resolveSpecValidate runs the expected-vs-actual diff against the +// just-stored inventory artifact, persists spec_diffs rows, and drives +// the state machine — all on the server. The agent does nothing for +// this stage. +func (a *Agent) resolveSpecValidate(r *http.Request, runID int64) { + run, err := a.Runs.Get(r.Context(), runID) + if err != nil { + log.Printf("specvalidate: get run: %v", err) + return + } + host, err := a.Hosts.Get(r.Context(), run.HostID) + if err != nil { + log.Printf("specvalidate: get host: %v", err) + return + } + expected, err := spec.Parse(host.ExpectedSpecYAML) + if err != nil { + log.Printf("specvalidate: parse expected yaml: %v", err) + a.failStage(r, runID, "SpecValidate", "malformed expected spec: "+err.Error()) + return + } + inv, err := a.readInventoryArtifact(r, runID) + if err != nil { + log.Printf("specvalidate: read inventory: %v", err) + a.failStage(r, runID, "SpecValidate", "missing inventory artifact") + return + } + diffs := spec.Diff(expected, inv) + if err := a.SpecDiffs.ReplaceForRun(r.Context(), runID, diffs); err != nil { + log.Printf("specvalidate: write diffs: %v", err) + } + if err := a.Stages.StartByName(r.Context(), runID, "SpecValidate"); err != nil { + log.Printf("specvalidate: start stage: %v", err) + } + + critical := 0 + for _, d := range diffs { + if d.Severity == "critical" && !d.Ignored { + critical++ + } + } + summaryBuf, _ := json.Marshal(map[string]any{ + "diffs": len(diffs), + "critical": critical, + }) + if critical > 0 { + _ = a.Stages.CompleteByName(r.Context(), runID, "SpecValidate", model.StageFailed, string(summaryBuf)) + _ = a.Runs.SetFailedStage(r.Context(), runID, "SpecValidate") + if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil { + log.Printf("specvalidate: failed-transition: %v", err) + } + a.appendLog(runID, "error", fmt.Sprintf("SpecValidate: %d critical diff(s) — holding host", critical)) + hostName := a.hostNameFor(r.Context(), run.HostID) + a.dispatchEvent(notify.Event{ + Kind: notify.KindSpecMismatch, + Severity: notify.SeverityCritical, + RunID: runID, + HostName: hostName, + Title: fmt.Sprintf("[vetting] %s spec mismatch (%d critical)", hostName, critical), + Body: fmt.Sprintf("SpecValidate found %d critical diff(s) on %s. Host is held for inspection.", critical, hostName), + URL: a.runLinkURL(runID), + }) + } else { + _ = a.Stages.CompleteByName(r.Context(), runID, "SpecValidate", model.StagePassed, string(summaryBuf)) + if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageCompleted); err != nil { + log.Printf("specvalidate: advance: %v", err) + } + a.appendLog(runID, "info", "SpecValidate: all fields match expected spec") + } +} + +func (a *Agent) readInventoryArtifact(r *http.Request, runID int64) (*spec.Inventory, error) { + arts, err := a.Artifacts.ListForRun(r.Context(), runID) + if err != nil { + return nil, err + } + for i := len(arts) - 1; i >= 0; i-- { + if arts[i].Kind == "inventory" { + buf, err := os.ReadFile(arts[i].Path) + if err != nil { + return nil, err + } + var inv spec.Inventory + if err := json.Unmarshal(buf, &inv); err != nil { + return nil, err + } + return &inv, nil + } + } + return nil, errors.New("no inventory artifact") +} + +func (a *Agent) failStage(r *http.Request, runID int64, stage, message string) { + _ = a.Stages.CompleteByName(r.Context(), runID, stage, model.StageFailed, fmt.Sprintf(`{"error":%q}`, message)) + _ = a.Runs.SetFailedStage(r.Context(), runID, stage) + if _, err := a.Runner.Transition(r.Context(), runID, orchestrator.TriggerStageFailed); err != nil { + log.Printf("failStage: transition run %d: %v", runID, err) + } + a.appendLog(runID, "error", stage+": "+message) +} + +func (a *Agent) appendLog(runID int64, level, text string) { + if a.Logs == nil { + return + } + w, err := a.Logs.WriterFor(runID) + if err != nil { + log.Printf("appendLog: %v", err) + return + } + w.Append(logs.Line{Level: level, Text: text}) +} + +// Hold issues the per-run ephemeral ed25519 keypair: the agent gets +// the authorized_keys line, the orchestrator keeps the privkey on disk. +// Hold also records the agent's reported IP so the tile can print the +// ssh invocation. +type HoldRequest struct { + AgentIP string `json:"agent_ip"` +} + +type HoldResponse struct { + AuthorizedKey string `json:"authorized_key"` + RunID int64 `json:"run_id"` +} + +func (a *Agent) Hold(w http.ResponseWriter, r *http.Request) { + runID, ok := runIDFromURL(w, r) + if !ok { + return + } + if _, ok := a.authenticate(w, r, runID); !ok { + return + } + var body HoldRequest + _ = json.NewDecoder(r.Body).Decode(&body) + agentIP := strings.TrimSpace(body.AgentIP) + if agentIP == "" { + if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil { + agentIP = host + } + } + if agentIP != "" { + if err := a.Runs.SetHoldIP(r.Context(), runID, agentIP); err != nil { + log.Printf("hold: set hold_ip: %v", err) + } + } + + kp, err := hold.Issue(runID) + if err != nil { + http.Error(w, "generate key: "+err.Error(), http.StatusInternalServerError) + return + } + keyPath := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID), "hold.key") + abs, err := kp.WritePrivateTo(keyPath) + if err != nil { + http.Error(w, "write key: "+err.Error(), http.StatusInternalServerError) + return + } + sum := sha256.Sum256(kp.PrivatePEM) + if _, err := a.Artifacts.Create(r.Context(), store.Artifact{ + RunID: runID, + Kind: "hold_key", + Path: abs, + SHA256: hex.EncodeToString(sum[:]), + SizeBytes: int64(len(kp.PrivatePEM)), + }); err != nil { + log.Printf("hold: record artifact: %v", err) + } + a.appendLog(runID, "info", fmt.Sprintf("Hold key issued. SSH in with: ssh -i %s root@%s", abs, agentIP)) + hostID := mustHostID(a, r, runID) + if hostID != 0 { + hostName := a.hostNameFor(r.Context(), hostID) + a.dispatchEvent(notify.Event{ + Kind: notify.KindHoldingOpened, + Severity: notify.SeverityCritical, + RunID: runID, + HostName: hostName, + Title: fmt.Sprintf("[vetting] %s holding — SSH ready", hostName), + Body: fmt.Sprintf("Host %s is holding at %s.\nssh -i %s root@%s", hostName, agentIP, abs, agentIP), + URL: a.runLinkURL(runID), + }) + } + // Refresh the tile so the operator sees the ssh command. + host, _ := a.Hosts.Get(r.Context(), mustHostID(a, r, runID)) + if host != nil { + latest, _ := a.Runs.Get(r.Context(), runID) + if orchestrator.TileRenderer != nil { + payload := orchestrator.TileRenderer(r.Context(), *host, latest) + a.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", host.ID), Payload: payload}) + } + } + writeJSON(w, http.StatusOK, HoldResponse{AuthorizedKey: kp.AuthorizedKey, RunID: runID}) +} + +// dispatchEvent hands an already-populated Event to the notify Registry +// if one is wired. Handler code uses hostNameFor to resolve the host +// name for the event payload; this keeps call sites terse. +func (a *Agent) dispatchEvent(ev notify.Event) { + if a.Notify == nil { + return + } + a.Notify.Dispatch(ev) +} + +// hostNameFor returns a human-readable host name for a run, or "host-N" +// if the lookup fails — notifications should never fail silently over a +// missing name. +func (a *Agent) hostNameFor(ctx context.Context, hostID int64) string { + if host, err := a.Hosts.Get(ctx, hostID); err == nil && host != nil { + return host.Name + } + return fmt.Sprintf("host-%d", hostID) +} + +func (a *Agent) runLinkURL(runID int64) string { + if a.PublicURL == "" { + return "" + } + return strings.TrimRight(a.PublicURL, "/") + "/reports/" + fmt.Sprintf("%d", runID) +} + +func mustHostID(a *Agent, r *http.Request, runID int64) int64 { + run, err := a.Runs.Get(r.Context(), runID) + if err != nil || run == nil { + return 0 + } + return run.HostID +} + +// ===== Phase 4 endpoints ================================================= + +// SensorBatch is what the agent POSTs to /sensor: a stream of numeric +// samples (temps, fan rpm, PSU rails, iperf throughput). Each sample is +// (kind, key, value, unit). Timestamps default to server-now when empty +// so the thermal sidecar doesn't have to carry a clock. +type SensorBatch struct { + Samples []SensorSample `json:"samples"` +} + +type SensorSample struct { + TS string `json:"ts,omitempty"` + Kind string `json:"kind"` // temp|fan|psu_volt|iperf|fio|smart_attr + Key string `json:"key"` + Value float64 `json:"value"` + Unit string `json:"unit,omitempty"` +} + +// Sensor persists a batch of numeric samples. The thermal sidecar hits +// this on a tick; stage executors (iperf, fio) also drop here. +func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) { + runID, ok := runIDFromURL(w, r) + if !ok { + return + } + if _, ok := a.authenticate(w, r, runID); !ok { + return + } + if a.Measurements == nil { + http.Error(w, "measurements store not wired", http.StatusInternalServerError) + return + } + var body SensorBatch + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + http.Error(w, "bad json", http.StatusBadRequest) + return + } + rows := make([]model.Measurement, 0, len(body.Samples)) + for _, s := range body.Samples { + ts, _ := time.Parse(time.RFC3339Nano, s.TS) + rows = append(rows, model.Measurement{ + RunID: runID, + TS: ts, + Kind: s.Kind, + Key: s.Key, + Value: s.Value, + Unit: s.Unit, + }) + } + if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil { + http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError) + return + } + writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(rows)}) +} + +// resolveReporting runs when the pipeline advances into StateReporting. +// It's an orchestrator-owned stage like SpecValidate: no agent action. +// Writes a JSON report bundling run + stages + diffs + measurements, +// then advances the run to Completed. Heartbeat will then return abort +// and the agent will power the host off in Phase 5. +func (a *Agent) resolveReporting(r *http.Request, runID int64) { + ctx := r.Context() + if err := a.Stages.StartByName(ctx, runID, "Reporting"); err != nil { + log.Printf("reporting: start stage: %v", err) + } + run, err := a.Runs.Get(ctx, runID) + if err != nil { + log.Printf("reporting: get run: %v", err) + return + } + host, err := a.Hosts.Get(ctx, run.HostID) + if err != nil { + log.Printf("reporting: get host: %v", err) + return + } + stages, err := a.Stages.ListForRun(ctx, runID) + if err != nil { + log.Printf("reporting: list stages: %v", err) + } + diffs, err := a.SpecDiffs.ListForRun(ctx, runID) + if err != nil { + log.Printf("reporting: list diffs: %v", err) + } + var measurements []model.Measurement + if a.Measurements != nil { + measurements, err = a.Measurements.ListForRun(ctx, runID) + if err != nil { + log.Printf("reporting: list measurements: %v", err) + } + } + bundle := map[string]any{ + "run": run, + "host": host, + "stages": stages, + "spec_diffs": diffs, + "measurements": measurements, + "generated_at": time.Now().UTC().Format(time.RFC3339), + } + buf, err := json.MarshalIndent(bundle, "", " ") + if err != nil { + log.Printf("reporting: marshal: %v", err) + a.failStage(r, runID, "Reporting", "marshal report: "+err.Error()) + return + } + dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", runID)) + if err := os.MkdirAll(dir, 0o755); err != nil { + a.failStage(r, runID, "Reporting", "mkdir: "+err.Error()) + return + } + path := filepath.Join(dir, "report.json") + if err := os.WriteFile(path, buf, 0o644); err != nil { + a.failStage(r, runID, "Reporting", "write: "+err.Error()) + return + } + sum := sha256.Sum256(buf) + if _, err := a.Artifacts.Create(ctx, store.Artifact{ + RunID: runID, + Kind: "report", + Path: path, + SHA256: hex.EncodeToString(sum[:]), + SizeBytes: int64(len(buf)), + }); err != nil { + log.Printf("reporting: record artifact: %v", err) + } + // Also render the operator-facing HTML summary alongside the JSON. + // Failures here are non-fatal — the JSON is the source of truth. + if host != nil { + htmlData := report.Data{ + GeneratedAt: time.Now().UTC(), + Run: *run, + Host: *host, + Stages: stages, + SpecDiffs: diffs, + Aggregates: report.AggregateMeasurements(measurements), + } + if htmlBuf, err := report.RenderHTML(htmlData); err != nil { + log.Printf("reporting: render html: %v", err) + } else { + htmlPath := filepath.Join(dir, "report.html") + if err := os.WriteFile(htmlPath, htmlBuf, 0o644); err != nil { + log.Printf("reporting: write html: %v", err) + } else { + htmlSum := sha256.Sum256(htmlBuf) + if _, err := a.Artifacts.Create(ctx, store.Artifact{ + RunID: runID, + Kind: "report_html", + Path: htmlPath, + SHA256: hex.EncodeToString(htmlSum[:]), + SizeBytes: int64(len(htmlBuf)), + }); err != nil { + log.Printf("reporting: record html artifact: %v", err) + } + } + } + } + summaryBuf, _ := json.Marshal(map[string]any{ + "report_path": path, + "stages": len(stages), + "diffs": len(diffs), + }) + if err := a.Stages.CompleteByName(ctx, runID, "Reporting", model.StagePassed, string(summaryBuf)); err != nil { + log.Printf("reporting: complete stage: %v", err) + } + if err := a.Runs.MarkCompleted(ctx, runID, path); err != nil { + log.Printf("reporting: mark completed: %v", err) + } + a.appendLog(runID, "info", "Reporting: wrote "+path+"; run completed.") + // Publish a final tile update so the dashboard flips to pass mood. + if host != nil && orchestrator.TileRenderer != nil { + latest, _ := a.Runs.Get(ctx, runID) + payload := orchestrator.TileRenderer(ctx, *host, latest) + a.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", host.ID), Payload: payload}) + } + hostName := "host" + if host != nil { + hostName = host.Name + } + a.dispatchEvent(notify.Event{ + Kind: notify.KindRunCompleted, + Severity: notify.SeverityInfo, + RunID: runID, + HostName: hostName, + Title: fmt.Sprintf("[vetting] %s passed vetting", hostName), + Body: fmt.Sprintf("Run %d on %s completed all stages. Report: %s", runID, hostName, path), + URL: a.runLinkURL(runID), + }) +} diff --git a/internal/api/agent_handlers_test.go b/internal/api/agent_handlers_test.go new file mode 100644 index 0000000..ed15faf --- /dev/null +++ b/internal/api/agent_handlers_test.go @@ -0,0 +1,128 @@ +package api_test + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "path/filepath" + "strconv" + "testing" + + "github.com/go-chi/chi/v5" + + "vetting/internal/api" + "vetting/internal/db" + "vetting/internal/model" + "vetting/internal/orchestrator" + "vetting/internal/store" +) + +func setupAgent(t *testing.T) (*api.Agent, int64, string) { + t.Helper() + path := filepath.Join(t.TempDir(), "vetting.db") + conn, err := db.Open(path) + if err != nil { + t.Fatalf("open db: %v", err) + } + t.Cleanup(func() { _ = conn.Close() }) + + hosts := &store.Hosts{DB: conn} + runs := &store.Runs{DB: conn} + meas := &store.Measurements{DB: conn} + + hostID, err := hosts.Create(context.Background(), model.Host{ + Name: "t-host", + MAC: "aa:bb:cc:dd:ee:01", + WoLBroadcastIP: "10.0.0.255", + WoLPort: 9, + ExpectedSpecYAML: "memory:\n total_gib: 16\n", + }) + if err != nil { + t.Fatalf("create host: %v", err) + } + plain, hash, err := orchestrator.IssueRunToken() + if err != nil { + t.Fatalf("issue token: %v", err) + } + runID, err := runs.Create(context.Background(), hostID, hash) + if err != nil { + t.Fatalf("create run: %v", err) + } + return &api.Agent{ + Hosts: hosts, + Runs: runs, + Measurements: meas, + }, runID, plain +} + +func routedRequest(runID int64, method, path string, body []byte) *http.Request { + req := httptest.NewRequest(method, path, bytes.NewReader(body)) + // chi.URLParam is read from chi's context routing; fake that here. + rctx := chi.NewRouteContext() + rctx.URLParams.Add("id", strconv.FormatInt(runID, 10)) + return req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx)) +} + +func TestSensorPersistsBatch(t *testing.T) { + a, runID, token := setupAgent(t) + batch := api.SensorBatch{Samples: []api.SensorSample{ + {Kind: "thermal", Key: "cpu", Value: 47.5, Unit: "C"}, + {Kind: "iperf", Key: "throughput_mbps", Value: 938.2, Unit: "Mbps"}, + }} + buf, _ := json.Marshal(batch) + req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf) + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + rr := httptest.NewRecorder() + a.Sensor(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String()) + } + rows, err := a.Measurements.ListForRun(context.Background(), runID) + if err != nil { + t.Fatalf("ListForRun: %v", err) + } + if len(rows) != 2 { + t.Fatalf("expected 2 measurements, got %d", len(rows)) + } +} + +func TestSensorRejectsBadToken(t *testing.T) { + a, runID, _ := setupAgent(t) + body, _ := json.Marshal(api.SensorBatch{}) + req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", body) + req.Header.Set("Authorization", "Bearer wrong-token") + rr := httptest.NewRecorder() + a.Sensor(rr, req) + if rr.Code != http.StatusUnauthorized { + t.Fatalf("status = %d, want 401", rr.Code) + } +} + +// TestHeartbeatShutdownWhenCompleted: once the orchestrator has flipped +// the run into Completed, the next heartbeat response must carry +// cmd=shutdown so the agent powers the host down. +func TestHeartbeatShutdownWhenCompleted(t *testing.T) { + a, runID, token := setupAgent(t) + // Wire a runner so Heartbeat's TouchHeartbeat call doesn't nil-panic. + a.Runner = &orchestrator.Runner{Runs: a.Runs, Hosts: a.Hosts, Stages: &store.Stages{DB: a.Runs.DB}} + if err := a.Runs.SetState(context.Background(), runID, model.StateCompleted); err != nil { + t.Fatalf("set state: %v", err) + } + req := routedRequest(runID, http.MethodPost, "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/heartbeat", nil) + req.Header.Set("Authorization", "Bearer "+token) + rr := httptest.NewRecorder() + a.Heartbeat(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rr.Code, rr.Body.String()) + } + var resp map[string]any + if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil { + t.Fatalf("decode: %v", err) + } + if resp["cmd"] != "shutdown" { + t.Fatalf("cmd = %v, want shutdown", resp["cmd"]) + } +} diff --git a/internal/api/smoke_test.go b/internal/api/smoke_test.go new file mode 100644 index 0000000..9fb64a0 --- /dev/null +++ b/internal/api/smoke_test.go @@ -0,0 +1,318 @@ +package api_test + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "testing" + "time" + + "github.com/go-chi/chi/v5" + + "vetting/internal/api" + "vetting/internal/db" + "vetting/internal/events" + "vetting/internal/logs" + "vetting/internal/model" + "vetting/internal/notify" + "vetting/internal/orchestrator" + "vetting/internal/spec" + "vetting/internal/store" +) + +// captureNotifier is a testing-only Notifier that records every Event +// sent to it, under a mutex so concurrent Dispatch goroutines are safe. +type captureNotifier struct { + mu sync.Mutex + name string + evs []notify.Event +} + +func (c *captureNotifier) Name() string { return c.name } + +func (c *captureNotifier) Send(_ context.Context, ev notify.Event) error { + c.mu.Lock() + c.evs = append(c.evs, ev) + c.mu.Unlock() + return nil +} + +func (c *captureNotifier) awaitKind(t *testing.T, k notify.Kind) notify.Event { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for { + c.mu.Lock() + for _, ev := range c.evs { + if ev.Kind == k { + got := ev + c.mu.Unlock() + return got + } + } + c.mu.Unlock() + if time.Now().After(deadline) { + t.Fatalf("no %q event received within timeout", k) + } + time.Sleep(5 * time.Millisecond) + } +} + +func newCaptureRegistry(c *captureNotifier) *notify.Registry { + reg := notify.NewRegistry(time.Second) + reg.Register(c) + reg.AddRoute(notify.Route{Notifier: c.name}) // wildcard + return reg +} + +// Builds a fully-wired Agent against a fresh sqlite DB and returns +// (agent, runID, plainTokenForBearer). Caller is responsible for +// transitioning the run out of Queued. +func fullAgent(t *testing.T) (*api.Agent, int64, string) { + t.Helper() + tmp := t.TempDir() + conn, err := db.Open(filepath.Join(tmp, "vetting.db")) + if err != nil { + t.Fatalf("open db: %v", err) + } + t.Cleanup(func() { _ = conn.Close() }) + + hostStore := &store.Hosts{DB: conn} + runStore := &store.Runs{DB: conn} + stageStore := &store.Stages{DB: conn} + artifactStore := &store.Artifacts{DB: conn} + specDiffStore := &store.SpecDiffs{DB: conn} + measurementStore := &store.Measurements{DB: conn} + + hub := events.NewHub() + logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub) + if err != nil { + t.Fatalf("logs hub: %v", err) + } + t.Cleanup(func() { logHub.Close() }) + + runner := &orchestrator.Runner{ + Runs: runStore, + Hosts: hostStore, + Stages: stageStore, + EventHub: hub, + } + + hostID, err := hostStore.Create(context.Background(), model.Host{ + Name: "smoke-host", + MAC: "aa:bb:cc:dd:ee:10", + WoLBroadcastIP: "10.0.0.255", + WoLPort: 9, + ExpectedSpecYAML: "", // empty spec → no diffs + }) + if err != nil { + t.Fatalf("create host: %v", err) + } + plain, hash, err := orchestrator.IssueRunToken() + if err != nil { + t.Fatalf("issue token: %v", err) + } + runID, err := runStore.Create(context.Background(), hostID, hash) + if err != nil { + t.Fatalf("create run: %v", err) + } + if err := stageStore.Seed(context.Background(), runID); err != nil { + t.Fatalf("seed stages: %v", err) + } + return &api.Agent{ + Hosts: hostStore, + Runs: runStore, + Stages: stageStore, + Artifacts: artifactStore, + SpecDiffs: specDiffStore, + Measurements: measurementStore, + Runner: runner, + EventHub: hub, + Logs: logHub, + ArtifactsDir: filepath.Join(tmp, "artifacts"), + PublicURL: "https://vetting.example", + }, runID, plain +} + +// walkStage simulates the agent reporting a single stage's outcome. +// Returns the next_state the orchestrator decided to advance to. +func walkStage(t *testing.T, a *api.Agent, runID int64, token, stage string, passed bool, extras map[string]any) string { + t.Helper() + body := map[string]any{"stage": stage, "passed": passed} + if extras != nil { + for k, v := range extras { + body[k] = v + } + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPost, + "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/result", + bytes.NewReader(buf)) + rctx := chi.NewRouteContext() + rctx.URLParams.Add("id", strconv.FormatInt(runID, 10)) + req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx)) + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + rr := httptest.NewRecorder() + a.Result(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("stage %s: status %d body=%q", stage, rr.Code, rr.Body.String()) + } + var resp struct { + OK bool `json:"ok"` + NextState string `json:"next_state"` + } + if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil { + t.Fatalf("stage %s: decode resp: %v", stage, err) + } + return resp.NextState +} + +// TestFullPipelineToCompleted walks an agent through all stages of a +// successful run and asserts the run ends in Completed. Inventory is +// minimal; the empty expected-spec means SpecValidate produces zero +// critical diffs and the orchestrator auto-advances past it. +func TestFullPipelineToCompleted(t *testing.T) { + a, runID, token := fullAgent(t) + capture := &captureNotifier{name: "capture"} + a.Notify = newCaptureRegistry(capture) + // Claim would normally transition Booting → InventoryCheck; set it + // directly here since we're not exercising the claim path. + if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil { + t.Fatalf("set state: %v", err) + } + + // Stage 1: Inventory — provide a concrete inventory so SpecValidate + // has something to compare against. + inv := spec.Inventory{ + CPU: spec.CPUSpec{Model: "Xeon Gold", LogicalCores: 8}, + Memory: spec.MemorySpec{TotalGiB: 16}, + } + next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}) + // After Inventory → SpecValidate resolves inline → SMART + if next != "SMART" { + t.Fatalf("after Inventory, next_state = %q, want SMART", next) + } + + // The remaining stages advance one-for-one in order. + walkPlan := []struct { + stage string + expected string + }{ + {"SMART", "CPUStress"}, + {"CPUStress", "Storage"}, + {"Storage", "Network"}, + {"Network", "GPU"}, + {"GPU", "PSU"}, + {"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed + } + for _, step := range walkPlan { + got := walkStage(t, a, runID, token, step.stage, true, nil) + if got != step.expected { + t.Fatalf("after %s, next_state = %q, want %q", step.stage, got, step.expected) + } + } + + run, err := a.Runs.Get(context.Background(), runID) + if err != nil { + t.Fatalf("Get run: %v", err) + } + if run.State != model.StateCompleted { + t.Fatalf("run.State = %q, want Completed", run.State) + } + if run.ReportPath == "" { + t.Fatalf("run.ReportPath not set") + } + + // Phase 5 assertions: an HTML report artifact exists on disk, and + // the capture notifier saw a RunCompleted event. + arts, err := a.Artifacts.ListForRun(context.Background(), runID) + if err != nil { + t.Fatalf("ListForRun: %v", err) + } + var htmlPath string + for _, art := range arts { + if art.Kind == "report_html" { + htmlPath = art.Path + } + } + if htmlPath == "" { + t.Fatalf("no report_html artifact recorded (kinds seen: %v)", artifactKinds(arts)) + } + data, err := os.ReadFile(htmlPath) + if err != nil { + t.Fatalf("read report.html: %v", err) + } + if !strings.Contains(string(data), " 65535 { + return "WoL port must be 1–65535." + } + } + return "" +} + +func friendlyDBError(err error) string { + s := err.Error() + switch { + case strings.Contains(s, "UNIQUE constraint failed: hosts.name"): + return "A host with that name already exists." + case strings.Contains(s, "UNIQUE constraint failed: hosts.mac"): + return "A host with that MAC already exists." + default: + return s + } +} diff --git a/internal/auth/middleware.go b/internal/auth/middleware.go new file mode 100644 index 0000000..3798de9 --- /dev/null +++ b/internal/auth/middleware.go @@ -0,0 +1,64 @@ +package auth + +import ( + "net/http" +) + +// RequireSession redirects unauthenticated requests to /login. +func (m *Manager) RequireSession(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if err := m.Validate(r); err != nil { + if acceptsHTML(r) { + http.Redirect(w, r, "/login?next="+r.URL.RequestURI(), http.StatusSeeOther) + return + } + http.Error(w, "unauthorized", http.StatusUnauthorized) + return + } + next.ServeHTTP(w, r) + }) +} + +func acceptsHTML(r *http.Request) bool { + accept := r.Header.Get("Accept") + if accept == "" { + return true + } + for _, part := range splitComma(accept) { + if part == "text/html" || part == "*/*" { + return true + } + } + return false +} + +func splitComma(s string) []string { + var out []string + start := 0 + for i := 0; i < len(s); i++ { + if s[i] == ',' { + out = append(out, trimSpace(s[start:i])) + start = i + 1 + } else if s[i] == ';' { + out = append(out, trimSpace(s[start:i])) + for i < len(s) && s[i] != ',' { + i++ + } + start = i + 1 + } + } + if start < len(s) { + out = append(out, trimSpace(s[start:])) + } + return out +} + +func trimSpace(s string) string { + for len(s) > 0 && (s[0] == ' ' || s[0] == '\t') { + s = s[1:] + } + for len(s) > 0 && (s[len(s)-1] == ' ' || s[len(s)-1] == '\t') { + s = s[:len(s)-1] + } + return s +} diff --git a/internal/auth/session.go b/internal/auth/session.go new file mode 100644 index 0000000..a0fb363 --- /dev/null +++ b/internal/auth/session.go @@ -0,0 +1,100 @@ +package auth + +import ( + "crypto/hmac" + "crypto/sha256" + "encoding/base64" + "errors" + "fmt" + "net/http" + "strconv" + "strings" + "time" + + "golang.org/x/crypto/bcrypt" +) + +const cookieName = "vetting_session" + +type Manager struct { + PasswordHash string + Secret []byte + TTL time.Duration +} + +func (m *Manager) VerifyPassword(password string) bool { + if m.PasswordHash == "" { + return false + } + return bcrypt.CompareHashAndPassword([]byte(m.PasswordHash), []byte(password)) == nil +} + +// Issue writes a signed session cookie valid for m.TTL. +func (m *Manager) Issue(w http.ResponseWriter, r *http.Request) { + expiry := time.Now().Add(m.TTL).Unix() + payload := strconv.FormatInt(expiry, 10) + sig := m.sign(payload) + value := payload + "." + sig + + http.SetCookie(w, &http.Cookie{ + Name: cookieName, + Value: value, + Path: "/", + HttpOnly: true, + Secure: r.TLS != nil, + SameSite: http.SameSiteLaxMode, + Expires: time.Unix(expiry, 0), + }) +} + +func (m *Manager) Clear(w http.ResponseWriter) { + http.SetCookie(w, &http.Cookie{ + Name: cookieName, + Value: "", + Path: "/", + HttpOnly: true, + MaxAge: -1, + }) +} + +var errInvalidSession = errors.New("invalid session") + +// Validate returns nil if the request's cookie is present, signed, and not expired. +func (m *Manager) Validate(r *http.Request) error { + c, err := r.Cookie(cookieName) + if err != nil { + return errInvalidSession + } + parts := strings.SplitN(c.Value, ".", 2) + if len(parts) != 2 { + return errInvalidSession + } + payload, sig := parts[0], parts[1] + expected := m.sign(payload) + if !hmac.Equal([]byte(sig), []byte(expected)) { + return errInvalidSession + } + expiry, err := strconv.ParseInt(payload, 10, 64) + if err != nil { + return errInvalidSession + } + if time.Now().Unix() >= expiry { + return errInvalidSession + } + return nil +} + +func (m *Manager) sign(payload string) string { + mac := hmac.New(sha256.New, m.Secret) + _, _ = mac.Write([]byte(payload)) + return base64.RawURLEncoding.EncodeToString(mac.Sum(nil)) +} + +// BcryptHash is a helper used by the gen-admin-password tool. +func BcryptHash(password string) (string, error) { + b, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost) + if err != nil { + return "", fmt.Errorf("bcrypt: %w", err) + } + return string(b), nil +} diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..0675980 --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,142 @@ +package config + +import ( + "encoding/hex" + "fmt" + "os" + + "gopkg.in/yaml.v3" +) + +type Config struct { + Server Server `yaml:"server"` + Database Database `yaml:"database"` + Artifacts Artifacts `yaml:"artifacts"` + Logs Logs `yaml:"logs"` + Auth Auth `yaml:"auth"` + Dispatcher Dispatcher `yaml:"dispatcher"` + Janitor Janitor `yaml:"janitor"` + PXE PXE `yaml:"pxe"` + Network Network `yaml:"network"` + Notifiers []Notifier `yaml:"notifiers"` + Routes []Route `yaml:"routes"` +} + +type Server struct { + Bind string `yaml:"bind"` + PublicURL string `yaml:"public_url"` // user-visible base URL, e.g. https://vetting.lan:8443; used in notification click-throughs + TLS TLS `yaml:"tls"` +} + +type TLS struct { + Enabled bool `yaml:"enabled"` + CertFile string `yaml:"cert_file"` + KeyFile string `yaml:"key_file"` +} + +type Database struct { + Path string `yaml:"path"` +} + +type Artifacts struct { + Dir string `yaml:"dir"` + RetentionDays int `yaml:"retention_days"` // 0 = keep forever +} + +type Logs struct { + Dir string `yaml:"dir"` + RetentionDays int `yaml:"retention_days"` // 0 = keep forever +} + +type Janitor struct { + IntervalMinutes int `yaml:"interval_minutes"` // 0 = 60 +} + +type Auth struct { + AdminPasswordBcrypt string `yaml:"admin_password_bcrypt"` + SessionSecretHex string `yaml:"session_secret_hex"` + SessionTTLHours int `yaml:"session_ttl_hours"` +} + +func (a Auth) SessionSecret() ([]byte, error) { + b, err := hex.DecodeString(a.SessionSecretHex) + if err != nil { + return nil, fmt.Errorf("session_secret_hex: %w", err) + } + if len(b) < 32 { + return nil, fmt.Errorf("session_secret_hex must decode to at least 32 bytes, got %d", len(b)) + } + return b, nil +} + +type Dispatcher struct { + MaxConcurrentRuns int `yaml:"max_concurrent_runs"` +} + +type Network struct { + IperfPort int `yaml:"iperf_port"` +} + +// PXE / Notifier / Route are declared up front so the config file is +// forward-compatible across phases. Phase 1 does not act on these. + +type PXE struct { + Enabled bool `yaml:"enabled"` + Interface string `yaml:"interface"` + DHCPRange string `yaml:"dhcp_range"` + OrchestratorURL string `yaml:"orchestrator_url"` + TFTPRoot string `yaml:"tftp_root"` // holds ipxe.efi + undionly.kpxe + LiveDir string `yaml:"live_dir"` // holds vmlinuz + initrd.img; served at /live +} + +type Notifier struct { + Name string `yaml:"name"` + Type string `yaml:"type"` + Topic string `yaml:"topic,omitempty"` + Server string `yaml:"server,omitempty"` + WebhookURL string `yaml:"webhook_url,omitempty"` + SMTP SMTP `yaml:"smtp,omitempty"` +} + +type SMTP struct { + Host string `yaml:"host,omitempty"` + Port int `yaml:"port,omitempty"` + From string `yaml:"from,omitempty"` + To []string `yaml:"to,omitempty"` +} + +type Route struct { + MatchKind []string `yaml:"match_kind"` + MatchSeverity []string `yaml:"match_severity,omitempty"` + Notifier string `yaml:"notifier"` +} + +func Load(path string) (*Config, error) { + b, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read config: %w", err) + } + var c Config + if err := yaml.Unmarshal(b, &c); err != nil { + return nil, fmt.Errorf("parse config: %w", err) + } + if c.Server.Bind == "" { + c.Server.Bind = "127.0.0.1:8080" + } + if c.Database.Path == "" { + c.Database.Path = "./var/vetting.db" + } + if c.Artifacts.Dir == "" { + c.Artifacts.Dir = "./var/artifacts" + } + if c.Logs.Dir == "" { + c.Logs.Dir = "./var/logs" + } + if c.Auth.SessionTTLHours == 0 { + c.Auth.SessionTTLHours = 24 + } + if c.Dispatcher.MaxConcurrentRuns == 0 { + c.Dispatcher.MaxConcurrentRuns = 3 + } + return &c, nil +} diff --git a/internal/db/db.go b/internal/db/db.go new file mode 100644 index 0000000..96c0357 --- /dev/null +++ b/internal/db/db.go @@ -0,0 +1,83 @@ +package db + +import ( + "database/sql" + "embed" + "fmt" + "io/fs" + "path/filepath" + "sort" + "strings" + + _ "modernc.org/sqlite" +) + +//go:embed migrations/*.sql +var migrationsFS embed.FS + +// Open opens the SQLite DB at path, enabling foreign keys and WAL, +// and applies every embedded migration in filename order. +func Open(path string) (*sql.DB, error) { + dsn := fmt.Sprintf("file:%s?_pragma=foreign_keys(1)&_pragma=journal_mode(WAL)&_pragma=busy_timeout(5000)", filepath.ToSlash(path)) + db, err := sql.Open("sqlite", dsn) + if err != nil { + return nil, fmt.Errorf("open sqlite: %w", err) + } + if err := db.Ping(); err != nil { + _ = db.Close() + return nil, fmt.Errorf("ping sqlite: %w", err) + } + if err := migrate(db); err != nil { + _ = db.Close() + return nil, err + } + return db, nil +} + +func migrate(db *sql.DB) error { + entries, err := fs.ReadDir(migrationsFS, "migrations") + if err != nil { + return fmt.Errorf("read migrations: %w", err) + } + names := make([]string, 0, len(entries)) + for _, e := range entries { + if !e.IsDir() && strings.HasSuffix(e.Name(), ".sql") { + names = append(names, e.Name()) + } + } + sort.Strings(names) + + if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS schema_migrations (name TEXT PRIMARY KEY, applied_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP)`); err != nil { + return fmt.Errorf("ensure schema_migrations: %w", err) + } + + for _, name := range names { + var applied int + if err := db.QueryRow(`SELECT COUNT(1) FROM schema_migrations WHERE name = ?`, name).Scan(&applied); err != nil { + return fmt.Errorf("check migration %s: %w", name, err) + } + if applied > 0 { + continue + } + content, err := migrationsFS.ReadFile("migrations/" + name) + if err != nil { + return fmt.Errorf("read migration %s: %w", name, err) + } + tx, err := db.Begin() + if err != nil { + return fmt.Errorf("begin migration %s: %w", name, err) + } + if _, err := tx.Exec(string(content)); err != nil { + _ = tx.Rollback() + return fmt.Errorf("apply migration %s: %w", name, err) + } + if _, err := tx.Exec(`INSERT INTO schema_migrations(name) VALUES(?)`, name); err != nil { + _ = tx.Rollback() + return fmt.Errorf("record migration %s: %w", name, err) + } + if err := tx.Commit(); err != nil { + return fmt.Errorf("commit migration %s: %w", name, err) + } + } + return nil +} diff --git a/internal/db/migrations/0001_init.sql b/internal/db/migrations/0001_init.sql new file mode 100644 index 0000000..5b6c834 --- /dev/null +++ b/internal/db/migrations/0001_init.sql @@ -0,0 +1,93 @@ +-- Phase 1 schema covers the full Vetting domain so future phases +-- only add data, never restructure. + +CREATE TABLE IF NOT EXISTS hosts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL UNIQUE, + mac TEXT NOT NULL UNIQUE, -- lowercase colon form + wol_broadcast_ip TEXT NOT NULL, + wol_port INTEGER NOT NULL DEFAULT 9, + expected_spec_yaml TEXT NOT NULL, + pdu_config_json TEXT, + ipmi_config_json TEXT, + notes TEXT NOT NULL DEFAULT '', + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + host_id INTEGER NOT NULL REFERENCES hosts(id) ON DELETE CASCADE, + state TEXT NOT NULL, + result TEXT, -- pass|fail|null + failed_stage TEXT, + next_boot_target TEXT, -- linux|memtest|linux-post-memtest (Phase 2+) + agent_token_hash TEXT NOT NULL, + started_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + completed_at TIMESTAMP, + report_path TEXT, + hold_ip TEXT, + override_flags_json TEXT +); +CREATE INDEX IF NOT EXISTS idx_runs_host ON runs(host_id); +CREATE INDEX IF NOT EXISTS idx_runs_state ON runs(state); + +CREATE TABLE IF NOT EXISTS stages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + name TEXT NOT NULL, + ordinal INTEGER NOT NULL, + state TEXT NOT NULL, -- pending|running|passed|failed|skipped + started_at TIMESTAMP, + completed_at TIMESTAMP, + summary_json TEXT +); +CREATE INDEX IF NOT EXISTS idx_stages_run_ordinal ON stages(run_id, ordinal); + +CREATE TABLE IF NOT EXISTS measurements ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + stage_id INTEGER REFERENCES stages(id) ON DELETE SET NULL, + ts TIMESTAMP NOT NULL, + kind TEXT NOT NULL, -- temp|power|iperf|fio|smart_attr + key TEXT NOT NULL, + value REAL, + unit TEXT +); +CREATE INDEX IF NOT EXISTS idx_measurements_run_kind_ts ON measurements(run_id, kind, ts); + +CREATE TABLE IF NOT EXISTS artifacts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + stage_id INTEGER REFERENCES stages(id) ON DELETE SET NULL, + kind TEXT NOT NULL, + path TEXT NOT NULL, + sha256 TEXT NOT NULL, + size_bytes INTEGER NOT NULL +); + +CREATE TABLE IF NOT EXISTS spec_diffs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + field TEXT NOT NULL, + expected TEXT, + actual TEXT, + severity TEXT NOT NULL, -- critical|warning|info + ignored INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE IF NOT EXISTS events ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER REFERENCES runs(id) ON DELETE CASCADE, + host_id INTEGER REFERENCES hosts(id) ON DELETE CASCADE, + ts TIMESTAMP NOT NULL, + level TEXT NOT NULL, + kind TEXT NOT NULL, + message TEXT NOT NULL, + data_json TEXT +); + +CREATE TABLE IF NOT EXISTS settings ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL +); diff --git a/internal/events/events.go b/internal/events/events.go new file mode 100644 index 0000000..312de07 --- /dev/null +++ b/internal/events/events.go @@ -0,0 +1,144 @@ +package events + +import ( + "context" + "fmt" + "net/http" + "sync" + "sync/atomic" + "time" +) + +// Event is a typed event published on the internal bus. In Phase 1 the +// payload is an already-rendered HTML fragment; later phases will wrap +// structured run state in this same Event envelope. +type Event struct { + Name string // SSE event name (e.g. "heartbeat", "tile-update", "log-line") + Payload string // pre-rendered HTML, ready to write as SSE data +} + +type subscriber struct { + id int64 + ch chan Event +} + +// Hub is an in-process fan-out for SSE subscribers. +type Hub struct { + mu sync.RWMutex + nextID int64 + subs map[int64]*subscriber + buffer int + heartbeat time.Duration +} + +func NewHub() *Hub { + h := &Hub{ + subs: map[int64]*subscriber{}, + buffer: 32, + heartbeat: 15 * time.Second, + } + go h.heartbeatLoop() + return h +} + +func (h *Hub) Publish(ev Event) { + h.mu.RLock() + defer h.mu.RUnlock() + for _, s := range h.subs { + select { + case s.ch <- ev: + default: + // Slow subscriber: drop the event rather than stall other clients. + } + } +} + +func (h *Hub) Subscribe() (id int64, ch <-chan Event, cancel func()) { + id = atomic.AddInt64(&h.nextID, 1) + s := &subscriber{id: id, ch: make(chan Event, h.buffer)} + h.mu.Lock() + h.subs[id] = s + h.mu.Unlock() + return id, s.ch, func() { + h.mu.Lock() + delete(h.subs, id) + h.mu.Unlock() + close(s.ch) + } +} + +func (h *Hub) heartbeatLoop() { + t := time.NewTicker(h.heartbeat) + defer t.Stop() + for range t.C { + h.Publish(Event{ + Name: "heartbeat", + Payload: fmt.Sprintf(``, time.Now().Unix()), + }) + } +} + +// ServeSSE writes server-sent events for a single subscriber for the +// lifetime of the request. Each Event becomes one SSE message. +func (h *Hub) ServeSSE(w http.ResponseWriter, r *http.Request) { + flusher, ok := w.(http.Flusher) + if !ok { + http.Error(w, "streaming not supported", http.StatusInternalServerError) + return + } + w.Header().Set("Content-Type", "text/event-stream") + w.Header().Set("Cache-Control", "no-cache") + w.Header().Set("Connection", "keep-alive") + w.Header().Set("X-Accel-Buffering", "no") + + _, eventsCh, cancel := h.Subscribe() + defer cancel() + + fmt.Fprintf(w, "event: hello\ndata: ok\n\n") + flusher.Flush() + + ctx := r.Context() + for { + select { + case <-ctx.Done(): + return + case ev, ok := <-eventsCh: + if !ok { + return + } + writeSSE(w, ev) + flusher.Flush() + } + } +} + +func writeSSE(w http.ResponseWriter, ev Event) { + if ev.Name != "" { + fmt.Fprintf(w, "event: %s\n", ev.Name) + } + for _, line := range splitLines(ev.Payload) { + fmt.Fprintf(w, "data: %s\n", line) + } + fmt.Fprint(w, "\n") +} + +func splitLines(s string) []string { + if s == "" { + return []string{""} + } + out := []string{} + start := 0 + for i := 0; i < len(s); i++ { + if s[i] == '\n' { + out = append(out, s[start:i]) + start = i + 1 + } + } + if start <= len(s) { + out = append(out, s[start:]) + } + return out +} + +// Shutdown is a no-op placeholder wired into graceful shutdown. +func (h *Hub) Shutdown(_ context.Context) error { return nil } diff --git a/internal/hold/hold.go b/internal/hold/hold.go new file mode 100644 index 0000000..d9a6076 --- /dev/null +++ b/internal/hold/hold.go @@ -0,0 +1,65 @@ +// Package hold generates per-run ephemeral ed25519 keypairs for the +// FailedHolding flow. When a run fails, the agent asks the orchestrator +// for a pubkey, drops it into /root/.ssh/authorized_keys, and reports +// its LAN IP. The orchestrator stores the private key next to the run's +// artifacts and surfaces `ssh -i root@` on the tile. +package hold + +import ( + "crypto/ed25519" + "crypto/rand" + "encoding/pem" + "fmt" + "os" + "path/filepath" + "strings" + + "golang.org/x/crypto/ssh" +) + +// Keypair bundles the PEM-encoded private key and the +// authorized_keys-style public key line. +type Keypair struct { + PrivatePEM []byte + AuthorizedKey string // "ssh-ed25519 AAAA... vetting-hold-N" +} + +// Issue generates a new ed25519 keypair labelled for the given run. +func Issue(runID int64) (*Keypair, error) { + pub, priv, err := ed25519.GenerateKey(rand.Reader) + if err != nil { + return nil, fmt.Errorf("generate ed25519: %w", err) + } + sshPub, err := ssh.NewPublicKey(pub) + if err != nil { + return nil, fmt.Errorf("ssh public key: %w", err) + } + blob := ssh.MarshalAuthorizedKey(sshPub) // "ssh-ed25519 AAAA...\n" + line := strings.TrimRight(string(blob), "\n") + if !strings.HasSuffix(line, fmt.Sprintf(" vetting-hold-%d", runID)) { + line += fmt.Sprintf(" vetting-hold-%d", runID) + } + + block, err := ssh.MarshalPrivateKey(priv, fmt.Sprintf("vetting-hold-%d", runID)) + if err != nil { + return nil, fmt.Errorf("marshal private key: %w", err) + } + return &Keypair{PrivatePEM: pem.EncodeToMemory(block), AuthorizedKey: line}, nil +} + +// WritePrivateTo persists the PEM to the given path with 0600 perms +// and returns the absolute path. The operator's shell reads this file +// by path, so we keep it on disk per-run. +func (kp *Keypair) WritePrivateTo(path string) (string, error) { + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return "", err + } + if err := os.WriteFile(path, kp.PrivatePEM, 0o600); err != nil { + return "", fmt.Errorf("write hold key: %w", err) + } + abs, err := filepath.Abs(path) + if err != nil { + return path, nil + } + return abs, nil +} diff --git a/internal/hold/hold_test.go b/internal/hold/hold_test.go new file mode 100644 index 0000000..aa7a28f --- /dev/null +++ b/internal/hold/hold_test.go @@ -0,0 +1,99 @@ +package hold + +import ( + "bytes" + "crypto/ed25519" + "os" + "path/filepath" + "strings" + "testing" + + "golang.org/x/crypto/ssh" +) + +// TestIssueRoundTrip checks that the private key we write is parseable +// with the standard openssh library and that its derived public key +// byte-for-byte matches the authorized_key line we handed the agent. +// If this drifts — e.g. we swap from ed25519 to something else, or +// mangle the comment — the operator's `ssh -i path root@ip` breaks +// silently. The test is the only early-warning we have. +func TestIssueRoundTrip(t *testing.T) { + kp, err := Issue(42) + if err != nil { + t.Fatalf("Issue: %v", err) + } + + // Parse the private key back. + signer, err := ssh.ParsePrivateKey(kp.PrivatePEM) + if err != nil { + t.Fatalf("ParsePrivateKey: %v", err) + } + + // The public derived from the signer must match the authorized_key line. + gotAuth := strings.TrimRight(string(ssh.MarshalAuthorizedKey(signer.PublicKey())), "\n") + wantAuth := kp.AuthorizedKey + // Authorized_keys comment is ours; compare just the type+b64 prefix. + gotParts := strings.SplitN(gotAuth, " ", 3) + wantParts := strings.SplitN(wantAuth, " ", 3) + if len(gotParts) < 2 || len(wantParts) < 2 { + t.Fatalf("unexpected authorized_key shape got=%q want=%q", gotAuth, wantAuth) + } + if gotParts[0] != wantParts[0] || gotParts[1] != wantParts[1] { + t.Fatalf("public key mismatch:\n got %s\n want %s", gotAuth, wantAuth) + } + if !strings.Contains(wantAuth, "vetting-hold-42") { + t.Fatalf("authorized_key line missing run tag: %q", wantAuth) + } +} + +// TestIssueKeysAreEd25519 pins the algorithm — anything other than +// ed25519 would surprise operators who've been told their hold key is +// ed25519 (and would change key-file sizes, path handling, etc.). +func TestIssueKeysAreEd25519(t *testing.T) { + kp, err := Issue(1) + if err != nil { + t.Fatalf("Issue: %v", err) + } + signer, err := ssh.ParsePrivateKey(kp.PrivatePEM) + if err != nil { + t.Fatalf("ParsePrivateKey: %v", err) + } + if got := signer.PublicKey().Type(); got != ssh.KeyAlgoED25519 { + t.Fatalf("key algorithm: got %s, want ssh-ed25519", got) + } + // Paranoia: the Ed25519 public key underneath should be 32 bytes. + edPub, ok := signer.PublicKey().(ssh.CryptoPublicKey) + if !ok { + t.Fatalf("public key does not expose CryptoPublicKey") + } + raw, ok := edPub.CryptoPublicKey().(ed25519.PublicKey) + if !ok { + t.Fatalf("public key is not ed25519.PublicKey") + } + if len(raw) != ed25519.PublicKeySize { + t.Fatalf("ed25519 pubkey size = %d, want %d", len(raw), ed25519.PublicKeySize) + } +} + +func TestWritePrivateToSetsPerms(t *testing.T) { + kp, err := Issue(7) + if err != nil { + t.Fatalf("Issue: %v", err) + } + dir := t.TempDir() + path := filepath.Join(dir, "nested", "hold.key") + abs, err := kp.WritePrivateTo(path) + if err != nil { + t.Fatalf("WritePrivateTo: %v", err) + } + if !filepath.IsAbs(abs) { + t.Fatalf("expected absolute path, got %q", abs) + } + buf, err := os.ReadFile(abs) + if err != nil { + t.Fatalf("ReadFile: %v", err) + } + if !bytes.Equal(buf, kp.PrivatePEM) { + t.Fatalf("on-disk bytes differ from in-memory PEM") + } +} diff --git a/internal/httpserver/router.go b/internal/httpserver/router.go new file mode 100644 index 0000000..ab02b71 --- /dev/null +++ b/internal/httpserver/router.go @@ -0,0 +1,75 @@ +// Package httpserver assembles the chi router. It lives in its own +// package because it depends on both `api` and `orchestrator`, and +// those two packages must stay import-independent. +package httpserver + +import ( + "io/fs" + "net/http" + + "github.com/go-chi/chi/v5" + "github.com/go-chi/chi/v5/middleware" + + "vetting/internal/api" + "vetting/internal/auth" + "vetting/internal/web" +) + +type Deps struct { + Auth *auth.Manager + UI *api.UI + Agent *api.Agent + LiveDir string // directory containing vmlinuz + initrd.img; "" disables /live +} + +func NewRouter(d Deps) http.Handler { + r := chi.NewRouter() + r.Use(middleware.RealIP) + r.Use(middleware.Recoverer) + r.Use(middleware.Logger) + + staticFS, err := fs.Sub(web.Static, "static") + if err != nil { + panic(err) + } + r.Handle("/static/*", http.StripPrefix("/static/", http.FileServer(http.FS(staticFS)))) + + if d.LiveDir != "" { + r.Handle("/live/*", http.StripPrefix("/live/", http.FileServer(http.Dir(d.LiveDir)))) + } + + // Public (no session required) endpoints. + r.Get("/login", d.UI.LoginForm) + r.Post("/login", d.UI.LoginSubmit) + r.Post("/logout", d.UI.Logout) + + // Agent / PXE endpoints — authenticated per-request by bearer token + // or by the unforgeable MAC path parameter, never by the UI session. + r.Get("/ipxe/{mac}", d.Agent.IPXEScript) + r.Route("/api/v1/runs/{id}", func(r chi.Router) { + r.Post("/hello", d.Agent.Hello) + r.Post("/claim", d.Agent.Claim) + r.Post("/heartbeat", d.Agent.Heartbeat) + r.Post("/log", d.Agent.Log) + r.Post("/result", d.Agent.Result) + r.Post("/hold", d.Agent.Hold) + r.Post("/sensor", d.Agent.Sensor) + }) + + // Session-gated browser UI. + r.Group(func(r chi.Router) { + r.Use(d.Auth.RequireSession) + + r.Get("/", d.UI.Dashboard) + r.Get("/hosts/new", d.UI.NewHostForm) + r.Post("/hosts", d.UI.CreateHost) + r.Post("/hosts/{id}/delete", d.UI.DeleteHost) + r.Post("/hosts/{id}/start", d.UI.StartRun) + r.Post("/hosts/{id}/override-wipe", d.UI.OverrideWipeStorage) + r.Get("/reports/{runID}", d.UI.Report) + + r.Get("/events", d.UI.SSE) + }) + + return r +} diff --git a/internal/janitor/adapter.go b/internal/janitor/adapter.go new file mode 100644 index 0000000..72af075 --- /dev/null +++ b/internal/janitor/adapter.go @@ -0,0 +1,33 @@ +package janitor + +import ( + "context" + "time" + + "vetting/internal/logs" + "vetting/internal/store" +) + +// StoreAdapter bridges the concrete orchestrator stores to the Janitor's +// dependency interface. Kept in the janitor package so the orchestrator +// wire-up stays a single-line: janitor.New(cfg, &janitor.StoreAdapter{...}). +type StoreAdapter struct { + Runs *store.Runs + Artifacts *store.Artifacts + Logs *logs.Hub +} + +func (a *StoreAdapter) CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error) { + return a.Runs.CompletedOlderThan(ctx, cutoff) +} + +func (a *StoreAdapter) DeleteArtifactsForRun(ctx context.Context, runID int64) ([]store.Artifact, error) { + return a.Artifacts.DeleteForRun(ctx, runID) +} + +func (a *StoreAdapter) LogPathFor(runID int64) string { + if a.Logs == nil { + return "" + } + return a.Logs.PathFor(runID) +} diff --git a/internal/janitor/janitor.go b/internal/janitor/janitor.go new file mode 100644 index 0000000..ea71345 --- /dev/null +++ b/internal/janitor/janitor.go @@ -0,0 +1,171 @@ +// Package janitor garbage-collects on-disk run data. A completed or +// released run produces an HTML report, a JSON report, a log file, and +// potentially several artifact blobs (fio output, iperf output, hold +// pubkey, inventory JSON). None of these need to stay on disk +// indefinitely — once the operator's looked at the report and closed +// the tile, disk pressure is the only cost. +// +// The DB row for the run is kept (so historical counts and host +// histories survive); only the on-disk files and their artifact rows +// are pruned. The janitor ticks on a fixed interval and is safe to +// run concurrently with live runs — it only touches runs in terminal +// states past a cutoff, which by definition are not being written to. +package janitor + +import ( + "context" + "errors" + "fmt" + "log" + "os" + "sync" + "time" + + "vetting/internal/store" +) + +// Config carries the retention knobs. Zero values mean "keep forever" +// for that class of data; a zero Interval defaults to 1h. +type Config struct { + ArtifactRetention time.Duration + LogRetention time.Duration + Interval time.Duration +} + +// Stores is the subset of the store layer the janitor needs. Defined as +// an interface so tests can fake it without spinning up SQLite. +type Stores interface { + CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error) + DeleteArtifactsForRun(ctx context.Context, runID int64) ([]store.Artifact, error) + LogPathFor(runID int64) string +} + +// Janitor owns the ticker goroutine. Start/Stop are idempotent; Stop +// waits for the in-flight pass to finish so tests can assert post-state. +type Janitor struct { + cfg Config + s Stores + stop chan struct{} + wg sync.WaitGroup + mu sync.Mutex + running bool +} + +func New(cfg Config, s Stores) *Janitor { + if cfg.Interval <= 0 { + cfg.Interval = time.Hour + } + return &Janitor{cfg: cfg, s: s, stop: make(chan struct{})} +} + +// Start launches the ticker. Retention zeros mean no cleanup is needed; +// in that case the ticker still runs but each Sweep is a no-op. +func (j *Janitor) Start(ctx context.Context) { + j.mu.Lock() + if j.running { + j.mu.Unlock() + return + } + j.running = true + j.mu.Unlock() + j.wg.Add(1) + go j.loop(ctx) +} + +func (j *Janitor) Stop() { + j.mu.Lock() + if !j.running { + j.mu.Unlock() + return + } + j.running = false + close(j.stop) + j.mu.Unlock() + j.wg.Wait() +} + +func (j *Janitor) loop(ctx context.Context) { + defer j.wg.Done() + // Run one sweep immediately so startup cleans up anything that + // aged out while the orchestrator was down. + if err := j.Sweep(ctx, time.Now().UTC()); err != nil { + log.Printf("janitor: initial sweep: %v", err) + } + t := time.NewTicker(j.cfg.Interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-j.stop: + return + case now := <-t.C: + if err := j.Sweep(ctx, now.UTC()); err != nil { + log.Printf("janitor: sweep: %v", err) + } + } + } +} + +// Sweep is exported so tests can drive a single pass deterministically. +// It picks the *more aggressive* cutoff between the two retentions so a +// single DB query covers both classes, then does the per-class work. +func (j *Janitor) Sweep(ctx context.Context, now time.Time) error { + if j.cfg.ArtifactRetention <= 0 && j.cfg.LogRetention <= 0 { + return nil + } + cutoff := now.Add(-longer(j.cfg.ArtifactRetention, j.cfg.LogRetention)) + runs, err := j.s.CompletedOlderThan(ctx, cutoff) + if err != nil { + return fmt.Errorf("list old runs: %w", err) + } + artifactCutoff := now.Add(-j.cfg.ArtifactRetention) + logCutoff := now.Add(-j.cfg.LogRetention) + for _, runID := range runs { + // The query above used the longer cutoff — each retention is + // re-checked per-run against its actual cutoff via the run's + // completed_at, but since we don't round-trip that here we + // just process both at their own cutoff using the single + // query's cheap filter (run is old enough for at least one). + if j.cfg.ArtifactRetention > 0 && !artifactCutoff.IsZero() { + j.cleanArtifacts(ctx, runID) + } + if j.cfg.LogRetention > 0 && !logCutoff.IsZero() { + j.cleanLog(runID) + } + } + return nil +} + +func (j *Janitor) cleanArtifacts(ctx context.Context, runID int64) { + arts, err := j.s.DeleteArtifactsForRun(ctx, runID) + if err != nil { + log.Printf("janitor: delete artifacts for run %d: %v", runID, err) + return + } + for _, a := range arts { + if a.Path == "" { + continue + } + if err := os.Remove(a.Path); err != nil && !errors.Is(err, os.ErrNotExist) { + log.Printf("janitor: unlink %s: %v", a.Path, err) + } + } +} + +func (j *Janitor) cleanLog(runID int64) { + path := j.s.LogPathFor(runID) + if path == "" { + return + } + if err := os.Remove(path); err != nil && !errors.Is(err, os.ErrNotExist) { + log.Printf("janitor: unlink log %s: %v", path, err) + } +} + +func longer(a, b time.Duration) time.Duration { + if a > b { + return a + } + return b +} diff --git a/internal/janitor/janitor_test.go b/internal/janitor/janitor_test.go new file mode 100644 index 0000000..346c8f1 --- /dev/null +++ b/internal/janitor/janitor_test.go @@ -0,0 +1,133 @@ +package janitor + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" + + "vetting/internal/store" +) + +// fakeStores is a test double that records what the janitor asked for +// and hands back canned runs/artifacts. It lets us verify both the +// cleanup contract (files deleted, rows deleted) and that the janitor +// honours a zero retention as a no-op. +type fakeStores struct { + cutoffSeen time.Time + runsOlder []int64 + artifactsByID map[int64][]store.Artifact + deleted map[int64]bool + logs map[int64]string +} + +func (f *fakeStores) CompletedOlderThan(_ context.Context, cutoff time.Time) ([]int64, error) { + f.cutoffSeen = cutoff + return f.runsOlder, nil +} + +func (f *fakeStores) DeleteArtifactsForRun(_ context.Context, runID int64) ([]store.Artifact, error) { + if f.deleted == nil { + f.deleted = map[int64]bool{} + } + f.deleted[runID] = true + return f.artifactsByID[runID], nil +} + +func (f *fakeStores) LogPathFor(runID int64) string { return f.logs[runID] } + +func writeTempFile(t *testing.T, dir, name string) string { + t.Helper() + p := filepath.Join(dir, name) + if err := os.WriteFile(p, []byte("x"), 0o644); err != nil { + t.Fatalf("write %s: %v", p, err) + } + return p +} + +func TestSweepDeletesArtifactsAndLogs(t *testing.T) { + dir := t.TempDir() + p1 := writeTempFile(t, dir, "artifact-1.bin") + p2 := writeTempFile(t, dir, "artifact-2.json") + log1 := writeTempFile(t, dir, "run-1.log") + + s := &fakeStores{ + runsOlder: []int64{1}, + artifactsByID: map[int64][]store.Artifact{ + 1: {{ID: 10, RunID: 1, Path: p1}, {ID: 11, RunID: 1, Path: p2}}, + }, + logs: map[int64]string{1: log1}, + } + j := New(Config{ + ArtifactRetention: 24 * time.Hour, + LogRetention: 24 * time.Hour, + Interval: time.Minute, + }, s) + if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil { + t.Fatalf("sweep: %v", err) + } + if !s.deleted[1] { + t.Fatalf("run 1 not passed to DeleteArtifactsForRun") + } + for _, p := range []string{p1, p2, log1} { + if _, err := os.Stat(p); !os.IsNotExist(err) { + t.Errorf("file %s still exists (err=%v)", p, err) + } + } +} + +func TestSweepIsNoopWhenRetentionsAreZero(t *testing.T) { + dir := t.TempDir() + p := writeTempFile(t, dir, "keep.bin") + s := &fakeStores{ + runsOlder: []int64{1}, + artifactsByID: map[int64][]store.Artifact{ + 1: {{ID: 10, RunID: 1, Path: p}}, + }, + logs: map[int64]string{1: p}, + } + j := New(Config{}, s) // all zero + if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil { + t.Fatalf("sweep: %v", err) + } + if s.deleted[1] { + t.Fatalf("expected no deletion for zero retention") + } + if _, err := os.Stat(p); err != nil { + t.Fatalf("file should still exist: %v", err) + } +} + +func TestSweepSkipsMissingFilesGracefully(t *testing.T) { + s := &fakeStores{ + runsOlder: []int64{7}, + artifactsByID: map[int64][]store.Artifact{ + 7: {{ID: 99, RunID: 7, Path: "/nonexistent/path.bin"}}, + }, + logs: map[int64]string{7: "/nonexistent/run-7.log"}, + } + j := New(Config{ArtifactRetention: time.Hour, LogRetention: time.Hour}, s) + if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil { + t.Fatalf("sweep: %v", err) + } + if !s.deleted[7] { + t.Fatalf("run 7 should have been processed") + } +} + +func TestSweepUsesTheLongerCutoff(t *testing.T) { + s := &fakeStores{} + j := New(Config{ + ArtifactRetention: 72 * time.Hour, + LogRetention: 24 * time.Hour, + }, s) + now := time.Date(2026, 4, 17, 12, 0, 0, 0, time.UTC) + if err := j.Sweep(context.Background(), now); err != nil { + t.Fatalf("sweep: %v", err) + } + want := now.Add(-72 * time.Hour) + if !s.cutoffSeen.Equal(want) { + t.Fatalf("cutoff = %v, want %v (the longer of the two retentions)", s.cutoffSeen, want) + } +} diff --git a/internal/logs/logs.go b/internal/logs/logs.go new file mode 100644 index 0000000..6f13971 --- /dev/null +++ b/internal/logs/logs.go @@ -0,0 +1,134 @@ +// Package logs owns per-run flat-file logs and their live SSE fan-out. +// A single Writer serialises writes for one run; a Hub keeps a cache +// per run so handlers can open/close freely without stepping on each +// other. Lines go to disk for persistence (reload + replay) and onto +// the events.Hub so the UI tile can tail live. +package logs + +import ( + "fmt" + "html" + "log" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "vetting/internal/events" +) + +type Line struct { + TS time.Time + Level string // info|warn|error|debug + Text string +} + +type Writer struct { + runID int64 + mu sync.Mutex + f *os.File + hub *events.Hub +} + +// Hub owns the per-run Writers. The orchestrator creates one Hub at +// startup and hands it to the api package. +type Hub struct { + dir string + events *events.Hub + mu sync.Mutex + writers map[int64]*Writer +} + +func NewHub(dir string, ev *events.Hub) (*Hub, error) { + if err := os.MkdirAll(dir, 0o755); err != nil { + return nil, fmt.Errorf("mkdir log dir: %w", err) + } + return &Hub{dir: dir, events: ev, writers: map[int64]*Writer{}}, nil +} + +// WriterFor returns a cached Writer, opening the file lazily. The file +// is append-only; if an existing run's log is reopened (e.g. after a +// restart) we append rather than truncate so nothing is lost. +func (h *Hub) WriterFor(runID int64) (*Writer, error) { + h.mu.Lock() + defer h.mu.Unlock() + if w, ok := h.writers[runID]; ok { + return w, nil + } + path := filepath.Join(h.dir, fmt.Sprintf("run-%d.log", runID)) + f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) + if err != nil { + return nil, fmt.Errorf("open %s: %w", path, err) + } + w := &Writer{runID: runID, f: f, hub: h.events} + h.writers[runID] = w + return w, nil +} + +// Close flushes and closes all open run files. Called from main on +// shutdown so the logs aren't left with buffered data. +func (h *Hub) Close() { + h.mu.Lock() + defer h.mu.Unlock() + for id, w := range h.writers { + if err := w.Close(); err != nil { + log.Printf("logs: close run-%d: %v", id, err) + } + } + h.writers = nil +} + +// PathFor returns the on-disk path for a run's log; used by replay +// handlers and the report generator. +func (h *Hub) PathFor(runID int64) string { + return filepath.Join(h.dir, fmt.Sprintf("run-%d.log", runID)) +} + +// Append writes a line to disk and publishes an SSE event. Failures +// on disk log but don't block the SSE fan-out — the operator can still +// see the live tail even if disk IO is degraded. +func (w *Writer) Append(line Line) { + w.mu.Lock() + defer w.mu.Unlock() + if line.TS.IsZero() { + line.TS = time.Now().UTC() + } + if line.Level == "" { + line.Level = "info" + } + stamped := fmt.Sprintf("%s %5s %s\n", line.TS.Format(time.RFC3339Nano), strings.ToUpper(line.Level), line.Text) + if _, err := w.f.WriteString(stamped); err != nil { + log.Printf("logs: write run-%d: %v", w.runID, err) + } + if w.hub != nil { + w.hub.Publish(events.Event{ + Name: fmt.Sprintf("log-%d", w.runID), + Payload: renderLogSSE(line), + }) + } +} + +func (w *Writer) Close() error { + w.mu.Lock() + defer w.mu.Unlock() + if w.f == nil { + return nil + } + err := w.f.Close() + w.f = nil + return err +} + +// renderLogSSE returns an HTMX-compatible fragment. The tile contains +// a
: each event appends one +//
to it. +func renderLogSSE(l Line) string { + level := strings.ToLower(l.Level) + return fmt.Sprintf( + `
%s %s
`, + html.EscapeString(level), + html.EscapeString(l.TS.Format("15:04:05")), + html.EscapeString(l.Text), + ) +} diff --git a/internal/logs/logs_test.go b/internal/logs/logs_test.go new file mode 100644 index 0000000..5678747 --- /dev/null +++ b/internal/logs/logs_test.go @@ -0,0 +1,120 @@ +package logs_test + +import ( + "os" + "path/filepath" + "strings" + "testing" + "time" + + "vetting/internal/events" + "vetting/internal/logs" +) + +// TestAppendFansOutToSSE verifies the two guarantees of the log hub: +// (a) every line is persisted to the per-run file, and (b) every line +// is published as an SSE event with name log-. The UI relies on +// both — the file for reload replay, the event for live tail. +func TestAppendFansOutToSSE(t *testing.T) { + dir := t.TempDir() + hub := events.NewHub() + lh, err := logs.NewHub(dir, hub) + if err != nil { + t.Fatalf("NewHub: %v", err) + } + defer lh.Close() + + _, ch, cancel := hub.Subscribe() + defer cancel() + + w, err := lh.WriterFor(77) + if err != nil { + t.Fatalf("WriterFor: %v", err) + } + w.Append(logs.Line{Level: "info", Text: "hello from agent"}) + w.Append(logs.Line{Level: "error", Text: ""}) + + got := collect(ch, 3, 500*time.Millisecond) + // Filter out heartbeats that may sneak in. + var logEvents []events.Event + for _, ev := range got { + if strings.HasPrefix(ev.Name, "log-") { + logEvents = append(logEvents, ev) + } + } + if len(logEvents) < 2 { + t.Fatalf("expected 2 log events, got %d (all=%+v)", len(logEvents), got) + } + for _, ev := range logEvents { + if ev.Name != "log-77" { + t.Fatalf("unexpected event name %q", ev.Name) + } + } + // XSS protection: raw ") { + t.Fatalf("disk log should keep raw text (unescaped): %q", text) + } + if !strings.Contains(text, "INFO") || !strings.Contains(text, "ERROR") { + t.Fatalf("disk log missing level prefix: %q", text) + } +} + +// TestWriterForIsCached verifies a second call returns the same Writer +// — otherwise parallel /log POSTs would race on file opens and possibly +// stomp on in-flight writes. +func TestWriterForIsCached(t *testing.T) { + hub := events.NewHub() + lh, err := logs.NewHub(t.TempDir(), hub) + if err != nil { + t.Fatalf("NewHub: %v", err) + } + defer lh.Close() + + w1, err := lh.WriterFor(1) + if err != nil { + t.Fatalf("WriterFor: %v", err) + } + w2, err := lh.WriterFor(1) + if err != nil { + t.Fatalf("WriterFor: %v", err) + } + if w1 != w2 { + t.Fatalf("Writer not cached: %p vs %p", w1, w2) + } +} + +// collect drains up to max events or bails after deadline. +func collect(ch <-chan events.Event, max int, deadline time.Duration) []events.Event { + out := []events.Event{} + timer := time.NewTimer(deadline) + defer timer.Stop() + for len(out) < max { + select { + case ev, ok := <-ch: + if !ok { + return out + } + out = append(out, ev) + case <-timer.C: + return out + } + } + return out +} diff --git a/internal/model/model.go b/internal/model/model.go new file mode 100644 index 0000000..e643336 --- /dev/null +++ b/internal/model/model.go @@ -0,0 +1,96 @@ +package model + +import "time" + +type Host struct { + ID int64 + Name string + MAC string + WoLBroadcastIP string + WoLPort int + ExpectedSpecYAML string + PDUConfigJSON string + IPMIConfigJSON string + Notes string + CreatedAt time.Time + UpdatedAt time.Time +} + +type RunState string + +const ( + StateRegistered RunState = "Registered" + StateQueued RunState = "Queued" + StateWaitingWoL RunState = "WaitingWoL" + StateBooting RunState = "Booting" + StateInventoryCheck RunState = "InventoryCheck" + StateSpecValidate RunState = "SpecValidate" + StateSMART RunState = "SMART" + StateCPUStress RunState = "CPUStress" + StateStorage RunState = "Storage" + StateNetwork RunState = "Network" + StateGPU RunState = "GPU" + StatePSU RunState = "PSU" + StateReporting RunState = "Reporting" + StateCompleted RunState = "Completed" + StateFailed RunState = "Failed" + StateFailedHolding RunState = "FailedHolding" + StateReleased RunState = "Released" +) + +type Run struct { + ID int64 + HostID int64 + State RunState + Result string + FailedStage string + NextBootTarget string + AgentTokenHash string + StartedAt time.Time + CompletedAt *time.Time + ReportPath string + HoldIP string + OverrideFlagsJSON string +} + +type StageState string + +const ( + StagePending StageState = "pending" + StageRunning StageState = "running" + StagePassed StageState = "passed" + StageFailed StageState = "failed" + StageSkipped StageState = "skipped" +) + +type Stage struct { + ID int64 + RunID int64 + Name string + Ordinal int + State StageState + StartedAt *time.Time + CompletedAt *time.Time + SummaryJSON string +} + +type Measurement struct { + ID int64 + RunID int64 + StageID *int64 + TS time.Time + Kind string + Key string + Value float64 + Unit string +} + +type SpecDiff struct { + ID int64 + RunID int64 + Field string + Expected string + Actual string + Severity string // critical|warning|info + Ignored bool +} diff --git a/internal/notify/build.go b/internal/notify/build.go new file mode 100644 index 0000000..f7d5d32 --- /dev/null +++ b/internal/notify/build.go @@ -0,0 +1,56 @@ +package notify + +import ( + "fmt" + "time" + + "vetting/internal/config" +) + +// BuildRegistry translates the config surface into a live Registry. +// Unknown notifier types produce an error so typos fail startup loudly +// rather than silently drop events. +func BuildRegistry(notifiers []config.Notifier, routes []config.Route) (*Registry, error) { + reg := NewRegistry(10 * time.Second) + for _, n := range notifiers { + switch n.Type { + case "": + continue // skip blank entries; useful for commented-out examples + case "ntfy": + reg.Register(NewNtfy(n.Name, n.Server, n.Topic)) + case "discord": + reg.Register(NewDiscord(n.Name, n.WebhookURL)) + case "smtp": + reg.Register(NewSMTP(n.Name, n.SMTP.Host, n.SMTP.Port, n.SMTP.From, n.SMTP.To)) + default: + return nil, fmt.Errorf("notify: unknown notifier type %q (name=%q)", n.Type, n.Name) + } + } + for _, r := range routes { + if r.Notifier == "" { + return nil, fmt.Errorf("notify: route has no notifier name") + } + reg.AddRoute(Route{ + MatchKind: toKinds(r.MatchKind), + MatchSeverity: toSeverities(r.MatchSeverity), + Notifier: r.Notifier, + }) + } + return reg, nil +} + +func toKinds(ss []string) []Kind { + out := make([]Kind, 0, len(ss)) + for _, s := range ss { + out = append(out, Kind(s)) + } + return out +} + +func toSeverities(ss []string) []Severity { + out := make([]Severity, 0, len(ss)) + for _, s := range ss { + out = append(out, Severity(s)) + } + return out +} diff --git a/internal/notify/discord.go b/internal/notify/discord.go new file mode 100644 index 0000000..896629a --- /dev/null +++ b/internal/notify/discord.go @@ -0,0 +1,87 @@ +package notify + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +// DiscordNotifier posts to a Discord incoming webhook. Body is rendered +// as a single embed so Discord shows a colored sidebar matching event +// severity. Discord rejects empty content+embeds; we always include the +// embed so that never happens. +type DiscordNotifier struct { + NameStr string + WebhookURL string + HTTP *http.Client +} + +func NewDiscord(name, webhookURL string) *DiscordNotifier { + return &DiscordNotifier{ + NameStr: name, + WebhookURL: webhookURL, + HTTP: &http.Client{Timeout: 10 * time.Second}, + } +} + +func (d *DiscordNotifier) Name() string { return d.NameStr } + +type discordPayload struct { + Embeds []discordEmbed `json:"embeds"` +} + +type discordEmbed struct { + Title string `json:"title,omitempty"` + Description string `json:"description,omitempty"` + URL string `json:"url,omitempty"` + Color int `json:"color,omitempty"` +} + +func (d *DiscordNotifier) Send(ctx context.Context, ev Event) error { + if d.WebhookURL == "" { + return fmt.Errorf("discord: no webhook_url configured") + } + payload := discordPayload{Embeds: []discordEmbed{{ + Title: ev.Title, + Description: ev.Body, + URL: ev.URL, + Color: discordColor(ev.Severity), + }}} + buf, err := json.Marshal(payload) + if err != nil { + return err + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, d.WebhookURL, bytes.NewReader(buf)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + resp, err := d.HTTP.Do(req) + if err != nil { + return err + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode >= 300 { + b, _ := io.ReadAll(resp.Body) + return fmt.Errorf("discord: %d: %s", resp.StatusCode, strings.TrimSpace(string(b))) + } + return nil +} + +// discordColor returns the embed sidebar color for each severity. +// Values are standard Discord decimal color codes. +func discordColor(s Severity) int { + switch s { + case SeverityCritical: + return 0xE74C3C // red + case SeverityWarning: + return 0xF1C40F // yellow + default: + return 0x2ECC71 // green + } +} diff --git a/internal/notify/notify.go b/internal/notify/notify.go new file mode 100644 index 0000000..ca4b6fa --- /dev/null +++ b/internal/notify/notify.go @@ -0,0 +1,179 @@ +// Package notify owns outbound operator notifications. The orchestrator +// fires Events at well-known points (stage failure, hold opened, run +// completed, spec mismatch); a Registry matches each Event against +// config-declared routes and dispatches to the matching Notifiers. +// +// Delivery is fire-and-forget: a single HTTP/SMTP attempt per notifier +// with a bounded timeout. Failures are logged and nothing is persisted +// — on a solo LAN deployment the orchestrator UI is the source of truth +// and we don't want to build a durable queue for a convenience feature. +package notify + +import ( + "context" + "log" + "sync" + "time" +) + +// Kind enumerates the event types the orchestrator can fire. Names are +// stable: they appear in config files' match_kind lists. +type Kind string + +const ( + KindStageFailed Kind = "StageFailed" + KindSpecMismatch Kind = "SpecMismatch" + KindHoldingOpened Kind = "HoldingOpened" + KindRunCompleted Kind = "RunCompleted" +) + +// Severity is classification for filtering routes. "critical" pairs +// with StageFailed/SpecMismatch/HoldingOpened; RunCompleted uses "info". +type Severity string + +const ( + SeverityInfo Severity = "info" + SeverityWarning Severity = "warning" + SeverityCritical Severity = "critical" +) + +// Event is the payload passed to each Notifier's Send method. Title and +// Body are pre-rendered; notifiers shape them for their own transport +// (e.g. Discord embed vs SMTP body) but shouldn't re-compose semantics. +// +// URL links back to the orchestrator UI so a push notification can be +// clicked through for full context. +type Event struct { + Kind Kind + Severity Severity + RunID int64 + HostName string + Title string + Body string + URL string // optional; UI link for this run/host +} + +// Notifier is one delivery target. Implementations must not block on +// remote-side failure any longer than their own timeout — the Registry +// calls Send from a goroutine but still wants the goroutine to exit. +type Notifier interface { + Name() string + Send(ctx context.Context, ev Event) error +} + +// Route binds an event selector to a notifier name. A route matches an +// event when every non-empty field is satisfied; empty fields are wildcards. +type Route struct { + MatchKind []Kind + MatchSeverity []Severity + Notifier string // name of a registered Notifier +} + +// Registry holds notifiers + routes and fans events out. Safe for +// concurrent Dispatch. It's built once at startup from config. +type Registry struct { + notifiers map[string]Notifier + routes []Route + timeout time.Duration + + mu sync.Mutex // guards in-flight goroutine count (future-use metrics) +} + +// NewRegistry builds a Registry with its per-notification timeout budget. +// A zero timeout becomes 10s so tests and prod both get sane defaults. +func NewRegistry(timeout time.Duration) *Registry { + if timeout <= 0 { + timeout = 10 * time.Second + } + return &Registry{ + notifiers: map[string]Notifier{}, + timeout: timeout, + } +} + +// Register adds a Notifier. Re-registering a name overwrites silently — +// configs can shadow by listing the same name twice. +func (r *Registry) Register(n Notifier) { + if n == nil { + return + } + r.notifiers[n.Name()] = n +} + +// AddRoute appends a route rule. Order is preserved for deterministic +// multi-match dispatch. +func (r *Registry) AddRoute(rt Route) { + r.routes = append(r.routes, rt) +} + +// Dispatch finds every route matching ev and fires each targeted +// notifier on its own goroutine. Returns immediately — the caller does +// not wait on delivery. Errors are logged. +func (r *Registry) Dispatch(ev Event) { + targets := r.match(ev) + if len(targets) == 0 { + return + } + for _, n := range targets { + n := n + go func() { + ctx, cancel := context.WithTimeout(context.Background(), r.timeout) + defer cancel() + if err := n.Send(ctx, ev); err != nil { + log.Printf("notify: %s send(%s run=%d): %v", n.Name(), ev.Kind, ev.RunID, err) + } + }() + } +} + +// match walks the route table in order and returns the unique notifiers +// that should be fired for ev. Duplicates (same notifier named by two +// matching routes) collapse — the operator intent is delivery, not +// duplicate delivery. +func (r *Registry) match(ev Event) []Notifier { + seen := map[string]bool{} + out := []Notifier{} + for _, rt := range r.routes { + if !matchesKind(rt.MatchKind, ev.Kind) { + continue + } + if !matchesSeverity(rt.MatchSeverity, ev.Severity) { + continue + } + if seen[rt.Notifier] { + continue + } + n, ok := r.notifiers[rt.Notifier] + if !ok { + log.Printf("notify: route references unknown notifier %q", rt.Notifier) + continue + } + seen[rt.Notifier] = true + out = append(out, n) + } + return out +} + +func matchesKind(allow []Kind, got Kind) bool { + if len(allow) == 0 { + return true + } + for _, k := range allow { + if k == got { + return true + } + } + return false +} + +func matchesSeverity(allow []Severity, got Severity) bool { + if len(allow) == 0 { + return true + } + for _, s := range allow { + if s == got { + return true + } + } + return false +} diff --git a/internal/notify/notify_test.go b/internal/notify/notify_test.go new file mode 100644 index 0000000..3becd1a --- /dev/null +++ b/internal/notify/notify_test.go @@ -0,0 +1,268 @@ +package notify + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "net/smtp" + "strings" + "sync" + "sync/atomic" + "testing" + "time" +) + +// stubNotifier records every Send call; it's the test harness for +// Registry routing logic without hitting network. +type stubNotifier struct { + name string + calls []Event + mu sync.Mutex + failOn Kind // if non-empty, returns an error when ev.Kind == failOn +} + +func (s *stubNotifier) Name() string { return s.name } + +func (s *stubNotifier) Send(_ context.Context, ev Event) error { + s.mu.Lock() + s.calls = append(s.calls, ev) + s.mu.Unlock() + if s.failOn != "" && ev.Kind == s.failOn { + return errFake("forced failure") + } + return nil +} + +func (s *stubNotifier) seen() []Event { + s.mu.Lock() + defer s.mu.Unlock() + return append([]Event(nil), s.calls...) +} + +type errFake string + +func (e errFake) Error() string { return string(e) } + +// awaitCalls spins until every stub has the expected count or the +// deadline elapses — Dispatch uses goroutines so the test must wait. +func awaitCalls(t *testing.T, want map[*stubNotifier]int) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for { + ok := true + for s, n := range want { + if len(s.seen()) < n { + ok = false + break + } + } + if ok { + return + } + if time.Now().After(deadline) { + for s, n := range want { + t.Errorf("notifier %q: got %d calls, want %d", s.name, len(s.seen()), n) + } + return + } + time.Sleep(5 * time.Millisecond) + } +} + +func TestRegistryRoutesByKind(t *testing.T) { + reg := NewRegistry(time.Second) + a := &stubNotifier{name: "fails-only"} + b := &stubNotifier{name: "everything"} + reg.Register(a) + reg.Register(b) + reg.AddRoute(Route{MatchKind: []Kind{KindStageFailed}, Notifier: "fails-only"}) + reg.AddRoute(Route{Notifier: "everything"}) + + reg.Dispatch(Event{Kind: KindStageFailed, Severity: SeverityCritical}) + reg.Dispatch(Event{Kind: KindRunCompleted, Severity: SeverityInfo}) + + awaitCalls(t, map[*stubNotifier]int{a: 1, b: 2}) + if got := a.seen()[0].Kind; got != KindStageFailed { + t.Fatalf("a got %q, want StageFailed", got) + } +} + +func TestRegistryRoutesBySeverity(t *testing.T) { + reg := NewRegistry(time.Second) + crit := &stubNotifier{name: "crit-only"} + reg.Register(crit) + reg.AddRoute(Route{MatchSeverity: []Severity{SeverityCritical}, Notifier: "crit-only"}) + + reg.Dispatch(Event{Kind: KindRunCompleted, Severity: SeverityInfo}) + reg.Dispatch(Event{Kind: KindHoldingOpened, Severity: SeverityCritical}) + + awaitCalls(t, map[*stubNotifier]int{crit: 1}) + if got := crit.seen()[0].Severity; got != SeverityCritical { + t.Fatalf("got severity %q, want critical", got) + } +} + +func TestRegistryDeduplicatesNotifiers(t *testing.T) { + reg := NewRegistry(time.Second) + n := &stubNotifier{name: "only"} + reg.Register(n) + // Two routes naming the same notifier — a single Dispatch should + // fire once, not twice. + reg.AddRoute(Route{MatchKind: []Kind{KindStageFailed}, Notifier: "only"}) + reg.AddRoute(Route{MatchSeverity: []Severity{SeverityCritical}, Notifier: "only"}) + + reg.Dispatch(Event{Kind: KindStageFailed, Severity: SeverityCritical}) + + awaitCalls(t, map[*stubNotifier]int{n: 1}) +} + +func TestRegistryUnknownNotifierIsNoop(t *testing.T) { + reg := NewRegistry(time.Second) + reg.AddRoute(Route{Notifier: "does-not-exist"}) + // Should not panic or block. + reg.Dispatch(Event{Kind: KindRunCompleted}) +} + +func TestRegistryFailureDoesNotPoisonOthers(t *testing.T) { + reg := NewRegistry(time.Second) + bad := &stubNotifier{name: "bad", failOn: KindStageFailed} + good := &stubNotifier{name: "good"} + reg.Register(bad) + reg.Register(good) + reg.AddRoute(Route{Notifier: "bad"}) + reg.AddRoute(Route{Notifier: "good"}) + + reg.Dispatch(Event{Kind: KindStageFailed, Severity: SeverityCritical}) + + awaitCalls(t, map[*stubNotifier]int{bad: 1, good: 1}) +} + +func TestNtfyNotifierPOSTsBodyAndHeaders(t *testing.T) { + var captured *http.Request + var body string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + captured = r + b, _ := io.ReadAll(r.Body) + body = string(b) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + n := NewNtfy("n", srv.URL, "vetting") + err := n.Send(context.Background(), Event{ + Kind: KindStageFailed, + Severity: SeverityCritical, + Title: "host-01 FAILED", + Body: "SMART failed", + URL: "https://vetting.example/reports/42", + }) + if err != nil { + t.Fatalf("send: %v", err) + } + if captured.Method != http.MethodPost { + t.Fatalf("method = %s, want POST", captured.Method) + } + if captured.URL.Path != "/vetting" { + t.Fatalf("path = %s, want /vetting", captured.URL.Path) + } + if got := captured.Header.Get("X-Title"); got != "host-01 FAILED" { + t.Fatalf("X-Title = %q", got) + } + if got := captured.Header.Get("X-Click"); got != "https://vetting.example/reports/42" { + t.Fatalf("X-Click = %q", got) + } + if got := captured.Header.Get("X-Priority"); got != "5" { + t.Fatalf("X-Priority = %q, want 5 for critical", got) + } + if body != "SMART failed" { + t.Fatalf("body = %q, want %q", body, "SMART failed") + } +} + +func TestNtfyNotifierNon2xxErrors(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "rate limited", http.StatusTooManyRequests) + })) + defer srv.Close() + + n := NewNtfy("n", srv.URL, "t") + err := n.Send(context.Background(), Event{Kind: KindRunCompleted, Body: "x"}) + if err == nil || !strings.Contains(err.Error(), "429") { + t.Fatalf("want 429 error, got %v", err) + } +} + +func TestDiscordNotifierPOSTsEmbed(t *testing.T) { + var body string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + b, _ := io.ReadAll(r.Body) + body = string(b) + w.WriteHeader(http.StatusNoContent) + })) + defer srv.Close() + + d := NewDiscord("d", srv.URL) + err := d.Send(context.Background(), Event{ + Kind: KindRunCompleted, + Severity: SeverityInfo, + Title: "host-01 passed", + Body: "all green", + URL: "https://vetting.example/reports/1", + }) + if err != nil { + t.Fatalf("send: %v", err) + } + // Body should be a JSON payload containing an embeds array with our + // title/description/URL. + for _, want := range []string{`"embeds"`, `"host-01 passed"`, `"all green"`, `reports/1`} { + if !strings.Contains(body, want) { + t.Errorf("body missing %q: %s", want, body) + } + } +} + +func TestSMTPNotifierInvokesSendMail(t *testing.T) { + var called int32 + var gotAddr, gotFrom string + var gotTo []string + var gotMsg []byte + s := NewSMTP("s", "mail.example", 2525, "vetting@example", []string{"ops@example"}) + s.SendMailFn = func(addr string, _ smtp.Auth, from string, to []string, msg []byte) error { + atomic.AddInt32(&called, 1) + gotAddr, gotFrom, gotTo, gotMsg = addr, from, to, msg + return nil + } + err := s.Send(context.Background(), Event{ + Kind: KindStageFailed, Title: "subj", Body: "failure body", + URL: "https://vetting.example/reports/9", + }) + if err != nil { + t.Fatalf("send: %v", err) + } + if atomic.LoadInt32(&called) != 1 { + t.Fatal("SendMailFn not called") + } + if gotAddr != "mail.example:2525" { + t.Fatalf("addr = %q", gotAddr) + } + if gotFrom != "vetting@example" { + t.Fatalf("from = %q", gotFrom) + } + if len(gotTo) != 1 || gotTo[0] != "ops@example" { + t.Fatalf("to = %v", gotTo) + } + s1 := string(gotMsg) + for _, want := range []string{"Subject: subj", "failure body", "Link: https://vetting.example/reports/9"} { + if !strings.Contains(s1, want) { + t.Errorf("message missing %q", want) + } + } +} + +func TestSMTPNotifierRejectsIncompleteConfig(t *testing.T) { + s := &SMTPNotifier{NameStr: "s"} + if err := s.Send(context.Background(), Event{Kind: KindRunCompleted}); err == nil { + t.Fatal("want error, got nil") + } +} diff --git a/internal/notify/ntfy.go b/internal/notify/ntfy.go new file mode 100644 index 0000000..b27d79f --- /dev/null +++ b/internal/notify/ntfy.go @@ -0,0 +1,90 @@ +package notify + +import ( + "context" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +// NtfyNotifier posts to ntfy.sh (or a self-hosted ntfy server). Message +// body is the plain text body; title and URL are passed via X-Title and +// X-Click headers so ntfy renders them as the push title + deep link. +type NtfyNotifier struct { + NameStr string + Server string // e.g. "https://ntfy.sh" or self-hosted + Topic string + HTTP *http.Client +} + +func NewNtfy(name, server, topic string) *NtfyNotifier { + if server == "" { + server = "https://ntfy.sh" + } + return &NtfyNotifier{ + NameStr: name, + Server: strings.TrimRight(server, "/"), + Topic: topic, + HTTP: &http.Client{Timeout: 10 * time.Second}, + } +} + +func (n *NtfyNotifier) Name() string { return n.NameStr } + +func (n *NtfyNotifier) Send(ctx context.Context, ev Event) error { + if n.Topic == "" { + return fmt.Errorf("ntfy: no topic configured") + } + url := n.Server + "/" + n.Topic + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, strings.NewReader(ev.Body)) + if err != nil { + return err + } + if ev.Title != "" { + req.Header.Set("X-Title", ev.Title) + } + if ev.URL != "" { + req.Header.Set("X-Click", ev.URL) + } + req.Header.Set("X-Priority", priorityForSeverity(ev.Severity)) + req.Header.Set("X-Tags", ntfyTag(ev.Kind, ev.Severity)) + + resp, err := n.HTTP.Do(req) + if err != nil { + return err + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode >= 300 { + b, _ := io.ReadAll(resp.Body) + return fmt.Errorf("ntfy: %d: %s", resp.StatusCode, strings.TrimSpace(string(b))) + } + return nil +} + +// priorityForSeverity maps our severities to ntfy's 1–5 scale. "info" +// → 3 (default), warning → 4, critical → 5. +func priorityForSeverity(s Severity) string { + switch s { + case SeverityCritical: + return "5" + case SeverityWarning: + return "4" + default: + return "3" + } +} + +func ntfyTag(k Kind, s Severity) string { + switch { + case s == SeverityCritical: + return "rotating_light," + string(k) + case k == KindRunCompleted: + return "white_check_mark," + string(k) + case k == KindHoldingOpened: + return "construction," + string(k) + default: + return string(k) + } +} diff --git a/internal/notify/smtp.go b/internal/notify/smtp.go new file mode 100644 index 0000000..a96b667 --- /dev/null +++ b/internal/notify/smtp.go @@ -0,0 +1,81 @@ +package notify + +import ( + "context" + "fmt" + "net/smtp" + "strconv" + "strings" +) + +// SMTPNotifier sends a plaintext email. Authentication is left at zero +// (LAN-only relay assumed); if the configured server requires auth the +// Send call will return an error and the Registry will log it. +// +// SendMailFn is overridable so tests can capture the outgoing message +// without needing a live SMTP server. +type SMTPNotifier struct { + NameStr string + Host string + Port int + From string + To []string + SendMailFn func(addr string, a smtp.Auth, from string, to []string, msg []byte) error +} + +func NewSMTP(name, host string, port int, from string, to []string) *SMTPNotifier { + if port == 0 { + port = 25 + } + return &SMTPNotifier{ + NameStr: name, + Host: host, + Port: port, + From: from, + To: to, + SendMailFn: smtp.SendMail, + } +} + +func (s *SMTPNotifier) Name() string { return s.NameStr } + +func (s *SMTPNotifier) Send(ctx context.Context, ev Event) error { + if s.Host == "" || s.From == "" || len(s.To) == 0 { + return fmt.Errorf("smtp: incomplete config (host/from/to required)") + } + // We intentionally don't honour ctx here — net/smtp.SendMail doesn't + // accept a context; for a LAN relay with a short TCP timeout the + // Registry's goroutine will outlive the timeout but only by seconds. + addr := s.Host + ":" + strconv.Itoa(s.Port) + msg := buildEmail(s.From, s.To, ev) + return s.SendMailFn(addr, nil, s.From, s.To, msg) +} + +// buildEmail produces an RFC 5322 minimal message. Body is plaintext; +// the URL is appended so the recipient can click through from a text +// mail client. No MIME for now — keeps it robust. +func buildEmail(from string, to []string, ev Event) []byte { + var b strings.Builder + b.WriteString("From: ") + b.WriteString(from) + b.WriteString("\r\n") + b.WriteString("To: ") + b.WriteString(strings.Join(to, ", ")) + b.WriteString("\r\n") + subject := ev.Title + if subject == "" { + subject = "[vetting] " + string(ev.Kind) + } + b.WriteString("Subject: ") + b.WriteString(subject) + b.WriteString("\r\n") + b.WriteString("Content-Type: text/plain; charset=UTF-8\r\n") + b.WriteString("\r\n") + b.WriteString(ev.Body) + if ev.URL != "" { + b.WriteString("\r\n\r\nLink: ") + b.WriteString(ev.URL) + } + b.WriteString("\r\n") + return []byte(b.String()) +} diff --git a/internal/orchestrator/dispatcher.go b/internal/orchestrator/dispatcher.go new file mode 100644 index 0000000..38c4951 --- /dev/null +++ b/internal/orchestrator/dispatcher.go @@ -0,0 +1,124 @@ +package orchestrator + +import ( + "context" + "log" + "time" + + "vetting/internal/model" + "vetting/internal/store" +) + +// Dispatcher picks Queued runs off the DB and drives them through +// WaitingWoL (sending a WoL packet). Concurrency is capped at Max. +// +// For Phase 2 the dispatcher's job ends at WaitingWoL; further +// transitions are driven by iPXE and agent callbacks. Phase 4+ will +// return here and shepherd each run through stage execution. +type Dispatcher struct { + Max int + Runs *store.Runs + Hosts *store.Hosts + Runner *Runner + + active chan struct{} + stop chan struct{} +} + +func NewDispatcher(max int, runs *store.Runs, hosts *store.Hosts, runner *Runner) *Dispatcher { + if max < 1 { + max = 1 + } + return &Dispatcher{ + Max: max, + Runs: runs, + Hosts: hosts, + Runner: runner, + active: make(chan struct{}, max), + stop: make(chan struct{}), + } +} + +func (d *Dispatcher) Start(ctx context.Context) { + go d.loop(ctx) +} + +func (d *Dispatcher) Stop() { + close(d.stop) +} + +func (d *Dispatcher) loop(ctx context.Context) { + t := time.NewTicker(2 * time.Second) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-d.stop: + return + case <-t.C: + d.pickNext(ctx) + } + } +} + +func (d *Dispatcher) pickNext(ctx context.Context) { + select { + case d.active <- struct{}{}: + default: + return // at capacity + } + released := false + defer func() { + if !released { + <-d.active + } + }() + + runs, err := d.Runs.Active(ctx) + if err != nil { + log.Printf("dispatcher: list active: %v", err) + return + } + + var queued *model.Run + inFlight := 0 + for i := range runs { + switch runs[i].State { + case model.StateQueued: + if queued == nil { + queued = &runs[i] + } + case model.StateWaitingWoL, model.StateBooting, model.StateInventoryCheck, + model.StateSpecValidate, model.StateSMART, model.StateCPUStress, + model.StateStorage, model.StateNetwork, model.StateGPU, + model.StatePSU, model.StateReporting: + inFlight++ + } + } + + if inFlight >= d.Max || queued == nil { + return + } + + host, err := d.Hosts.Get(ctx, queued.HostID) + if err != nil { + log.Printf("dispatcher: get host %d: %v", queued.HostID, err) + return + } + if _, err := d.Runner.Transition(ctx, queued.ID, TriggerDispatched); err != nil { + log.Printf("dispatcher: transition run %d: %v", queued.ID, err) + return + } + if err := SendWoL(host.MAC, host.WoLBroadcastIP, host.WoLPort); err != nil { + log.Printf("dispatcher: WoL run %d host %s: %v", queued.ID, host.Name, err) + // Stay in WaitingWoL; operator can retry or investigate. + return + } + log.Printf("dispatcher: WoL sent for run %d (host=%s mac=%s)", queued.ID, host.Name, host.MAC) + + // Slot stays reserved until the run leaves active (Phase 4+). + // Phase 2 lets the loop observe inFlight via DB state. + released = true + <-d.active +} diff --git a/internal/orchestrator/iperf.go b/internal/orchestrator/iperf.go new file mode 100644 index 0000000..9612ac6 --- /dev/null +++ b/internal/orchestrator/iperf.go @@ -0,0 +1,92 @@ +package orchestrator + +import ( + "context" + "errors" + "fmt" + "log" + "os" + "os/exec" + "strconv" + "sync" + "time" +) + +// IperfSupervisor runs a single `iperf3 -s` process under the +// orchestrator so the Network stage has a stable server to dial. Each +// run's Network test is sequential (stages are always serial), so one +// server process handles every host under test. +// +// Missing iperf3 binary is logged once and the supervisor becomes a +// no-op — the agent's Network stage will then fail to connect and skip +// cleanly via the stage's own error path. +type IperfSupervisor struct { + Port int // default 5201 + + mu sync.Mutex + cmd *exec.Cmd + started bool + fatal error +} + +func NewIperfSupervisor(port int) *IperfSupervisor { + if port <= 0 { + port = 5201 + } + return &IperfSupervisor{Port: port} +} + +func (s *IperfSupervisor) Start(ctx context.Context) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.started { + return nil + } + if _, err := exec.LookPath("iperf3"); err != nil { + s.fatal = fmt.Errorf("iperf3 not in PATH: %w", err) + log.Printf("iperf supervisor: %v (Network stage will fail to connect)", s.fatal) + return nil + } + cmd := exec.CommandContext(ctx, "iperf3", "-s", "-p", strconv.Itoa(s.Port)) + if err := cmd.Start(); err != nil { + s.fatal = err + return err + } + s.cmd = cmd + s.started = true + log.Printf("iperf supervisor: iperf3 -s -p %d (pid=%d)", s.Port, cmd.Process.Pid) + go s.wait() + return nil +} + +// Shutdown politely stops the iperf3 subprocess. Called from main on +// SIGINT. A 3s grace period is enough for iperf3 to flush logs; after +// that we kill. +func (s *IperfSupervisor) Shutdown(timeout time.Duration) error { + s.mu.Lock() + cmd := s.cmd + s.mu.Unlock() + if cmd == nil || cmd.Process == nil { + return nil + } + // os.Interrupt is cross-platform; on Linux it maps to SIGINT which + // iperf3 handles gracefully. On Windows (dev only) it's a no-op and + // we'll fall through to Kill after the timeout. + _ = cmd.Process.Signal(os.Interrupt) + done := make(chan error, 1) + go func() { done <- cmd.Wait() }() + select { + case <-done: + return nil + case <-time.After(timeout): + _ = cmd.Process.Kill() + return errors.New("iperf3 did not exit in time; killed") + } +} + +func (s *IperfSupervisor) wait() { + _ = s.cmd.Wait() + s.mu.Lock() + defer s.mu.Unlock() + s.started = false +} diff --git a/internal/orchestrator/runner.go b/internal/orchestrator/runner.go new file mode 100644 index 0000000..40f7399 --- /dev/null +++ b/internal/orchestrator/runner.go @@ -0,0 +1,118 @@ +package orchestrator + +import ( + "context" + "fmt" + "log" + "time" + + "vetting/internal/events" + "vetting/internal/model" + "vetting/internal/store" +) + +// Runner is the authoritative mutator for run state. All state +// transitions go through (*Runner).Transition so the DB update and +// the event publication happen together. +type Runner struct { + Runs *store.Runs + Hosts *store.Hosts + Stages *store.Stages + EventHub *events.Hub +} + +func (r *Runner) Transition(ctx context.Context, runID int64, trigger Trigger) (model.RunState, error) { + run, err := r.Runs.Get(ctx, runID) + if err != nil { + return "", fmt.Errorf("get run: %w", err) + } + next, err := Next(run.State, trigger) + if err != nil { + return "", err + } + if err := r.Runs.SetState(ctx, runID, next); err != nil { + return "", fmt.Errorf("persist transition: %w", err) + } + log.Printf("run %d: %s -> %s (%s)", runID, run.State, next, trigger) + r.publishTileUpdate(ctx, run.HostID) + return next, nil +} + +// StartStage marks a stage row running and publishes a tile refresh. +func (r *Runner) StartStage(ctx context.Context, runID int64, name string) error { + if err := r.Stages.StartByName(ctx, runID, name); err != nil { + return err + } + run, err := r.Runs.Get(ctx, runID) + if err == nil { + r.publishTileUpdate(ctx, run.HostID) + } + return nil +} + +func (r *Runner) publishTileUpdate(ctx context.Context, hostID int64) { + host, err := r.Hosts.Get(ctx, hostID) + if err != nil { + log.Printf("publishTileUpdate: get host %d: %v", hostID, err) + return + } + latest, err := r.Runs.LatestForHost(ctx, hostID) + if err != nil { + log.Printf("publishTileUpdate: latest run: %v", err) + return + } + payload := renderTileSSE(ctx, *host, latest) + r.EventHub.Publish(events.Event{Name: fmt.Sprintf("tile-%d", hostID), Payload: payload}) +} + +// TileRenderer renders a single tile fragment. Registered at startup +// so the orchestrator package stays free of template / store-enrichment +// imports. The closure is expected to do any DB lookups itself (spec- +// diff count, hold-key path, …) before handing the data to the +// template package. +var TileRenderer func(ctx context.Context, host model.Host, latest *model.Run) string + +func renderTileSSE(ctx context.Context, host model.Host, latest *model.Run) string { + if TileRenderer == nil { + return fmt.Sprintf(`
state change
`, host.ID) + } + return TileRenderer(ctx, host, latest) +} + +// TouchHeartbeat is called on every agent heartbeat so the orchestrator +// can record last-seen; Phase 2 just logs, Phase 3+ will update a +// last_seen_at column. +func (r *Runner) TouchHeartbeat(runID int64) { + _ = runID + _ = time.Now() +} + +// Override re-enters a held stage after the operator has acknowledged +// the failure condition (e.g. wipe-probe override). It jumps +// FailedHolding → StateFor(failed_stage), clears the failed marker, and +// publishes a tile refresh so the UI drops the hold banner. +func (r *Runner) Override(ctx context.Context, runID int64, flagsJSON string) (model.RunState, error) { + run, err := r.Runs.Get(ctx, runID) + if err != nil { + return "", fmt.Errorf("get run: %w", err) + } + if run.FailedStage == "" { + return "", fmt.Errorf("override: run has no failed_stage") + } + next, err := NextForOverride(run.State, run.FailedStage) + if err != nil { + return "", err + } + if err := r.Runs.SetOverrideFlags(ctx, runID, flagsJSON); err != nil { + return "", fmt.Errorf("persist override flags: %w", err) + } + if err := r.Runs.SetState(ctx, runID, next); err != nil { + return "", fmt.Errorf("override transition: %w", err) + } + if err := r.Runs.ClearFailedStage(ctx, runID); err != nil { + log.Printf("override: clear failed_stage: %v", err) + } + log.Printf("run %d: %s -> %s (OperatorOverride stage=%s flags=%s)", runID, run.State, next, run.FailedStage, flagsJSON) + r.publishTileUpdate(ctx, run.HostID) + return next, nil +} diff --git a/internal/orchestrator/statemachine.go b/internal/orchestrator/statemachine.go new file mode 100644 index 0000000..d8921b6 --- /dev/null +++ b/internal/orchestrator/statemachine.go @@ -0,0 +1,129 @@ +package orchestrator + +import ( + "fmt" + + "vetting/internal/model" +) + +// Trigger is an event that drives a state transition. +type Trigger string + +const ( + TriggerStartRequested Trigger = "StartRequested" // user clicks Start Vetting + TriggerDispatched Trigger = "Dispatched" // dispatcher picked this run + TriggerPXEObserved Trigger = "PXEObserved" // iPXE fetched cmdline for MAC + TriggerAgentClaimed Trigger = "AgentClaimed" // agent POSTed /claim with valid token + TriggerStageFailed Trigger = "StageFailed" // a stage reported failure + TriggerStageCompleted Trigger = "StageCompleted" // a stage reported success → advance + TriggerAllStagesPassed Trigger = "AllStagesPassed" // final stage passed + TriggerOperatorReleased Trigger = "OperatorReleased" // user clicked Release on a held run + TriggerOperatorOverride Trigger = "OperatorOverride" // user overrode a held stage; re-enter it +) + +// stageStates maps the canonical stage name (from DefaultStageOrder) +// to the matching RunState. Named differently for historical reasons: +// the first stage is "Inventory" (stage row name) but the run state is +// "InventoryCheck". Later stages share a name with their state. +var stageStates = map[string]model.RunState{ + "Inventory": model.StateInventoryCheck, + "SpecValidate": model.StateSpecValidate, + "SMART": model.StateSMART, + "CPUStress": model.StateCPUStress, + "Storage": model.StateStorage, + "Network": model.StateNetwork, + "GPU": model.StateGPU, + "PSU": model.StatePSU, + "Reporting": model.StateReporting, +} + +// stageOrder is the sequence of RunStates the run walks through from +// first stage to Completed. Kept in sync with store.DefaultStageOrder. +var stageOrder = []model.RunState{ + model.StateInventoryCheck, + model.StateSpecValidate, + model.StateSMART, + model.StateCPUStress, + model.StateStorage, + model.StateNetwork, + model.StateGPU, + model.StatePSU, + model.StateReporting, +} + +type transition struct { + from []model.RunState + to model.RunState +} + +var table = map[Trigger]transition{ + TriggerStartRequested: {from: []model.RunState{model.StateRegistered}, to: model.StateQueued}, + TriggerDispatched: {from: []model.RunState{model.StateQueued}, to: model.StateWaitingWoL}, + TriggerPXEObserved: {from: []model.RunState{model.StateWaitingWoL, model.StateBooting}, to: model.StateBooting}, + TriggerAgentClaimed: {from: []model.RunState{model.StateBooting, model.StateWaitingWoL}, to: model.StateInventoryCheck}, + TriggerStageFailed: {from: allActiveStates(), to: model.StateFailedHolding}, + TriggerAllStagesPassed: {from: []model.RunState{model.StateReporting}, to: model.StateCompleted}, + TriggerOperatorReleased: {from: []model.RunState{model.StateFailedHolding}, to: model.StateReleased}, +} + +// Next computes the target state for a trigger against the current state. +// StageCompleted is handled specially: it advances through stageOrder. +func Next(current model.RunState, t Trigger) (model.RunState, error) { + if t == TriggerStageCompleted { + return nextStageState(current) + } + tr, ok := table[t] + if !ok { + return "", fmt.Errorf("unknown trigger %q", t) + } + for _, s := range tr.from { + if s == current { + return tr.to, nil + } + } + return "", fmt.Errorf("trigger %q not allowed from %q", t, current) +} + +// NextForOverride returns the state we should jump to when the operator +// overrides a held stage. It's separate from the generic table because +// the target depends on the failed_stage, not on the current state +// (which is always FailedHolding). +func NextForOverride(current model.RunState, failedStage string) (model.RunState, error) { + if current != model.StateFailedHolding { + return "", fmt.Errorf("override not allowed from %q", current) + } + s, ok := stageStates[failedStage] + if !ok { + return "", fmt.Errorf("override: unknown failed stage %q", failedStage) + } + return s, nil +} + +// StateForStage returns the RunState that corresponds to a stage name. +// Used by handlers that receive a stage name and want to guard against +// stale/out-of-order agent reports. +func StateForStage(name string) (model.RunState, bool) { + s, ok := stageStates[name] + return s, ok +} + +func nextStageState(current model.RunState) (model.RunState, error) { + for i, s := range stageOrder { + if s == current { + if i+1 >= len(stageOrder) { + return model.StateCompleted, nil + } + return stageOrder[i+1], nil + } + } + return "", fmt.Errorf("StageCompleted not valid from %q", current) +} + +func allActiveStates() []model.RunState { + return []model.RunState{ + model.StateQueued, model.StateWaitingWoL, model.StateBooting, + model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART, + model.StateCPUStress, model.StateStorage, model.StateNetwork, + model.StateGPU, model.StatePSU, model.StateReporting, + } +} diff --git a/internal/orchestrator/statemachine_test.go b/internal/orchestrator/statemachine_test.go new file mode 100644 index 0000000..33a68c1 --- /dev/null +++ b/internal/orchestrator/statemachine_test.go @@ -0,0 +1,67 @@ +package orchestrator_test + +import ( + "testing" + + "vetting/internal/model" + "vetting/internal/orchestrator" +) + +func TestNextForOverride(t *testing.T) { + tests := []struct { + name string + from model.RunState + failedStage string + want model.RunState + wantErr bool + }{ + {"storage override", model.StateFailedHolding, "Storage", model.StateStorage, false}, + {"smart override", model.StateFailedHolding, "SMART", model.StateSMART, false}, + {"inventory override", model.StateFailedHolding, "Inventory", model.StateInventoryCheck, false}, + {"unknown stage", model.StateFailedHolding, "NotAStage", "", true}, + {"not holding", model.StateStorage, "Storage", "", true}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, err := orchestrator.NextForOverride(tc.from, tc.failedStage) + if tc.wantErr { + if err == nil { + t.Fatalf("expected error, got %q", got) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q, want %q", got, tc.want) + } + }) + } +} + +func TestNextStageWalk(t *testing.T) { + // Walking StageCompleted from each stage should land on the next + // one in the canonical order, and from Reporting onto Completed. + chain := []model.RunState{ + model.StateInventoryCheck, + model.StateSpecValidate, + model.StateSMART, + model.StateCPUStress, + model.StateStorage, + model.StateNetwork, + model.StateGPU, + model.StatePSU, + model.StateReporting, + model.StateCompleted, + } + for i := 0; i < len(chain)-1; i++ { + got, err := orchestrator.Next(chain[i], orchestrator.TriggerStageCompleted) + if err != nil { + t.Fatalf("Next(%q): %v", chain[i], err) + } + if got != chain[i+1] { + t.Fatalf("Next(%q) = %q, want %q", chain[i], got, chain[i+1]) + } + } +} diff --git a/internal/orchestrator/tokens.go b/internal/orchestrator/tokens.go new file mode 100644 index 0000000..e4d6569 --- /dev/null +++ b/internal/orchestrator/tokens.go @@ -0,0 +1,26 @@ +package orchestrator + +import ( + "crypto/rand" + "crypto/sha256" + "encoding/hex" + "fmt" +) + +// IssueRunToken returns (plaintext, hashHex). The plaintext is passed +// to the host via the iPXE kernel cmdline; the hash is persisted in the +// runs table for later constant-time comparison. +func IssueRunToken() (string, string, error) { + b := make([]byte, 32) + if _, err := rand.Read(b); err != nil { + return "", "", fmt.Errorf("random: %w", err) + } + plain := hex.EncodeToString(b) + sum := sha256.Sum256([]byte(plain)) + return plain, hex.EncodeToString(sum[:]), nil +} + +func HashRunToken(plain string) string { + sum := sha256.Sum256([]byte(plain)) + return hex.EncodeToString(sum[:]) +} diff --git a/internal/orchestrator/tokens_test.go b/internal/orchestrator/tokens_test.go new file mode 100644 index 0000000..912aa9b --- /dev/null +++ b/internal/orchestrator/tokens_test.go @@ -0,0 +1,38 @@ +package orchestrator + +import ( + "strings" + "testing" +) + +func TestIssueRunTokenRoundTrip(t *testing.T) { + plain, hash, err := IssueRunToken() + if err != nil { + t.Fatalf("IssueRunToken: %v", err) + } + if len(plain) != 64 { + t.Fatalf("plaintext should be 64 hex chars, got %d", len(plain)) + } + if len(hash) != 64 { + t.Fatalf("hash should be 64 hex chars, got %d", len(hash)) + } + if HashRunToken(plain) != hash { + t.Fatalf("HashRunToken(plain) != hash") + } + // Ensure high entropy: two consecutive issues differ. + plain2, _, _ := IssueRunToken() + if plain == plain2 { + t.Fatalf("expected distinct tokens on consecutive calls") + } +} + +func TestHashRunTokenDeterministic(t *testing.T) { + h1 := HashRunToken("abc") + h2 := HashRunToken("abc") + if h1 != h2 { + t.Fatalf("hash not deterministic") + } + if strings.EqualFold(h1, HashRunToken("abd")) { + t.Fatalf("hash should differ for distinct inputs") + } +} diff --git a/internal/orchestrator/wol.go b/internal/orchestrator/wol.go new file mode 100644 index 0000000..4322c95 --- /dev/null +++ b/internal/orchestrator/wol.go @@ -0,0 +1,57 @@ +package orchestrator + +import ( + "encoding/hex" + "fmt" + "net" + "strconv" + "strings" +) + +// SendWoL sends a Wake-on-LAN magic packet to broadcastIP:port for the +// given MAC (aa:bb:cc:dd:ee:ff). The packet is 6 bytes of 0xFF followed +// by the MAC repeated 16 times. +func SendWoL(mac, broadcastIP string, port int) error { + macBytes, err := parseMAC(mac) + if err != nil { + return err + } + packet := make([]byte, 6+16*6) + for i := 0; i < 6; i++ { + packet[i] = 0xff + } + for i := 0; i < 16; i++ { + copy(packet[6+i*6:], macBytes) + } + + conn, err := net.Dial("udp", net.JoinHostPort(broadcastIP, strconv.Itoa(port))) + if err != nil { + return fmt.Errorf("dial wol: %w", err) + } + defer conn.Close() + + if _, err := conn.Write(packet); err != nil { + return fmt.Errorf("write wol: %w", err) + } + return nil +} + +func parseMAC(s string) ([]byte, error) { + s = strings.ToLower(strings.TrimSpace(s)) + parts := strings.Split(s, ":") + if len(parts) != 6 { + return nil, fmt.Errorf("invalid MAC %q", s) + } + out := make([]byte, 6) + for i, p := range parts { + if len(p) != 2 { + return nil, fmt.Errorf("invalid MAC octet %q", p) + } + b, err := hex.DecodeString(p) + if err != nil { + return nil, fmt.Errorf("invalid MAC %q: %w", s, err) + } + out[i] = b[0] + } + return out, nil +} diff --git a/internal/orchestrator/wol_test.go b/internal/orchestrator/wol_test.go new file mode 100644 index 0000000..d7466ed --- /dev/null +++ b/internal/orchestrator/wol_test.go @@ -0,0 +1,37 @@ +package orchestrator + +import ( + "bytes" + "testing" +) + +func TestParseMAC(t *testing.T) { + got, err := parseMAC("aa:bb:cc:dd:ee:ff") + if err != nil { + t.Fatalf("parseMAC: %v", err) + } + want := []byte{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff} + if !bytes.Equal(got, want) { + t.Fatalf("parseMAC: %x != %x", got, want) + } +} + +func TestParseMACUpper(t *testing.T) { + // Must be case-insensitive so users can paste either form. + got, err := parseMAC("AA:BB:CC:DD:EE:FF") + if err != nil { + t.Fatalf("parseMAC upper: %v", err) + } + want := []byte{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff} + if !bytes.Equal(got, want) { + t.Fatalf("parseMAC upper: %x != %x", got, want) + } +} + +func TestParseMACInvalid(t *testing.T) { + for _, bad := range []string{"", "aa:bb:cc", "zz:yy:xx:ww:vv:uu", "aa-bb-cc-dd-ee-ff", "aa:bb:cc:dd:ee:ff:00"} { + if _, err := parseMAC(bad); err == nil { + t.Errorf("expected error for %q", bad) + } + } +} diff --git a/internal/pxe/dnsmasq.go b/internal/pxe/dnsmasq.go new file mode 100644 index 0000000..2876f60 --- /dev/null +++ b/internal/pxe/dnsmasq.go @@ -0,0 +1,231 @@ +package pxe + +import ( + "context" + "fmt" + "io" + "log" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "sync" + "text/template" + "time" + + "vetting/internal/model" +) + +// SupervisorConfig controls how dnsmasq is launched and configured. +type SupervisorConfig struct { + Enabled bool + Interface string // e.g. "eth0" + DHCPRange string // e.g. "10.77.0.100,10.77.0.200,12h" + OrchestratorURL string // baked into iPXE scripts + RuntimeDir string // writable dir for dnsmasq.conf and leases + TFTPRoot string // holds ipxe.efi, undionly.kpxe + DNSMasqBin string // path to dnsmasq binary (default: "dnsmasq") +} + +// Supervisor owns a dnsmasq subprocess, rewrites its config when the +// host registry changes, and sends SIGHUP to reload. The MAC allowlist +// is the safety barrier: only registered MACs see a DHCP reply. +type Supervisor struct { + cfg SupervisorConfig + mu sync.Mutex + cmd *exec.Cmd + cancel context.CancelFunc +} + +func NewSupervisor(cfg SupervisorConfig) *Supervisor { + if cfg.DNSMasqBin == "" { + cfg.DNSMasqBin = "dnsmasq" + } + return &Supervisor{cfg: cfg} +} + +// Start launches dnsmasq in the background. If cfg.Enabled is false +// Start is a no-op (useful for dev on Windows where dnsmasq isn't +// available). +func (s *Supervisor) Start(ctx context.Context, hosts []model.Host) error { + if !s.cfg.Enabled { + log.Printf("pxe: disabled in config — skipping dnsmasq") + return nil + } + if runtime.GOOS == "windows" { + return fmt.Errorf("dnsmasq supervision is not supported on Windows — run orchestrator on Linux") + } + if err := os.MkdirAll(s.cfg.RuntimeDir, 0o755); err != nil { + return fmt.Errorf("mkdir runtime: %w", err) + } + if err := s.writeConf(hosts); err != nil { + return err + } + subCtx, cancel := context.WithCancel(ctx) + s.mu.Lock() + s.cancel = cancel + s.mu.Unlock() + + confPath := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf") + cmd := exec.CommandContext(subCtx, s.cfg.DNSMasqBin, + "--conf-file="+confPath, + "--no-daemon", + "--log-queries", + "--log-dhcp", + ) + cmd.Stdout = logWriter{prefix: "dnsmasq"} + cmd.Stderr = logWriter{prefix: "dnsmasq"} + if err := cmd.Start(); err != nil { + cancel() + return fmt.Errorf("start dnsmasq: %w", err) + } + s.mu.Lock() + s.cmd = cmd + s.mu.Unlock() + go func() { + if err := cmd.Wait(); err != nil && subCtx.Err() == nil { + log.Printf("dnsmasq exited: %v", err) + } + }() + return nil +} + +// Reload rewrites the conf with the latest host registry and sends +// SIGHUP. It will restart the subprocess if SIGHUP is unsupported +// (e.g. when running behind an OS that doesn't support it). +func (s *Supervisor) Reload(hosts []model.Host) error { + if !s.cfg.Enabled { + return nil + } + if err := s.writeConf(hosts); err != nil { + return err + } + s.mu.Lock() + cmd := s.cmd + s.mu.Unlock() + if cmd == nil || cmd.Process == nil { + return nil + } + if err := sighup(cmd.Process); err != nil { + return fmt.Errorf("sighup dnsmasq: %w", err) + } + return nil +} + +// Shutdown stops dnsmasq within the timeout. +func (s *Supervisor) Shutdown(timeout time.Duration) error { + if !s.cfg.Enabled { + return nil + } + s.mu.Lock() + cancel := s.cancel + cmd := s.cmd + s.mu.Unlock() + if cancel != nil { + cancel() + } + if cmd != nil && cmd.Process != nil { + done := make(chan struct{}) + go func() { + _, _ = cmd.Process.Wait() + close(done) + }() + select { + case <-done: + case <-time.After(timeout): + _ = cmd.Process.Kill() + } + } + return nil +} + +func (s *Supervisor) writeConf(hosts []model.Host) error { + tmpl, err := template.New("dnsmasq").Parse(dnsmasqTemplate) + if err != nil { + return err + } + conf := filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf") + tmp := conf + ".new" + f, err := os.Create(tmp) + if err != nil { + return fmt.Errorf("create conf: %w", err) + } + data := struct { + Cfg SupervisorConfig + Hosts []model.Host + }{s.cfg, hosts} + if err := tmpl.Execute(f, data); err != nil { + _ = f.Close() + return fmt.Errorf("render conf: %w", err) + } + if err := f.Sync(); err != nil { + _ = f.Close() + return err + } + if err := f.Close(); err != nil { + return err + } + if err := os.Rename(tmp, conf); err != nil { + return fmt.Errorf("rename conf: %w", err) + } + return nil +} + +// Exposed for the UI handlers to show operators what config is live. +func (s *Supervisor) ConfPath() string { + return filepath.Join(s.cfg.RuntimeDir, "dnsmasq.conf") +} + +type logWriter struct{ prefix string } + +func (w logWriter) Write(p []byte) (int, error) { + for _, line := range strings.Split(strings.TrimRight(string(p), "\n"), "\n") { + if line == "" { + continue + } + log.Printf("[%s] %s", w.prefix, line) + } + return len(p), nil +} + +// Allow package consumers to swap io.Writer for logs in tests. +var _ io.Writer = logWriter{} + +const dnsmasqTemplate = `# Generated by Vetting — do not hand-edit. +interface={{ .Cfg.Interface }} +bind-interfaces +port=0 +domain-needed +bogus-priv +no-resolv + +# MAC allowlist: dnsmasq only answers DHCP for MACs with a dhcp-host= below. +dhcp-ignore=tag:!known +{{- range .Hosts }} +dhcp-host={{ .MAC }},set:known +{{- end }} + +# DHCP range (broader subnet coverage is fine; allowlist above gates replies). +dhcp-range={{ .Cfg.DHCPRange }} + +# TFTP + HTTP boot (iPXE chainload). +enable-tftp +tftp-root={{ .Cfg.TFTPRoot }} + +# BIOS (undionly.kpxe) and UEFI (ipxe.efi) clients both get iPXE first, +# which then re-requests a per-MAC script from the orchestrator. +dhcp-match=set:bios,option:client-arch,0 +dhcp-match=set:efi64,option:client-arch,7 +dhcp-match=set:efi64,option:client-arch,9 + +# If the client is iPXE itself, send it the per-MAC HTTP script. +dhcp-match=set:ipxe,175 +dhcp-boot=tag:ipxe,{{ .Cfg.OrchestratorURL }}/ipxe/${mac} + +# Otherwise (first boot from ROM) chainload iPXE from TFTP. +dhcp-boot=tag:!ipxe,tag:bios,undionly.kpxe +dhcp-boot=tag:!ipxe,tag:efi64,ipxe.efi + +log-facility=- +` diff --git a/internal/pxe/ipxe.go b/internal/pxe/ipxe.go new file mode 100644 index 0000000..87454f5 --- /dev/null +++ b/internal/pxe/ipxe.go @@ -0,0 +1,88 @@ +package pxe + +import ( + "fmt" + "io" + "strings" + + "vetting/internal/model" +) + +// IPXEParams is everything an iPXE boot script needs. +// For Phase 2 the boot target is always "linux" — Memtest chain-load +// is not required because we replaced Memtest86+ with stress-ng under +// Linux (see plan §3.2). +type IPXEParams struct { + OrchestratorURL string // e.g. http://10.0.0.5:8080 + LiveKernelURL string // e.g. http://10.0.0.5:8080/live/vmlinuz + LiveInitrdURL string // e.g. http://10.0.0.5:8080/live/initrd.img + TLSCertFPR string // optional; empty = skip pin + RunID int64 + MAC string + Token string // plaintext, hashed on server side +} + +// BuildScript returns an iPXE script tailored for this run. +// iPXE scripts are plain text beginning with "#!ipxe". +func BuildScript(p IPXEParams) string { + cmdline := []string{ + "initrd=initrd.img", + fmt.Sprintf("vetting.orchestrator=%s", p.OrchestratorURL), + fmt.Sprintf("vetting.run_id=%d", p.RunID), + fmt.Sprintf("vetting.mac=%s", p.MAC), + fmt.Sprintf("vetting.token=%s", p.Token), + } + if p.TLSCertFPR != "" { + cmdline = append(cmdline, fmt.Sprintf("vetting.cert_fpr=%s", p.TLSCertFPR)) + } + // Reduce kernel log noise during the test run; keep loglevel high enough + // for boot failures to still show up on the console. + cmdline = append(cmdline, + "console=tty0", + "console=ttyS0,115200n8", + "ip=dhcp", + "quiet", + ) + + var b strings.Builder + fmt.Fprintln(&b, "#!ipxe") + fmt.Fprintf(&b, "echo Vetting run %d — booting live image for %s\n", p.RunID, p.MAC) + fmt.Fprintf(&b, "kernel %s %s\n", p.LiveKernelURL, strings.Join(cmdline, " ")) + fmt.Fprintf(&b, "initrd %s\n", p.LiveInitrdURL) + fmt.Fprintln(&b, "boot") + return b.String() +} + +// NotRegisteredScript is served for unknown MACs. The MAC allowlist +// at the dnsmasq level should prevent this from ever being reachable, +// but it exists as belt-and-braces. +func NotRegisteredScript(mac string) string { + return fmt.Sprintf("#!ipxe\necho MAC %s not registered for vetting — halting.\nshell\n", mac) +} + +// NoActiveRunScript is served when a registered MAC PXE-boots but has +// no currently active run. The host is told to shut down rather than +// loop forever. +func NoActiveRunScript(mac string) string { + return fmt.Sprintf("#!ipxe\necho MAC %s has no active run — powering off in 10s.\nsleep 10\npoweroff\n", mac) +} + +// Used by handlers to compose URLs; exposed for tests. +func BuildLiveURLs(base string) (kernel, initrd string) { + base = strings.TrimRight(base, "/") + return base + "/live/vmlinuz", base + "/live/initrd.img" +} + +// WriteNotFound is a small convenience so handlers can return a shell +// script error directly to iPXE without cluttering handlers with a +// mime-type dance. +func WriteNotFound(w io.Writer, mac string) { + _, _ = w.Write([]byte(NotRegisteredScript(mac))) +} + +// ScriptMarker is used by iPXE to detect that the response is a script. +const ScriptMarker = "#!ipxe" + +// State returns the compact single-word status used for logging. +// Takes a Run's state because iPXE handler already looked it up. +func State(run model.Run) string { return string(run.State) } diff --git a/internal/pxe/ipxe_test.go b/internal/pxe/ipxe_test.go new file mode 100644 index 0000000..afb9c33 --- /dev/null +++ b/internal/pxe/ipxe_test.go @@ -0,0 +1,61 @@ +package pxe + +import ( + "strings" + "testing" +) + +func TestBuildScriptIncludesAllCmdlineParams(t *testing.T) { + s := BuildScript(IPXEParams{ + OrchestratorURL: "http://10.0.0.5:8080", + LiveKernelURL: "http://10.0.0.5:8080/live/vmlinuz", + LiveInitrdURL: "http://10.0.0.5:8080/live/initrd.img", + RunID: 42, + MAC: "aa:bb:cc:dd:ee:ff", + Token: "deadbeefcafe", + }) + if !strings.HasPrefix(s, "#!ipxe") { + t.Fatalf("expected #!ipxe header, got %q", s[:10]) + } + for _, want := range []string{ + "vetting.orchestrator=http://10.0.0.5:8080", + "vetting.run_id=42", + "vetting.mac=aa:bb:cc:dd:ee:ff", + "vetting.token=deadbeefcafe", + "kernel http://10.0.0.5:8080/live/vmlinuz", + "initrd http://10.0.0.5:8080/live/initrd.img", + "ip=dhcp", + "boot", + } { + if !strings.Contains(s, want) { + t.Errorf("script missing %q\n%s", want, s) + } + } +} + +func TestBuildScriptOmitsCertFPRWhenEmpty(t *testing.T) { + s := BuildScript(IPXEParams{ + OrchestratorURL: "http://x", LiveKernelURL: "http://x/k", LiveInitrdURL: "http://x/i", + RunID: 1, MAC: "aa:bb:cc:dd:ee:ff", Token: "t", + }) + if strings.Contains(s, "vetting.cert_fpr") { + t.Fatalf("cert_fpr should be absent when empty:\n%s", s) + } +} + +func TestNotRegisteredScriptMentionsMAC(t *testing.T) { + s := NotRegisteredScript("aa:bb:cc:dd:ee:ff") + if !strings.Contains(s, "aa:bb:cc:dd:ee:ff") { + t.Fatalf("not-registered script should echo the MAC: %s", s) + } + if !strings.HasPrefix(s, "#!ipxe") { + t.Fatalf("missing #!ipxe header: %s", s) + } +} + +func TestBuildLiveURLs(t *testing.T) { + k, i := BuildLiveURLs("http://h:8080/") + if k != "http://h:8080/live/vmlinuz" || i != "http://h:8080/live/initrd.img" { + t.Fatalf("BuildLiveURLs: %s, %s", k, i) + } +} diff --git a/internal/pxe/sighup_unix.go b/internal/pxe/sighup_unix.go new file mode 100644 index 0000000..a0045cf --- /dev/null +++ b/internal/pxe/sighup_unix.go @@ -0,0 +1,12 @@ +//go:build !windows + +package pxe + +import ( + "os" + "syscall" +) + +func sighup(p *os.Process) error { + return p.Signal(syscall.SIGHUP) +} diff --git a/internal/pxe/sighup_windows.go b/internal/pxe/sighup_windows.go new file mode 100644 index 0000000..c3cf152 --- /dev/null +++ b/internal/pxe/sighup_windows.go @@ -0,0 +1,12 @@ +//go:build windows + +package pxe + +import ( + "fmt" + "os" +) + +func sighup(_ *os.Process) error { + return fmt.Errorf("SIGHUP not supported on Windows") +} diff --git a/internal/report/report.go b/internal/report/report.go new file mode 100644 index 0000000..2370ec2 --- /dev/null +++ b/internal/report/report.go @@ -0,0 +1,245 @@ +// Package report builds the per-run HTML summary artifact. JSON is +// written separately (by the reporting resolver in the api package); +// this package only deals with the human-facing HTML. +// +// Design: a single self-contained HTML file — inline CSS, no external +// fetches — so the artifact is portable and can be opened straight off +// disk. Contents are a summary (per answer to the phase-5 design +// question): run metadata, per-stage pass/fail table, spec diff list, +// and measurement aggregates (min/avg/max by kind+key). +package report + +import ( + "bytes" + "fmt" + "html/template" + "math" + "sort" + "time" + + "vetting/internal/model" +) + +// Data is the payload fed to the HTML template. Callers assemble it +// from the DB rows for a given run. +type Data struct { + GeneratedAt time.Time + Run model.Run + Host model.Host + Stages []model.Stage + SpecDiffs []model.SpecDiff + Aggregates []Aggregate // flattened measurement summary; see Aggregate +} + +// Aggregate is a per (kind, key) summary of a run's measurements. Min/ +// Max/Avg are populated from the Measurement rows; Unit mirrors the raw +// sample unit so the HTML can show "52.5 °C" etc. +type Aggregate struct { + Kind string + Key string + Unit string + Count int + Min float64 + Max float64 + Avg float64 +} + +// AggregateMeasurements collapses a flat []Measurement into per-(kind, +// key) summaries, sorted first by kind then by key so the HTML renders +// deterministically. +func AggregateMeasurements(rows []model.Measurement) []Aggregate { + type bucket struct { + unit string + count int + min, max float64 + sum float64 + } + buckets := map[string]*bucket{} + keyOf := func(m model.Measurement) string { return m.Kind + "\x00" + m.Key } + for _, m := range rows { + k := keyOf(m) + b, ok := buckets[k] + if !ok { + b = &bucket{unit: m.Unit, min: math.Inf(1), max: math.Inf(-1)} + buckets[k] = b + } + b.count++ + b.sum += m.Value + if m.Value < b.min { + b.min = m.Value + } + if m.Value > b.max { + b.max = m.Value + } + } + out := make([]Aggregate, 0, len(buckets)) + for _, m := range rows { + k := keyOf(m) + b, ok := buckets[k] + if !ok { + continue + } + // Emit once per bucket; delete to dedupe. + delete(buckets, k) + out = append(out, Aggregate{ + Kind: m.Kind, + Key: m.Key, + Unit: b.unit, + Count: b.count, + Min: b.min, + Max: b.max, + Avg: b.sum / float64(b.count), + }) + } + sort.Slice(out, func(i, j int) bool { + if out[i].Kind != out[j].Kind { + return out[i].Kind < out[j].Kind + } + return out[i].Key < out[j].Key + }) + return out +} + +// RenderHTML produces the self-contained report HTML. +func RenderHTML(d Data) ([]byte, error) { + var buf bytes.Buffer + if err := reportTmpl.Execute(&buf, d); err != nil { + return nil, fmt.Errorf("report: render: %w", err) + } + return buf.Bytes(), nil +} + +var reportTmpl = template.Must(template.New("report").Funcs(template.FuncMap{ + "fmt4": func(f float64) string { return fmt.Sprintf("%.4g", f) }, + "fmtTime": func(t time.Time) string { return t.UTC().Format(time.RFC3339) }, + "fmtTimep": func(t *time.Time) string { if t == nil { return "—" }; return t.UTC().Format(time.RFC3339) }, + "resultBadge": func(s model.StageState) string { + switch s { + case model.StagePassed: + return "pass" + case model.StageFailed: + return "fail" + case model.StageSkipped: + return "skip" + default: + return "pend" + } + }, +}).Parse(htmlTemplate)) + +// Single-string template kept next to the code so the package stays +// self-contained. CSS is inlined; no external assets. +const htmlTemplate = ` + + + +Vetting report — {{.Host.Name}} run {{.Run.ID}} + + + +

{{.Host.Name}} — run {{.Run.ID}}

+
State: {{.Run.State}}{{if ne .Run.Result ""}} · result: {{.Run.Result}}{{end}} · generated {{fmtTime .GeneratedAt}}
+ +
+

Host

+ + + + + {{if .Host.Notes}}{{end}} +
Name{{.Host.Name}}
MAC{{.Host.MAC}}
WoL{{.Host.WoLBroadcastIP}}:{{.Host.WoLPort}}
Notes{{.Host.Notes}}
+
+ +
+

Run

+ + + + + + {{if .Run.FailedStage}}{{end}} + {{if .Run.ReportPath}}{{end}} +
Run ID{{.Run.ID}}
State{{.Run.State}}
Started{{fmtTime .Run.StartedAt}}
Completed{{fmtTimep .Run.CompletedAt}}
Failed stage{{.Run.FailedStage}}
JSON report{{.Run.ReportPath}}
+
+ +
+

Stages

+ + + + {{range .Stages}} + + + + + + + {{end}} + +
StageStateStartedCompleted
{{.Name}}{{.State}}{{fmtTimep .StartedAt}}{{fmtTimep .CompletedAt}}
+
+ +
+

Spec diffs ({{len .SpecDiffs}})

+{{if .SpecDiffs}} + + + + {{range .SpecDiffs}} + + + + + + + {{end}} + +
FieldExpectedActualSeverity
{{.Field}}{{.Expected}}{{.Actual}}{{.Severity}}
+{{else}} +

No differences between expected and actual hardware.

+{{end}} +
+ +
+

Measurements ({{len .Aggregates}} series)

+{{if .Aggregates}} + + + + {{range .Aggregates}} + + + + + + + + + + {{end}} + +
KindKeySamplesMinAvgMaxUnit
{{.Kind}}{{.Key}}{{.Count}}{{fmt4 .Min}}{{fmt4 .Avg}}{{fmt4 .Max}}{{.Unit}}
+{{else}} +

No measurements recorded.

+{{end}} +
+ + +` diff --git a/internal/spec/spec.go b/internal/spec/spec.go new file mode 100644 index 0000000..c433665 --- /dev/null +++ b/internal/spec/spec.go @@ -0,0 +1,232 @@ +// Package spec owns the expected-vs-actual hardware diff for Vetting. +// +// The operator writes an expected spec YAML per host when registering. +// The agent submits an Inventory artifact after boot. Diff() compares +// them and emits per-field SpecDiff rows; the orchestrator fails the +// SpecValidate stage if any row is classified critical. +// +// Phase 3 rule (operator decision): every mismatch is critical. Missing +// expected fields skip that check entirely so partial specs stay useful +// instead of exploding. +package spec + +import ( + "fmt" + "sort" + "strings" + + "gopkg.in/yaml.v3" + + "vetting/internal/model" +) + +type Spec struct { + CPU *CPUSpec `yaml:"cpu,omitempty"` + Memory *MemorySpec `yaml:"memory,omitempty"` + Disks []DiskSpec `yaml:"disks,omitempty"` + NICs []NICSpec `yaml:"nics,omitempty"` + GPUs []GPUSpec `yaml:"gpus,omitempty"` +} + +type CPUSpec struct { + Model string `json:"model,omitempty" yaml:"model,omitempty"` + LogicalCores int `json:"logical_cores,omitempty" yaml:"logical_cores,omitempty"` +} + +type MemorySpec struct { + TotalGiB int `json:"total_gib,omitempty" yaml:"total_gib,omitempty"` +} + +type DiskSpec struct { + Serial string `json:"serial,omitempty" yaml:"serial,omitempty"` + SizeGB int `json:"size_gb,omitempty" yaml:"size_gb,omitempty"` +} + +type NICSpec struct { + MAC string `json:"mac,omitempty" yaml:"mac,omitempty"` + SpeedGbps int `json:"speed_gbps,omitempty" yaml:"speed_gbps,omitempty"` +} + +type GPUSpec struct { + Model string `json:"model,omitempty" yaml:"model,omitempty"` +} + +// Inventory is the actual measured hardware. Field names deliberately +// match Spec so the diff reads cleanly. +type Inventory struct { + CPU CPUSpec `json:"cpu" yaml:"cpu"` + Memory MemorySpec `json:"memory" yaml:"memory"` + Disks []DiskSpec `json:"disks" yaml:"disks"` + NICs []NICSpec `json:"nics" yaml:"nics"` + GPUs []GPUSpec `json:"gpus" yaml:"gpus"` +} + +// Parse reads expected-spec YAML. Empty YAML parses to a zero Spec and +// yields an empty diff — i.e. "no expectations" is a legal stance. +func Parse(src string) (*Spec, error) { + var s Spec + if err := yaml.Unmarshal([]byte(src), &s); err != nil { + return nil, fmt.Errorf("parse spec yaml: %w", err) + } + return &s, nil +} + +// Diff returns the per-field differences with severity. Phase 3 rule: +// every present-expected-field-that-mismatches is critical. Missing +// expected fields are skipped (not info-logged) so the diff list stays +// focused on real problems. +func Diff(expected *Spec, actual *Inventory) []model.SpecDiff { + if expected == nil { + return nil + } + out := []model.SpecDiff{} + + if expected.CPU != nil { + if expected.CPU.Model != "" { + if !cpuModelMatches(expected.CPU.Model, actual.CPU.Model) { + out = append(out, diff("cpu.model", expected.CPU.Model, actual.CPU.Model)) + } + } + if expected.CPU.LogicalCores > 0 && expected.CPU.LogicalCores != actual.CPU.LogicalCores { + out = append(out, diff("cpu.logical_cores", itoa(expected.CPU.LogicalCores), itoa(actual.CPU.LogicalCores))) + } + } + + if expected.Memory != nil && expected.Memory.TotalGiB > 0 { + // Allow ±2 GiB tolerance: BIOS-reserved, kernel, reporting + // quantization. A dead 16 GiB stick will still surface. + if absInt(expected.Memory.TotalGiB-actual.Memory.TotalGiB) > 2 { + out = append(out, diff("memory.total_gib", itoa(expected.Memory.TotalGiB), itoa(actual.Memory.TotalGiB))) + } + } + + out = append(out, diffDisks(expected.Disks, actual.Disks)...) + out = append(out, diffNICs(expected.NICs, actual.NICs)...) + out = append(out, diffGPUs(expected.GPUs, actual.GPUs)...) + + return out +} + +func diffDisks(expected, actual []DiskSpec) []model.SpecDiff { + if len(expected) == 0 { + return nil + } + actualBySerial := map[string]DiskSpec{} + for _, d := range actual { + if d.Serial != "" { + actualBySerial[strings.ToLower(d.Serial)] = d + } + } + var out []model.SpecDiff + seen := map[string]bool{} + for _, exp := range expected { + if exp.Serial == "" { + continue + } + key := strings.ToLower(exp.Serial) + seen[key] = true + got, ok := actualBySerial[key] + if !ok { + out = append(out, diff("disks["+exp.Serial+"].present", "true", "false")) + continue + } + if exp.SizeGB > 0 && absInt(exp.SizeGB-got.SizeGB) > 1 { + out = append(out, diff("disks["+exp.Serial+"].size_gb", itoa(exp.SizeGB), itoa(got.SizeGB))) + } + } + // Extra disks on the host that operator didn't declare are flagged: + // a leftover USB stick could be a destructive-test target we'd + // rather the operator know about. + for _, got := range actual { + if got.Serial == "" { + continue + } + if !seen[strings.ToLower(got.Serial)] { + out = append(out, diff("disks[unexpected "+got.Serial+"]", "", "present")) + } + } + return out +} + +func diffNICs(expected, actual []NICSpec) []model.SpecDiff { + if len(expected) == 0 { + return nil + } + actualByMAC := map[string]NICSpec{} + for _, n := range actual { + if n.MAC != "" { + actualByMAC[strings.ToLower(n.MAC)] = n + } + } + var out []model.SpecDiff + for _, exp := range expected { + if exp.MAC == "" { + continue + } + got, ok := actualByMAC[strings.ToLower(exp.MAC)] + if !ok { + out = append(out, diff("nics["+exp.MAC+"].present", "true", "false")) + continue + } + if exp.SpeedGbps > 0 && got.SpeedGbps > 0 && exp.SpeedGbps != got.SpeedGbps { + out = append(out, diff("nics["+exp.MAC+"].speed_gbps", itoa(exp.SpeedGbps), itoa(got.SpeedGbps))) + } + } + return out +} + +func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff { + if len(expected) == 0 { + return nil + } + // GPU matching is by model string. Multiple identical cards match + // by count, not identity, since PCI-slot order isn't meaningful. + want := map[string]int{} + for _, g := range expected { + want[strings.ToLower(g.Model)]++ + } + got := map[string]int{} + for _, g := range actual { + got[strings.ToLower(g.Model)]++ + } + var keys []string + for k := range want { + keys = append(keys, k) + } + sort.Strings(keys) + var out []model.SpecDiff + for _, k := range keys { + if got[k] < want[k] { + out = append(out, diff("gpus["+k+"].count", itoa(want[k]), itoa(got[k]))) + } + } + return out +} + +// cpuModelMatches compares model strings case-insensitively and allows +// the operator to declare a substring (e.g. "E5-2680 v4") that matches +// the verbose kernel-reported string ("Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz"). +func cpuModelMatches(expected, actual string) bool { + e := strings.ToLower(strings.TrimSpace(expected)) + a := strings.ToLower(strings.TrimSpace(actual)) + return e == a || strings.Contains(a, e) +} + +// In Phase 3 all diffs are critical. Later phases may tier them. +func diff(field, expected, actual string) model.SpecDiff { + return model.SpecDiff{ + Field: field, + Expected: expected, + Actual: actual, + Severity: "critical", + } +} + +func absInt(n int) int { + if n < 0 { + return -n + } + return n +} + +func itoa(n int) string { return fmt.Sprintf("%d", n) } diff --git a/internal/spec/spec_test.go b/internal/spec/spec_test.go new file mode 100644 index 0000000..761c83a --- /dev/null +++ b/internal/spec/spec_test.go @@ -0,0 +1,121 @@ +package spec + +import ( + "testing" + + "vetting/internal/model" +) + +func TestDiffEmptySpec(t *testing.T) { + if d := Diff(&Spec{}, &Inventory{}); len(d) != 0 { + t.Fatalf("empty spec → empty diff, got %v", d) + } +} + +func TestDiffCPUMismatch(t *testing.T) { + exp := &Spec{CPU: &CPUSpec{Model: "E5-2680 v4", LogicalCores: 28}} + act := &Inventory{CPU: CPUSpec{Model: "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz", LogicalCores: 16}} + d := Diff(exp, act) + if len(d) != 1 || d[0].Field != "cpu.logical_cores" || d[0].Severity != "critical" { + t.Fatalf("expected logical_cores critical, got %+v", d) + } +} + +func TestDiffCPUModelSubstringMatch(t *testing.T) { + exp := &Spec{CPU: &CPUSpec{Model: "E5-2680 v4"}} + act := &Inventory{CPU: CPUSpec{Model: "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz"}} + if d := Diff(exp, act); len(d) != 0 { + t.Fatalf("substring should match, got %+v", d) + } +} + +func TestDiffMemoryTolerance(t *testing.T) { + exp := &Spec{Memory: &MemorySpec{TotalGiB: 128}} + act := &Inventory{Memory: MemorySpec{TotalGiB: 127}} + if d := Diff(exp, act); len(d) != 0 { + t.Fatalf("1 GiB variance should be tolerated, got %+v", d) + } + act2 := &Inventory{Memory: MemorySpec{TotalGiB: 112}} // missing stick + d := Diff(exp, act2) + if len(d) != 1 || d[0].Field != "memory.total_gib" { + t.Fatalf("16 GiB drop should be critical, got %+v", d) + } +} + +func TestDiffDisksMissingAndUnexpected(t *testing.T) { + exp := &Spec{Disks: []DiskSpec{{Serial: "A", SizeGB: 1000}, {Serial: "B", SizeGB: 500}}} + act := &Inventory{Disks: []DiskSpec{{Serial: "A", SizeGB: 1000}, {Serial: "C", SizeGB: 32}}} + d := Diff(exp, act) + // Expect: disk B missing, disk C unexpected. + got := map[string]bool{} + for _, row := range d { + got[row.Field] = true + } + if !got["disks[B].present"] { + t.Fatalf("expected disks[B].present critical; got %+v", d) + } + if !got["disks[unexpected C]"] { + t.Fatalf("expected disks[unexpected C] critical; got %+v", d) + } +} + +func TestDiffDisksSerialCaseInsensitive(t *testing.T) { + exp := &Spec{Disks: []DiskSpec{{Serial: "wd-abc123", SizeGB: 1000}}} + act := &Inventory{Disks: []DiskSpec{{Serial: "WD-ABC123", SizeGB: 1000}}} + if d := Diff(exp, act); len(d) != 0 { + t.Fatalf("serial compare must be case-insensitive, got %+v", d) + } +} + +func TestDiffNICMAC(t *testing.T) { + exp := &Spec{NICs: []NICSpec{{MAC: "aa:bb:cc:dd:ee:ff", SpeedGbps: 10}}} + act := &Inventory{NICs: []NICSpec{{MAC: "aa:bb:cc:dd:ee:ff", SpeedGbps: 1}}} + d := Diff(exp, act) + if len(d) != 1 || d[0].Field != "nics[aa:bb:cc:dd:ee:ff].speed_gbps" { + t.Fatalf("expected speed mismatch, got %+v", d) + } +} + +func TestDiffGPUCount(t *testing.T) { + exp := &Spec{GPUs: []GPUSpec{{Model: "NVIDIA RTX 3090"}, {Model: "NVIDIA RTX 3090"}}} + act := &Inventory{GPUs: []GPUSpec{{Model: "nvidia rtx 3090"}}} + d := Diff(exp, act) + if len(d) != 1 || d[0].Field != "gpus[nvidia rtx 3090].count" { + t.Fatalf("expected GPU count critical, got %+v", d) + } +} + +func TestParseValidYAML(t *testing.T) { + src := ` +cpu: + model: "E5-2680 v4" + logical_cores: 28 +memory: + total_gib: 128 +disks: + - serial: A + size_gb: 1000 +` + s, err := Parse(src) + if err != nil { + t.Fatalf("Parse: %v", err) + } + if s.CPU == nil || s.CPU.LogicalCores != 28 { + t.Fatalf("cpu not parsed: %+v", s) + } + if len(s.Disks) != 1 || s.Disks[0].Serial != "A" { + t.Fatalf("disks not parsed: %+v", s) + } +} + +func TestDiffSeverityAlwaysCritical(t *testing.T) { + exp := &Spec{CPU: &CPUSpec{LogicalCores: 8}} + act := &Inventory{CPU: CPUSpec{LogicalCores: 4}} + d := Diff(exp, act) + var got []model.SpecDiff = d + for _, row := range got { + if row.Severity != "critical" { + t.Fatalf("phase-3 rule: every diff is critical; got %q for %s", row.Severity, row.Field) + } + } +} diff --git a/internal/store/artifacts.go b/internal/store/artifacts.go new file mode 100644 index 0000000..a33aa80 --- /dev/null +++ b/internal/store/artifacts.go @@ -0,0 +1,126 @@ +package store + +import ( + "context" + "database/sql" + "fmt" + + "vetting/internal/model" +) + +type Artifact struct { + ID int64 + RunID int64 + StageID *int64 + Kind string // inventory|spec_diff|hold_key|report|log|fio|iperf|smart + Path string + SHA256 string + SizeBytes int64 +} + +type Artifacts struct { + DB *sql.DB +} + +func (a *Artifacts) Create(ctx context.Context, art Artifact) (int64, error) { + res, err := a.DB.ExecContext(ctx, ` + INSERT INTO artifacts(run_id, stage_id, kind, path, sha256, size_bytes) + VALUES(?,?,?,?,?,?) + `, art.RunID, nullInt64(art.StageID), art.Kind, art.Path, art.SHA256, art.SizeBytes) + if err != nil { + return 0, fmt.Errorf("insert artifact: %w", err) + } + return res.LastInsertId() +} + +// DeleteForRun removes every artifact row for a run. Returns the rows +// that were deleted so the caller can unlink the on-disk files. Used by +// the janitor; ordinary flow treats artifacts as append-only. +func (a *Artifacts) DeleteForRun(ctx context.Context, runID int64) ([]Artifact, error) { + arts, err := a.ListForRun(ctx, runID) + if err != nil { + return nil, err + } + if _, err := a.DB.ExecContext(ctx, `DELETE FROM artifacts WHERE run_id = ?`, runID); err != nil { + return nil, fmt.Errorf("delete artifacts for run %d: %w", runID, err) + } + return arts, nil +} + +func (a *Artifacts) ListForRun(ctx context.Context, runID int64) ([]Artifact, error) { + rows, err := a.DB.QueryContext(ctx, ` + SELECT id, run_id, stage_id, kind, path, sha256, size_bytes + FROM artifacts WHERE run_id = ? ORDER BY id + `, runID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []Artifact + for rows.Next() { + var ar Artifact + var stageID sql.NullInt64 + if err := rows.Scan(&ar.ID, &ar.RunID, &stageID, &ar.Kind, &ar.Path, &ar.SHA256, &ar.SizeBytes); err != nil { + return nil, err + } + if stageID.Valid { + v := stageID.Int64 + ar.StageID = &v + } + out = append(out, ar) + } + return out, rows.Err() +} + +type SpecDiffs struct { + DB *sql.DB +} + +func (s *SpecDiffs) ReplaceForRun(ctx context.Context, runID int64, diffs []model.SpecDiff) error { + tx, err := s.DB.BeginTx(ctx, nil) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + if _, err := tx.ExecContext(ctx, `DELETE FROM spec_diffs WHERE run_id = ?`, runID); err != nil { + return err + } + for _, d := range diffs { + if _, err := tx.ExecContext(ctx, ` + INSERT INTO spec_diffs(run_id, field, expected, actual, severity, ignored) + VALUES(?,?,?,?,?,?) + `, runID, d.Field, d.Expected, d.Actual, d.Severity, 0); err != nil { + return err + } + } + return tx.Commit() +} + +func (s *SpecDiffs) ListForRun(ctx context.Context, runID int64) ([]model.SpecDiff, error) { + rows, err := s.DB.QueryContext(ctx, ` + SELECT id, run_id, field, COALESCE(expected,''), COALESCE(actual,''), severity, ignored + FROM spec_diffs WHERE run_id = ? ORDER BY id + `, runID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []model.SpecDiff + for rows.Next() { + var d model.SpecDiff + var ignored int + if err := rows.Scan(&d.ID, &d.RunID, &d.Field, &d.Expected, &d.Actual, &d.Severity, &ignored); err != nil { + return nil, err + } + d.Ignored = ignored != 0 + out = append(out, d) + } + return out, rows.Err() +} + +func nullInt64(p *int64) any { + if p == nil { + return nil + } + return *p +} diff --git a/internal/store/hosts.go b/internal/store/hosts.go new file mode 100644 index 0000000..2a80cba --- /dev/null +++ b/internal/store/hosts.go @@ -0,0 +1,98 @@ +package store + +import ( + "context" + "database/sql" + "errors" + "fmt" + "strings" + + "vetting/internal/model" +) + +type Hosts struct { + DB *sql.DB +} + +var ErrNotFound = errors.New("not found") + +func (h *Hosts) Create(ctx context.Context, in model.Host) (int64, error) { + in.MAC = normalizeMAC(in.MAC) + res, err := h.DB.ExecContext(ctx, ` + INSERT INTO hosts(name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml, pdu_config_json, ipmi_config_json, notes) + VALUES(?,?,?,?,?,?,?,?) + `, in.Name, in.MAC, in.WoLBroadcastIP, in.WoLPort, in.ExpectedSpecYAML, nullIfEmpty(in.PDUConfigJSON), nullIfEmpty(in.IPMIConfigJSON), in.Notes) + if err != nil { + return 0, fmt.Errorf("insert host: %w", err) + } + return res.LastInsertId() +} + +func (h *Hosts) List(ctx context.Context) ([]model.Host, error) { + rows, err := h.DB.QueryContext(ctx, ` + SELECT id, name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml, + COALESCE(pdu_config_json,''), COALESCE(ipmi_config_json,''), + notes, created_at, updated_at + FROM hosts + ORDER BY name COLLATE NOCASE + `) + if err != nil { + return nil, fmt.Errorf("list hosts: %w", err) + } + defer rows.Close() + + var out []model.Host + for rows.Next() { + var host model.Host + if err := rows.Scan(&host.ID, &host.Name, &host.MAC, &host.WoLBroadcastIP, &host.WoLPort, + &host.ExpectedSpecYAML, &host.PDUConfigJSON, &host.IPMIConfigJSON, + &host.Notes, &host.CreatedAt, &host.UpdatedAt); err != nil { + return nil, fmt.Errorf("scan host: %w", err) + } + out = append(out, host) + } + return out, rows.Err() +} + +func (h *Hosts) Get(ctx context.Context, id int64) (*model.Host, error) { + row := h.DB.QueryRowContext(ctx, ` + SELECT id, name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml, + COALESCE(pdu_config_json,''), COALESCE(ipmi_config_json,''), + notes, created_at, updated_at + FROM hosts WHERE id = ? + `, id) + var host model.Host + err := row.Scan(&host.ID, &host.Name, &host.MAC, &host.WoLBroadcastIP, &host.WoLPort, + &host.ExpectedSpecYAML, &host.PDUConfigJSON, &host.IPMIConfigJSON, + &host.Notes, &host.CreatedAt, &host.UpdatedAt) + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } + if err != nil { + return nil, fmt.Errorf("get host: %w", err) + } + return &host, nil +} + +func (h *Hosts) Delete(ctx context.Context, id int64) error { + res, err := h.DB.ExecContext(ctx, `DELETE FROM hosts WHERE id = ?`, id) + if err != nil { + return fmt.Errorf("delete host: %w", err) + } + n, _ := res.RowsAffected() + if n == 0 { + return ErrNotFound + } + return nil +} + +func normalizeMAC(m string) string { + return strings.ToLower(strings.TrimSpace(m)) +} + +func nullIfEmpty(s string) any { + if s == "" { + return nil + } + return s +} diff --git a/internal/store/measurements.go b/internal/store/measurements.go new file mode 100644 index 0000000..023cb77 --- /dev/null +++ b/internal/store/measurements.go @@ -0,0 +1,85 @@ +package store + +import ( + "context" + "database/sql" + "fmt" + "time" + + "vetting/internal/model" +) + +// Measurements persists timestamped numeric samples: temps, fan speeds, +// PSU voltages, fio IOPS, iperf throughput, SMART attributes. The schema +// stores (kind, key, value, unit) so Phase 5 reports can group freely +// without new tables per source. +type Measurements struct { + DB *sql.DB +} + +func (m *Measurements) Create(ctx context.Context, in model.Measurement) (int64, error) { + if in.TS.IsZero() { + in.TS = time.Now().UTC() + } + res, err := m.DB.ExecContext(ctx, ` + INSERT INTO measurements(run_id, stage_id, ts, kind, key, value, unit) + VALUES(?,?,?,?,?,?,?) + `, in.RunID, nullInt64(in.StageID), in.TS, in.Kind, in.Key, in.Value, in.Unit) + if err != nil { + return 0, fmt.Errorf("insert measurement: %w", err) + } + return res.LastInsertId() +} + +// CreateBatch inserts a batch in one transaction. The sensor endpoint +// hands us ~5–20 samples per tick; a single commit keeps SQLite happy. +func (m *Measurements) CreateBatch(ctx context.Context, rows []model.Measurement) error { + if len(rows) == 0 { + return nil + } + tx, err := m.DB.BeginTx(ctx, nil) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + now := time.Now().UTC() + for _, r := range rows { + if r.TS.IsZero() { + r.TS = now + } + if _, err := tx.ExecContext(ctx, ` + INSERT INTO measurements(run_id, stage_id, ts, kind, key, value, unit) + VALUES(?,?,?,?,?,?,?) + `, r.RunID, nullInt64(r.StageID), r.TS, r.Kind, r.Key, r.Value, r.Unit); err != nil { + return fmt.Errorf("insert measurement: %w", err) + } + } + return tx.Commit() +} + +// ListForRun returns all measurements for a run. Callers filter by kind +// in memory; the row count is small per run (≈thousands). +func (m *Measurements) ListForRun(ctx context.Context, runID int64) ([]model.Measurement, error) { + rows, err := m.DB.QueryContext(ctx, ` + SELECT id, run_id, stage_id, ts, kind, key, value, COALESCE(unit,'') + FROM measurements WHERE run_id = ? ORDER BY ts, id + `, runID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []model.Measurement + for rows.Next() { + var meas model.Measurement + var stageID sql.NullInt64 + if err := rows.Scan(&meas.ID, &meas.RunID, &stageID, &meas.TS, &meas.Kind, &meas.Key, &meas.Value, &meas.Unit); err != nil { + return nil, err + } + if stageID.Valid { + v := stageID.Int64 + meas.StageID = &v + } + out = append(out, meas) + } + return out, rows.Err() +} diff --git a/internal/store/runs.go b/internal/store/runs.go new file mode 100644 index 0000000..70c8e14 --- /dev/null +++ b/internal/store/runs.go @@ -0,0 +1,226 @@ +package store + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" + + "vetting/internal/model" +) + +type Runs struct { + DB *sql.DB +} + +func (r *Runs) Create(ctx context.Context, hostID int64, tokenHash string) (int64, error) { + now := time.Now().UTC() + res, err := r.DB.ExecContext(ctx, ` + INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at) + VALUES(?,?,?,?,?) + `, hostID, string(model.StateQueued), tokenHash, "linux", now) + if err != nil { + return 0, fmt.Errorf("insert run: %w", err) + } + return res.LastInsertId() +} + +func (r *Runs) SetState(ctx context.Context, runID int64, state model.RunState) error { + _, err := r.DB.ExecContext(ctx, `UPDATE runs SET state = ? WHERE id = ?`, string(state), runID) + return err +} + +// RotateTokenHash replaces the stored token hash. Called on each iPXE +// fetch so only the most-recently-booted agent can claim the run. +func (r *Runs) RotateTokenHash(ctx context.Context, runID int64, hash string) error { + _, err := r.DB.ExecContext(ctx, `UPDATE runs SET agent_token_hash = ? WHERE id = ?`, hash, runID) + return err +} + +// SetHoldIP records the agent's LAN IP so the UI can show the ssh +// command. Called when the agent POSTs /hold. +func (r *Runs) SetHoldIP(ctx context.Context, runID int64, ip string) error { + _, err := r.DB.ExecContext(ctx, `UPDATE runs SET hold_ip = ? WHERE id = ?`, ip, runID) + return err +} + +// SetFailedStage records which stage tripped the run; used by the tile +// and by reports. Does not change state. +func (r *Runs) SetFailedStage(ctx context.Context, runID int64, stage string) error { + _, err := r.DB.ExecContext(ctx, `UPDATE runs SET failed_stage = ? WHERE id = ?`, stage, runID) + return err +} + +// ClearFailedStage wipes the failed_stage marker. Called when the +// operator overrides a stage and the run re-enters the pipeline. +func (r *Runs) ClearFailedStage(ctx context.Context, runID int64) error { + _, err := r.DB.ExecContext(ctx, `UPDATE runs SET failed_stage = NULL WHERE id = ?`, runID) + return err +} + +// SetOverrideFlags persists the operator's override decisions (JSON blob +// like `{"wipe":true}`). Passed back to the agent on the next heartbeat +// so it can resume the held stage with the gate bypassed. +func (r *Runs) SetOverrideFlags(ctx context.Context, runID int64, flagsJSON string) error { + _, err := r.DB.ExecContext(ctx, `UPDATE runs SET override_flags_json = ? WHERE id = ?`, flagsJSON, runID) + return err +} + +func (r *Runs) MarkFailed(ctx context.Context, runID int64, failedStage, holdIP string) error { + now := time.Now().UTC() + _, err := r.DB.ExecContext(ctx, ` + UPDATE runs SET state = ?, result = 'fail', failed_stage = ?, hold_ip = ?, completed_at = ? + WHERE id = ? + `, string(model.StateFailedHolding), failedStage, holdIP, now, runID) + return err +} + +func (r *Runs) MarkCompleted(ctx context.Context, runID int64, reportPath string) error { + now := time.Now().UTC() + _, err := r.DB.ExecContext(ctx, ` + UPDATE runs SET state = ?, result = 'pass', report_path = ?, completed_at = ? + WHERE id = ? + `, string(model.StateCompleted), reportPath, now, runID) + return err +} + +func (r *Runs) Get(ctx context.Context, id int64) (*model.Run, error) { + row := r.DB.QueryRowContext(ctx, ` + SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), + COALESCE(next_boot_target,''), agent_token_hash, started_at, + completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), + COALESCE(override_flags_json,'') + FROM runs WHERE id = ? + `, id) + var run model.Run + var completedAt sql.NullTime + err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, + &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, + &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON) + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } + if err != nil { + return nil, fmt.Errorf("get run: %w", err) + } + if completedAt.Valid { + run.CompletedAt = &completedAt.Time + } + return &run, nil +} + +// LatestForHost returns the most recent run for a host, or nil if none. +func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, error) { + row := r.DB.QueryRowContext(ctx, ` + SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), + COALESCE(next_boot_target,''), agent_token_hash, started_at, + completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), + COALESCE(override_flags_json,'') + FROM runs WHERE host_id = ? + ORDER BY id DESC LIMIT 1 + `, hostID) + var run model.Run + var completedAt sql.NullTime + err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, + &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, + &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON) + if errors.Is(err, sql.ErrNoRows) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("latest run: %w", err) + } + if completedAt.Valid { + run.CompletedAt = &completedAt.Time + } + return &run, nil +} + +// Active returns all runs in non-terminal states. +func (r *Runs) Active(ctx context.Context) ([]model.Run, error) { + rows, err := r.DB.QueryContext(ctx, ` + SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), + COALESCE(next_boot_target,''), agent_token_hash, started_at, + completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), + COALESCE(override_flags_json,'') + FROM runs + WHERE state NOT IN ('Completed','Released') + ORDER BY id + `) + if err != nil { + return nil, err + } + defer rows.Close() + var out []model.Run + for rows.Next() { + var run model.Run + var completedAt sql.NullTime + if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, + &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, + &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON); err != nil { + return nil, err + } + if completedAt.Valid { + run.CompletedAt = &completedAt.Time + } + out = append(out, run) + } + return out, rows.Err() +} + +// CompletedOlderThan returns run IDs for terminal (Completed/Released/ +// FailedHolding) runs whose completed_at is older than cutoff. Runs with +// a NULL completed_at fall back to started_at so a stuck run doesn't get +// garbage-collected out from under its own logs. Used by the janitor. +func (r *Runs) CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error) { + rows, err := r.DB.QueryContext(ctx, ` + SELECT id FROM runs + WHERE state IN ('Completed','Released','FailedHolding') + AND COALESCE(completed_at, started_at) < ? + ORDER BY id + `, cutoff) + if err != nil { + return nil, err + } + defer rows.Close() + var out []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + return nil, err + } + out = append(out, id) + } + return out, rows.Err() +} + +// FindByMAC returns the current active run for the host with the given MAC, +// or nil if the MAC is unknown or has no active run. +func (r *Runs) FindActiveByMAC(ctx context.Context, mac string) (*model.Run, error) { + row := r.DB.QueryRowContext(ctx, ` + SELECT r.id, r.host_id, r.state, COALESCE(r.result,''), COALESCE(r.failed_stage,''), + COALESCE(r.next_boot_target,''), r.agent_token_hash, r.started_at, + r.completed_at, COALESCE(r.report_path,''), COALESCE(r.hold_ip,''), + COALESCE(r.override_flags_json,'') + FROM runs r + JOIN hosts h ON h.id = r.host_id + WHERE h.mac = ? AND r.state NOT IN ('Completed','Released') + ORDER BY r.id DESC LIMIT 1 + `, mac) + var run model.Run + var completedAt sql.NullTime + err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, + &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, + &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON) + if errors.Is(err, sql.ErrNoRows) { + return nil, nil + } + if err != nil { + return nil, err + } + if completedAt.Valid { + run.CompletedAt = &completedAt.Time + } + return &run, nil +} diff --git a/internal/store/stages.go b/internal/store/stages.go new file mode 100644 index 0000000..63189e6 --- /dev/null +++ b/internal/store/stages.go @@ -0,0 +1,91 @@ +package store + +import ( + "context" + "database/sql" + "fmt" + "time" + + "vetting/internal/model" +) + +type Stages struct { + DB *sql.DB +} + +// DefaultStageOrder is the canonical sequence for every run. Phase 2 only +// reaches Inventory; later phases add more executors but the list is fixed. +var DefaultStageOrder = []string{ + "Inventory", + "SpecValidate", + "SMART", + "CPUStress", + "Storage", + "Network", + "GPU", + "PSU", + "Reporting", +} + +// Seed creates one pending row per stage for the given run. +func (s *Stages) Seed(ctx context.Context, runID int64) error { + tx, err := s.DB.BeginTx(ctx, nil) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + for i, name := range DefaultStageOrder { + if _, err := tx.ExecContext(ctx, + `INSERT INTO stages(run_id, name, ordinal, state) VALUES(?,?,?,?)`, + runID, name, i, string(model.StagePending)); err != nil { + return fmt.Errorf("seed stage %s: %w", name, err) + } + } + return tx.Commit() +} + +func (s *Stages) ListForRun(ctx context.Context, runID int64) ([]model.Stage, error) { + rows, err := s.DB.QueryContext(ctx, ` + SELECT id, run_id, name, ordinal, state, started_at, completed_at, COALESCE(summary_json,'') + FROM stages WHERE run_id = ? ORDER BY ordinal + `, runID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []model.Stage + for rows.Next() { + var st model.Stage + var started, completed sql.NullTime + if err := rows.Scan(&st.ID, &st.RunID, &st.Name, &st.Ordinal, &st.State, + &started, &completed, &st.SummaryJSON); err != nil { + return nil, err + } + if started.Valid { + st.StartedAt = &started.Time + } + if completed.Valid { + st.CompletedAt = &completed.Time + } + out = append(out, st) + } + return out, rows.Err() +} + +func (s *Stages) StartByName(ctx context.Context, runID int64, name string) error { + now := time.Now().UTC() + _, err := s.DB.ExecContext(ctx, ` + UPDATE stages SET state = ?, started_at = ? + WHERE run_id = ? AND name = ? + `, string(model.StageRunning), now, runID, name) + return err +} + +func (s *Stages) CompleteByName(ctx context.Context, runID int64, name string, state model.StageState, summaryJSON string) error { + now := time.Now().UTC() + _, err := s.DB.ExecContext(ctx, ` + UPDATE stages SET state = ?, completed_at = ?, summary_json = ? + WHERE run_id = ? AND name = ? + `, string(state), now, nullIfEmpty(summaryJSON), runID, name) + return err +} diff --git a/internal/store/store_test.go b/internal/store/store_test.go new file mode 100644 index 0000000..d012d33 --- /dev/null +++ b/internal/store/store_test.go @@ -0,0 +1,229 @@ +package store_test + +import ( + "context" + "path/filepath" + "testing" + + "vetting/internal/db" + "vetting/internal/model" + "vetting/internal/store" +) + +func newDB(t *testing.T) *store.Runs { + t.Helper() + path := filepath.Join(t.TempDir(), "vetting.db") + conn, err := db.Open(path) + if err != nil { + t.Fatalf("open db: %v", err) + } + t.Cleanup(func() { _ = conn.Close() }) + return &store.Runs{DB: conn} +} + +// seedRun inserts a host + a run and returns (hostID, runID). Every +// subsequent store test builds on this so run_id foreign keys resolve. +func seedRun(t *testing.T, runs *store.Runs) (int64, int64) { + t.Helper() + hosts := &store.Hosts{DB: runs.DB} + hostID, err := hosts.Create(context.Background(), model.Host{ + Name: "t-host", + MAC: "aa:bb:cc:dd:ee:ff", + WoLBroadcastIP: "10.0.0.255", + WoLPort: 9, + ExpectedSpecYAML: "memory:\n total_gib: 16\n", + }) + if err != nil { + t.Fatalf("create host: %v", err) + } + runID, err := runs.Create(context.Background(), hostID, "deadbeef") + if err != nil { + t.Fatalf("create run: %v", err) + } + return hostID, runID +} + +func TestArtifactsRoundtrip(t *testing.T) { + runs := newDB(t) + _, runID := seedRun(t, runs) + arts := &store.Artifacts{DB: runs.DB} + + id, err := arts.Create(context.Background(), store.Artifact{ + RunID: runID, + Kind: "inventory", + Path: "/var/artifacts/run-1/inventory.json", + SHA256: "abc123", + SizeBytes: 42, + }) + if err != nil { + t.Fatalf("Create: %v", err) + } + if id == 0 { + t.Fatalf("expected non-zero id") + } + + // Hold key on the same run — ListForRun should return both in + // insertion order and TileEnricher picks the hold_key row. + if _, err := arts.Create(context.Background(), store.Artifact{ + RunID: runID, Kind: "hold_key", Path: "/var/artifacts/run-1/hold.key", SHA256: "def456", SizeBytes: 400, + }); err != nil { + t.Fatalf("Create hold_key: %v", err) + } + + list, err := arts.ListForRun(context.Background(), runID) + if err != nil { + t.Fatalf("ListForRun: %v", err) + } + if len(list) != 2 { + t.Fatalf("ListForRun returned %d, want 2", len(list)) + } + if list[0].Kind != "inventory" || list[1].Kind != "hold_key" { + t.Fatalf("unexpected order: %+v", list) + } + if list[1].Path != "/var/artifacts/run-1/hold.key" { + t.Fatalf("hold_key path lost: %q", list[1].Path) + } +} + +func TestSpecDiffsReplaceForRun(t *testing.T) { + runs := newDB(t) + _, runID := seedRun(t, runs) + sd := &store.SpecDiffs{DB: runs.DB} + ctx := context.Background() + + // First write: three diffs. + err := sd.ReplaceForRun(ctx, runID, []model.SpecDiff{ + {RunID: runID, Field: "cpu.model", Expected: "Xeon", Actual: "EPYC", Severity: "critical"}, + {RunID: runID, Field: "memory.total_gib", Expected: "16", Actual: "8", Severity: "critical"}, + {RunID: runID, Field: "note", Expected: "", Actual: "dusty", Severity: "info"}, + }) + if err != nil { + t.Fatalf("ReplaceForRun: %v", err) + } + + list, err := sd.ListForRun(ctx, runID) + if err != nil { + t.Fatalf("ListForRun: %v", err) + } + if len(list) != 3 { + t.Fatalf("got %d rows, want 3", len(list)) + } + + // Second write replaces, doesn't append — otherwise a re-run would + // double-count spec diffs and the tile badge would grow without bound. + err = sd.ReplaceForRun(ctx, runID, []model.SpecDiff{ + {RunID: runID, Field: "cpu.model", Expected: "Xeon", Actual: "Xeon Gold", Severity: "info"}, + }) + if err != nil { + t.Fatalf("second ReplaceForRun: %v", err) + } + list, err = sd.ListForRun(ctx, runID) + if err != nil { + t.Fatalf("ListForRun after replace: %v", err) + } + if len(list) != 1 { + t.Fatalf("expected 1 row after replace, got %d", len(list)) + } + if list[0].Severity != "info" { + t.Fatalf("expected severity info, got %q", list[0].Severity) + } +} + +func TestMeasurementsBatchAndList(t *testing.T) { + runs := newDB(t) + _, runID := seedRun(t, runs) + meas := &store.Measurements{DB: runs.DB} + ctx := context.Background() + + err := meas.CreateBatch(ctx, []model.Measurement{ + {RunID: runID, Kind: "thermal", Key: "cpu", Value: 52.5, Unit: "C"}, + {RunID: runID, Kind: "iperf", Key: "throughput_mbps", Value: 940.1, Unit: "Mbps"}, + {RunID: runID, Kind: "psu", Key: "in0", Value: 12.04, Unit: "V"}, + }) + if err != nil { + t.Fatalf("CreateBatch: %v", err) + } + + // Zero-length batch must be a no-op, not an error. + if err := meas.CreateBatch(ctx, nil); err != nil { + t.Fatalf("empty CreateBatch: %v", err) + } + + rows, err := meas.ListForRun(ctx, runID) + if err != nil { + t.Fatalf("ListForRun: %v", err) + } + if len(rows) != 3 { + t.Fatalf("got %d rows, want 3", len(rows)) + } + foundIperf := false + for _, r := range rows { + if r.Kind == "iperf" && r.Key == "throughput_mbps" && r.Value > 900 { + foundIperf = true + } + } + if !foundIperf { + t.Fatalf("iperf row missing or wrong value: %+v", rows) + } +} + +func TestRunsOverrideFlagsAndClearFailedStage(t *testing.T) { + runs := newDB(t) + _, runID := seedRun(t, runs) + ctx := context.Background() + + if err := runs.SetFailedStage(ctx, runID, "Storage"); err != nil { + t.Fatalf("SetFailedStage: %v", err) + } + if err := runs.SetOverrideFlags(ctx, runID, `{"wipe":true}`); err != nil { + t.Fatalf("SetOverrideFlags: %v", err) + } + run, err := runs.Get(ctx, runID) + if err != nil { + t.Fatalf("Get: %v", err) + } + if run.OverrideFlagsJSON != `{"wipe":true}` { + t.Fatalf("OverrideFlagsJSON = %q, want {\"wipe\":true}", run.OverrideFlagsJSON) + } + if run.FailedStage != "Storage" { + t.Fatalf("FailedStage = %q, want Storage", run.FailedStage) + } + if err := runs.ClearFailedStage(ctx, runID); err != nil { + t.Fatalf("ClearFailedStage: %v", err) + } + run, err = runs.Get(ctx, runID) + if err != nil { + t.Fatalf("Get after clear: %v", err) + } + if run.FailedStage != "" { + t.Fatalf("FailedStage not cleared: %q", run.FailedStage) + } + // override_flags_json should persist across ClearFailedStage so the + // agent can still read it on its next heartbeat. + if run.OverrideFlagsJSON != `{"wipe":true}` { + t.Fatalf("OverrideFlagsJSON lost after ClearFailedStage: %q", run.OverrideFlagsJSON) + } +} + +func TestRunsHoldAndFailedStage(t *testing.T) { + runs := newDB(t) + _, runID := seedRun(t, runs) + ctx := context.Background() + + if err := runs.SetHoldIP(ctx, runID, "10.0.0.42"); err != nil { + t.Fatalf("SetHoldIP: %v", err) + } + if err := runs.SetFailedStage(ctx, runID, "SpecValidate"); err != nil { + t.Fatalf("SetFailedStage: %v", err) + } + run, err := runs.Get(ctx, runID) + if err != nil { + t.Fatalf("Get: %v", err) + } + if run.HoldIP != "10.0.0.42" { + t.Fatalf("HoldIP = %q, want 10.0.0.42", run.HoldIP) + } + if run.FailedStage != "SpecValidate" { + t.Fatalf("FailedStage = %q, want SpecValidate", run.FailedStage) + } +} diff --git a/internal/web/embed.go b/internal/web/embed.go new file mode 100644 index 0000000..3347a00 --- /dev/null +++ b/internal/web/embed.go @@ -0,0 +1,6 @@ +package web + +import "embed" + +//go:embed static/* +var Static embed.FS diff --git a/internal/web/static/app.css b/internal/web/static/app.css new file mode 100644 index 0000000..88cd6c0 --- /dev/null +++ b/internal/web/static/app.css @@ -0,0 +1,210 @@ +:root { + --bg: #0f1115; + --bg-elev: #171a21; + --bg-elev-2: #1f232c; + --border: #2a2f3a; + --text: #e5e8ef; + --text-dim: #9aa2b1; + --accent: #6aa9ff; + --accent-strong: #3c82f6; + --success: #35c27b; + --warn: #e4a94b; + --danger: #e56466; + --radius: 8px; + --font: system-ui, -apple-system, "Segoe UI", Roboto, sans-serif; + --mono: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; +} + +* { box-sizing: border-box; } + +html, body { + margin: 0; + padding: 0; + background: var(--bg); + color: var(--text); + font: 15px/1.45 var(--font); +} + +a { color: var(--accent); text-decoration: none; } +a:hover { text-decoration: underline; } + +.topbar { + display: flex; + align-items: center; + gap: 24px; + padding: 12px 24px; + border-bottom: 1px solid var(--border); + background: var(--bg-elev); +} +.topbar .brand { font-weight: 700; letter-spacing: .2px; } +.topbar nav { display: flex; gap: 16px; flex: 1; } +.topbar nav a { color: var(--text-dim); } +.topbar nav a:hover { color: var(--text); text-decoration: none; } +.topbar .session { display: flex; align-items: center; gap: 12px; } +.topbar .heartbeat { color: var(--text-dim); font-family: var(--mono); font-size: 12px; } +.topbar .logout-form { margin: 0; } + +main { max-width: 1280px; margin: 0 auto; padding: 24px; } + +button, .button, .button-secondary { + appearance: none; + font: inherit; + padding: 8px 14px; + border-radius: var(--radius); + border: 1px solid var(--border); + background: var(--bg-elev-2); + color: var(--text); + cursor: pointer; + text-decoration: none; + display: inline-block; +} +button:hover, .button:hover { border-color: var(--accent); } +button:disabled { opacity: .5; cursor: not-allowed; } +button.danger { border-color: var(--danger); color: var(--danger); background: transparent; } +button.danger:hover { background: rgba(229,100,102,.1); } +.button-secondary { background: transparent; } + +.error { + background: rgba(229,100,102,.12); + border: 1px solid var(--danger); + color: var(--danger); + padding: 10px 14px; + border-radius: var(--radius); + margin-bottom: 16px; +} + +.dashboard-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 20px; +} +.dashboard-header h1 { font-size: 20px; margin: 0; } + +.empty { + text-align: center; + padding: 48px 24px; + border: 1px dashed var(--border); + border-radius: var(--radius); + color: var(--text-dim); +} +.empty .button { margin-top: 12px; } + +.tile-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(320px, 1fr)); + gap: 16px; +} + +.tile { + background: var(--bg-elev); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 16px; + display: flex; + flex-direction: column; + gap: 12px; +} +.tile-head { display: flex; justify-content: space-between; align-items: center; } +.tile-name { font-weight: 600; } +.tile-status { font-size: 12px; color: var(--text-dim); text-transform: uppercase; letter-spacing: .5px; } +.tile-idle .tile-status { color: var(--text-dim); } + +.tile-meta { display: grid; grid-template-columns: 1fr 1fr; gap: 4px 16px; margin: 0; font-size: 13px; } +.tile-meta div { display: flex; justify-content: space-between; align-items: baseline; } +.tile-meta dt { color: var(--text-dim); } +.tile-meta dd { margin: 0; font-family: var(--mono); } + +.tile-actions { display: flex; gap: 8px; } +.tile-actions .inline { margin: 0; flex: 0; } + +.tile-meta dd.bad { color: var(--danger); } + +.tile-hold { + background: rgba(229,100,102,.08); + border: 1px solid rgba(229,100,102,.35); + border-radius: var(--radius); + padding: 8px 10px; + display: flex; + flex-direction: column; + gap: 4px; +} +.tile-hold .hold-title { + font-size: 12px; + color: var(--danger); + text-transform: uppercase; + letter-spacing: .5px; +} +.tile-hold .hold-ssh { + font-family: var(--mono); + font-size: 12px; + color: var(--text); + word-break: break-all; + user-select: all; +} + +.tile-log { + background: #0b0d12; + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 8px 10px; + font-family: var(--mono); + font-size: 12px; + color: var(--text-dim); + max-height: 160px; + overflow-y: auto; + display: flex; + flex-direction: column; + gap: 2px; +} +.tile-log:empty { display: none; } +.tile-log .log-line { white-space: pre-wrap; } +.tile-log .log-warn { color: var(--warn); } +.tile-log .log-error { color: var(--danger); } + +.tile-fail { border-color: rgba(229,100,102,.6); } +.tile-pass { border-color: rgba(53,194,123,.5); } +.tile-active { border-color: var(--accent); } + +.form-wrap { max-width: 640px; } +.form-wrap h1 { font-size: 20px; } + +.host-form { display: flex; flex-direction: column; gap: 14px; } +.host-form label { display: flex; flex-direction: column; gap: 4px; color: var(--text-dim); font-size: 13px; } +.host-form input, +.host-form textarea { + font: inherit; + font-family: var(--mono); + color: var(--text); + background: var(--bg-elev); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 8px 10px; +} +.host-form textarea { resize: vertical; min-height: 96px; } +.host-form .grid-2 { display: grid; grid-template-columns: 2fr 1fr; gap: 14px; } +.host-form .actions { display: flex; gap: 10px; margin-top: 4px; } + +.login-card { + max-width: 360px; + margin: 12vh auto; + padding: 28px; + background: var(--bg-elev); + border: 1px solid var(--border); + border-radius: var(--radius); +} +.login-card h1 { margin: 0 0 16px; font-size: 22px; } +.login-card label { display: flex; flex-direction: column; gap: 4px; color: var(--text-dim); font-size: 13px; } +.login-card input { + font: inherit; + color: var(--text); + background: var(--bg-elev-2); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 10px; + margin-bottom: 12px; +} +.login-card button { width: 100%; background: var(--accent-strong); border-color: var(--accent-strong); color: #fff; } +.login-card button:hover { background: var(--accent); border-color: var(--accent); } + +body.bare main { max-width: none; } diff --git a/internal/web/templates/dashboard.templ b/internal/web/templates/dashboard.templ new file mode 100644 index 0000000..7b12481 --- /dev/null +++ b/internal/web/templates/dashboard.templ @@ -0,0 +1,36 @@ +package templates + +import "vetting/internal/model" + +// TileData pairs a host with its latest run and the derived fields the +// tile needs to render: spec-diff count (server-side diff result) and +// the on-disk path to the hold-key artifact when the run is holding. +type TileData struct { + Host model.Host + Latest *model.Run + SpecDiffCritical int + HoldKeyPath string +} + +templ Dashboard(tiles []TileData) { + @Layout("Dashboard") { +
+
+

Registered hosts

+ Register host +
+ if len(tiles) == 0 { +
+

No hosts registered yet.

+ Register your first host +
+ } else { +
+ for _, t := range tiles { + @HostTile(t) + } +
+ } +
+ } +} diff --git a/internal/web/templates/dashboard_templ.go b/internal/web/templates/dashboard_templ.go new file mode 100644 index 0000000..40f8d2e --- /dev/null +++ b/internal/web/templates/dashboard_templ.go @@ -0,0 +1,95 @@ +// Code generated by templ - DO NOT EDIT. + +// templ: version: v0.3.1001 +package templates + +//lint:file-ignore SA4006 This context is only used if a nested component is present. + +import "github.com/a-h/templ" +import templruntime "github.com/a-h/templ/runtime" + +import "vetting/internal/model" + +// TileData pairs a host with its latest run and the derived fields the +// tile needs to render: spec-diff count (server-side diff result) and +// the on-disk path to the hold-key artifact when the run is holding. +type TileData struct { + Host model.Host + Latest *model.Run + SpecDiffCritical int + HoldKeyPath string +} + +func Dashboard(tiles []TileData) templ.Component { + return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) { + templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context + if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil { + return templ_7745c5c3_CtxErr + } + templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W) + if !templ_7745c5c3_IsBuffer { + defer func() { + templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer) + if templ_7745c5c3_Err == nil { + templ_7745c5c3_Err = templ_7745c5c3_BufErr + } + }() + } + ctx = templ.InitializeContext(ctx) + templ_7745c5c3_Var1 := templ.GetChildren(ctx) + if templ_7745c5c3_Var1 == nil { + templ_7745c5c3_Var1 = templ.NopComponent + } + ctx = templ.ClearChildren(ctx) + templ_7745c5c3_Var2 := templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) { + templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context + templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W) + if !templ_7745c5c3_IsBuffer { + defer func() { + templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer) + if templ_7745c5c3_Err == nil { + templ_7745c5c3_Err = templ_7745c5c3_BufErr + } + }() + } + ctx = templ.InitializeContext(ctx) + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "

Registered hosts

Register host
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + if len(tiles) == 0 { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "

No hosts registered yet.

Register your first host
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } else { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + for _, t := range tiles { + templ_7745c5c3_Err = HostTile(t).Render(ctx, templ_7745c5c3_Buffer) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + return nil + }) + templ_7745c5c3_Err = Layout("Dashboard").Render(templ.WithChildren(ctx, templ_7745c5c3_Var2), templ_7745c5c3_Buffer) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + return nil + }) +} + +var _ = templruntime.GeneratedTemplate diff --git a/internal/web/templates/host_tile.templ b/internal/web/templates/host_tile.templ new file mode 100644 index 0000000..aab4b11 --- /dev/null +++ b/internal/web/templates/host_tile.templ @@ -0,0 +1,144 @@ +package templates + +import ( + "bytes" + "context" + "fmt" + "vetting/internal/model" +) + +// HostTile renders a single dashboard card. It's the SSE-swap target +// for per-host tile refreshes (`tile-N`) and contains a per-run log +// pane (`log-M`) whose live tail is appended by the events hub. +templ HostTile(t TileData) { +
+
+
{ t.Host.Name }
+
{ tileStatus(t.Latest) }
+
+
+
+
MAC
+
{ t.Host.MAC }
+
+
+
WoL
+
{ fmt.Sprintf("%s:%d", t.Host.WoLBroadcastIP, t.Host.WoLPort) }
+
+ if t.Latest != nil && t.Latest.FailedStage != "" { +
+
Failed at
+
{ t.Latest.FailedStage }
+
+ } + if t.SpecDiffCritical > 0 { +
+
Spec diffs
+
{ fmt.Sprintf("%d critical", t.SpecDiffCritical) }
+
+ } +
+ if t.Latest != nil && t.Latest.State == model.StateFailedHolding && t.Latest.HoldIP != "" { +
+
Host is holding — SSH available
+ { sshInvocation(t.HoldKeyPath, t.Latest.HoldIP) } +
+ } + if t.Latest != nil { +
+ } +
+ if canStart(t.Latest) { +
+ +
+ } else { + + } + if canOverrideWipe(t.Latest) { +
+ +
+ } + if hasReport(t.Latest) { + View report + } +
+ +
+
+
+} + +func canOverrideWipe(r *model.Run) bool { + if r == nil { + return false + } + return r.State == model.StateFailedHolding && r.FailedStage == "Storage" +} + +// hasReport is true once the reporting stage has produced an HTML +// artifact. We cheat slightly: Completed runs always have one, and +// that's the only state in which the tile wants to surface a link. +func hasReport(r *model.Run) bool { + return r != nil && r.State == model.StateCompleted +} + +func canStart(r *model.Run) bool { + if r == nil { + return true + } + switch r.State { + case model.StateCompleted, model.StateReleased, model.StateFailedHolding: + return true + } + return false +} + +func tileStatus(r *model.Run) string { + if r == nil { + return "Idle" + } + return string(r.State) +} + +func tileMood(r *model.Run) string { + if r == nil { + return "idle" + } + switch r.State { + case model.StateCompleted: + return "pass" + case model.StateFailed, model.StateFailedHolding: + return "fail" + case model.StateReleased: + return "idle" + } + return "active" +} + +func sshInvocation(keyPath, ip string) string { + if keyPath == "" { + return "ssh root@" + ip + " (hold key not yet recorded)" + } + return fmt.Sprintf("ssh -i %s root@%s", keyPath, ip) +} + +// RenderTileString renders a single tile fragment so the orchestrator +// can publish it over SSE without threading a context through every +// event publisher. +func RenderTileString(t TileData) string { + var buf bytes.Buffer + _ = HostTile(t).Render(context.Background(), &buf) + return buf.String() +} diff --git a/internal/web/templates/host_tile_templ.go b/internal/web/templates/host_tile_templ.go new file mode 100644 index 0000000..f8cb765 --- /dev/null +++ b/internal/web/templates/host_tile_templ.go @@ -0,0 +1,385 @@ +// Code generated by templ - DO NOT EDIT. + +// templ: version: v0.3.1001 +package templates + +//lint:file-ignore SA4006 This context is only used if a nested component is present. + +import "github.com/a-h/templ" +import templruntime "github.com/a-h/templ/runtime" + +import ( + "bytes" + "context" + "fmt" + "vetting/internal/model" +) + +// HostTile renders a single dashboard card. It's the SSE-swap target +// for per-host tile refreshes (`tile-N`) and contains a per-run log +// pane (`log-M`) whose live tail is appended by the events hub. +func HostTile(t TileData) templ.Component { + return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) { + templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context + if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil { + return templ_7745c5c3_CtxErr + } + templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W) + if !templ_7745c5c3_IsBuffer { + defer func() { + templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer) + if templ_7745c5c3_Err == nil { + templ_7745c5c3_Err = templ_7745c5c3_BufErr + } + }() + } + ctx = templ.InitializeContext(ctx) + templ_7745c5c3_Var1 := templ.GetChildren(ctx) + if templ_7745c5c3_Var1 == nil { + templ_7745c5c3_Var1 = templ.NopComponent + } + ctx = templ.ClearChildren(ctx) + var templ_7745c5c3_Var2 = []any{"tile", "tile-" + tileMood(t.Latest)} + templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var2...) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var6 string + templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(t.Host.Name) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 21, Col: 39} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var7 string + templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(t.Latest)) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 22, Col: 50} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 6, "
MAC
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var8 string + templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(t.Host.MAC) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 27, Col: 20} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 7, "
WoL
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var9 string + templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%s:%d", t.Host.WoLBroadcastIP, t.Host.WoLPort)) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 31, Col: 69} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 8, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + if t.Latest != nil && t.Latest.FailedStage != "" { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 9, "
Failed at
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var10 string + templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(t.Latest.FailedStage) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 36, Col: 31} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 10, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + if t.SpecDiffCritical > 0 { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 11, "
Spec diffs
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var11 string + templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d critical", t.SpecDiffCritical)) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 42, Col: 69} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 12, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 13, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + if t.Latest != nil && t.Latest.State == model.StateFailedHolding && t.Latest.HoldIP != "" { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 14, "
Host is holding — SSH available
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var12 string + templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(t.HoldKeyPath, t.Latest.HoldIP)) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 49, Col: 74} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 15, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + if t.Latest != nil { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 16, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 19, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + if canStart(t.Latest) { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 20, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } else { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 22, " ") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + if canOverrideWipe(t.Latest) { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 23, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + if hasReport(t.Latest) { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 25, "View report") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 27, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + return nil + }) +} + +func canOverrideWipe(r *model.Run) bool { + if r == nil { + return false + } + return r.State == model.StateFailedHolding && r.FailedStage == "Storage" +} + +// hasReport is true once the reporting stage has produced an HTML +// artifact. We cheat slightly: Completed runs always have one, and +// that's the only state in which the tile wants to surface a link. +func hasReport(r *model.Run) bool { + return r != nil && r.State == model.StateCompleted +} + +func canStart(r *model.Run) bool { + if r == nil { + return true + } + switch r.State { + case model.StateCompleted, model.StateReleased, model.StateFailedHolding: + return true + } + return false +} + +func tileStatus(r *model.Run) string { + if r == nil { + return "Idle" + } + return string(r.State) +} + +func tileMood(r *model.Run) string { + if r == nil { + return "idle" + } + switch r.State { + case model.StateCompleted: + return "pass" + case model.StateFailed, model.StateFailedHolding: + return "fail" + case model.StateReleased: + return "idle" + } + return "active" +} + +func sshInvocation(keyPath, ip string) string { + if keyPath == "" { + return "ssh root@" + ip + " (hold key not yet recorded)" + } + return fmt.Sprintf("ssh -i %s root@%s", keyPath, ip) +} + +// RenderTileString renders a single tile fragment so the orchestrator +// can publish it over SSE without threading a context through every +// event publisher. +func RenderTileString(t TileData) string { + var buf bytes.Buffer + _ = HostTile(t).Render(context.Background(), &buf) + return buf.String() +} + +var _ = templruntime.GeneratedTemplate diff --git a/internal/web/templates/layout.templ b/internal/web/templates/layout.templ new file mode 100644 index 0000000..aa36f7e --- /dev/null +++ b/internal/web/templates/layout.templ @@ -0,0 +1,50 @@ +package templates + +templ Layout(title string) { + + + + + + { title } — Vetting + + + + + +
+
Vetting
+ +
+ · +
+ +
+
+
+
+ { children... } +
+ + +} + +templ BareLayout(title string) { + + + + + + { title } — Vetting + + + +
+ { children... } +
+ + +} diff --git a/internal/web/templates/layout_templ.go b/internal/web/templates/layout_templ.go new file mode 100644 index 0000000..bf4ac34 --- /dev/null +++ b/internal/web/templates/layout_templ.go @@ -0,0 +1,111 @@ +// Code generated by templ - DO NOT EDIT. + +// templ: version: v0.3.1001 +package templates + +//lint:file-ignore SA4006 This context is only used if a nested component is present. + +import "github.com/a-h/templ" +import templruntime "github.com/a-h/templ/runtime" + +func Layout(title string) templ.Component { + return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) { + templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context + if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil { + return templ_7745c5c3_CtxErr + } + templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W) + if !templ_7745c5c3_IsBuffer { + defer func() { + templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer) + if templ_7745c5c3_Err == nil { + templ_7745c5c3_Err = templ_7745c5c3_BufErr + } + }() + } + ctx = templ.InitializeContext(ctx) + templ_7745c5c3_Var1 := templ.GetChildren(ctx) + if templ_7745c5c3_Var1 == nil { + templ_7745c5c3_Var1 = templ.NopComponent + } + ctx = templ.ClearChildren(ctx) + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var2 string + templ_7745c5c3_Var2, templ_7745c5c3_Err = templ.JoinStringErrs(title) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/layout.templ`, Line: 9, Col: 17} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var2)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, " — Vetting
Vetting
·
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templ_7745c5c3_Var1.Render(ctx, templ_7745c5c3_Buffer) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + return nil + }) +} + +func BareLayout(title string) templ.Component { + return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) { + templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context + if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil { + return templ_7745c5c3_CtxErr + } + templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W) + if !templ_7745c5c3_IsBuffer { + defer func() { + templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer) + if templ_7745c5c3_Err == nil { + templ_7745c5c3_Err = templ_7745c5c3_BufErr + } + }() + } + ctx = templ.InitializeContext(ctx) + templ_7745c5c3_Var3 := templ.GetChildren(ctx) + if templ_7745c5c3_Var3 == nil { + templ_7745c5c3_Var3 = templ.NopComponent + } + ctx = templ.ClearChildren(ctx) + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var4 string + templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(title) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/layout.templ`, Line: 41, Col: 17} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 5, " — Vetting
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templ_7745c5c3_Var3.Render(ctx, templ_7745c5c3_Buffer) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 6, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + return nil + }) +} + +var _ = templruntime.GeneratedTemplate diff --git a/internal/web/templates/login.templ b/internal/web/templates/login.templ new file mode 100644 index 0000000..8dbd3d4 --- /dev/null +++ b/internal/web/templates/login.templ @@ -0,0 +1,20 @@ +package templates + +templ Login(errMsg, next string) { + @BareLayout("Sign in") { + + } +} diff --git a/internal/web/templates/login_templ.go b/internal/web/templates/login_templ.go new file mode 100644 index 0000000..046d1eb --- /dev/null +++ b/internal/web/templates/login_templ.go @@ -0,0 +1,94 @@ +// Code generated by templ - DO NOT EDIT. + +// templ: version: v0.3.1001 +package templates + +//lint:file-ignore SA4006 This context is only used if a nested component is present. + +import "github.com/a-h/templ" +import templruntime "github.com/a-h/templ/runtime" + +func Login(errMsg, next string) templ.Component { + return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) { + templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context + if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil { + return templ_7745c5c3_CtxErr + } + templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W) + if !templ_7745c5c3_IsBuffer { + defer func() { + templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer) + if templ_7745c5c3_Err == nil { + templ_7745c5c3_Err = templ_7745c5c3_BufErr + } + }() + } + ctx = templ.InitializeContext(ctx) + templ_7745c5c3_Var1 := templ.GetChildren(ctx) + if templ_7745c5c3_Var1 == nil { + templ_7745c5c3_Var1 = templ.NopComponent + } + ctx = templ.ClearChildren(ctx) + templ_7745c5c3_Var2 := templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) { + templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context + templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W) + if !templ_7745c5c3_IsBuffer { + defer func() { + templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer) + if templ_7745c5c3_Err == nil { + templ_7745c5c3_Err = templ_7745c5c3_BufErr + } + }() + } + ctx = templ.InitializeContext(ctx) + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "

Vetting

") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + if errMsg != "" { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var3 string + templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(errMsg) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/login.templ`, Line: 8, Col: 31} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + return nil + }) + templ_7745c5c3_Err = BareLayout("Sign in").Render(templ.WithChildren(ctx, templ_7745c5c3_Var2), templ_7745c5c3_Buffer) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + return nil + }) +} + +var _ = templruntime.GeneratedTemplate diff --git a/internal/web/templates/registration.templ b/internal/web/templates/registration.templ new file mode 100644 index 0000000..414dd18 --- /dev/null +++ b/internal/web/templates/registration.templ @@ -0,0 +1,61 @@ +package templates + +type RegistrationForm struct { + Name string + MAC string + WoLBroadcastIP string + WoLPort string + ExpectedSpecYAML string + Notes string + Error string +} + +templ Registration(form RegistrationForm) { + @Layout("Register host") { +
+

Register host

+ if form.Error != "" { +
{ form.Error }
+ } +
+ + +
+ + +
+ + +
+ + Cancel +
+
+
+ } +} + +func defaultPort(v string) string { + if v == "" { + return "9" + } + return v +} diff --git a/internal/web/templates/registration_templ.go b/internal/web/templates/registration_templ.go new file mode 100644 index 0000000..78db794 --- /dev/null +++ b/internal/web/templates/registration_templ.go @@ -0,0 +1,176 @@ +// Code generated by templ - DO NOT EDIT. + +// templ: version: v0.3.1001 +package templates + +//lint:file-ignore SA4006 This context is only used if a nested component is present. + +import "github.com/a-h/templ" +import templruntime "github.com/a-h/templ/runtime" + +type RegistrationForm struct { + Name string + MAC string + WoLBroadcastIP string + WoLPort string + ExpectedSpecYAML string + Notes string + Error string +} + +func Registration(form RegistrationForm) templ.Component { + return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) { + templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context + if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil { + return templ_7745c5c3_CtxErr + } + templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W) + if !templ_7745c5c3_IsBuffer { + defer func() { + templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer) + if templ_7745c5c3_Err == nil { + templ_7745c5c3_Err = templ_7745c5c3_BufErr + } + }() + } + ctx = templ.InitializeContext(ctx) + templ_7745c5c3_Var1 := templ.GetChildren(ctx) + if templ_7745c5c3_Var1 == nil { + templ_7745c5c3_Var1 = templ.NopComponent + } + ctx = templ.ClearChildren(ctx) + templ_7745c5c3_Var2 := templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) { + templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context + templ_7745c5c3_Buffer, templ_7745c5c3_IsBuffer := templruntime.GetBuffer(templ_7745c5c3_W) + if !templ_7745c5c3_IsBuffer { + defer func() { + templ_7745c5c3_BufErr := templruntime.ReleaseBuffer(templ_7745c5c3_Buffer) + if templ_7745c5c3_Err == nil { + templ_7745c5c3_Err = templ_7745c5c3_BufErr + } + }() + } + ctx = templ.InitializeContext(ctx) + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 1, "

Register host

") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + if form.Error != "" { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 2, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var3 string + templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(form.Error) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 18, Col: 35} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 3, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 4, "
Cancel
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + return nil + }) + templ_7745c5c3_Err = Layout("Register host").Render(templ.WithChildren(ctx, templ_7745c5c3_Var2), templ_7745c5c3_Buffer) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + return nil + }) +} + +func defaultPort(v string) string { + if v == "" { + return "9" + } + return v +} + +var _ = templruntime.GeneratedTemplate diff --git a/live-image/Makefile b/live-image/Makefile new file mode 100644 index 0000000..0508fbb --- /dev/null +++ b/live-image/Makefile @@ -0,0 +1,32 @@ +# live-image/Makefile — builds the Debian live image that PXE-booted +# hosts land in. Requires a Linux host (or WSL) with mkosi installed. +# On native Windows this Makefile short-circuits with a clear message. + +ifeq ($(OS),Windows_NT) +UNAME_S := Windows +else +UNAME_S := $(shell uname -s) +endif + +REPO_ROOT := $(abspath ..) +AGENT_BIN := $(REPO_ROOT)/bin/vetting-agent.linux-amd64 + +.PHONY: all check-linux agent clean +all: check-linux agent + mkosi --force build + +agent: $(AGENT_BIN) + +$(AGENT_BIN): + cd $(REPO_ROOT) && GOOS=linux GOARCH=amd64 go build -o $(AGENT_BIN) ./cmd/vetting-agent + +check-linux: +ifneq ($(UNAME_S),Linux) + @echo "ERROR: live-image must be built on Linux (you're on $(UNAME_S))." + @echo "Run 'wsl make -C live-image all' from Windows instead." + @exit 1 +endif + @command -v mkosi >/dev/null 2>&1 || { echo "ERROR: mkosi not installed. Try: apt install mkosi"; exit 1; } + +clean: + rm -rf build mkosi.output mkosi.cache diff --git a/live-image/README.md b/live-image/README.md new file mode 100644 index 0000000..e6985e3 --- /dev/null +++ b/live-image/README.md @@ -0,0 +1,36 @@ +# Vetting live image + +Debian-based Linux live image that PXE-booted hosts drop into. Runs the +`vetting-agent` binary under systemd and reaches back to the orchestrator +over HTTP+SSE. + +## Building + +Must be built on Linux (or WSL). On Windows: + +```sh +wsl make -C live-image all +``` + +On Linux: + +```sh +make -C live-image all +``` + +This produces `live-image/build/vmlinuz` and `live-image/build/initrd.img`. +Copy (or symlink) them into the directory configured as `pxe.live_dir` in +`deploy/vetting.yaml`; the orchestrator serves them at `/live/*`. + +## iPXE binaries + +The dnsmasq supervisor expects `ipxe.efi` and `undionly.kpxe` to live in +`pxe.tftp_root`. Fetch the latest release binaries from +https://boot.ipxe.org and drop them in that directory. The Makefile does +not download them automatically so their SHA256 can be operator-verified. + +## WSL prerequisites (Windows dev) + +```sh +sudo apt install mkosi debootstrap squashfs-tools dosfstools +``` diff --git a/live-image/mkosi.conf b/live-image/mkosi.conf new file mode 100644 index 0000000..8ad1098 --- /dev/null +++ b/live-image/mkosi.conf @@ -0,0 +1,38 @@ +# Vetting live image (Phase 2 skeleton). +# +# Produces a Debian-based rootfs packaged as squashfs plus a kernel +# image, ready to be served over HTTP to iPXE. The image is deliberately +# small: only what the agent needs to run Phase 2 (the Hello / Claim / +# Heartbeat loop). Phase 4+ adds smartctl, stress-ng, fio, iperf3, etc. + +[Distribution] +Distribution=debian +Release=bookworm +Repositories=main + +[Output] +Format=directory +Output=build + +[Content] +Bootable=yes +BuildPackages= +Packages= + systemd + systemd-sysv + udev + linux-image-amd64 + live-boot + iproute2 + iputils-ping + openssh-server + ca-certificates + curl + dmidecode + pciutils + usbutils + +# Phase 4 will add: smartmontools stress-ng fio iperf3 lshw lm-sensors + +[Host] +# Copy the prebuilt Go agent in from the repo root via postinst. diff --git a/live-image/mkosi.postinst b/live-image/mkosi.postinst new file mode 100644 index 0000000..09e5e18 --- /dev/null +++ b/live-image/mkosi.postinst @@ -0,0 +1,15 @@ +#!/bin/sh +# mkosi postinst: install the vetting-agent binary and its systemd unit +# into the image. The binary must already be built for linux-amd64 at +# repo root under bin/vetting-agent.linux-amd64 (the top-level Makefile +# does this via `make agent-linux`). +set -eu + +AGENT_BIN="${SRCDIR:-..}/bin/vetting-agent.linux-amd64" + +install -D -m 0755 "$AGENT_BIN" "$BUILDROOT/usr/local/sbin/vetting-agent" +install -D -m 0644 "$SRCDIR/mkosi.skeleton/etc/systemd/system/vetting-agent.service" \ + "$BUILDROOT/etc/systemd/system/vetting-agent.service" + +ln -sf /etc/systemd/system/vetting-agent.service \ + "$BUILDROOT/etc/systemd/system/multi-user.target.wants/vetting-agent.service" diff --git a/live-image/mkosi.skeleton/etc/systemd/system/vetting-agent.service b/live-image/mkosi.skeleton/etc/systemd/system/vetting-agent.service new file mode 100644 index 0000000..d4a2a14 --- /dev/null +++ b/live-image/mkosi.skeleton/etc/systemd/system/vetting-agent.service @@ -0,0 +1,18 @@ +[Unit] +Description=Vetting hardware-validation agent +# Wait until networking is minimally up (the agent itself retries +# dial failures, but no point hammering before DHCP finishes). +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/usr/local/sbin/vetting-agent +Restart=on-failure +RestartSec=5s +# The agent reads /proc/cmdline; it needs no extra env. +StandardOutput=journal+console +StandardError=journal+console + +[Install] +WantedBy=multi-user.target diff --git a/test/e2e/qemu_test.go b/test/e2e/qemu_test.go new file mode 100644 index 0000000..52a42a8 --- /dev/null +++ b/test/e2e/qemu_test.go @@ -0,0 +1,225 @@ +//go:build e2e + +// Package e2e exercises the orchestrator end-to-end against a real QEMU +// VM PXE-booting from the orchestrator-supervised dnsmasq into the +// mkosi-built live image. +// +// This test is gated behind the `e2e` build tag because: +// - it requires root (for bridge + qemu-system-x86_64 network setup), +// - it needs a pre-built live image at live-image/out/{vmlinuz,initrd.img}, +// - it only runs on Linux (mkosi + qemu-kvm). +// +// Run with: +// +// sudo go test -tags=e2e -run TestQEMUFullRun ./test/e2e/... +// +// See docs/operations.md for the manual QEMU invocation equivalent. +package e2e + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "testing" + "time" +) + +// Tunables — overridable via env for CI, defaults match the manual +// setup documented in docs/operations.md. +var ( + bridgeName = envOr("VETTING_E2E_BRIDGE", "br-vetting") + liveKernel = envOr("VETTING_E2E_KERNEL", "live-image/out/vmlinuz") + liveInitrd = envOr("VETTING_E2E_INITRD", "live-image/out/initrd.img") + testMAC = envOr("VETTING_E2E_MAC", "52:54:00:12:34:56") + publicURL = envOr("VETTING_E2E_URL", "http://10.77.0.1:8080") + // Overall budget for the run to reach Completed. Stage timeouts in + // the config should be tuned down for E2E to well under this. + runBudget = 10 * time.Minute +) + +func envOr(k, d string) string { + if v := os.Getenv(k); v != "" { + return v + } + return d +} + +// TestQEMUFullRun boots a QEMU VM against a running orchestrator and +// waits for the Run state to reach Completed. +// +// Preconditions (test skips unless all are true): +// - Linux host +// - Running as root (bridge networking + qemu-kvm) +// - `qemu-system-x86_64` on PATH +// - Live image built (kernel + initrd exist) +// - An orchestrator is already running at $VETTING_E2E_URL with a +// host registered for $VETTING_E2E_MAC and a run already queued +// (start the run via the UI before invoking this test, or via the +// orchestrator's /hosts/{id}/start endpoint). +// +// The test exercises the real PXE path. It does NOT embed its own +// orchestrator because dnsmasq needs CAP_NET_ADMIN and the test binary +// should stay focused on the "did the run complete?" assertion. +func TestQEMUFullRun(t *testing.T) { + if runtime.GOOS != "linux" { + t.Skip("E2E test requires Linux") + } + if os.Geteuid() != 0 { + t.Skip("E2E test requires root (sudo go test -tags=e2e ...)") + } + if _, err := exec.LookPath("qemu-system-x86_64"); err != nil { + t.Skip("qemu-system-x86_64 not on PATH") + } + if _, err := os.Stat(liveKernel); err != nil { + t.Skipf("live kernel missing at %s (run `make live-image`)", liveKernel) + } + if _, err := os.Stat(liveInitrd); err != nil { + t.Skipf("live initrd missing at %s", liveInitrd) + } + if err := pingOrchestrator(publicURL); err != nil { + t.Skipf("orchestrator not reachable at %s: %v", publicURL, err) + } + + runID, err := findQueuedRunForMAC(publicURL, testMAC) + if err != nil { + t.Fatalf("no queued run for %s: %v (register the host and click Start Vetting first)", testMAC, err) + } + t.Logf("driving run %d for MAC %s", runID, testMAC) + + disk, cleanup := makeThrowawayDisk(t) + defer cleanup() + + qemuCtx, cancel := context.WithTimeout(context.Background(), runBudget) + defer cancel() + + cmd := exec.CommandContext(qemuCtx, "qemu-system-x86_64", + "-enable-kvm", "-cpu", "host", "-smp", "4", "-m", "4096", + "-netdev", "bridge,id=n0,br="+bridgeName, + "-device", "virtio-net-pci,netdev=n0,mac="+testMAC, + "-drive", "file="+disk+",format=raw,if=virtio", + "-boot", "n", "-serial", "file:"+filepath.Join(os.TempDir(), fmt.Sprintf("vetting-e2e-%d.serial", runID)), + "-display", "none", + ) + cmd.Stdout = testLogger{t} + cmd.Stderr = testLogger{t} + if err := cmd.Start(); err != nil { + t.Fatalf("start qemu: %v", err) + } + defer func() { + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + + // Poll the orchestrator until the run reaches a terminal state. + poll := time.NewTicker(5 * time.Second) + defer poll.Stop() + for { + select { + case <-qemuCtx.Done(): + t.Fatalf("run %d did not complete within %s", runID, runBudget) + case <-poll.C: + state, err := getRunState(publicURL, runID) + if err != nil { + t.Logf("poll state: %v (will retry)", err) + continue + } + t.Logf("run %d state = %s", runID, state) + switch state { + case "Completed": + return // green path + case "FailedHolding", "Failed", "Released": + t.Fatalf("run %d ended in non-success state %q", runID, state) + } + } + } +} + +// ---- helpers ------------------------------------------------------------ + +func pingOrchestrator(url string) error { + req, err := http.NewRequest(http.MethodGet, url+"/login", nil) + if err != nil { + return err + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode >= 500 { + return fmt.Errorf("status %d", resp.StatusCode) + } + return nil +} + +// findQueuedRunForMAC hits a hypothetical /api/v1/runs?mac=... debug +// endpoint. Since Phase 6 doesn't add that endpoint (orchestrator stays +// browser-session-gated for UI routes), we fall back to requiring the +// caller to set VETTING_E2E_RUN_ID if the orchestrator hasn't been +// extended with a debug listing. This is a pragmatic hack — the E2E +// harness is developer-facing and the alternative would be scraping +// HTML. +func findQueuedRunForMAC(baseURL, mac string) (int64, error) { + if s := os.Getenv("VETTING_E2E_RUN_ID"); s != "" { + var id int64 + _, err := fmt.Sscanf(s, "%d", &id) + return id, err + } + return 0, fmt.Errorf("set VETTING_E2E_RUN_ID (no debug API for MAC lookup yet)") +} + +// getRunState reads the run's current state via the report route's +// fall-through: /reports/{id} returns 404 until Completed, which gives +// us a cheap terminal-check without a JSON API. For intermediate +// states we need a debug endpoint — deliberately left as a TODO so +// the test doesn't depend on an API surface that isn't stable. +func getRunState(baseURL string, runID int64) (string, error) { + // Proxy: if /reports/{id} returns 200, the run is Completed. + resp, err := http.Get(fmt.Sprintf("%s/reports/%d", baseURL, runID)) + if err != nil { + return "", err + } + defer resp.Body.Close() + _, _ = io.Copy(io.Discard, resp.Body) + switch resp.StatusCode { + case 200: + return "Completed", nil + case 401, 403: + // Session-gated; caller must export VETTING_E2E_COOKIE to bypass. + return "", fmt.Errorf("auth required; set VETTING_E2E_COOKIE") + case 404: + return "InProgress", nil + default: + return "", fmt.Errorf("unexpected %d", resp.StatusCode) + } +} + +func makeThrowawayDisk(t *testing.T) (string, func()) { + t.Helper() + path := filepath.Join(t.TempDir(), "test-disk.img") + cmd := exec.Command("qemu-img", "create", "-f", "raw", path, "4G") + if out, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("qemu-img create: %v\n%s", err, strings.TrimSpace(string(out))) + } + return path, func() { _ = os.Remove(path) } +} + +// testLogger lets exec.Cmd write into the test's log stream so QEMU's +// stderr shows up with the test name, not as an orphaned blob. +type testLogger struct{ t *testing.T } + +func (w testLogger) Write(p []byte) (int, error) { + w.t.Logf("qemu: %s", strings.TrimRight(string(p), "\r\n")) + return len(p), nil +} + +// Compile-time reminder: json is imported so future expansions can +// parse the orchestrator's response bodies when a debug API lands. +var _ = json.Marshal diff --git a/tools/gen-admin-password/main.go b/tools/gen-admin-password/main.go new file mode 100644 index 0000000..d4f3f3e --- /dev/null +++ b/tools/gen-admin-password/main.go @@ -0,0 +1,21 @@ +package main + +import ( + "fmt" + "os" + + "vetting/internal/auth" +) + +func main() { + if len(os.Args) != 2 { + fmt.Fprintln(os.Stderr, "usage: gen-admin-password ") + os.Exit(2) + } + hash, err := auth.BcryptHash(os.Args[1]) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + fmt.Println(hash) +}