Initial commit: full Phases 1-6 implementation
CI / Lint + build + test (push) Has been cancelled

Post-repair hardware validation pipeline for Proxmox cluster hosts.
Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq
PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
+136
View File
@@ -0,0 +1,136 @@
#!/usr/bin/env bash
# install.sh — one-shot installer for the vetting orchestrator on a
# Proxmox LXC (or any Debian/Ubuntu host).
#
# What it does:
# 1. apt-installs runtime dependencies (dnsmasq, iperf3, ca-certs).
# 2. Creates the `vetting` system user with /var/lib/vetting homedir.
# 3. Copies the pre-built `vetting` binary into /usr/local/bin.
# 4. Drops the systemd unit and example config into /etc/vetting.
# 5. Reminds the operator to edit the config and set a bcrypt
# password before enabling the service — we don't auto-start
# because a placeholder password would just refuse to boot.
#
# What it deliberately does NOT do:
# - Build the orchestrator (this script assumes you ran
# `make orchestrator-linux` beforehand and that bin/vetting-linux-amd64
# exists alongside this script, or pass --binary to locate it).
# - Install the live image or TFTP payloads — those are separate,
# since most operators want to build them from a pinned CI artifact
# rather than on the LXC itself.
#
# Usage:
# sudo ./install.sh [--binary PATH] [--config-dir /etc/vetting]
#
set -euo pipefail
BINARY=""
CONFIG_DIR="/etc/vetting"
STATE_DIR="/var/lib/vetting"
LOG_DIR="/var/log/vetting"
SERVICE_USER="vetting"
usage() {
cat <<EOF
Usage: $0 [--binary PATH] [--config-dir DIR]
--binary PATH Path to a pre-built vetting binary (default:
auto-detect ../bin/vetting-linux-amd64 relative to
this script).
--config-dir DIR Where to install vetting.yaml + systemd unit drop
(default: /etc/vetting).
-h, --help Print this message.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--binary) BINARY="$2"; shift 2 ;;
--config-dir) CONFIG_DIR="$2"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "unknown arg: $1" >&2; usage; exit 2 ;;
esac
done
if [[ $EUID -ne 0 ]]; then
echo "install.sh must be run as root (try: sudo $0)" >&2
exit 1
fi
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
if [[ -z "${BINARY}" ]]; then
for cand in \
"${REPO_ROOT}/bin/vetting-linux-amd64" \
"${REPO_ROOT}/bin/vetting" \
"${SCRIPT_DIR}/vetting"; do
if [[ -x "${cand}" ]]; then BINARY="${cand}"; break; fi
done
fi
if [[ -z "${BINARY}" || ! -x "${BINARY}" ]]; then
echo "could not find a vetting binary to install; pass --binary PATH or run 'make orchestrator-linux' first" >&2
exit 1
fi
echo "==> installing runtime dependencies"
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y --no-install-recommends \
ca-certificates dnsmasq iperf3
echo "==> creating ${SERVICE_USER} user"
if ! id -u "${SERVICE_USER}" >/dev/null 2>&1; then
useradd --system \
--home-dir "${STATE_DIR}" \
--shell /usr/sbin/nologin \
"${SERVICE_USER}"
fi
echo "==> preparing directories"
install -d -m 0755 -o "${SERVICE_USER}" -g "${SERVICE_USER}" "${STATE_DIR}"
install -d -m 0755 -o "${SERVICE_USER}" -g "${SERVICE_USER}" "${LOG_DIR}"
install -d -m 0755 "${CONFIG_DIR}"
echo "==> installing binary"
install -m 0755 "${BINARY}" /usr/local/bin/vetting
echo "==> installing config and systemd unit"
if [[ ! -f "${CONFIG_DIR}/vetting.yaml" ]]; then
install -m 0640 -o root -g "${SERVICE_USER}" \
"${SCRIPT_DIR}/vetting.example.yaml" \
"${CONFIG_DIR}/vetting.yaml"
echo " -> installed default config at ${CONFIG_DIR}/vetting.yaml"
else
echo " -> preserving existing ${CONFIG_DIR}/vetting.yaml"
fi
install -m 0644 "${SCRIPT_DIR}/vetting.service" /etc/systemd/system/vetting.service
# Disable the distro's dnsmasq so only the orchestrator-supervised
# instance owns DHCP/TFTP. Operators who want to keep dnsmasq for
# something else can re-enable it after configuring a disjoint listen
# address.
if systemctl is-enabled --quiet dnsmasq 2>/dev/null; then
echo "==> disabling distro dnsmasq (orchestrator supervises its own)"
systemctl disable --now dnsmasq
fi
systemctl daemon-reload
cat <<EOF
vetting is installed but not yet enabled.
Next steps:
1. Edit ${CONFIG_DIR}/vetting.yaml and set:
- auth.admin_password_bcrypt (run: vetting gen-admin-password YOURPW)
- auth.session_secret_hex (run: openssl rand -hex 32)
- server.public_url (the URL you'll browse to)
- pxe.* if you want PXE boot support
- notifiers + routes (optional)
2. Start the service:
systemctl enable --now vetting
3. Watch the logs:
journalctl -fu vetting
EOF
+89
View File
@@ -0,0 +1,89 @@
server:
bind: "127.0.0.1:8080"
# Base URL the orchestrator is reachable at from the operator's
# browser. Used as the click-through link in notifications, so it
# should be the *external* URL (e.g. https://vetting.lan:8443),
# not the bind address.
public_url: "http://127.0.0.1:8080"
tls:
enabled: false
cert_file: ""
key_file: ""
database:
path: "./var/vetting.db"
artifacts:
dir: "./var/artifacts"
# Days to keep per-run artifact files (report.html, report.json, fio,
# iperf, inventory.json, hold keys). DB rows are preserved. 0 = forever.
retention_days: 30
logs:
dir: "./var/logs"
# Days to keep per-run log files. 0 = forever.
retention_days: 30
janitor:
# Interval between cleanup sweeps. 0 defaults to 60.
interval_minutes: 60
auth:
# bcrypt hash of your admin password.
# Generate via: ./bin/gen-admin-password "your-password"
admin_password_bcrypt: "$2a$10$REPLACE_ME_WITH_A_REAL_BCRYPT_HASH_0123456789abcdefABCDEFxx"
# Random 32-byte hex string used to sign session cookies.
# Generate via: openssl rand -hex 32 (or use PowerShell equivalent)
session_secret_hex: "0000000000000000000000000000000000000000000000000000000000000000"
session_ttl_hours: 24
dispatcher:
max_concurrent_runs: 3
# Fields below are populated in later phases and ignored in Phase 1.
pxe:
enabled: false
interface: "" # e.g. "eth0"
dhcp_range: "" # e.g. "10.77.0.100,10.77.0.200,12h"
orchestrator_url: "" # e.g. "http://10.77.0.1:8080"
tftp_root: "" # holds ipxe.efi + undionly.kpxe
live_dir: "" # holds vmlinuz + initrd.img; served at /live/*
# Notifications fire on StageFailed, SpecMismatch, HoldingOpened,
# RunCompleted. Declare one or more notifiers and route each event
# kind (and optionally severity) to a notifier by name. Delivery is
# fire-and-forget (one attempt per event, logged on failure).
#
# Example (uncomment and fill in):
#
# notifiers:
# - name: ops-ntfy
# type: ntfy
# server: https://ntfy.sh
# topic: vetting-YOUR-TOPIC
# - name: ops-discord
# type: discord
# webhook_url: https://discord.com/api/webhooks/XXX/YYY
# - name: ops-email
# type: smtp
# smtp:
# host: mail.lan
# port: 25
# from: vetting@lan.local
# to: [ops@lan.local]
#
# routes:
# # Critical events (failures / holds) fire on all three channels.
# - match_severity: [critical]
# notifier: ops-ntfy
# - match_severity: [critical]
# notifier: ops-discord
# - match_severity: [critical]
# notifier: ops-email
# # RunCompleted is informational — push to ntfy only.
# - match_kind: [RunCompleted]
# notifier: ops-ntfy
notifiers: []
routes: []
+53
View File
@@ -0,0 +1,53 @@
[Unit]
Description=Vetting orchestrator (post-repair hardware validation)
Documentation=https://github.com/your-org/vetting
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=vetting
Group=vetting
ExecStart=/usr/local/bin/vetting --config /etc/vetting/vetting.yaml
# The orchestrator embeds dnsmasq and sends raw WoL broadcasts. Rather
# than run as root, grant just the caps we need:
# CAP_NET_BIND_SERVICE — if the operator binds :443 or :80
# CAP_NET_RAW — WoL magic packet via DGRAM broadcast; not
# strictly required when using UDP broadcast to
# 255.255.255.255 on port 9, but safer to carry
# so custom ports work.
# CAP_NET_ADMIN — dnsmasq needs this to create the DHCP socket
# and to bind to a specific interface.
AmbientCapabilities=CAP_NET_BIND_SERVICE CAP_NET_RAW CAP_NET_ADMIN
CapabilityBoundingSet=CAP_NET_BIND_SERVICE CAP_NET_RAW CAP_NET_ADMIN
# Filesystem: the orchestrator needs to write to /var/lib/vetting and
# /var/log/vetting. Everything else is read-only.
ReadWritePaths=/var/lib/vetting /var/log/vetting
ProtectSystem=strict
ProtectHome=true
NoNewPrivileges=true
PrivateTmp=true
PrivateDevices=true
ProtectControlGroups=true
ProtectKernelTunables=true
ProtectKernelModules=true
RestrictSUIDSGID=true
RestrictNamespaces=true
LockPersonality=true
# Restart policy — crash out loudly on startup errors, but recover from
# transient failures.
Restart=on-failure
RestartSec=5
StartLimitBurst=5
StartLimitIntervalSec=60
# Logs go to journald; the orchestrator's own per-run log files live
# under /var/log/vetting regardless.
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target