Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env bash
|
||||
# install.sh — one-shot installer for the vetting orchestrator on a
|
||||
# Proxmox LXC (or any Debian/Ubuntu host).
|
||||
#
|
||||
# What it does:
|
||||
# 1. apt-installs runtime dependencies (dnsmasq, iperf3, ca-certs).
|
||||
# 2. Creates the `vetting` system user with /var/lib/vetting homedir.
|
||||
# 3. Copies the pre-built `vetting` binary into /usr/local/bin.
|
||||
# 4. Drops the systemd unit and example config into /etc/vetting.
|
||||
# 5. Reminds the operator to edit the config and set a bcrypt
|
||||
# password before enabling the service — we don't auto-start
|
||||
# because a placeholder password would just refuse to boot.
|
||||
#
|
||||
# What it deliberately does NOT do:
|
||||
# - Build the orchestrator (this script assumes you ran
|
||||
# `make orchestrator-linux` beforehand and that bin/vetting-linux-amd64
|
||||
# exists alongside this script, or pass --binary to locate it).
|
||||
# - Install the live image or TFTP payloads — those are separate,
|
||||
# since most operators want to build them from a pinned CI artifact
|
||||
# rather than on the LXC itself.
|
||||
#
|
||||
# Usage:
|
||||
# sudo ./install.sh [--binary PATH] [--config-dir /etc/vetting]
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
BINARY=""
|
||||
CONFIG_DIR="/etc/vetting"
|
||||
STATE_DIR="/var/lib/vetting"
|
||||
LOG_DIR="/var/log/vetting"
|
||||
SERVICE_USER="vetting"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [--binary PATH] [--config-dir DIR]
|
||||
|
||||
--binary PATH Path to a pre-built vetting binary (default:
|
||||
auto-detect ../bin/vetting-linux-amd64 relative to
|
||||
this script).
|
||||
--config-dir DIR Where to install vetting.yaml + systemd unit drop
|
||||
(default: /etc/vetting).
|
||||
-h, --help Print this message.
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--binary) BINARY="$2"; shift 2 ;;
|
||||
--config-dir) CONFIG_DIR="$2"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) echo "unknown arg: $1" >&2; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
echo "install.sh must be run as root (try: sudo $0)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||
|
||||
if [[ -z "${BINARY}" ]]; then
|
||||
for cand in \
|
||||
"${REPO_ROOT}/bin/vetting-linux-amd64" \
|
||||
"${REPO_ROOT}/bin/vetting" \
|
||||
"${SCRIPT_DIR}/vetting"; do
|
||||
if [[ -x "${cand}" ]]; then BINARY="${cand}"; break; fi
|
||||
done
|
||||
fi
|
||||
if [[ -z "${BINARY}" || ! -x "${BINARY}" ]]; then
|
||||
echo "could not find a vetting binary to install; pass --binary PATH or run 'make orchestrator-linux' first" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "==> installing runtime dependencies"
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt-get update -qq
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates dnsmasq iperf3
|
||||
|
||||
echo "==> creating ${SERVICE_USER} user"
|
||||
if ! id -u "${SERVICE_USER}" >/dev/null 2>&1; then
|
||||
useradd --system \
|
||||
--home-dir "${STATE_DIR}" \
|
||||
--shell /usr/sbin/nologin \
|
||||
"${SERVICE_USER}"
|
||||
fi
|
||||
|
||||
echo "==> preparing directories"
|
||||
install -d -m 0755 -o "${SERVICE_USER}" -g "${SERVICE_USER}" "${STATE_DIR}"
|
||||
install -d -m 0755 -o "${SERVICE_USER}" -g "${SERVICE_USER}" "${LOG_DIR}"
|
||||
install -d -m 0755 "${CONFIG_DIR}"
|
||||
|
||||
echo "==> installing binary"
|
||||
install -m 0755 "${BINARY}" /usr/local/bin/vetting
|
||||
|
||||
echo "==> installing config and systemd unit"
|
||||
if [[ ! -f "${CONFIG_DIR}/vetting.yaml" ]]; then
|
||||
install -m 0640 -o root -g "${SERVICE_USER}" \
|
||||
"${SCRIPT_DIR}/vetting.example.yaml" \
|
||||
"${CONFIG_DIR}/vetting.yaml"
|
||||
echo " -> installed default config at ${CONFIG_DIR}/vetting.yaml"
|
||||
else
|
||||
echo " -> preserving existing ${CONFIG_DIR}/vetting.yaml"
|
||||
fi
|
||||
install -m 0644 "${SCRIPT_DIR}/vetting.service" /etc/systemd/system/vetting.service
|
||||
|
||||
# Disable the distro's dnsmasq so only the orchestrator-supervised
|
||||
# instance owns DHCP/TFTP. Operators who want to keep dnsmasq for
|
||||
# something else can re-enable it after configuring a disjoint listen
|
||||
# address.
|
||||
if systemctl is-enabled --quiet dnsmasq 2>/dev/null; then
|
||||
echo "==> disabling distro dnsmasq (orchestrator supervises its own)"
|
||||
systemctl disable --now dnsmasq
|
||||
fi
|
||||
|
||||
systemctl daemon-reload
|
||||
|
||||
cat <<EOF
|
||||
|
||||
vetting is installed but not yet enabled.
|
||||
|
||||
Next steps:
|
||||
1. Edit ${CONFIG_DIR}/vetting.yaml and set:
|
||||
- auth.admin_password_bcrypt (run: vetting gen-admin-password YOURPW)
|
||||
- auth.session_secret_hex (run: openssl rand -hex 32)
|
||||
- server.public_url (the URL you'll browse to)
|
||||
- pxe.* if you want PXE boot support
|
||||
- notifiers + routes (optional)
|
||||
2. Start the service:
|
||||
systemctl enable --now vetting
|
||||
3. Watch the logs:
|
||||
journalctl -fu vetting
|
||||
|
||||
EOF
|
||||
@@ -0,0 +1,89 @@
|
||||
server:
|
||||
bind: "127.0.0.1:8080"
|
||||
# Base URL the orchestrator is reachable at from the operator's
|
||||
# browser. Used as the click-through link in notifications, so it
|
||||
# should be the *external* URL (e.g. https://vetting.lan:8443),
|
||||
# not the bind address.
|
||||
public_url: "http://127.0.0.1:8080"
|
||||
tls:
|
||||
enabled: false
|
||||
cert_file: ""
|
||||
key_file: ""
|
||||
|
||||
database:
|
||||
path: "./var/vetting.db"
|
||||
|
||||
artifacts:
|
||||
dir: "./var/artifacts"
|
||||
# Days to keep per-run artifact files (report.html, report.json, fio,
|
||||
# iperf, inventory.json, hold keys). DB rows are preserved. 0 = forever.
|
||||
retention_days: 30
|
||||
|
||||
logs:
|
||||
dir: "./var/logs"
|
||||
# Days to keep per-run log files. 0 = forever.
|
||||
retention_days: 30
|
||||
|
||||
janitor:
|
||||
# Interval between cleanup sweeps. 0 defaults to 60.
|
||||
interval_minutes: 60
|
||||
|
||||
auth:
|
||||
# bcrypt hash of your admin password.
|
||||
# Generate via: ./bin/gen-admin-password "your-password"
|
||||
admin_password_bcrypt: "$2a$10$REPLACE_ME_WITH_A_REAL_BCRYPT_HASH_0123456789abcdefABCDEFxx"
|
||||
# Random 32-byte hex string used to sign session cookies.
|
||||
# Generate via: openssl rand -hex 32 (or use PowerShell equivalent)
|
||||
session_secret_hex: "0000000000000000000000000000000000000000000000000000000000000000"
|
||||
session_ttl_hours: 24
|
||||
|
||||
dispatcher:
|
||||
max_concurrent_runs: 3
|
||||
|
||||
# Fields below are populated in later phases and ignored in Phase 1.
|
||||
|
||||
pxe:
|
||||
enabled: false
|
||||
interface: "" # e.g. "eth0"
|
||||
dhcp_range: "" # e.g. "10.77.0.100,10.77.0.200,12h"
|
||||
orchestrator_url: "" # e.g. "http://10.77.0.1:8080"
|
||||
tftp_root: "" # holds ipxe.efi + undionly.kpxe
|
||||
live_dir: "" # holds vmlinuz + initrd.img; served at /live/*
|
||||
|
||||
# Notifications fire on StageFailed, SpecMismatch, HoldingOpened,
|
||||
# RunCompleted. Declare one or more notifiers and route each event
|
||||
# kind (and optionally severity) to a notifier by name. Delivery is
|
||||
# fire-and-forget (one attempt per event, logged on failure).
|
||||
#
|
||||
# Example (uncomment and fill in):
|
||||
#
|
||||
# notifiers:
|
||||
# - name: ops-ntfy
|
||||
# type: ntfy
|
||||
# server: https://ntfy.sh
|
||||
# topic: vetting-YOUR-TOPIC
|
||||
# - name: ops-discord
|
||||
# type: discord
|
||||
# webhook_url: https://discord.com/api/webhooks/XXX/YYY
|
||||
# - name: ops-email
|
||||
# type: smtp
|
||||
# smtp:
|
||||
# host: mail.lan
|
||||
# port: 25
|
||||
# from: vetting@lan.local
|
||||
# to: [ops@lan.local]
|
||||
#
|
||||
# routes:
|
||||
# # Critical events (failures / holds) fire on all three channels.
|
||||
# - match_severity: [critical]
|
||||
# notifier: ops-ntfy
|
||||
# - match_severity: [critical]
|
||||
# notifier: ops-discord
|
||||
# - match_severity: [critical]
|
||||
# notifier: ops-email
|
||||
# # RunCompleted is informational — push to ntfy only.
|
||||
# - match_kind: [RunCompleted]
|
||||
# notifier: ops-ntfy
|
||||
|
||||
notifiers: []
|
||||
routes: []
|
||||
@@ -0,0 +1,53 @@
|
||||
[Unit]
|
||||
Description=Vetting orchestrator (post-repair hardware validation)
|
||||
Documentation=https://github.com/your-org/vetting
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=vetting
|
||||
Group=vetting
|
||||
ExecStart=/usr/local/bin/vetting --config /etc/vetting/vetting.yaml
|
||||
|
||||
# The orchestrator embeds dnsmasq and sends raw WoL broadcasts. Rather
|
||||
# than run as root, grant just the caps we need:
|
||||
# CAP_NET_BIND_SERVICE — if the operator binds :443 or :80
|
||||
# CAP_NET_RAW — WoL magic packet via DGRAM broadcast; not
|
||||
# strictly required when using UDP broadcast to
|
||||
# 255.255.255.255 on port 9, but safer to carry
|
||||
# so custom ports work.
|
||||
# CAP_NET_ADMIN — dnsmasq needs this to create the DHCP socket
|
||||
# and to bind to a specific interface.
|
||||
AmbientCapabilities=CAP_NET_BIND_SERVICE CAP_NET_RAW CAP_NET_ADMIN
|
||||
CapabilityBoundingSet=CAP_NET_BIND_SERVICE CAP_NET_RAW CAP_NET_ADMIN
|
||||
|
||||
# Filesystem: the orchestrator needs to write to /var/lib/vetting and
|
||||
# /var/log/vetting. Everything else is read-only.
|
||||
ReadWritePaths=/var/lib/vetting /var/log/vetting
|
||||
ProtectSystem=strict
|
||||
ProtectHome=true
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
PrivateDevices=true
|
||||
ProtectControlGroups=true
|
||||
ProtectKernelTunables=true
|
||||
ProtectKernelModules=true
|
||||
RestrictSUIDSGID=true
|
||||
RestrictNamespaces=true
|
||||
LockPersonality=true
|
||||
|
||||
# Restart policy — crash out loudly on startup errors, but recover from
|
||||
# transient failures.
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
StartLimitBurst=5
|
||||
StartLimitIntervalSec=60
|
||||
|
||||
# Logs go to journald; the orchestrator's own per-run log files live
|
||||
# under /var/log/vetting regardless.
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Reference in New Issue
Block a user