From a01db639525d78b176988dd6c6a68d7cbf95358f Mon Sep 17 00:00:00 2001 From: josh Date: Mon, 20 Apr 2026 19:56:39 -0400 Subject: [PATCH] feat(install): auto-heal pxe.interface/pxe.subnet against the host A stale /etc/vetting/vetting.yaml (e.g. pxe.interface=eth1 after an LXC rebuild renamed the NIC to eth0) blocks vetting.service startup with "pxe.interface 'eth1' not found on host", requiring the operator to ssh in and hand-edit the yaml after every rebuild. install.sh now validates the pxe block against the host's actual network state on every install/upgrade run. If pxe.enabled is true and pxe.interface doesn't exist (or pxe.subnet is missing/malformed), the script auto-detects the primary NIC via the default route, reads its subnet from the kernel-scope route, and patches both values in place. Valid configs are left exactly as the operator had them; fresh installs with pxe.enabled=false skip the check entirely. The one-liner install/update is now self-healing for the most common stale-config failure mode. --- deploy/install.sh | 91 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/deploy/install.sh b/deploy/install.sh index b7f37ec..76cbab5 100755 --- a/deploy/install.sh +++ b/deploy/install.sh @@ -77,6 +77,94 @@ if [[ $EUID -ne 0 ]]; then exit 1 fi +# heal_pxe_config: make sure /etc/vetting/vetting.yaml's pxe.interface +# and pxe.subnet reference things that actually exist on this host. Stale +# values (common after an LXC rebuild renames the NIC, or after pxe-setup +# was pointed at a NIC that later got removed) block vetting.service +# startup with "pxe.interface X not found on host". +# +# Only runs when pxe.enabled is true — a disabled pxe block doesn't gate +# startup. Only rewrites fields that are currently invalid; a good +# interface/subnet pair is preserved exactly as the operator had it. +heal_pxe_config() { + local config="$1" + [[ -f "${config}" ]] || return 0 + + # Minimal one-key reader for the pxe: block. Mirrors pxe-setup.sh's + # extract_yaml_value so the two scripts stay independent. + _pxe_val() { + awk -v key="$1" ' + /^pxe:/ { in_pxe=1; next } + in_pxe && /^[A-Za-z_][A-Za-z0-9_]*:/ { in_pxe=0 } + in_pxe { + re = "^[[:space:]]+" key ":[[:space:]]*" + if ($0 ~ re) { + line = $0 + sub(re, "", line) + if (match(line, /"[^"]*"/)) { + print substr(line, RSTART+1, RLENGTH-2); exit + } + sub(/[[:space:]]*#.*$/, "", line) + gsub(/^[[:space:]]+|[[:space:]]+$/, "", line) + print line; exit + } + } + ' "${config}" + } + + local enabled cur_iface cur_subnet + enabled="$(_pxe_val enabled)" + cur_iface="$(_pxe_val interface)" + cur_subnet="$(_pxe_val subnet)" + + [[ "${enabled}" == "true" ]] || return 0 + + local iface_ok=0 subnet_ok=0 + if [[ -n "${cur_iface}" ]] && ip link show "${cur_iface}" >/dev/null 2>&1; then + iface_ok=1 + fi + if [[ "${cur_subnet}" =~ ^([0-9]{1,3}\.){3}[0-9]{1,3}/[0-9]{1,2}$ ]]; then + subnet_ok=1 + fi + (( iface_ok && subnet_ok )) && return 0 + + local detected_iface detected_subnet + detected_iface="$(ip -4 -o route show default 2>/dev/null | awk '{print $5; exit}')" + if [[ -n "${detected_iface}" ]]; then + detected_subnet="$(ip -4 -o route show dev "${detected_iface}" proto kernel scope link 2>/dev/null | awk '{print $1; exit}')" + fi + + if [[ -z "${detected_iface}" || -z "${detected_subnet}" ]]; then + echo "WARN: pxe is enabled in ${config} but pxe.interface=${cur_iface:-} / pxe.subnet=${cur_subnet:-} is stale," >&2 + echo " and no default-route NIC was found to auto-detect from. Edit the file manually before starting." >&2 + return 0 + fi + + local iface_to_write="${cur_iface}" subnet_to_write="${cur_subnet}" + if (( iface_ok == 0 )); then + echo "==> pxe.interface \"${cur_iface}\" is not present on this host; auto-patching to \"${detected_iface}\"" + iface_to_write="${detected_iface}" + fi + if (( subnet_ok == 0 )); then + echo "==> pxe.subnet \"${cur_subnet:-}\" is missing/invalid; auto-patching to \"${detected_subnet}\"" + subnet_to_write="${detected_subnet}" + fi + + local tmp + tmp="$(mktemp)" + IFACE="${iface_to_write}" SUBNET="${subnet_to_write}" awk ' + /^pxe:/ { in_pxe=1; print; next } + in_pxe && /^[A-Za-z_][A-Za-z0-9_]*:/ { in_pxe=0 } + in_pxe && /^[[:space:]]+interface:/ { print " interface: \"" ENVIRON["IFACE"] "\""; next } + in_pxe && /^[[:space:]]+subnet:/ { print " subnet: \"" ENVIRON["SUBNET"] "\""; next } + { print } + ' "${config}" > "${tmp}" + + chown --reference="${config}" "${tmp}" + chmod --reference="${config}" "${tmp}" + mv "${tmp}" "${config}" +} + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" @@ -193,6 +281,9 @@ if systemctl is-enabled --quiet dnsmasq 2>/dev/null; then systemctl disable --now dnsmasq fi +echo "==> validating pxe config against this host's interfaces" +heal_pxe_config "${CONFIG_DIR}/vetting.yaml" + systemctl daemon-reload # Upgrade path: if vetting.service is already enabled, restart it so the