23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
116 lines
4.7 KiB
YAML
116 lines
4.7 KiB
YAML
server:
|
|
# Loopback-only by default; change to "0.0.0.0:8080" (or similar) once
|
|
# you've wired up TLS or fronted the service with a reverse proxy.
|
|
bind: "127.0.0.1:8080"
|
|
# Base URL the orchestrator is reachable at from the operator's
|
|
# browser. Used as the click-through link in notifications.
|
|
public_url: "http://127.0.0.1:8080"
|
|
tls:
|
|
enabled: false
|
|
cert_file: ""
|
|
key_file: ""
|
|
|
|
database:
|
|
path: "/var/lib/vetting/vetting.db"
|
|
|
|
artifacts:
|
|
dir: "/var/lib/vetting/artifacts"
|
|
# Days to keep per-run artifact files (report.html, report.json, fio,
|
|
# iperf, inventory.json, hold keys). DB rows are preserved. 0 = forever.
|
|
retention_days: 30
|
|
|
|
logs:
|
|
dir: "/var/log/vetting"
|
|
# Days to keep per-run log files. 0 = forever.
|
|
retention_days: 30
|
|
|
|
janitor:
|
|
# Interval between cleanup sweeps. 0 defaults to 60.
|
|
interval_minutes: 60
|
|
|
|
dispatcher:
|
|
max_concurrent_runs: 3
|
|
|
|
pxe:
|
|
enabled: false
|
|
interface: "" # LAN NIC, e.g. "eth0"
|
|
subnet: "" # LAN CIDR, e.g. "192.168.1.0/24"; dnsmasq runs in proxy-DHCP mode scoped to this subnet, coexisting with the LAN's existing DHCP server
|
|
orchestrator_url: "" # e.g. "http://192.168.1.135:8080"
|
|
tftp_root: "/var/lib/vetting/tftp" # holds ipxe.efi + undionly.kpxe
|
|
live_dir: "/var/lib/vetting/live" # holds vmlinuz + initrd.img; served at /live/*
|
|
|
|
agent:
|
|
# Directory holding vetting-agent-linux-amd64, served at
|
|
# /assets/vetting-agent-linux-amd64. install.sh drops the binary here.
|
|
asset_dir: "/var/lib/vetting/assets"
|
|
|
|
# Notifications fire on StageFailed, SpecMismatch, HoldingOpened,
|
|
# RunCompleted. Declare one or more notifiers and route each event
|
|
# kind (and optionally severity) to a notifier by name. Delivery is
|
|
# fire-and-forget (one attempt per event, logged on failure).
|
|
#
|
|
# Example (uncomment and fill in):
|
|
#
|
|
# notifiers:
|
|
# - name: ops-ntfy
|
|
# type: ntfy
|
|
# server: https://ntfy.sh
|
|
# topic: vetting-YOUR-TOPIC
|
|
# - name: ops-discord
|
|
# type: discord
|
|
# webhook_url: https://discord.com/api/webhooks/XXX/YYY
|
|
# - name: ops-email
|
|
# type: smtp
|
|
# smtp:
|
|
# host: mail.lan
|
|
# port: 25
|
|
# from: vetting@lan.local
|
|
# to: [ops@lan.local]
|
|
#
|
|
# routes:
|
|
# - match_severity: [critical]
|
|
# notifier: ops-ntfy
|
|
# - match_kind: [RunCompleted]
|
|
# notifier: ops-ntfy
|
|
|
|
notifiers: []
|
|
routes: []
|
|
|
|
# Vetting pipeline shared defaults. Every profile (quick/deep/soak)
|
|
# walks the same stage list; only per-stage durations differ.
|
|
# Thresholds apply to every profile — critical breaches fail a run
|
|
# regardless of which profile the operator picked.
|
|
vetting:
|
|
stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting]
|
|
thresholds:
|
|
- { stage: "*", kind: temp, key: "cpu/*", op: lt, value: 92, unit: C, severity: critical }
|
|
- { stage: PSU, kind: psu_volt, key: "+12V", op: within_pct, value: 5, nominal: 12.0, severity: critical }
|
|
- { stage: PSU, kind: psu_volt, key: "+5V", op: within_pct, value: 5, nominal: 5.0, severity: critical }
|
|
- { stage: PSU, kind: psu_volt, key: "+3.3V", op: within_pct, value: 5, nominal: 3.3, severity: critical }
|
|
- { stage: Storage, kind: fio_p99_us, key: "*", op: lt, value: 50000, severity: warning }
|
|
- { stage: Network, kind: iperf, key: throughput_mbps, op: gte, value: 900, severity: critical }
|
|
- { stage: Network, kind: nic_retrans, key: "*/rate", op: lt, value: 0.001, severity: warning }
|
|
- { stage: CPUStress, kind: edac_ue, key: "*", op: lte, value: 0, severity: critical }
|
|
- { stage: CPUStress, kind: mce, key: "*", op: lte, value: 0, severity: critical }
|
|
|
|
profiles:
|
|
quick:
|
|
stage_timeouts: { CPUStress: 5m, Storage: 5m, Network: 2m }
|
|
defaults:
|
|
cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s }
|
|
storage: { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 }
|
|
network: { duration: 60s }
|
|
deep:
|
|
stage_timeouts: { CPUStress: 2h, Storage: 4h, Network: 35m }
|
|
defaults:
|
|
cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s }
|
|
storage: { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 }
|
|
network: { duration: 30m }
|
|
soak:
|
|
inherit: deep
|
|
stage_timeouts: { CPUStress: 14h, Storage: 8h, Network: 2h30m }
|
|
defaults:
|
|
cpustress: { cpu_pass: 12h }
|
|
storage: { mode: full_disk, fio_time: 6h }
|
|
network: { duration: 2h }
|