server: # Loopback-only by default; change to "0.0.0.0:8080" (or similar) once # you've wired up TLS or fronted the service with a reverse proxy. bind: "127.0.0.1:8080" # Base URL the orchestrator is reachable at from the operator's # browser. Used as the click-through link in notifications. public_url: "http://127.0.0.1:8080" tls: enabled: false cert_file: "" key_file: "" database: path: "/var/lib/vetting/vetting.db" artifacts: dir: "/var/lib/vetting/artifacts" # Days to keep per-run artifact files (report.html, report.json, fio, # iperf, inventory.json, hold keys). DB rows are preserved. 0 = forever. retention_days: 30 logs: dir: "/var/log/vetting" # Days to keep per-run log files. 0 = forever. retention_days: 30 janitor: # Interval between cleanup sweeps. 0 defaults to 60. interval_minutes: 60 dispatcher: max_concurrent_runs: 3 pxe: enabled: false interface: "" # LAN NIC, e.g. "eth0" subnet: "" # LAN CIDR, e.g. "192.168.1.0/24"; dnsmasq runs in proxy-DHCP mode scoped to this subnet, coexisting with the LAN's existing DHCP server orchestrator_url: "" # e.g. "http://192.168.1.135:8080" tftp_root: "/var/lib/vetting/tftp" # holds ipxe.efi + undionly.kpxe live_dir: "/var/lib/vetting/live" # holds vmlinuz + initrd.img; served at /live/* agent: # Directory holding vetting-agent-linux-amd64, served at # /assets/vetting-agent-linux-amd64. install.sh drops the binary here. asset_dir: "/var/lib/vetting/assets" # Notifications fire on StageFailed, SpecMismatch, HoldingOpened, # RunCompleted. Declare one or more notifiers and route each event # kind (and optionally severity) to a notifier by name. Delivery is # fire-and-forget (one attempt per event, logged on failure). # # Example (uncomment and fill in): # # notifiers: # - name: ops-ntfy # type: ntfy # server: https://ntfy.sh # topic: vetting-YOUR-TOPIC # - name: ops-discord # type: discord # webhook_url: https://discord.com/api/webhooks/XXX/YYY # - name: ops-email # type: smtp # smtp: # host: mail.lan # port: 25 # from: vetting@lan.local # to: [ops@lan.local] # # routes: # - match_severity: [critical] # notifier: ops-ntfy # - match_kind: [RunCompleted] # notifier: ops-ntfy notifiers: [] routes: [] # Vetting pipeline shared defaults. Every profile (quick/deep/soak) # walks the same stage list; only per-stage durations differ. # Thresholds apply to every profile — critical breaches fail a run # regardless of which profile the operator picked. vetting: stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting] thresholds: - { stage: "*", kind: temp, key: "cpu/*", op: lt, value: 92, unit: C, severity: critical } - { stage: PSU, kind: psu_volt, key: "+12V", op: within_pct, value: 5, nominal: 12.0, severity: critical } - { stage: PSU, kind: psu_volt, key: "+5V", op: within_pct, value: 5, nominal: 5.0, severity: critical } - { stage: PSU, kind: psu_volt, key: "+3.3V", op: within_pct, value: 5, nominal: 3.3, severity: critical } - { stage: Storage, kind: fio_p99_us, key: "*", op: lt, value: 50000, severity: warning } - { stage: Network, kind: iperf, key: throughput_mbps, op: gte, value: 900, severity: critical } - { stage: Network, kind: nic_retrans, key: "*/rate", op: lt, value: 0.001, severity: warning } - { stage: CPUStress, kind: edac_ue, key: "*", op: lte, value: 0, severity: critical } - { stage: CPUStress, kind: mce, key: "*", op: lte, value: 0, severity: critical } profiles: quick: stage_timeouts: { CPUStress: 5m, Storage: 5m, Network: 2m } defaults: cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s } storage: { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 } network: { duration: 60s } deep: stage_timeouts: { CPUStress: 2h, Storage: 4h, Network: 35m } defaults: cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s } storage: { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 } network: { duration: 30m } soak: inherit: deep stage_timeouts: { CPUStress: 14h, Storage: 8h, Network: 2h30m } defaults: cpustress: { cpu_pass: 12h } storage: { mode: full_disk, fio_time: 6h } network: { duration: 2h }