server: bind: "127.0.0.1:8080" # Base URL the orchestrator is reachable at from the operator's # browser. Used as the click-through link in notifications, so it # should be the *external* URL (e.g. https://vetting.lan:8443), # not the bind address. public_url: "http://127.0.0.1:8080" tls: enabled: false cert_file: "" key_file: "" database: path: "./var/vetting.db" artifacts: dir: "./var/artifacts" # Days to keep per-run artifact files (report.html, report.json, fio, # iperf, inventory.json, hold keys). DB rows are preserved. 0 = forever. retention_days: 30 logs: dir: "./var/logs" # Days to keep per-run log files. 0 = forever. retention_days: 30 janitor: # Interval between cleanup sweeps. 0 defaults to 60. interval_minutes: 60 dispatcher: max_concurrent_runs: 3 # Fields below are populated in later phases and ignored in Phase 1. pxe: enabled: false interface: "" # LAN NIC, e.g. "eth0" subnet: "" # LAN CIDR, e.g. "192.168.1.0/24"; proxy-DHCP scope orchestrator_url: "" # e.g. "http://192.168.1.135:8080" tftp_root: "" # holds ipxe.efi + undionly.kpxe live_dir: "" # holds vmlinuz + initrd.img; served at /live/* agent: # Directory containing vetting-agent-linux-amd64. The quick-register # one-liner downloads from /assets/vetting-agent-linux-amd64 and # installs it as a systemd service so the host keeps heartbeating. # Leave empty to disable the /assets/* route. asset_dir: "./var/assets" # Notifications fire on StageFailed, SpecMismatch, HoldingOpened, # RunCompleted. Declare one or more notifiers and route each event # kind (and optionally severity) to a notifier by name. Delivery is # fire-and-forget (one attempt per event, logged on failure). # # Example (uncomment and fill in): # # notifiers: # - name: ops-ntfy # type: ntfy # server: https://ntfy.sh # topic: vetting-YOUR-TOPIC # - name: ops-discord # type: discord # webhook_url: https://discord.com/api/webhooks/XXX/YYY # - name: ops-email # type: smtp # smtp: # host: mail.lan # port: 25 # from: vetting@lan.local # to: [ops@lan.local] # # routes: # # Critical events (failures / holds) fire on all three channels. # - match_severity: [critical] # notifier: ops-ntfy # - match_severity: [critical] # notifier: ops-discord # - match_severity: [critical] # notifier: ops-email # # RunCompleted is informational — push to ntfy only. # - match_kind: [RunCompleted] # notifier: ops-ntfy notifiers: [] routes: [] # Vetting pipeline shared defaults. Every profile (quick/deep/soak) # walks the same stage list; only per-stage durations differ. # Thresholds here apply to every profile — a 92°C CPU fails a # 2-minute quick run and a 12-hour soak run alike. vetting: stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting] thresholds: - { stage: "*", kind: temp, key: "cpu/*", op: lt, value: 92, unit: C, severity: critical } - { stage: PSU, kind: psu_volt, key: "+12V", op: within_pct, value: 5, nominal: 12.0, severity: critical } - { stage: PSU, kind: psu_volt, key: "+5V", op: within_pct, value: 5, nominal: 5.0, severity: critical } - { stage: PSU, kind: psu_volt, key: "+3.3V", op: within_pct, value: 5, nominal: 3.3, severity: critical } - { stage: Storage, kind: fio_p99_us, key: "*", op: lt, value: 50000, severity: warning } - { stage: Network, kind: iperf, key: throughput_mbps, op: gte, value: 900, severity: critical } - { stage: Network, kind: nic_retrans, key: "*/rate", op: lt, value: 0.001, severity: warning } - { stage: CPUStress, kind: edac_ue, key: "*", op: lte, value: 0, severity: critical } - { stage: CPUStress, kind: mce, key: "*", op: lte, value: 0, severity: critical } # Per-profile durations + probe knobs. Only the *durations* scale across # profiles — every profile exercises every probe and gate. Quick is a # ~10-minute same-day sanity check; deep is the 8–12 h overnight soak; # soak is the opt-in 36–40 h extreme run. profiles: quick: stage_timeouts: CPUStress: 5m Storage: 5m Network: 2m defaults: cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s } storage: { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 } network: { duration: 60s } deep: stage_timeouts: CPUStress: 2h Storage: 4h Network: 35m defaults: cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s } storage: { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 } network: { duration: 30m } soak: inherit: deep stage_timeouts: CPUStress: 14h Storage: 8h Network: 2h30m defaults: cpustress: { cpu_pass: 12h } storage: { mode: full_disk, fio_time: 6h } network: { duration: 2h }