Vetting/deploy/vetting.production.yaml

server:
  # Loopback-only by default; change to "0.0.0.0:8080" (or similar) once
  # you've wired up TLS or fronted the service with a reverse proxy.
  bind: "127.0.0.1:8080"
  # Base URL the orchestrator is reachable at from the operator's
  # browser. Used as the click-through link in notifications.
  public_url: "http://127.0.0.1:8080"
  tls:
    enabled: false
    cert_file: ""
    key_file: ""

database:
  path: "/var/lib/vetting/vetting.db"

artifacts:
  dir: "/var/lib/vetting/artifacts"
  # Days to keep per-run artifact files (report.html, report.json, fio,
  # iperf, inventory.json, hold keys). DB rows are preserved. 0 = forever.
  retention_days: 30

logs:
  dir: "/var/log/vetting"
  # Days to keep per-run log files. 0 = forever.
  retention_days: 30

janitor:
  # Interval between cleanup sweeps. 0 defaults to 60.
  interval_minutes: 60

dispatcher:
  max_concurrent_runs: 3

pxe:
  enabled: false
  interface: ""                          # LAN NIC, e.g. "eth0"
  subnet: ""                             # LAN CIDR, e.g. "192.168.1.0/24"; dnsmasq runs in proxy-DHCP mode scoped to this subnet, coexisting with the LAN's existing DHCP server
  orchestrator_url: ""                   # e.g. "http://192.168.1.135:8080"
  tftp_root: "/var/lib/vetting/tftp"     # holds ipxe.efi + undionly.kpxe
  live_dir: "/var/lib/vetting/live"      # holds vmlinuz + initrd.img; served at /live/*

agent:
  # Directory holding vetting-agent-linux-amd64, served at
  # /assets/vetting-agent-linux-amd64. install.sh drops the binary here.
  asset_dir: "/var/lib/vetting/assets"

# Notifications fire on StageFailed, SpecMismatch, HoldingOpened,
# RunCompleted. Declare one or more notifiers and route each event
# kind (and optionally severity) to a notifier by name. Delivery is
# fire-and-forget (one attempt per event, logged on failure).
#
# Example (uncomment and fill in):
#
# notifiers:
#   - name: ops-ntfy
#     type: ntfy
#     server: https://ntfy.sh
#     topic: vetting-YOUR-TOPIC
#   - name: ops-discord
#     type: discord
#     webhook_url: https://discord.com/api/webhooks/XXX/YYY
#   - name: ops-email
#     type: smtp
#     smtp:
#       host: mail.lan
#       port: 25
#       from: vetting@lan.local
#       to: [ops@lan.local]
#
# routes:
#   - match_severity: [critical]
#     notifier: ops-ntfy
#   - match_kind: [RunCompleted]
#     notifier: ops-ntfy

notifiers: []
routes: []

# Vetting pipeline shared defaults. Every profile (quick/deep/soak)
# walks the same stage list; only per-stage durations differ.
# Thresholds apply to every profile — critical breaches fail a run
# regardless of which profile the operator picked.
vetting:
  stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting]
  thresholds:
    - { stage: "*",       kind: temp,        key: "cpu/*",           op: lt,         value: 92,   unit: C, severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+12V",            op: within_pct, value: 5,  nominal: 12.0, severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+5V",             op: within_pct, value: 5,  nominal: 5.0,  severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+3.3V",           op: within_pct, value: 5,  nominal: 3.3,  severity: critical }
    - { stage: Storage,   kind: fio_p99_us,  key: "*",               op: lt,         value: 50000,                 severity: warning }
    - { stage: Network,   kind: iperf,       key: throughput_mbps,   op: gte,        value: 900,                   severity: critical }
    - { stage: Network,   kind: nic_retrans, key: "*/rate",          op: lt,         value: 0.001,                 severity: warning }
    - { stage: CPUStress, kind: edac_ue,     key: "*",               op: lte,        value: 0,                     severity: critical }
    - { stage: CPUStress, kind: mce,         key: "*",               op: lte,        value: 0,                     severity: critical }

profiles:
  quick:
    stage_timeouts: { CPUStress: 5m, Storage: 5m, Network: 2m }
    defaults:
      cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s }
      storage:   { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 }
      network:   { duration: 60s }
  deep:
    stage_timeouts: { CPUStress: 2h, Storage: 4h, Network: 35m }
    defaults:
      cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s }
      storage:   { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 }
      network:   { duration: 30m }
  soak:
    inherit: deep
    stage_timeouts: { CPUStress: 14h, Storage: 8h, Network: 2h30m }
    defaults:
      cpustress: { cpu_pass: 12h }
      storage:   { mode: full_disk, fio_time: 6h }
      network:   { duration: 2h }