server:
  bind: "127.0.0.1:8080"
  # Base URL the orchestrator is reachable at from the operator's
  # browser. Used as the click-through link in notifications, so it
  # should be the *external* URL (e.g. https://vetting.lan:8443),
  # not the bind address.
  public_url: "http://127.0.0.1:8080"
  tls:
    enabled: false
    cert_file: ""
    key_file: ""

database:
  path: "./var/vetting.db"

artifacts:
  dir: "./var/artifacts"
  # Days to keep per-run artifact files (report.html, report.json, fio,
  # iperf, inventory.json, hold keys). DB rows are preserved. 0 = forever.
  retention_days: 30

logs:
  dir: "./var/logs"
  # Days to keep per-run log files. 0 = forever.
  retention_days: 30

janitor:
  # Interval between cleanup sweeps. 0 defaults to 60.
  interval_minutes: 60

dispatcher:
  max_concurrent_runs: 3

# Fields below are populated in later phases and ignored in Phase 1.

pxe:
  enabled: false
  interface: ""                          # LAN NIC, e.g. "eth0"
  subnet: ""                             # LAN CIDR, e.g. "192.168.1.0/24"; proxy-DHCP scope
  orchestrator_url: ""                   # e.g. "http://192.168.1.135:8080"
  tftp_root: ""                          # holds ipxe.efi + undionly.kpxe
  live_dir: ""                           # holds vmlinuz + initrd.img; served at /live/*

agent:
  # Directory containing vetting-agent-linux-amd64. The quick-register
  # one-liner downloads from /assets/vetting-agent-linux-amd64 and
  # installs it as a systemd service so the host keeps heartbeating.
  # Leave empty to disable the /assets/* route.
  asset_dir: "./var/assets"

# Notifications fire on StageFailed, SpecMismatch, HoldingOpened,
# RunCompleted. Declare one or more notifiers and route each event
# kind (and optionally severity) to a notifier by name. Delivery is
# fire-and-forget (one attempt per event, logged on failure).
#
# Example (uncomment and fill in):
#
# notifiers:
#   - name: ops-ntfy
#     type: ntfy
#     server: https://ntfy.sh
#     topic: vetting-YOUR-TOPIC
#   - name: ops-discord
#     type: discord
#     webhook_url: https://discord.com/api/webhooks/XXX/YYY
#   - name: ops-email
#     type: smtp
#     smtp:
#       host: mail.lan
#       port: 25
#       from: vetting@lan.local
#       to: [ops@lan.local]
#
# routes:
#   # Critical events (failures / holds) fire on all three channels.
#   - match_severity: [critical]
#     notifier: ops-ntfy
#   - match_severity: [critical]
#     notifier: ops-discord
#   - match_severity: [critical]
#     notifier: ops-email
#   # RunCompleted is informational — push to ntfy only.
#   - match_kind: [RunCompleted]
#     notifier: ops-ntfy

notifiers: []
routes: []

# Vetting pipeline shared defaults. Every profile (quick/deep/soak)
# walks the same stage list; only per-stage durations differ.
# Thresholds here apply to every profile — a 92°C CPU fails a
# 2-minute quick run and a 12-hour soak run alike.
vetting:
  stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting]
  thresholds:
    - { stage: "*",       kind: temp,        key: "cpu/*",           op: lt,         value: 92,   unit: C, severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+12V",            op: within_pct, value: 5,  nominal: 12.0, severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+5V",             op: within_pct, value: 5,  nominal: 5.0,  severity: critical }
    - { stage: PSU,       kind: psu_volt,    key: "+3.3V",           op: within_pct, value: 5,  nominal: 3.3,  severity: critical }
    - { stage: Storage,   kind: fio_p99_us,  key: "*",               op: lt,         value: 50000,                 severity: warning }
    - { stage: Network,   kind: iperf,       key: throughput_mbps,   op: gte,        value: 900,                   severity: critical }
    - { stage: Network,   kind: nic_retrans, key: "*/rate",          op: lt,         value: 0.001,                 severity: warning }
    - { stage: CPUStress, kind: edac_ue,     key: "*",               op: lte,        value: 0,                     severity: critical }
    - { stage: CPUStress, kind: mce,         key: "*",               op: lte,        value: 0,                     severity: critical }

# Per-profile durations + probe knobs. Only the *durations* scale across
# profiles — every profile exercises every probe and gate. Quick is a
# ~10-minute same-day sanity check; deep is the 8–12 h overnight soak;
# soak is the opt-in 36–40 h extreme run.
profiles:
  quick:
    stage_timeouts:
      CPUStress: 5m
      Storage:   5m
      Network:   2m
    defaults:
      cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s }
      storage:   { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 }
      network:   { duration: 60s }
  deep:
    stage_timeouts:
      CPUStress: 2h
      Storage:   4h
      Network:   35m
    defaults:
      cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s }
      storage:   { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 }
      network:   { duration: 30m }
  soak:
    inherit: deep
    stage_timeouts:
      CPUStress: 14h
      Storage:   8h
      Network:   2h30m
    defaults:
      cpustress: { cpu_pass: 12h }
      storage:   { mode: full_disk, fio_time: 6h }
      network:   { duration: 2h }