23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
139 lines
5.2 KiB
YAML
139 lines
5.2 KiB
YAML
server:
|
||
bind: "127.0.0.1:8080"
|
||
# Base URL the orchestrator is reachable at from the operator's
|
||
# browser. Used as the click-through link in notifications, so it
|
||
# should be the *external* URL (e.g. https://vetting.lan:8443),
|
||
# not the bind address.
|
||
public_url: "http://127.0.0.1:8080"
|
||
tls:
|
||
enabled: false
|
||
cert_file: ""
|
||
key_file: ""
|
||
|
||
database:
|
||
path: "./var/vetting.db"
|
||
|
||
artifacts:
|
||
dir: "./var/artifacts"
|
||
# Days to keep per-run artifact files (report.html, report.json, fio,
|
||
# iperf, inventory.json, hold keys). DB rows are preserved. 0 = forever.
|
||
retention_days: 30
|
||
|
||
logs:
|
||
dir: "./var/logs"
|
||
# Days to keep per-run log files. 0 = forever.
|
||
retention_days: 30
|
||
|
||
janitor:
|
||
# Interval between cleanup sweeps. 0 defaults to 60.
|
||
interval_minutes: 60
|
||
|
||
dispatcher:
|
||
max_concurrent_runs: 3
|
||
|
||
# Fields below are populated in later phases and ignored in Phase 1.
|
||
|
||
pxe:
|
||
enabled: false
|
||
interface: "" # LAN NIC, e.g. "eth0"
|
||
subnet: "" # LAN CIDR, e.g. "192.168.1.0/24"; proxy-DHCP scope
|
||
orchestrator_url: "" # e.g. "http://192.168.1.135:8080"
|
||
tftp_root: "" # holds ipxe.efi + undionly.kpxe
|
||
live_dir: "" # holds vmlinuz + initrd.img; served at /live/*
|
||
|
||
agent:
|
||
# Directory containing vetting-agent-linux-amd64. The quick-register
|
||
# one-liner downloads from /assets/vetting-agent-linux-amd64 and
|
||
# installs it as a systemd service so the host keeps heartbeating.
|
||
# Leave empty to disable the /assets/* route.
|
||
asset_dir: "./var/assets"
|
||
|
||
# Notifications fire on StageFailed, SpecMismatch, HoldingOpened,
|
||
# RunCompleted. Declare one or more notifiers and route each event
|
||
# kind (and optionally severity) to a notifier by name. Delivery is
|
||
# fire-and-forget (one attempt per event, logged on failure).
|
||
#
|
||
# Example (uncomment and fill in):
|
||
#
|
||
# notifiers:
|
||
# - name: ops-ntfy
|
||
# type: ntfy
|
||
# server: https://ntfy.sh
|
||
# topic: vetting-YOUR-TOPIC
|
||
# - name: ops-discord
|
||
# type: discord
|
||
# webhook_url: https://discord.com/api/webhooks/XXX/YYY
|
||
# - name: ops-email
|
||
# type: smtp
|
||
# smtp:
|
||
# host: mail.lan
|
||
# port: 25
|
||
# from: vetting@lan.local
|
||
# to: [ops@lan.local]
|
||
#
|
||
# routes:
|
||
# # Critical events (failures / holds) fire on all three channels.
|
||
# - match_severity: [critical]
|
||
# notifier: ops-ntfy
|
||
# - match_severity: [critical]
|
||
# notifier: ops-discord
|
||
# - match_severity: [critical]
|
||
# notifier: ops-email
|
||
# # RunCompleted is informational — push to ntfy only.
|
||
# - match_kind: [RunCompleted]
|
||
# notifier: ops-ntfy
|
||
|
||
notifiers: []
|
||
routes: []
|
||
|
||
# Vetting pipeline shared defaults. Every profile (quick/deep/soak)
|
||
# walks the same stage list; only per-stage durations differ.
|
||
# Thresholds here apply to every profile — a 92°C CPU fails a
|
||
# 2-minute quick run and a 12-hour soak run alike.
|
||
vetting:
|
||
stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting]
|
||
thresholds:
|
||
- { stage: "*", kind: temp, key: "cpu/*", op: lt, value: 92, unit: C, severity: critical }
|
||
- { stage: PSU, kind: psu_volt, key: "+12V", op: within_pct, value: 5, nominal: 12.0, severity: critical }
|
||
- { stage: PSU, kind: psu_volt, key: "+5V", op: within_pct, value: 5, nominal: 5.0, severity: critical }
|
||
- { stage: PSU, kind: psu_volt, key: "+3.3V", op: within_pct, value: 5, nominal: 3.3, severity: critical }
|
||
- { stage: Storage, kind: fio_p99_us, key: "*", op: lt, value: 50000, severity: warning }
|
||
- { stage: Network, kind: iperf, key: throughput_mbps, op: gte, value: 900, severity: critical }
|
||
- { stage: Network, kind: nic_retrans, key: "*/rate", op: lt, value: 0.001, severity: warning }
|
||
- { stage: CPUStress, kind: edac_ue, key: "*", op: lte, value: 0, severity: critical }
|
||
- { stage: CPUStress, kind: mce, key: "*", op: lte, value: 0, severity: critical }
|
||
|
||
# Per-profile durations + probe knobs. Only the *durations* scale across
|
||
# profiles — every profile exercises every probe and gate. Quick is a
|
||
# ~10-minute same-day sanity check; deep is the 8–12 h overnight soak;
|
||
# soak is the opt-in 36–40 h extreme run.
|
||
profiles:
|
||
quick:
|
||
stage_timeouts:
|
||
CPUStress: 5m
|
||
Storage: 5m
|
||
Network: 2m
|
||
defaults:
|
||
cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s }
|
||
storage: { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 }
|
||
network: { duration: 60s }
|
||
deep:
|
||
stage_timeouts:
|
||
CPUStress: 2h
|
||
Storage: 4h
|
||
Network: 35m
|
||
defaults:
|
||
cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s }
|
||
storage: { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 }
|
||
network: { duration: 30m }
|
||
soak:
|
||
inherit: deep
|
||
stage_timeouts:
|
||
CPUStress: 14h
|
||
Storage: 8h
|
||
Network: 2h30m
|
||
defaults:
|
||
cpustress: { cpu_pass: 12h }
|
||
storage: { mode: full_disk, fio_time: 6h }
|
||
network: { duration: 2h }
|