deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+130 -16
View File
@@ -9,19 +9,27 @@ import (
"strconv"
"strings"
"time"
"vetting/agent/probes"
)
// NetworkConfig is what the agent passes to Network: the orchestrator's
// iperf3 server address and port. We derive host from OrchestratorURL.
// iperf3 server address, port, and the per-profile duration.
type NetworkConfig struct {
OrchestratorURL string
IperfPort int // 0 = 5201
Duration time.Duration
}
// Network runs iperf3 against the orchestrator's bundled server. Records
// bandwidth as a measurement; fails if iperf3 is missing, the server
// isn't reachable, or throughput is zero.
// Network runs iperf3 against the orchestrator's bundled server for
// the profile-configured duration. Records throughput as a measurement;
// records per-interface rx/tx error-rate deltas as nic_retrans samples
// so the server-side threshold gate (`nic_retrans rate < 0.001`) fires
// on a flaky PHY or a wire that drops half its packets under load.
//
// Failure cases: iperf3 missing, server unreachable, zero throughput.
// Zero throughput is treated as a hard failure — an iperf that finished
// cleanly but pushed zero bytes is indistinguishable from a bad run.
func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
if _, err := exec.LookPath("iperf3"); err != nil {
// Live image ships iperf3; absence means packaging regression.
@@ -51,6 +59,11 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
duration = 10 * time.Second
}
// Snapshot /proc/net/dev before the test so we can attribute any
// error-count growth to *this stage's* traffic. The same snapshot
// taken after iperf returns is the end of the window.
netStart := indexNetDev(probes.NetDev())
args := []string{
"-c", host,
"-p", strconv.Itoa(port),
@@ -72,7 +85,7 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)},
}
}
mbps, parsed, err := parseIperfJSON(out)
mbps, retrans, bytesSent, parsed, err := parseIperfJSON(out)
if err != nil {
d.Error("Network: parse iperf3 output: " + err.Error())
return Outcome{
@@ -82,12 +95,58 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
Extras: map[string]any{"raw": string(out)},
}
}
netEnd := indexNetDev(probes.NetDev())
netDelta := diffNetDev(netStart, netEnd)
samples := []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}
// iperf-derived retrans rate: retrans_count / packet_count_estimate.
// TCP typical MTU 1500; payload ~1460. We divide bytes by 1460 to
// approximate packets. This keeps the rate bounded in [0, 1].
if bytesSent > 0 {
packets := float64(bytesSent) / 1460.0
if packets > 0 {
samples = append(samples, Sample{
Kind: "nic_retrans",
Key: "iperf/rate",
Value: float64(retrans) / packets,
Unit: "rate",
})
}
}
// Per-interface error-rate deltas. A flaky cable typically surfaces
// as tx_errs or tx_drop on the originating interface, not inside
// iperf's own tally.
for iface, delta := range netDelta {
if delta.TxBytes > 0 {
packets := float64(delta.TxBytes) / 1460.0
if packets > 0 {
rate := float64(delta.TxErrs+delta.TxDrop) / packets
samples = append(samples, Sample{
Kind: "nic_retrans", Key: iface + "/rate", Value: rate, Unit: "rate",
})
}
}
// Diagnostic raw counts so the report can show which interface
// bled. These don't fire a threshold today but are useful for
// post-mortem.
samples = append(samples,
Sample{Kind: "nic_errs", Key: iface + "/rx", Value: float64(delta.RxErrs + delta.RxDrop), Unit: "count"},
Sample{Kind: "nic_errs", Key: iface + "/tx", Value: float64(delta.TxErrs + delta.TxDrop), Unit: "count"},
)
}
if d.Sensor != nil {
_ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}})
_ = d.Sensor(ctx, samples)
}
extras := map[string]any{
"throughput_mbps": mbps,
"retransmits": retrans,
"bytes_sent": bytesSent,
"net_delta": netDelta,
"iperf_end": parsed,
}
if mbps <= 0 {
@@ -98,14 +157,55 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
Extras: extras,
}
}
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps))
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps (retransmits=%d)", mbps, retrans))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host),
Summary: fmt.Sprintf("%.1f Mbps to %s (retransmits=%d)", mbps, host, retrans),
Extras: extras,
}
}
// indexNetDev flattens a NetDev slice into a map keyed by interface
// name so diffNetDev can pair start/end by name without O(n²) scans.
func indexNetDev(snaps []probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
out := map[string]probes.NetDevSnapshot{}
for _, s := range snaps {
out[s.Iface] = s
}
return out
}
// diffNetDev computes end start for each interface present in both
// snapshots. An interface that dropped away mid-run is dropped from
// the result (can't compute a delta). Underflow (end < start, rare
// after a counter reset) is clamped to 0.
func diffNetDev(start, end map[string]probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
out := map[string]probes.NetDevSnapshot{}
for iface, e := range end {
s, ok := start[iface]
if !ok {
continue
}
out[iface] = probes.NetDevSnapshot{
Iface: iface,
RxBytes: subU64(e.RxBytes, s.RxBytes),
RxErrs: subU64(e.RxErrs, s.RxErrs),
RxDrop: subU64(e.RxDrop, s.RxDrop),
TxBytes: subU64(e.TxBytes, s.TxBytes),
TxErrs: subU64(e.TxErrs, s.TxErrs),
TxDrop: subU64(e.TxDrop, s.TxDrop),
}
}
return out
}
func subU64(a, b uint64) uint64 {
if a < b {
return 0
}
return a - b
}
// deriveHost pulls the hostname out of an https://host:port base URL.
func deriveHost(raw string) (string, error) {
if raw == "" {
@@ -119,18 +219,22 @@ func deriveHost(raw string) (string, error) {
return strings.TrimSpace(h), nil
}
// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J.
// Returns (Mbps, full-json-map, err).
func parseIperfJSON(b []byte) (float64, map[string]any, error) {
// parseIperfJSON pulls end.sum_sent.bits_per_second and retransmits out
// of iperf3 -J. Returns (Mbps, retransmits, bytes_sent, full-end-map, err).
func parseIperfJSON(b []byte) (float64, int64, int64, map[string]any, error) {
var top map[string]any
if err := json.Unmarshal(b, &top); err != nil {
return 0, nil, err
return 0, 0, 0, nil, err
}
end, ok := top["end"].(map[string]any)
if !ok {
return 0, top, fmt.Errorf("missing end")
return 0, 0, 0, nil, fmt.Errorf("missing end")
}
// iperf3 reports either sum_sent (when -R not set) or sum_received.
// Pull the first sum that carries bits_per_second; retransmits +
// bytes live there too for TCP.
var mbps float64
var retrans int64
var bytesSent int64
for _, key := range []string{"sum_sent", "sum_received", "sum"} {
sum, ok := end[key].(map[string]any)
if !ok {
@@ -140,7 +244,17 @@ func parseIperfJSON(b []byte) (float64, map[string]any, error) {
if !ok {
continue
}
return bps / 1_000_000, end, nil
mbps = bps / 1_000_000
if r, ok := sum["retransmits"].(float64); ok {
retrans = int64(r)
}
if bs, ok := sum["bytes"].(float64); ok {
bytesSent = int64(bs)
}
break
}
return 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
if mbps == 0 {
return 0, 0, 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
}
return mbps, retrans, bytesSent, end, nil
}