Files
josh 23c689aa5b
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled
deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00

261 lines
7.6 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package tests
import (
"context"
"encoding/json"
"fmt"
"net/url"
"os/exec"
"strconv"
"strings"
"time"
"vetting/agent/probes"
)
// NetworkConfig is what the agent passes to Network: the orchestrator's
// iperf3 server address, port, and the per-profile duration.
type NetworkConfig struct {
OrchestratorURL string
IperfPort int // 0 = 5201
Duration time.Duration
}
// Network runs iperf3 against the orchestrator's bundled server for
// the profile-configured duration. Records throughput as a measurement;
// records per-interface rx/tx error-rate deltas as nic_retrans samples
// so the server-side threshold gate (`nic_retrans rate < 0.001`) fires
// on a flaky PHY or a wire that drops half its packets under load.
//
// Failure cases: iperf3 missing, server unreachable, zero throughput.
// Zero throughput is treated as a hard failure — an iperf that finished
// cleanly but pushed zero bytes is indistinguishable from a bad run.
func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
if _, err := exec.LookPath("iperf3"); err != nil {
// Live image ships iperf3; absence means packaging regression.
d.Error("Network: iperf3 not found — live image is missing required tool")
return Outcome{
Passed: false,
Message: "iperf3 binary missing from live image",
Summary: "failed (iperf3 missing)",
Extras: map[string]any{"reason": "iperf3_missing"},
}
}
host, err := deriveHost(cfg.OrchestratorURL)
if err != nil || host == "" {
d.Warn("Network: can't derive orchestrator host from URL — skipping stage")
return Outcome{
Passed: true,
Summary: "skipped (no orchestrator host)",
Extras: map[string]any{"skipped": true, "reason": "no_host"},
}
}
port := cfg.IperfPort
if port == 0 {
port = 5201
}
duration := cfg.Duration
if duration <= 0 {
duration = 10 * time.Second
}
// Snapshot /proc/net/dev before the test so we can attribute any
// error-count growth to *this stage's* traffic. The same snapshot
// taken after iperf returns is the end of the window.
netStart := indexNetDev(probes.NetDev())
args := []string{
"-c", host,
"-p", strconv.Itoa(port),
"-t", strconv.Itoa(int(duration.Seconds())),
"-J", // JSON output
}
d.Info(fmt.Sprintf("Network: iperf3 -c %s -p %d -t %s", host, port, duration))
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
defer cancel()
cmd := exec.CommandContext(runCtx, "iperf3", args...)
out, err := cmd.Output()
if err != nil {
d.Error("Network: iperf3 client failed: " + err.Error())
return Outcome{
Passed: false,
Message: "iperf3 client error: " + err.Error(),
Summary: "iperf3 failed",
Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)},
}
}
mbps, retrans, bytesSent, parsed, err := parseIperfJSON(out)
if err != nil {
d.Error("Network: parse iperf3 output: " + err.Error())
return Outcome{
Passed: false,
Message: "parse iperf3 json: " + err.Error(),
Summary: "parse error",
Extras: map[string]any{"raw": string(out)},
}
}
netEnd := indexNetDev(probes.NetDev())
netDelta := diffNetDev(netStart, netEnd)
samples := []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}
// iperf-derived retrans rate: retrans_count / packet_count_estimate.
// TCP typical MTU 1500; payload ~1460. We divide bytes by 1460 to
// approximate packets. This keeps the rate bounded in [0, 1].
if bytesSent > 0 {
packets := float64(bytesSent) / 1460.0
if packets > 0 {
samples = append(samples, Sample{
Kind: "nic_retrans",
Key: "iperf/rate",
Value: float64(retrans) / packets,
Unit: "rate",
})
}
}
// Per-interface error-rate deltas. A flaky cable typically surfaces
// as tx_errs or tx_drop on the originating interface, not inside
// iperf's own tally.
for iface, delta := range netDelta {
if delta.TxBytes > 0 {
packets := float64(delta.TxBytes) / 1460.0
if packets > 0 {
rate := float64(delta.TxErrs+delta.TxDrop) / packets
samples = append(samples, Sample{
Kind: "nic_retrans", Key: iface + "/rate", Value: rate, Unit: "rate",
})
}
}
// Diagnostic raw counts so the report can show which interface
// bled. These don't fire a threshold today but are useful for
// post-mortem.
samples = append(samples,
Sample{Kind: "nic_errs", Key: iface + "/rx", Value: float64(delta.RxErrs + delta.RxDrop), Unit: "count"},
Sample{Kind: "nic_errs", Key: iface + "/tx", Value: float64(delta.TxErrs + delta.TxDrop), Unit: "count"},
)
}
if d.Sensor != nil {
_ = d.Sensor(ctx, samples)
}
extras := map[string]any{
"throughput_mbps": mbps,
"retransmits": retrans,
"bytes_sent": bytesSent,
"net_delta": netDelta,
"iperf_end": parsed,
}
if mbps <= 0 {
return Outcome{
Passed: false,
Message: "iperf3 reported zero throughput",
Summary: "zero throughput",
Extras: extras,
}
}
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps (retransmits=%d)", mbps, retrans))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("%.1f Mbps to %s (retransmits=%d)", mbps, host, retrans),
Extras: extras,
}
}
// indexNetDev flattens a NetDev slice into a map keyed by interface
// name so diffNetDev can pair start/end by name without O(n²) scans.
func indexNetDev(snaps []probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
out := map[string]probes.NetDevSnapshot{}
for _, s := range snaps {
out[s.Iface] = s
}
return out
}
// diffNetDev computes end start for each interface present in both
// snapshots. An interface that dropped away mid-run is dropped from
// the result (can't compute a delta). Underflow (end < start, rare
// after a counter reset) is clamped to 0.
func diffNetDev(start, end map[string]probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
out := map[string]probes.NetDevSnapshot{}
for iface, e := range end {
s, ok := start[iface]
if !ok {
continue
}
out[iface] = probes.NetDevSnapshot{
Iface: iface,
RxBytes: subU64(e.RxBytes, s.RxBytes),
RxErrs: subU64(e.RxErrs, s.RxErrs),
RxDrop: subU64(e.RxDrop, s.RxDrop),
TxBytes: subU64(e.TxBytes, s.TxBytes),
TxErrs: subU64(e.TxErrs, s.TxErrs),
TxDrop: subU64(e.TxDrop, s.TxDrop),
}
}
return out
}
func subU64(a, b uint64) uint64 {
if a < b {
return 0
}
return a - b
}
// deriveHost pulls the hostname out of an https://host:port base URL.
func deriveHost(raw string) (string, error) {
if raw == "" {
return "", fmt.Errorf("empty url")
}
u, err := url.Parse(raw)
if err != nil {
return "", err
}
h := u.Hostname()
return strings.TrimSpace(h), nil
}
// parseIperfJSON pulls end.sum_sent.bits_per_second and retransmits out
// of iperf3 -J. Returns (Mbps, retransmits, bytes_sent, full-end-map, err).
func parseIperfJSON(b []byte) (float64, int64, int64, map[string]any, error) {
var top map[string]any
if err := json.Unmarshal(b, &top); err != nil {
return 0, 0, 0, nil, err
}
end, ok := top["end"].(map[string]any)
if !ok {
return 0, 0, 0, nil, fmt.Errorf("missing end")
}
// Pull the first sum that carries bits_per_second; retransmits +
// bytes live there too for TCP.
var mbps float64
var retrans int64
var bytesSent int64
for _, key := range []string{"sum_sent", "sum_received", "sum"} {
sum, ok := end[key].(map[string]any)
if !ok {
continue
}
bps, ok := sum["bits_per_second"].(float64)
if !ok {
continue
}
mbps = bps / 1_000_000
if r, ok := sum["retransmits"].(float64); ok {
retrans = int64(r)
}
if bs, ok := sum["bytes"].(float64); ok {
bytesSent = int64(bs)
}
break
}
if mbps == 0 {
return 0, 0, 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
}
return mbps, retrans, bytesSent, end, nil
}