deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+130
-16
@@ -9,19 +9,27 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"vetting/agent/probes"
|
||||
)
|
||||
|
||||
// NetworkConfig is what the agent passes to Network: the orchestrator's
|
||||
// iperf3 server address and port. We derive host from OrchestratorURL.
|
||||
// iperf3 server address, port, and the per-profile duration.
|
||||
type NetworkConfig struct {
|
||||
OrchestratorURL string
|
||||
IperfPort int // 0 = 5201
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
// Network runs iperf3 against the orchestrator's bundled server. Records
|
||||
// bandwidth as a measurement; fails if iperf3 is missing, the server
|
||||
// isn't reachable, or throughput is zero.
|
||||
// Network runs iperf3 against the orchestrator's bundled server for
|
||||
// the profile-configured duration. Records throughput as a measurement;
|
||||
// records per-interface rx/tx error-rate deltas as nic_retrans samples
|
||||
// so the server-side threshold gate (`nic_retrans rate < 0.001`) fires
|
||||
// on a flaky PHY or a wire that drops half its packets under load.
|
||||
//
|
||||
// Failure cases: iperf3 missing, server unreachable, zero throughput.
|
||||
// Zero throughput is treated as a hard failure — an iperf that finished
|
||||
// cleanly but pushed zero bytes is indistinguishable from a bad run.
|
||||
func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
if _, err := exec.LookPath("iperf3"); err != nil {
|
||||
// Live image ships iperf3; absence means packaging regression.
|
||||
@@ -51,6 +59,11 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
duration = 10 * time.Second
|
||||
}
|
||||
|
||||
// Snapshot /proc/net/dev before the test so we can attribute any
|
||||
// error-count growth to *this stage's* traffic. The same snapshot
|
||||
// taken after iperf returns is the end of the window.
|
||||
netStart := indexNetDev(probes.NetDev())
|
||||
|
||||
args := []string{
|
||||
"-c", host,
|
||||
"-p", strconv.Itoa(port),
|
||||
@@ -72,7 +85,7 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)},
|
||||
}
|
||||
}
|
||||
mbps, parsed, err := parseIperfJSON(out)
|
||||
mbps, retrans, bytesSent, parsed, err := parseIperfJSON(out)
|
||||
if err != nil {
|
||||
d.Error("Network: parse iperf3 output: " + err.Error())
|
||||
return Outcome{
|
||||
@@ -82,12 +95,58 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
Extras: map[string]any{"raw": string(out)},
|
||||
}
|
||||
}
|
||||
|
||||
netEnd := indexNetDev(probes.NetDev())
|
||||
netDelta := diffNetDev(netStart, netEnd)
|
||||
|
||||
samples := []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}
|
||||
|
||||
// iperf-derived retrans rate: retrans_count / packet_count_estimate.
|
||||
// TCP typical MTU 1500; payload ~1460. We divide bytes by 1460 to
|
||||
// approximate packets. This keeps the rate bounded in [0, 1].
|
||||
if bytesSent > 0 {
|
||||
packets := float64(bytesSent) / 1460.0
|
||||
if packets > 0 {
|
||||
samples = append(samples, Sample{
|
||||
Kind: "nic_retrans",
|
||||
Key: "iperf/rate",
|
||||
Value: float64(retrans) / packets,
|
||||
Unit: "rate",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Per-interface error-rate deltas. A flaky cable typically surfaces
|
||||
// as tx_errs or tx_drop on the originating interface, not inside
|
||||
// iperf's own tally.
|
||||
for iface, delta := range netDelta {
|
||||
if delta.TxBytes > 0 {
|
||||
packets := float64(delta.TxBytes) / 1460.0
|
||||
if packets > 0 {
|
||||
rate := float64(delta.TxErrs+delta.TxDrop) / packets
|
||||
samples = append(samples, Sample{
|
||||
Kind: "nic_retrans", Key: iface + "/rate", Value: rate, Unit: "rate",
|
||||
})
|
||||
}
|
||||
}
|
||||
// Diagnostic raw counts so the report can show which interface
|
||||
// bled. These don't fire a threshold today but are useful for
|
||||
// post-mortem.
|
||||
samples = append(samples,
|
||||
Sample{Kind: "nic_errs", Key: iface + "/rx", Value: float64(delta.RxErrs + delta.RxDrop), Unit: "count"},
|
||||
Sample{Kind: "nic_errs", Key: iface + "/tx", Value: float64(delta.TxErrs + delta.TxDrop), Unit: "count"},
|
||||
)
|
||||
}
|
||||
|
||||
if d.Sensor != nil {
|
||||
_ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}})
|
||||
_ = d.Sensor(ctx, samples)
|
||||
}
|
||||
|
||||
extras := map[string]any{
|
||||
"throughput_mbps": mbps,
|
||||
"retransmits": retrans,
|
||||
"bytes_sent": bytesSent,
|
||||
"net_delta": netDelta,
|
||||
"iperf_end": parsed,
|
||||
}
|
||||
if mbps <= 0 {
|
||||
@@ -98,14 +157,55 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps))
|
||||
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps (retransmits=%d)", mbps, retrans))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host),
|
||||
Summary: fmt.Sprintf("%.1f Mbps to %s (retransmits=%d)", mbps, host, retrans),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
|
||||
// indexNetDev flattens a NetDev slice into a map keyed by interface
|
||||
// name so diffNetDev can pair start/end by name without O(n²) scans.
|
||||
func indexNetDev(snaps []probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
|
||||
out := map[string]probes.NetDevSnapshot{}
|
||||
for _, s := range snaps {
|
||||
out[s.Iface] = s
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// diffNetDev computes end − start for each interface present in both
|
||||
// snapshots. An interface that dropped away mid-run is dropped from
|
||||
// the result (can't compute a delta). Underflow (end < start, rare
|
||||
// after a counter reset) is clamped to 0.
|
||||
func diffNetDev(start, end map[string]probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
|
||||
out := map[string]probes.NetDevSnapshot{}
|
||||
for iface, e := range end {
|
||||
s, ok := start[iface]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
out[iface] = probes.NetDevSnapshot{
|
||||
Iface: iface,
|
||||
RxBytes: subU64(e.RxBytes, s.RxBytes),
|
||||
RxErrs: subU64(e.RxErrs, s.RxErrs),
|
||||
RxDrop: subU64(e.RxDrop, s.RxDrop),
|
||||
TxBytes: subU64(e.TxBytes, s.TxBytes),
|
||||
TxErrs: subU64(e.TxErrs, s.TxErrs),
|
||||
TxDrop: subU64(e.TxDrop, s.TxDrop),
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func subU64(a, b uint64) uint64 {
|
||||
if a < b {
|
||||
return 0
|
||||
}
|
||||
return a - b
|
||||
}
|
||||
|
||||
// deriveHost pulls the hostname out of an https://host:port base URL.
|
||||
func deriveHost(raw string) (string, error) {
|
||||
if raw == "" {
|
||||
@@ -119,18 +219,22 @@ func deriveHost(raw string) (string, error) {
|
||||
return strings.TrimSpace(h), nil
|
||||
}
|
||||
|
||||
// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J.
|
||||
// Returns (Mbps, full-json-map, err).
|
||||
func parseIperfJSON(b []byte) (float64, map[string]any, error) {
|
||||
// parseIperfJSON pulls end.sum_sent.bits_per_second and retransmits out
|
||||
// of iperf3 -J. Returns (Mbps, retransmits, bytes_sent, full-end-map, err).
|
||||
func parseIperfJSON(b []byte) (float64, int64, int64, map[string]any, error) {
|
||||
var top map[string]any
|
||||
if err := json.Unmarshal(b, &top); err != nil {
|
||||
return 0, nil, err
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
end, ok := top["end"].(map[string]any)
|
||||
if !ok {
|
||||
return 0, top, fmt.Errorf("missing end")
|
||||
return 0, 0, 0, nil, fmt.Errorf("missing end")
|
||||
}
|
||||
// iperf3 reports either sum_sent (when -R not set) or sum_received.
|
||||
// Pull the first sum that carries bits_per_second; retransmits +
|
||||
// bytes live there too for TCP.
|
||||
var mbps float64
|
||||
var retrans int64
|
||||
var bytesSent int64
|
||||
for _, key := range []string{"sum_sent", "sum_received", "sum"} {
|
||||
sum, ok := end[key].(map[string]any)
|
||||
if !ok {
|
||||
@@ -140,7 +244,17 @@ func parseIperfJSON(b []byte) (float64, map[string]any, error) {
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
return bps / 1_000_000, end, nil
|
||||
mbps = bps / 1_000_000
|
||||
if r, ok := sum["retransmits"].(float64); ok {
|
||||
retrans = int64(r)
|
||||
}
|
||||
if bs, ok := sum["bytes"].(float64); ok {
|
||||
bytesSent = int64(bs)
|
||||
}
|
||||
break
|
||||
}
|
||||
return 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
|
||||
if mbps == 0 {
|
||||
return 0, 0, 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
|
||||
}
|
||||
return mbps, retrans, bytesSent, end, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user