deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -0,0 +1,192 @@
+package tests
+
+import (
+	"encoding/json"
+	"testing"
+
+	"vetting/agent/probes"
+)
+
+// TestParseIperfJSON_SumSent confirms we pull throughput, retransmits,
+// and bytes_sent from end.sum_sent. Real iperf3 -J output nests these
+// three under end.sum_sent for TCP streams.
+func TestParseIperfJSON_SumSent(t *testing.T) {
+	raw := `{
+		"end": {
+			"sum_sent": {
+				"bits_per_second": 950000000,
+				"retransmits": 42,
+				"bytes": 1187500000
+			}
+		}
+	}`
+	mbps, retrans, bytesSent, _, err := parseIperfJSON([]byte(raw))
+	if err != nil {
+		t.Fatalf("parseIperfJSON: %v", err)
+	}
+	if mbps != 950 {
+		t.Errorf("mbps = %v, want 950", mbps)
+	}
+	if retrans != 42 {
+		t.Errorf("retransmits = %d, want 42", retrans)
+	}
+	if bytesSent != 1187500000 {
+		t.Errorf("bytesSent = %d, want 1187500000", bytesSent)
+	}
+}
+
+// TestParseIperfJSON_MissingEnd fails cleanly when iperf returned
+// something without an end block (partial/aborted run).
+func TestParseIperfJSON_MissingEnd(t *testing.T) {
+	raw := `{"start": {}}`
+	if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
+		t.Errorf("expected error on iperf output missing end block")
+	}
+}
+
+// TestParseIperfJSON_ZeroBps returns an error so the stage can fail
+// fast. A successful-exit iperf that pushed zero bits is indistinguishable
+// from a broken run and must not pass.
+func TestParseIperfJSON_ZeroBps(t *testing.T) {
+	raw := `{"end": {"sum_sent": {"bits_per_second": 0}}}`
+	if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
+		t.Errorf("expected error when bits_per_second is 0")
+	}
+}
+
+// TestParseIperfJSON_FallsBackToSumReceived: UDP tests and some edge
+// cases don't populate sum_sent. The parser walks sum_sent → sum_received
+// → sum and picks the first that has a throughput number.
+func TestParseIperfJSON_FallsBackToSumReceived(t *testing.T) {
+	raw := `{
+		"end": {
+			"sum_received": {"bits_per_second": 500000000}
+		}
+	}`
+	mbps, _, _, _, err := parseIperfJSON([]byte(raw))
+	if err != nil {
+		t.Fatalf("parseIperfJSON: %v", err)
+	}
+	if mbps != 500 {
+		t.Errorf("mbps = %v, want 500", mbps)
+	}
+}
+
+// TestDiffNetDev_HappyPath confirms end − start on a shared interface
+// produces the delta we expect. eth0 pushed 10k bytes and accumulated
+// 3 tx errors during the window.
+func TestDiffNetDev_HappyPath(t *testing.T) {
+	start := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", RxBytes: 1000, RxErrs: 0, TxBytes: 5000, TxErrs: 1},
+	}
+	end := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", RxBytes: 2000, RxErrs: 0, TxBytes: 15000, TxErrs: 4},
+	}
+	delta := diffNetDev(start, end)
+	got, ok := delta["eth0"]
+	if !ok {
+		t.Fatalf("eth0 missing from diff output")
+	}
+	if got.RxBytes != 1000 {
+		t.Errorf("RxBytes delta=%d, want 1000", got.RxBytes)
+	}
+	if got.TxBytes != 10000 {
+		t.Errorf("TxBytes delta=%d, want 10000", got.TxBytes)
+	}
+	if got.TxErrs != 3 {
+		t.Errorf("TxErrs delta=%d, want 3", got.TxErrs)
+	}
+}
+
+// TestDiffNetDev_InterfaceVanished: an interface present at start but
+// gone at end drops from the diff rather than carrying a negative or
+// stale number.
+func TestDiffNetDev_InterfaceVanished(t *testing.T) {
+	start := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", TxBytes: 1000},
+		"eth1": {Iface: "eth1", TxBytes: 500},
+	}
+	end := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", TxBytes: 2000},
+	}
+	delta := diffNetDev(start, end)
+	if _, ok := delta["eth1"]; ok {
+		t.Errorf("eth1 should have been dropped (gone at end)")
+	}
+	if delta["eth0"].TxBytes != 1000 {
+		t.Errorf("eth0 TxBytes delta=%d, want 1000", delta["eth0"].TxBytes)
+	}
+}
+
+// TestDiffNetDev_CounterReset: if a counter resets between snapshots
+// (kernel restart, wrap-around on a 32-bit counter) we clamp to 0
+// rather than underflow a uint64.
+func TestDiffNetDev_CounterReset(t *testing.T) {
+	start := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", TxBytes: 9999, TxErrs: 5},
+	}
+	end := map[string]probes.NetDevSnapshot{
+		"eth0": {Iface: "eth0", TxBytes: 100, TxErrs: 0},
+	}
+	delta := diffNetDev(start, end)
+	if delta["eth0"].TxBytes != 0 {
+		t.Errorf("reset TxBytes delta=%d, want 0 (clamped)", delta["eth0"].TxBytes)
+	}
+	if delta["eth0"].TxErrs != 0 {
+		t.Errorf("reset TxErrs delta=%d, want 0 (clamped)", delta["eth0"].TxErrs)
+	}
+}
+
+// TestDeriveHost: orchestrator URL → host extraction is how the agent
+// picks the iperf3 server target. Handles both https://host and
+// https://host:port shapes.
+func TestDeriveHost(t *testing.T) {
+	cases := []struct {
+		raw  string
+		want string
+	}{
+		{"https://orch.local", "orch.local"},
+		{"https://orch.local:8443", "orch.local"},
+		{"http://10.0.0.5:8080", "10.0.0.5"},
+	}
+	for _, c := range cases {
+		got, err := deriveHost(c.raw)
+		if err != nil {
+			t.Errorf("deriveHost(%q) error: %v", c.raw, err)
+			continue
+		}
+		if got != c.want {
+			t.Errorf("deriveHost(%q) = %q, want %q", c.raw, got, c.want)
+		}
+	}
+}
+
+func TestDeriveHost_Empty(t *testing.T) {
+	if _, err := deriveHost(""); err == nil {
+		t.Errorf("deriveHost(\"\") should error")
+	}
+}
+
+// TestParseIperfJSON_ParsesEndMap confirms the full end map is returned
+// so extras can show every field iperf produced, not just the three we
+// extract by hand.
+func TestParseIperfJSON_ParsesEndMap(t *testing.T) {
+	raw := `{
+		"end": {
+			"sum_sent": {"bits_per_second": 1000000, "retransmits": 0, "bytes": 125000},
+			"cpu_utilization_percent": {"host_total": 12.3}
+		}
+	}`
+	_, _, _, endMap, err := parseIperfJSON([]byte(raw))
+	if err != nil {
+		t.Fatalf("parseIperfJSON: %v", err)
+	}
+	if endMap == nil {
+		t.Fatalf("endMap is nil")
+	}
+	// Sanity: both keys round-trip via json.
+	b, _ := json.Marshal(endMap)
+	if len(b) == 0 {
+		t.Errorf("endMap marshaled to empty")
+	}
+}