deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -0,0 +1,218 @@
+package tests
+
+import (
+	"encoding/json"
+	"testing"
+	"time"
+)
+
+// TestParseFioJSON_ATAReadWrite confirms we pull IOPS, BW, and p99
+// latency from both read and write sides. P99 is read from clat_ns and
+// converted ns → us (the unit we emit to the threshold evaluator).
+func TestParseFioJSON_ATAReadWrite(t *testing.T) {
+	raw := `{
+		"jobs": [{
+			"read":  {"iops": 1234.5, "bw": 5000, "clat_ns": {"percentile": {"99.000000": 250000}}},
+			"write": {"iops": 432.1,  "bw": 2000, "clat_ns": {"percentile": {"99.000000": 500000}}}
+		}]
+	}`
+	r, err := parseFioJSON([]byte(raw))
+	if err != nil {
+		t.Fatalf("parseFioJSON: %v", err)
+	}
+	if r.ReadIOPS != 1234.5 {
+		t.Errorf("ReadIOPS = %v, want 1234.5", r.ReadIOPS)
+	}
+	if r.WriteIOPS != 432.1 {
+		t.Errorf("WriteIOPS = %v, want 432.1", r.WriteIOPS)
+	}
+	if r.ReadBWKBps != 5000 {
+		t.Errorf("ReadBWKBps = %v, want 5000", r.ReadBWKBps)
+	}
+	// 250000 ns → 250 us
+	if r.ReadP99Us != 250 {
+		t.Errorf("ReadP99Us = %v, want 250", r.ReadP99Us)
+	}
+	// 500000 ns → 500 us
+	if r.WriteP99Us != 500 {
+		t.Errorf("WriteP99Us = %v, want 500", r.WriteP99Us)
+	}
+}
+
+// TestParseFioJSON_ReadOnlyJob: if only one side has p99 populated the
+// other stays zero (not emitted as a sample). Mirrors a randread job.
+func TestParseFioJSON_ReadOnlyJob(t *testing.T) {
+	raw := `{
+		"jobs": [{
+			"read":  {"iops": 1000, "bw": 4000, "clat_ns": {"percentile": {"99.000000": 100000}}},
+			"write": {"iops": 0, "bw": 0}
+		}]
+	}`
+	r, err := parseFioJSON([]byte(raw))
+	if err != nil {
+		t.Fatalf("parseFioJSON: %v", err)
+	}
+	if r.WriteP99Us != 0 {
+		t.Errorf("WriteP99Us = %v on read-only job, want 0", r.WriteP99Us)
+	}
+	if r.ReadP99Us != 100 {
+		t.Errorf("ReadP99Us = %v, want 100", r.ReadP99Us)
+	}
+}
+
+// TestParseFioJSON_NoJobs fails rather than reporting zeroes silently.
+// An empty jobs array means fio didn't run anything.
+func TestParseFioJSON_NoJobs(t *testing.T) {
+	raw := `{"jobs": []}`
+	if _, err := parseFioJSON([]byte(raw)); err == nil {
+		t.Errorf("expected error on empty jobs array")
+	}
+}
+
+// TestExtractSMARTAttrs_ATA picks attributes out of ata_smart_attributes.table
+// when present. Attributes outside the whitelist drop out silently.
+func TestExtractSMARTAttrs_ATA(t *testing.T) {
+	raw := map[string]any{}
+	smartJSON := `{
+		"ata_smart_attributes": {
+			"table": [
+				{"name": "Reallocated_Sector_Ct",   "raw": {"value": 7}},
+				{"name": "Current_Pending_Sector",  "raw": {"value": 3}},
+				{"name": "Spin_Retry_Count",        "raw": {"value": 99}}
+			]
+		}
+	}`
+	if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
+		t.Fatalf("unmarshal fixture: %v", err)
+	}
+	out := extractSMARTAttrs(raw)
+	if out["Reallocated_Sector_Ct"] != 7 {
+		t.Errorf("Reallocated_Sector_Ct = %v, want 7", out["Reallocated_Sector_Ct"])
+	}
+	if out["Current_Pending_Sector"] != 3 {
+		t.Errorf("Current_Pending_Sector = %v, want 3", out["Current_Pending_Sector"])
+	}
+	if _, ok := out["Spin_Retry_Count"]; ok {
+		t.Errorf("Spin_Retry_Count should not appear (not in whitelist)")
+	}
+}
+
+// TestExtractSMARTAttrs_NVMe picks media_errors and friends from the
+// nvme health log shape, which is a flat map at the top of the JSON.
+func TestExtractSMARTAttrs_NVMe(t *testing.T) {
+	raw := map[string]any{}
+	smartJSON := `{
+		"nvme_smart_health_information_log": {
+			"media_errors": 2,
+			"num_err_log_entries": 15,
+			"percentage_used": 7,
+			"temperature": 42
+		}
+	}`
+	if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
+		t.Fatalf("unmarshal fixture: %v", err)
+	}
+	out := extractSMARTAttrs(raw)
+	if out["media_errors"] != 2 {
+		t.Errorf("media_errors = %v, want 2", out["media_errors"])
+	}
+	if out["num_err_log_entries"] != 15 {
+		t.Errorf("num_err_log_entries = %v, want 15", out["num_err_log_entries"])
+	}
+	if out["percentage_used"] != 7 {
+		t.Errorf("percentage_used = %v, want 7", out["percentage_used"])
+	}
+	if _, ok := out["temperature"]; ok {
+		t.Errorf("temperature should not appear (not in whitelist)")
+	}
+}
+
+// TestDiffSMARTAttrs: end − start per (device, attr). Only attrs in
+// both snapshots yield a delta; any disappearing attribute just drops
+// out instead of showing a misleading negative.
+func TestDiffSMARTAttrs(t *testing.T) {
+	start := smartAttrMap{
+		"/dev/sda": {"Reallocated_Sector_Ct": 5, "Current_Pending_Sector": 0},
+	}
+	end := smartAttrMap{
+		"/dev/sda": {"Reallocated_Sector_Ct": 8, "Current_Pending_Sector": 2, "UDMA_CRC_Error_Count": 1},
+	}
+	out := diffSMARTAttrs(start, end)
+	if out["/dev/sda"]["Reallocated_Sector_Ct"] != 3 {
+		t.Errorf("Reallocated_Sector_Ct delta = %v, want 3", out["/dev/sda"]["Reallocated_Sector_Ct"])
+	}
+	if out["/dev/sda"]["Current_Pending_Sector"] != 2 {
+		t.Errorf("Current_Pending_Sector delta = %v, want 2", out["/dev/sda"]["Current_Pending_Sector"])
+	}
+	if _, ok := out["/dev/sda"]["UDMA_CRC_Error_Count"]; ok {
+		t.Errorf("UDMA_CRC_Error_Count should not appear (missing at start)")
+	}
+}
+
+// TestDiffSMARTAttrs_DeviceNewAtEnd: a device only present in the end
+// snapshot (drive hot-plugged mid-run, or SMART read succeeded only at
+// end) is dropped from the diff — no start baseline to subtract from.
+func TestDiffSMARTAttrs_DeviceNewAtEnd(t *testing.T) {
+	start := smartAttrMap{}
+	end := smartAttrMap{
+		"/dev/sda": {"Reallocated_Sector_Ct": 10},
+	}
+	out := diffSMARTAttrs(start, end)
+	if _, ok := out["/dev/sda"]; ok {
+		t.Errorf("/dev/sda should drop from diff when absent at start")
+	}
+}
+
+// TestResolveFioOpts_Defaults: zero-valued knobs resolve to the quick
+// profile's fio_sample shape. Any stage that's missing per-profile
+// knobs (legacy claim response, test harness) still has coherent
+// bounded defaults — we won't accidentally fall into unbounded writes.
+func TestResolveFioOpts_Defaults(t *testing.T) {
+	o := resolveFioOpts(StorageKnobs{})
+	if o.Mode != "fio_sample" {
+		t.Errorf("Mode = %q, want fio_sample", o.Mode)
+	}
+	if o.Size != "1GiB" {
+		t.Errorf("Size = %q, want 1GiB", o.Size)
+	}
+	if o.Runtime != 3*time.Minute {
+		t.Errorf("Runtime = %v, want 3m", o.Runtime)
+	}
+	if o.BS != "4k" {
+		t.Errorf("BS = %q, want 4k", o.BS)
+	}
+	if o.RW != "randrw" {
+		t.Errorf("RW = %q, want randrw", o.RW)
+	}
+	if o.Verify != "md5" {
+		t.Errorf("Verify = %q, want md5", o.Verify)
+	}
+}
+
+// TestResolveFioOpts_FullDiskOverride confirms the deep/soak shape
+// round-trips. FioTime as 2h overrides the 3-minute default.
+func TestResolveFioOpts_FullDiskOverride(t *testing.T) {
+	k := StorageKnobs{
+		Mode:    "full_disk",
+		FioTime: 2 * time.Hour,
+		FioBS:   "64k",
+		FioRW:   "write",
+	}
+	o := resolveFioOpts(k)
+	if o.Mode != "full_disk" {
+		t.Errorf("Mode = %q, want full_disk", o.Mode)
+	}
+	if o.Runtime != 2*time.Hour {
+		t.Errorf("Runtime = %v, want 2h", o.Runtime)
+	}
+	if o.BS != "64k" {
+		t.Errorf("BS = %q, want 64k", o.BS)
+	}
+	if o.RW != "write" {
+		t.Errorf("RW = %q, want write", o.RW)
+	}
+	// Verify should fall back to md5 default since knob was empty.
+	if o.Verify != "md5" {
+		t.Errorf("Verify = %q, want md5 (default)", o.Verify)
+	}
+}