deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,218 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestParseFioJSON_ATAReadWrite confirms we pull IOPS, BW, and p99
|
||||
// latency from both read and write sides. P99 is read from clat_ns and
|
||||
// converted ns → us (the unit we emit to the threshold evaluator).
|
||||
func TestParseFioJSON_ATAReadWrite(t *testing.T) {
|
||||
raw := `{
|
||||
"jobs": [{
|
||||
"read": {"iops": 1234.5, "bw": 5000, "clat_ns": {"percentile": {"99.000000": 250000}}},
|
||||
"write": {"iops": 432.1, "bw": 2000, "clat_ns": {"percentile": {"99.000000": 500000}}}
|
||||
}]
|
||||
}`
|
||||
r, err := parseFioJSON([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("parseFioJSON: %v", err)
|
||||
}
|
||||
if r.ReadIOPS != 1234.5 {
|
||||
t.Errorf("ReadIOPS = %v, want 1234.5", r.ReadIOPS)
|
||||
}
|
||||
if r.WriteIOPS != 432.1 {
|
||||
t.Errorf("WriteIOPS = %v, want 432.1", r.WriteIOPS)
|
||||
}
|
||||
if r.ReadBWKBps != 5000 {
|
||||
t.Errorf("ReadBWKBps = %v, want 5000", r.ReadBWKBps)
|
||||
}
|
||||
// 250000 ns → 250 us
|
||||
if r.ReadP99Us != 250 {
|
||||
t.Errorf("ReadP99Us = %v, want 250", r.ReadP99Us)
|
||||
}
|
||||
// 500000 ns → 500 us
|
||||
if r.WriteP99Us != 500 {
|
||||
t.Errorf("WriteP99Us = %v, want 500", r.WriteP99Us)
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseFioJSON_ReadOnlyJob: if only one side has p99 populated the
|
||||
// other stays zero (not emitted as a sample). Mirrors a randread job.
|
||||
func TestParseFioJSON_ReadOnlyJob(t *testing.T) {
|
||||
raw := `{
|
||||
"jobs": [{
|
||||
"read": {"iops": 1000, "bw": 4000, "clat_ns": {"percentile": {"99.000000": 100000}}},
|
||||
"write": {"iops": 0, "bw": 0}
|
||||
}]
|
||||
}`
|
||||
r, err := parseFioJSON([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("parseFioJSON: %v", err)
|
||||
}
|
||||
if r.WriteP99Us != 0 {
|
||||
t.Errorf("WriteP99Us = %v on read-only job, want 0", r.WriteP99Us)
|
||||
}
|
||||
if r.ReadP99Us != 100 {
|
||||
t.Errorf("ReadP99Us = %v, want 100", r.ReadP99Us)
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseFioJSON_NoJobs fails rather than reporting zeroes silently.
|
||||
// An empty jobs array means fio didn't run anything.
|
||||
func TestParseFioJSON_NoJobs(t *testing.T) {
|
||||
raw := `{"jobs": []}`
|
||||
if _, err := parseFioJSON([]byte(raw)); err == nil {
|
||||
t.Errorf("expected error on empty jobs array")
|
||||
}
|
||||
}
|
||||
|
||||
// TestExtractSMARTAttrs_ATA picks attributes out of ata_smart_attributes.table
|
||||
// when present. Attributes outside the whitelist drop out silently.
|
||||
func TestExtractSMARTAttrs_ATA(t *testing.T) {
|
||||
raw := map[string]any{}
|
||||
smartJSON := `{
|
||||
"ata_smart_attributes": {
|
||||
"table": [
|
||||
{"name": "Reallocated_Sector_Ct", "raw": {"value": 7}},
|
||||
{"name": "Current_Pending_Sector", "raw": {"value": 3}},
|
||||
{"name": "Spin_Retry_Count", "raw": {"value": 99}}
|
||||
]
|
||||
}
|
||||
}`
|
||||
if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
|
||||
t.Fatalf("unmarshal fixture: %v", err)
|
||||
}
|
||||
out := extractSMARTAttrs(raw)
|
||||
if out["Reallocated_Sector_Ct"] != 7 {
|
||||
t.Errorf("Reallocated_Sector_Ct = %v, want 7", out["Reallocated_Sector_Ct"])
|
||||
}
|
||||
if out["Current_Pending_Sector"] != 3 {
|
||||
t.Errorf("Current_Pending_Sector = %v, want 3", out["Current_Pending_Sector"])
|
||||
}
|
||||
if _, ok := out["Spin_Retry_Count"]; ok {
|
||||
t.Errorf("Spin_Retry_Count should not appear (not in whitelist)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestExtractSMARTAttrs_NVMe picks media_errors and friends from the
|
||||
// nvme health log shape, which is a flat map at the top of the JSON.
|
||||
func TestExtractSMARTAttrs_NVMe(t *testing.T) {
|
||||
raw := map[string]any{}
|
||||
smartJSON := `{
|
||||
"nvme_smart_health_information_log": {
|
||||
"media_errors": 2,
|
||||
"num_err_log_entries": 15,
|
||||
"percentage_used": 7,
|
||||
"temperature": 42
|
||||
}
|
||||
}`
|
||||
if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
|
||||
t.Fatalf("unmarshal fixture: %v", err)
|
||||
}
|
||||
out := extractSMARTAttrs(raw)
|
||||
if out["media_errors"] != 2 {
|
||||
t.Errorf("media_errors = %v, want 2", out["media_errors"])
|
||||
}
|
||||
if out["num_err_log_entries"] != 15 {
|
||||
t.Errorf("num_err_log_entries = %v, want 15", out["num_err_log_entries"])
|
||||
}
|
||||
if out["percentage_used"] != 7 {
|
||||
t.Errorf("percentage_used = %v, want 7", out["percentage_used"])
|
||||
}
|
||||
if _, ok := out["temperature"]; ok {
|
||||
t.Errorf("temperature should not appear (not in whitelist)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestDiffSMARTAttrs: end − start per (device, attr). Only attrs in
|
||||
// both snapshots yield a delta; any disappearing attribute just drops
|
||||
// out instead of showing a misleading negative.
|
||||
func TestDiffSMARTAttrs(t *testing.T) {
|
||||
start := smartAttrMap{
|
||||
"/dev/sda": {"Reallocated_Sector_Ct": 5, "Current_Pending_Sector": 0},
|
||||
}
|
||||
end := smartAttrMap{
|
||||
"/dev/sda": {"Reallocated_Sector_Ct": 8, "Current_Pending_Sector": 2, "UDMA_CRC_Error_Count": 1},
|
||||
}
|
||||
out := diffSMARTAttrs(start, end)
|
||||
if out["/dev/sda"]["Reallocated_Sector_Ct"] != 3 {
|
||||
t.Errorf("Reallocated_Sector_Ct delta = %v, want 3", out["/dev/sda"]["Reallocated_Sector_Ct"])
|
||||
}
|
||||
if out["/dev/sda"]["Current_Pending_Sector"] != 2 {
|
||||
t.Errorf("Current_Pending_Sector delta = %v, want 2", out["/dev/sda"]["Current_Pending_Sector"])
|
||||
}
|
||||
if _, ok := out["/dev/sda"]["UDMA_CRC_Error_Count"]; ok {
|
||||
t.Errorf("UDMA_CRC_Error_Count should not appear (missing at start)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestDiffSMARTAttrs_DeviceNewAtEnd: a device only present in the end
|
||||
// snapshot (drive hot-plugged mid-run, or SMART read succeeded only at
|
||||
// end) is dropped from the diff — no start baseline to subtract from.
|
||||
func TestDiffSMARTAttrs_DeviceNewAtEnd(t *testing.T) {
|
||||
start := smartAttrMap{}
|
||||
end := smartAttrMap{
|
||||
"/dev/sda": {"Reallocated_Sector_Ct": 10},
|
||||
}
|
||||
out := diffSMARTAttrs(start, end)
|
||||
if _, ok := out["/dev/sda"]; ok {
|
||||
t.Errorf("/dev/sda should drop from diff when absent at start")
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveFioOpts_Defaults: zero-valued knobs resolve to the quick
|
||||
// profile's fio_sample shape. Any stage that's missing per-profile
|
||||
// knobs (legacy claim response, test harness) still has coherent
|
||||
// bounded defaults — we won't accidentally fall into unbounded writes.
|
||||
func TestResolveFioOpts_Defaults(t *testing.T) {
|
||||
o := resolveFioOpts(StorageKnobs{})
|
||||
if o.Mode != "fio_sample" {
|
||||
t.Errorf("Mode = %q, want fio_sample", o.Mode)
|
||||
}
|
||||
if o.Size != "1GiB" {
|
||||
t.Errorf("Size = %q, want 1GiB", o.Size)
|
||||
}
|
||||
if o.Runtime != 3*time.Minute {
|
||||
t.Errorf("Runtime = %v, want 3m", o.Runtime)
|
||||
}
|
||||
if o.BS != "4k" {
|
||||
t.Errorf("BS = %q, want 4k", o.BS)
|
||||
}
|
||||
if o.RW != "randrw" {
|
||||
t.Errorf("RW = %q, want randrw", o.RW)
|
||||
}
|
||||
if o.Verify != "md5" {
|
||||
t.Errorf("Verify = %q, want md5", o.Verify)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveFioOpts_FullDiskOverride confirms the deep/soak shape
|
||||
// round-trips. FioTime as 2h overrides the 3-minute default.
|
||||
func TestResolveFioOpts_FullDiskOverride(t *testing.T) {
|
||||
k := StorageKnobs{
|
||||
Mode: "full_disk",
|
||||
FioTime: 2 * time.Hour,
|
||||
FioBS: "64k",
|
||||
FioRW: "write",
|
||||
}
|
||||
o := resolveFioOpts(k)
|
||||
if o.Mode != "full_disk" {
|
||||
t.Errorf("Mode = %q, want full_disk", o.Mode)
|
||||
}
|
||||
if o.Runtime != 2*time.Hour {
|
||||
t.Errorf("Runtime = %v, want 2h", o.Runtime)
|
||||
}
|
||||
if o.BS != "64k" {
|
||||
t.Errorf("BS = %q, want 64k", o.BS)
|
||||
}
|
||||
if o.RW != "write" {
|
||||
t.Errorf("RW = %q, want write", o.RW)
|
||||
}
|
||||
// Verify should fall back to md5 default since knob was empty.
|
||||
if o.Verify != "md5" {
|
||||
t.Errorf("Verify = %q, want md5 (default)", o.Verify)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user