Files
josh 23c689aa5b
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled
deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00

219 lines
6.8 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package tests
import (
"encoding/json"
"testing"
"time"
)
// TestParseFioJSON_ATAReadWrite confirms we pull IOPS, BW, and p99
// latency from both read and write sides. P99 is read from clat_ns and
// converted ns → us (the unit we emit to the threshold evaluator).
func TestParseFioJSON_ATAReadWrite(t *testing.T) {
raw := `{
"jobs": [{
"read": {"iops": 1234.5, "bw": 5000, "clat_ns": {"percentile": {"99.000000": 250000}}},
"write": {"iops": 432.1, "bw": 2000, "clat_ns": {"percentile": {"99.000000": 500000}}}
}]
}`
r, err := parseFioJSON([]byte(raw))
if err != nil {
t.Fatalf("parseFioJSON: %v", err)
}
if r.ReadIOPS != 1234.5 {
t.Errorf("ReadIOPS = %v, want 1234.5", r.ReadIOPS)
}
if r.WriteIOPS != 432.1 {
t.Errorf("WriteIOPS = %v, want 432.1", r.WriteIOPS)
}
if r.ReadBWKBps != 5000 {
t.Errorf("ReadBWKBps = %v, want 5000", r.ReadBWKBps)
}
// 250000 ns → 250 us
if r.ReadP99Us != 250 {
t.Errorf("ReadP99Us = %v, want 250", r.ReadP99Us)
}
// 500000 ns → 500 us
if r.WriteP99Us != 500 {
t.Errorf("WriteP99Us = %v, want 500", r.WriteP99Us)
}
}
// TestParseFioJSON_ReadOnlyJob: if only one side has p99 populated the
// other stays zero (not emitted as a sample). Mirrors a randread job.
func TestParseFioJSON_ReadOnlyJob(t *testing.T) {
raw := `{
"jobs": [{
"read": {"iops": 1000, "bw": 4000, "clat_ns": {"percentile": {"99.000000": 100000}}},
"write": {"iops": 0, "bw": 0}
}]
}`
r, err := parseFioJSON([]byte(raw))
if err != nil {
t.Fatalf("parseFioJSON: %v", err)
}
if r.WriteP99Us != 0 {
t.Errorf("WriteP99Us = %v on read-only job, want 0", r.WriteP99Us)
}
if r.ReadP99Us != 100 {
t.Errorf("ReadP99Us = %v, want 100", r.ReadP99Us)
}
}
// TestParseFioJSON_NoJobs fails rather than reporting zeroes silently.
// An empty jobs array means fio didn't run anything.
func TestParseFioJSON_NoJobs(t *testing.T) {
raw := `{"jobs": []}`
if _, err := parseFioJSON([]byte(raw)); err == nil {
t.Errorf("expected error on empty jobs array")
}
}
// TestExtractSMARTAttrs_ATA picks attributes out of ata_smart_attributes.table
// when present. Attributes outside the whitelist drop out silently.
func TestExtractSMARTAttrs_ATA(t *testing.T) {
raw := map[string]any{}
smartJSON := `{
"ata_smart_attributes": {
"table": [
{"name": "Reallocated_Sector_Ct", "raw": {"value": 7}},
{"name": "Current_Pending_Sector", "raw": {"value": 3}},
{"name": "Spin_Retry_Count", "raw": {"value": 99}}
]
}
}`
if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
t.Fatalf("unmarshal fixture: %v", err)
}
out := extractSMARTAttrs(raw)
if out["Reallocated_Sector_Ct"] != 7 {
t.Errorf("Reallocated_Sector_Ct = %v, want 7", out["Reallocated_Sector_Ct"])
}
if out["Current_Pending_Sector"] != 3 {
t.Errorf("Current_Pending_Sector = %v, want 3", out["Current_Pending_Sector"])
}
if _, ok := out["Spin_Retry_Count"]; ok {
t.Errorf("Spin_Retry_Count should not appear (not in whitelist)")
}
}
// TestExtractSMARTAttrs_NVMe picks media_errors and friends from the
// nvme health log shape, which is a flat map at the top of the JSON.
func TestExtractSMARTAttrs_NVMe(t *testing.T) {
raw := map[string]any{}
smartJSON := `{
"nvme_smart_health_information_log": {
"media_errors": 2,
"num_err_log_entries": 15,
"percentage_used": 7,
"temperature": 42
}
}`
if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
t.Fatalf("unmarshal fixture: %v", err)
}
out := extractSMARTAttrs(raw)
if out["media_errors"] != 2 {
t.Errorf("media_errors = %v, want 2", out["media_errors"])
}
if out["num_err_log_entries"] != 15 {
t.Errorf("num_err_log_entries = %v, want 15", out["num_err_log_entries"])
}
if out["percentage_used"] != 7 {
t.Errorf("percentage_used = %v, want 7", out["percentage_used"])
}
if _, ok := out["temperature"]; ok {
t.Errorf("temperature should not appear (not in whitelist)")
}
}
// TestDiffSMARTAttrs: end start per (device, attr). Only attrs in
// both snapshots yield a delta; any disappearing attribute just drops
// out instead of showing a misleading negative.
func TestDiffSMARTAttrs(t *testing.T) {
start := smartAttrMap{
"/dev/sda": {"Reallocated_Sector_Ct": 5, "Current_Pending_Sector": 0},
}
end := smartAttrMap{
"/dev/sda": {"Reallocated_Sector_Ct": 8, "Current_Pending_Sector": 2, "UDMA_CRC_Error_Count": 1},
}
out := diffSMARTAttrs(start, end)
if out["/dev/sda"]["Reallocated_Sector_Ct"] != 3 {
t.Errorf("Reallocated_Sector_Ct delta = %v, want 3", out["/dev/sda"]["Reallocated_Sector_Ct"])
}
if out["/dev/sda"]["Current_Pending_Sector"] != 2 {
t.Errorf("Current_Pending_Sector delta = %v, want 2", out["/dev/sda"]["Current_Pending_Sector"])
}
if _, ok := out["/dev/sda"]["UDMA_CRC_Error_Count"]; ok {
t.Errorf("UDMA_CRC_Error_Count should not appear (missing at start)")
}
}
// TestDiffSMARTAttrs_DeviceNewAtEnd: a device only present in the end
// snapshot (drive hot-plugged mid-run, or SMART read succeeded only at
// end) is dropped from the diff — no start baseline to subtract from.
func TestDiffSMARTAttrs_DeviceNewAtEnd(t *testing.T) {
start := smartAttrMap{}
end := smartAttrMap{
"/dev/sda": {"Reallocated_Sector_Ct": 10},
}
out := diffSMARTAttrs(start, end)
if _, ok := out["/dev/sda"]; ok {
t.Errorf("/dev/sda should drop from diff when absent at start")
}
}
// TestResolveFioOpts_Defaults: zero-valued knobs resolve to the quick
// profile's fio_sample shape. Any stage that's missing per-profile
// knobs (legacy claim response, test harness) still has coherent
// bounded defaults — we won't accidentally fall into unbounded writes.
func TestResolveFioOpts_Defaults(t *testing.T) {
o := resolveFioOpts(StorageKnobs{})
if o.Mode != "fio_sample" {
t.Errorf("Mode = %q, want fio_sample", o.Mode)
}
if o.Size != "1GiB" {
t.Errorf("Size = %q, want 1GiB", o.Size)
}
if o.Runtime != 3*time.Minute {
t.Errorf("Runtime = %v, want 3m", o.Runtime)
}
if o.BS != "4k" {
t.Errorf("BS = %q, want 4k", o.BS)
}
if o.RW != "randrw" {
t.Errorf("RW = %q, want randrw", o.RW)
}
if o.Verify != "md5" {
t.Errorf("Verify = %q, want md5", o.Verify)
}
}
// TestResolveFioOpts_FullDiskOverride confirms the deep/soak shape
// round-trips. FioTime as 2h overrides the 3-minute default.
func TestResolveFioOpts_FullDiskOverride(t *testing.T) {
k := StorageKnobs{
Mode: "full_disk",
FioTime: 2 * time.Hour,
FioBS: "64k",
FioRW: "write",
}
o := resolveFioOpts(k)
if o.Mode != "full_disk" {
t.Errorf("Mode = %q, want full_disk", o.Mode)
}
if o.Runtime != 2*time.Hour {
t.Errorf("Runtime = %v, want 2h", o.Runtime)
}
if o.BS != "64k" {
t.Errorf("BS = %q, want 64k", o.BS)
}
if o.RW != "write" {
t.Errorf("RW = %q, want write", o.RW)
}
// Verify should fall back to md5 default since knob was empty.
if o.Verify != "md5" {
t.Errorf("Verify = %q, want md5 (default)", o.Verify)
}
}