deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,144 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestSensorMux_CloseFlushesBuffer confirms Close() empties the
|
||||
// pending buffer through the HTTP client before returning. Without
|
||||
// this guarantee a Burn run would drop the last 2 s of samples when
|
||||
// the stage tears down, which is exactly the window that contains the
|
||||
// peak-load PSU / thermal readings we care about.
|
||||
func TestSensorMux_CloseFlushesBuffer(t *testing.T) {
|
||||
var batches int32
|
||||
var totalSamples int32
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if !strings.HasSuffix(r.URL.Path, "/sensor") {
|
||||
t.Errorf("unexpected path %s", r.URL.Path)
|
||||
}
|
||||
body, _ := io.ReadAll(r.Body)
|
||||
var env struct {
|
||||
Samples []SensorSample `json:"samples"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &env); err != nil {
|
||||
t.Errorf("decode: %v", err)
|
||||
}
|
||||
atomic.AddInt32(&batches, 1)
|
||||
atomic.AddInt32(&totalSamples, int32(len(env.Samples)))
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := &Client{
|
||||
BaseURL: srv.URL,
|
||||
RunID: 1,
|
||||
Token: "t",
|
||||
HTTP: srv.Client(),
|
||||
}
|
||||
mux := NewSensorMux(context.Background(), c)
|
||||
mux.Send([]SensorSample{
|
||||
{Kind: "temp", Key: "cpu/0", Value: 72.5, Unit: "C"},
|
||||
{Kind: "psu_volt", Key: "+12V", Value: 12.05, Unit: "V"},
|
||||
})
|
||||
mux.Send([]SensorSample{
|
||||
{Kind: "mce", Key: "0", Value: 0, Unit: "count"},
|
||||
})
|
||||
mux.Close()
|
||||
|
||||
if got := atomic.LoadInt32(&totalSamples); got != 3 {
|
||||
t.Errorf("expected 3 samples flushed, got %d across %d batch(es)", got, atomic.LoadInt32(&batches))
|
||||
}
|
||||
if atomic.LoadInt32(&batches) == 0 {
|
||||
t.Errorf("expected at least one batch HTTP post")
|
||||
}
|
||||
}
|
||||
|
||||
// TestSensorMux_ChunksOversizedBatch verifies flushChunks splits a
|
||||
// single oversized input into maxBatch-sized HTTP requests. The plan's
|
||||
// Burn stage can legitimately push a single input larger than the cap
|
||||
// (e.g. a workload goroutine dumping a backlog), and a single giant
|
||||
// POST would defeat the point of the multiplexer.
|
||||
func TestSensorMux_ChunksOversizedBatch(t *testing.T) {
|
||||
var batchSizes []int
|
||||
var mu sync.Mutex
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
body, _ := io.ReadAll(r.Body)
|
||||
var env struct {
|
||||
Samples []SensorSample `json:"samples"`
|
||||
}
|
||||
_ = json.Unmarshal(body, &env)
|
||||
mu.Lock()
|
||||
batchSizes = append(batchSizes, len(env.Samples))
|
||||
mu.Unlock()
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := &Client{BaseURL: srv.URL, RunID: 1, Token: "t", HTTP: srv.Client()}
|
||||
mux := NewSensorMux(context.Background(), c)
|
||||
|
||||
// One input with 1200 samples → expect chunks of 500 + 500 + 200
|
||||
// given the default maxBatch of 500.
|
||||
big := make([]SensorSample, 1200)
|
||||
for i := range big {
|
||||
big[i] = SensorSample{Kind: "burn/throughput_mbps", Key: "eth0", Value: float64(i), Unit: "Mbps"}
|
||||
}
|
||||
mux.Send(big)
|
||||
mux.Close()
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
total := 0
|
||||
for _, n := range batchSizes {
|
||||
total += n
|
||||
if n > 500 {
|
||||
t.Errorf("batch size %d exceeds maxBatch=500", n)
|
||||
}
|
||||
}
|
||||
if total != 1200 {
|
||||
t.Errorf("sum of batch sizes = %d, want 1200 (sizes=%v)", total, batchSizes)
|
||||
}
|
||||
if len(batchSizes) < 3 {
|
||||
t.Errorf("expected at least 3 chunks for a 1200-sample input, got %d (%v)", len(batchSizes), batchSizes)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSensorMux_EmptyAndNilSafe covers the defensive guards around
|
||||
// Send(nil) / Send([]) / a nil *SensorMux. Callers with conditional
|
||||
// sample lists (storage probe that skipped a disk, GPU stage with no
|
||||
// devices) should be able to call Send unconditionally without adding
|
||||
// their own nil check.
|
||||
func TestSensorMux_EmptyAndNilSafe(t *testing.T) {
|
||||
var batches int32
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
atomic.AddInt32(&batches, 1)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
// Nil receiver must be a no-op.
|
||||
var nilMux *SensorMux
|
||||
nilMux.Send([]SensorSample{{Kind: "x", Key: "y"}})
|
||||
nilMux.Close()
|
||||
|
||||
c := &Client{BaseURL: srv.URL, RunID: 1, Token: "t", HTTP: srv.Client()}
|
||||
mux := NewSensorMux(context.Background(), c)
|
||||
mux.Send(nil)
|
||||
mux.Send([]SensorSample{})
|
||||
mux.Close()
|
||||
|
||||
// Give any spurious goroutine a chance to surprise us.
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
if atomic.LoadInt32(&batches) != 0 {
|
||||
t.Errorf("empty/nil Send must not produce HTTP batches, got %d", atomic.LoadInt32(&batches))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user