23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
145 lines
5.2 KiB
Go
145 lines
5.2 KiB
Go
// Package tests contains the per-stage executors the agent runs on the
|
|
// host under test. Each stage implements Runner, is called with a
|
|
// Context that carries the client + forwarder + run params, and returns
|
|
// an Outcome that the caller POSTs to /result.
|
|
package tests
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"time"
|
|
)
|
|
|
|
// Outcome is what a stage returns; it maps directly to the /result body.
|
|
// - Passed=true and len(Skipped)>0 counts as a pass but surfaces in the
|
|
// tile summary so operators can see "GPU: skipped (no VGA device)".
|
|
// - Message is only used on failure; the UI displays it in the log.
|
|
// - Extras is merged into the posted summary so stages can add
|
|
// their own shape (e.g. Storage returns per-disk probe results).
|
|
// - SubSteps carries agent-authored sub-step rows (CPU/Memory passes,
|
|
// per-disk SMART, per-device GPU, …). Empty for stages with no
|
|
// natural breakdown; persisted verbatim by the /result handler.
|
|
type Outcome struct {
|
|
Passed bool
|
|
Message string
|
|
Summary string // short human-readable one-liner
|
|
Extras map[string]any // merged into posted summary JSON
|
|
SubSteps []SubStepReport // agent-authored granular rows
|
|
}
|
|
|
|
// SubStepReport is one entry a stage contributes to its sub-step list.
|
|
// Ordinal is assigned in the order entries appear in the slice — the
|
|
// agent shouldn't set it manually. State is derived from Passed/Skipped
|
|
// the same way Outcome is: Skipped wins if set, else Passed ? passed :
|
|
// failed. StartedAt/CompletedAt are required so the UI can order rows
|
|
// and slice the stage log by time window.
|
|
type SubStepReport struct {
|
|
Name string
|
|
Passed bool
|
|
Skipped bool
|
|
StartedAt time.Time
|
|
CompletedAt time.Time
|
|
SummaryJSON json.RawMessage
|
|
}
|
|
|
|
// MarshalSummary builds the summary JSON body POSTed to /result.
|
|
// Stages accumulate fields via Extras; this helper adds "summary" (the
|
|
// human-readable line) and serializes.
|
|
func (o Outcome) MarshalSummary() (json.RawMessage, error) {
|
|
body := map[string]any{}
|
|
for k, v := range o.Extras {
|
|
body[k] = v
|
|
}
|
|
if o.Summary != "" {
|
|
body["summary"] = o.Summary
|
|
}
|
|
return json.Marshal(body)
|
|
}
|
|
|
|
// Deps bundles what stages need without pulling in the whole agent.
|
|
// Logger methods print to stdout + forward to the orchestrator; Sensor
|
|
// drops numeric samples; OverrideFlags carries operator-set bypasses.
|
|
//
|
|
// CPUStressKnobs / StorageKnobs / NetworkKnobs are Phase-2 profile
|
|
// knobs. Zero-valued fields mean "fall back to the compile-time
|
|
// default" — that keeps the stages runnable even when the runner can't
|
|
// materialize a profile (tests, legacy orchestrator, etc).
|
|
type Deps struct {
|
|
Info func(string)
|
|
Warn func(string)
|
|
Error func(string)
|
|
Sensor func(ctx context.Context, samples []Sample) error
|
|
OverrideWipe bool
|
|
NonDestructive bool // skip wipe-probe + writes in Storage
|
|
ExpectedDisks []ExpectedDisk // serials + sizes from host.expected_spec
|
|
StageTimeout time.Duration
|
|
CPUStressKnobs CPUStressKnobs
|
|
StorageKnobs StorageKnobs
|
|
NetworkKnobs NetworkKnobs
|
|
BurnKnobs BurnKnobs
|
|
// LookPath is the unit-test seam for swapping a real external
|
|
// binary (stress-ng, fio, iperf3, dmidecode, …) for a fake. When
|
|
// nil the stage falls back to os/exec.LookPath — production and
|
|
// existing tests keep working unchanged. Tests under
|
|
// agent/tests/fakes/ populate this to redirect lookups to a built
|
|
// fake binary in a tempdir.
|
|
LookPath func(name string) (string, error)
|
|
}
|
|
|
|
// CPUStressKnobs parameterizes the CPUStress stage. Zero durations fall
|
|
// back to the package's compile-time defaults (cpuPassDuration etc).
|
|
type CPUStressKnobs struct {
|
|
CPUPass time.Duration
|
|
MemPass time.Duration
|
|
EDACPoll time.Duration
|
|
}
|
|
|
|
// StorageKnobs parameterizes the Storage stage. Mode picks between
|
|
// "fio_sample" (bounded tempfile inside the device, quick profile) and
|
|
// "full_disk" (whole-device write verify, deep/soak). Empty strings
|
|
// fall back to the stage's safe defaults.
|
|
type StorageKnobs struct {
|
|
Mode string
|
|
FioSize string
|
|
FioTime time.Duration
|
|
FioBS string
|
|
FioRW string
|
|
Verify string
|
|
}
|
|
|
|
// NetworkKnobs parameterizes the Network stage.
|
|
type NetworkKnobs struct {
|
|
Duration time.Duration
|
|
}
|
|
|
|
// BurnKnobs parameterizes the Burn super-stage. Duration is the total
|
|
// Burn window; sub-workloads run concurrently inside that window.
|
|
// CPUWorkers is "all" (runtime.NumCPU) or a numeric string. MemPct is a
|
|
// percentage of MemAvailable to allocate for the memory burner (clamped
|
|
// 0-90 by the stage). IperfParallel feeds iperf3 -P to generate sustained
|
|
// NIC load. FioOnSpare gates the storage sub-workload: true = fio runs
|
|
// against the allow-listed disks for the same window; false = skip fio.
|
|
type BurnKnobs struct {
|
|
Duration time.Duration
|
|
CPUWorkers string
|
|
MemPct int
|
|
FioOnSpare bool
|
|
IperfParallel int
|
|
}
|
|
|
|
// Sample mirrors the server's SensorSample but lives in the tests
|
|
// package so probe code doesn't import internal/api.
|
|
type Sample struct {
|
|
Kind string
|
|
Key string
|
|
Value float64
|
|
Unit string
|
|
}
|
|
|
|
// ExpectedDisk is the subset of internal/spec.DiskSpec that Storage
|
|
// needs: a device allowlist keyed on serial.
|
|
type ExpectedDisk struct {
|
|
Serial string
|
|
SizeGB int
|
|
}
|