deep profile + threshold gating + firmware stage + Burn super-stage
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+133
-24
@@ -26,6 +26,7 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
@@ -71,7 +72,10 @@ func Run(ctx context.Context, p *bootstate.Params) error {
|
||||
}
|
||||
fwd.info(fmt.Sprintf("claimed run; stages=%v current_state=%s", claim.Stages, claim.CurrentState))
|
||||
|
||||
go thermalSidecar(ctx, c, fwd)
|
||||
mux := NewSensorMux(ctx, c)
|
||||
defer mux.Close()
|
||||
|
||||
go thermalSidecar(ctx, mux, fwd)
|
||||
|
||||
hbCh := make(chan HeartbeatResponse, 4)
|
||||
go heartbeatLoop(ctx, c, fwd, hbCh)
|
||||
@@ -101,7 +105,7 @@ func Run(ctx context.Context, p *bootstate.Params) error {
|
||||
default:
|
||||
}
|
||||
fwd.info("stage: starting " + nextStage)
|
||||
outcome := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{})
|
||||
outcome := runStageCancellable(ctx, nextStage, claim, fwd, c, mux, overrideFlags{})
|
||||
if outcome.Cancelled {
|
||||
fwd.warn("stage cancelled by operator; posting result and exiting")
|
||||
_, _ = postResult(ctx, c, nextStage, outcome)
|
||||
@@ -119,7 +123,7 @@ func Run(ctx context.Context, p *bootstate.Params) error {
|
||||
return err
|
||||
}
|
||||
// Park and wait for an override directive.
|
||||
return waitForOverride(ctx, c, fwd, hbCh, claim)
|
||||
return waitForOverride(ctx, c, fwd, mux, hbCh, claim)
|
||||
}
|
||||
if resp.NextState == "Completed" || resp.NextState == "" {
|
||||
fwd.info("pipeline complete")
|
||||
@@ -144,10 +148,10 @@ func Run(ctx context.Context, p *bootstate.Params) error {
|
||||
// it runs the inventory probe and passes the result as the /result body
|
||||
// (the orchestrator persists it as an artifact). Every other stage
|
||||
// returns a tests.Outcome which postResult marshals generically.
|
||||
func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome {
|
||||
func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, mux *SensorMux, ovr overrideFlags) stageOutcome {
|
||||
fwd.SetStage(stage)
|
||||
defer fwd.ClearStage()
|
||||
deps := newDeps(ctx, c, fwd, ovr, claim)
|
||||
deps := newDeps(ctx, c, fwd, mux, ovr, claim, stage)
|
||||
switch stage {
|
||||
case "Inventory":
|
||||
fwd.info("Inventory: probing host hardware")
|
||||
@@ -163,6 +167,25 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
|
||||
},
|
||||
Inventory: inv,
|
||||
}
|
||||
case "Firmware":
|
||||
fwd.info("Firmware: probing firmware versions")
|
||||
snaps, warns := probes.Firmware(ctx)
|
||||
for _, w := range warns {
|
||||
fwd.warn(w)
|
||||
}
|
||||
summary := firmwareSummary(snaps)
|
||||
fwd.info("Firmware: " + summary)
|
||||
return stageOutcome{
|
||||
Outcome: tests.Outcome{
|
||||
Passed: true,
|
||||
Summary: summary,
|
||||
Extras: map[string]any{
|
||||
"warnings": warns,
|
||||
"snapshots": len(snaps),
|
||||
},
|
||||
},
|
||||
Firmware: snaps,
|
||||
}
|
||||
case "SMART":
|
||||
return stageOutcome{Outcome: tests.SMART(ctx, deps)}
|
||||
case "CPUStress":
|
||||
@@ -170,10 +193,19 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
|
||||
case "Storage":
|
||||
return stageOutcome{Outcome: tests.Storage(ctx, deps)}
|
||||
case "Network":
|
||||
duration := deps.NetworkKnobs.Duration
|
||||
if duration <= 0 {
|
||||
duration = 10 * time.Second
|
||||
}
|
||||
return stageOutcome{Outcome: tests.Network(ctx, deps, tests.NetworkConfig{
|
||||
OrchestratorURL: c.BaseURL,
|
||||
IperfPort: claim.IperfPort,
|
||||
Duration: 10 * time.Second,
|
||||
Duration: duration,
|
||||
})}
|
||||
case "Burn":
|
||||
return stageOutcome{Outcome: tests.Burn(ctx, deps, tests.BurnConfig{
|
||||
OrchestratorURL: c.BaseURL,
|
||||
IperfPort: claim.IperfPort,
|
||||
})}
|
||||
case "GPU":
|
||||
return stageOutcome{Outcome: tests.GPU(ctx, deps)}
|
||||
@@ -188,8 +220,9 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
|
||||
|
||||
type stageOutcome struct {
|
||||
Outcome tests.Outcome
|
||||
Inventory *spec.Inventory // only for Inventory stage
|
||||
Cancelled bool // set when the stage was cut short by operator cancel
|
||||
Inventory *spec.Inventory // only for Inventory stage
|
||||
Firmware []probes.FirmwareSnapshot // only for Firmware stage
|
||||
Cancelled bool // set when the stage was cut short by operator cancel
|
||||
}
|
||||
|
||||
// runStageCancellable wraps runStage in a per-stage context so the
|
||||
@@ -197,14 +230,14 @@ type stageOutcome struct {
|
||||
// is currently running. If the derived context was cancelled while the
|
||||
// stage executed, the outcome is rewritten as a cancellation record so
|
||||
// the orchestrator has something to persist.
|
||||
func runStageCancellable(parent context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome {
|
||||
func runStageCancellable(parent context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, mux *SensorMux, ovr overrideFlags) stageOutcome {
|
||||
stageCtx, cancel := context.WithCancel(parent)
|
||||
stageCancel.Store(cancel)
|
||||
defer func() {
|
||||
cancel()
|
||||
stageCancel.Store(context.CancelFunc(nil))
|
||||
}()
|
||||
out := runStage(stageCtx, stage, claim, fwd, c, ovr)
|
||||
out := runStage(stageCtx, stage, claim, fwd, c, mux, ovr)
|
||||
// If the parent is still live but the stage ctx was cancelled, the
|
||||
// operator fired a cancel — mark the outcome so the caller can exit
|
||||
// the pipeline cleanly. Plain ctx-cancel on ctx.Done (e.g. shutdown)
|
||||
@@ -235,7 +268,7 @@ type overrideFlags struct {
|
||||
Wipe bool `json:"wipe"`
|
||||
}
|
||||
|
||||
func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlags, claim *ClaimResponse) tests.Deps {
|
||||
func newDeps(ctx context.Context, c *Client, fwd *logForwarder, mux *SensorMux, ovr overrideFlags, claim *ClaimResponse, stage string) tests.Deps {
|
||||
var expected []tests.ExpectedDisk
|
||||
for _, e := range claim.ExpectedDisks {
|
||||
expected = append(expected, tests.ExpectedDisk{Serial: e.Serial, SizeGB: e.SizeGB})
|
||||
@@ -247,17 +280,73 @@ func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlag
|
||||
OverrideWipe: ovr.Wipe,
|
||||
NonDestructive: claim.NonDestructive,
|
||||
ExpectedDisks: expected,
|
||||
StageTimeout: 2 * time.Minute,
|
||||
Sensor: func(ctx context.Context, samples []tests.Sample) error {
|
||||
StageTimeout: stageTimeout(claim, stage),
|
||||
CPUStressKnobs: tests.CPUStressKnobs{
|
||||
CPUPass: parseDur(claim.StageConfig.CPUStress.CPUPass),
|
||||
MemPass: parseDur(claim.StageConfig.CPUStress.MemPass),
|
||||
EDACPoll: parseDur(claim.StageConfig.CPUStress.EDACPoll),
|
||||
},
|
||||
StorageKnobs: tests.StorageKnobs{
|
||||
Mode: claim.StageConfig.Storage.Mode,
|
||||
FioSize: claim.StageConfig.Storage.FioSize,
|
||||
FioTime: parseDur(claim.StageConfig.Storage.FioTime),
|
||||
FioBS: claim.StageConfig.Storage.FioBS,
|
||||
FioRW: claim.StageConfig.Storage.FioRW,
|
||||
Verify: claim.StageConfig.Storage.Verify,
|
||||
},
|
||||
NetworkKnobs: tests.NetworkKnobs{
|
||||
Duration: parseDur(claim.StageConfig.Network.Duration),
|
||||
},
|
||||
BurnKnobs: tests.BurnKnobs{
|
||||
Duration: parseDur(claim.StageConfig.Burn.Duration),
|
||||
CPUWorkers: claim.StageConfig.Burn.CPUWorkers,
|
||||
MemPct: claim.StageConfig.Burn.MemPct,
|
||||
FioOnSpare: claim.StageConfig.Burn.FioOnSpare,
|
||||
IperfParallel: claim.StageConfig.Burn.IperfParallel,
|
||||
},
|
||||
Sensor: func(_ context.Context, samples []tests.Sample) error {
|
||||
out := make([]SensorSample, 0, len(samples))
|
||||
for _, s := range samples {
|
||||
out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
|
||||
}
|
||||
return c.Sensor(ctx, out)
|
||||
mux.Send(out)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// stageTimeout reads claim.StageConfig.StageTimeouts[stage] and falls
|
||||
// back to 2 minutes (the pre-Phase-2 default). Malformed entries log and
|
||||
// fall back — we'd rather run the stage than refuse on a typo.
|
||||
func stageTimeout(claim *ClaimResponse, stage string) time.Duration {
|
||||
if claim == nil || claim.StageConfig.StageTimeouts == nil {
|
||||
return 2 * time.Minute
|
||||
}
|
||||
raw, ok := claim.StageConfig.StageTimeouts[stage]
|
||||
if !ok || raw == "" {
|
||||
return 2 * time.Minute
|
||||
}
|
||||
d, err := time.ParseDuration(raw)
|
||||
if err != nil || d <= 0 {
|
||||
return 2 * time.Minute
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// parseDur is the permissive duration parser for the knob wire shape.
|
||||
// Empty strings / parse failures yield 0 so callers can treat a zero
|
||||
// value as "use the compile-time default" without a nil-check dance.
|
||||
func parseDur(s string) time.Duration {
|
||||
if s == "" {
|
||||
return 0
|
||||
}
|
||||
d, err := time.ParseDuration(s)
|
||||
if err != nil || d < 0 {
|
||||
return 0
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// postResult marshals stageOutcome for the /result endpoint. The
|
||||
// Inventory shape is special-cased: it includes the inventory blob so
|
||||
// the orchestrator can persist it and run server-side spec diff.
|
||||
@@ -276,6 +365,9 @@ func postResult(ctx context.Context, c *Client, stage string, s stageOutcome) (*
|
||||
if s.Inventory != nil {
|
||||
body["inventory"] = s.Inventory
|
||||
}
|
||||
if len(s.Firmware) > 0 {
|
||||
body["firmware"] = s.Firmware
|
||||
}
|
||||
if len(s.Outcome.SubSteps) > 0 {
|
||||
wire := make([]SubStepReport, 0, len(s.Outcome.SubSteps))
|
||||
for _, ss := range s.Outcome.SubSteps {
|
||||
@@ -304,7 +396,7 @@ func stageForState(state string) string {
|
||||
switch state {
|
||||
case "InventoryCheck":
|
||||
return "Inventory"
|
||||
case "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU":
|
||||
case "Firmware", "SMART", "CPUStress", "Storage", "Network", "Burn", "GPU", "PSU":
|
||||
return state
|
||||
}
|
||||
// SpecValidate and Reporting are orchestrator-owned; we never see
|
||||
@@ -315,7 +407,7 @@ func stageForState(state string) string {
|
||||
// waitForOverride parks the agent in FailedHolding. It listens for a
|
||||
// heartbeat directive that tells it to retry a stage (e.g. Storage
|
||||
// with wipe-override armed) and re-enters runStage from that point.
|
||||
func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-chan HeartbeatResponse, claim *ClaimResponse) error {
|
||||
func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, mux *SensorMux, hb <-chan HeartbeatResponse, claim *ClaimResponse) error {
|
||||
fwd.info("holding: awaiting operator decision (heartbeat directive or ctx cancel)")
|
||||
for {
|
||||
select {
|
||||
@@ -333,7 +425,7 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha
|
||||
if len(cmd.OverrideFlags) > 0 {
|
||||
_ = json.Unmarshal(cmd.OverrideFlags, &ovr)
|
||||
}
|
||||
outcome := runStageCancellable(ctx, cmd.Stage, claim, fwd, c, ovr)
|
||||
outcome := runStageCancellable(ctx, cmd.Stage, claim, fwd, c, mux, ovr)
|
||||
if outcome.Cancelled {
|
||||
fwd.warn("stage cancelled by operator; posting result and exiting")
|
||||
_, _ = postResult(ctx, c, cmd.Stage, outcome)
|
||||
@@ -362,7 +454,7 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha
|
||||
default:
|
||||
}
|
||||
fwd.info("stage: starting " + nextStage)
|
||||
out := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{})
|
||||
out := runStageCancellable(ctx, nextStage, claim, fwd, c, mux, overrideFlags{})
|
||||
if out.Cancelled {
|
||||
fwd.warn("stage cancelled by operator; posting result and exiting")
|
||||
_, _ = postResult(ctx, c, nextStage, out)
|
||||
@@ -417,11 +509,32 @@ func inventorySummary(inv *spec.Inventory) string {
|
||||
len(inv.Disks), len(inv.NICs), len(inv.GPUs))
|
||||
}
|
||||
|
||||
// firmwareSummary renders the one-liner surfaced in the stage tile:
|
||||
// per-component counts so an operator can see "bios=1 nic=2 nvme_fw=1"
|
||||
// without opening the report.
|
||||
func firmwareSummary(snaps []probes.FirmwareSnapshot) string {
|
||||
counts := map[string]int{}
|
||||
for _, s := range snaps {
|
||||
counts[s.Component]++
|
||||
}
|
||||
if len(counts) == 0 {
|
||||
return "no firmware readable"
|
||||
}
|
||||
keys := []string{"bios", "bmc", "nic", "hba", "nvme_fw", "microcode"}
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, k := range keys {
|
||||
if n := counts[k]; n > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%s=%d", k, n))
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
// thermalSidecar posts a batch of /sys/class/hwmon samples every 5s.
|
||||
// Idempotent: a dead sensor just drops out of the next batch. Errors
|
||||
// are logged but never fatal — we'd rather have a run with partial
|
||||
// thermal data than kill the agent over an I/O hiccup.
|
||||
func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) {
|
||||
func thermalSidecar(ctx context.Context, mux *SensorMux, fwd *logForwarder) {
|
||||
t := time.NewTicker(5 * time.Second)
|
||||
defer t.Stop()
|
||||
for {
|
||||
@@ -437,11 +550,7 @@ func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) {
|
||||
for _, s := range samples {
|
||||
out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
|
||||
}
|
||||
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
if err := c.Sensor(sendCtx, out); err != nil {
|
||||
fwd.warn("thermal sidecar: " + err.Error())
|
||||
}
|
||||
cancel()
|
||||
mux.Send(out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user