Files
Vetting/agent/tests/cpustress.go
T
josh 27098fc7ed
CI / Lint + build + test (push) Successful in 1m23s
Release / release (push) Successful in 6m2s
cpustress+orchestrator: serial CPU/RAM passes + silent-skip guard
Orion's run (log 20:49 → 20:54) shipped GREEN while silently skipping
CPUStress. Two compounding bugs:

1. CPUStress ran --cpu N AND --vm N --vm-bytes 90% concurrently.
   On a 4-core 8 GiB N95, that's 360% RAM overcommit; the OOM-killer
   fired, usually on the agent itself. Replaced with two sequential
   passes — CPU (all methods, --verify) for 3 min, then RAM (--vm 1,
   --vm-bytes capped to MemAvailable − 1.5 GiB, floor 256 MiB, --verify)
   for 3 min. Each pass now also asserts elapsed ≥ target − 2s so a
   premature clean exit counts as failure instead of a silent pass.

2. On systemd-restart after the OOM, the agent hardcoded nextStage :=
   "Inventory" and re-ran it. The orchestrator's /result handler
   advances run state via TriggerStageCompleted against the *current*
   RunState, not against body.Stage — so an Inventory result posted
   while the run was in StateCPUStress silently advanced CPUStress →
   Storage and marked CPUStress passed without it ever running.

Two-layer defense for #2:
- agent-side: /claim response now carries current_state; agent resumes
  at the matching stage on a re-claim (happy path).
- server-side: new TriggerStageMismatch + StageNameForState helper
  backstop. If body.Stage doesn't match the run's current stage, /result
  parks the run in FailedHolding with failed_stage labeled
  "<got> (expected <expected>)" and returns 409.

Other stages audited for similar unbounded concurrency — none found;
only CPUStress was unsafe.

Tests:
- cpustress_test.go — parseMemAvailable parses real meminfo, errors on
  missing/malformed; cap calc hits floor on tiny boxes, uses 1.5 GiB
  headroom on normal/huge boxes.
- statemachine_test.go — TriggerStageMismatch lands at FailedHolding
  from every stage state and is rejected from pre-stage/terminal
  states; StageNameForState round-trips the stageStates map.
- agent_handlers_test.go — TestResult_RejectsMismatchedStage proves
  the Orion scenario now 409s + FailedHolding; TestResult_AcceptsMatchingStage
  proves the guard doesn't break the happy path.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 17:29:13 -04:00

253 lines
7.6 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package tests
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"runtime"
"strconv"
"strings"
"time"
)
// CPUStress runs stress-ng as two serial passes. The previous shape
// (--cpu N AND --vm N --vm-bytes 90% concurrently) OOM-killed the
// agent itself on small hosts: 4 workers × 90% of an 8GiB box is 360%
// overcommit, and the kernel killed stress-ng / agent / whatever the
// OOM scorer picked. We flip it serial so only one stressor is live
// at a time and the RAM cap is computed from MemAvailable with a
// 1.5GiB headroom reserve, keeping the kernel + agent + log buffers
// alive.
//
// Other stages were audited at the same time (SMART, Storage,
// Network, GPU, PSU, Inventory, SpecValidate, Reporting) — none had
// the CPUStress pattern of unbounded concurrency, so they're
// unchanged.
//
// Pass 1 — CPU only, all methods, 3min. --verify re-runs the ALU
// work and diffs against known-good outputs so a silent miscomputation
// (rowhammered register, flaky bus) still fails the stage.
//
// Pass 2 — RAM only, single worker, 3min. --vm-bytes is
// MemAvailable 1.5GiB, floor 256MiB. --vm-keep reuses the same
// mapping across iterations so we hit every page repeatedly within the
// window.
//
// Each pass also asserts elapsed ≥ (target 2s). A premature clean
// exit (stress-ng killed by a signal, workload bailed quietly) now
// counts as a failure instead of falsely passing on exit-0.
func CPUStress(ctx context.Context, d Deps) Outcome {
if _, err := exec.LookPath("stress-ng"); err != nil {
d.Error("CPUStress: stress-ng not found in PATH — live image is missing required tool")
return Outcome{
Passed: false,
Message: "stress-ng binary missing from live image",
Summary: "failed (stress-ng missing)",
Extras: map[string]any{"reason": "stress_ng_missing"},
}
}
cores := runtime.NumCPU()
extras := map[string]any{"cores": cores}
// Pass 1: CPU
cpu := runStressPass(ctx, d, "CPU", cpuPassDuration, []string{
"--cpu", strconv.Itoa(cores),
"--cpu-method", "all",
"--timeout", durationSeconds(cpuPassDuration),
"--metrics-brief",
"--verify",
})
extras["cpu_pass"] = cpu
if !cpu.Passed {
return Outcome{
Passed: false,
Message: "CPU pass failed: " + cpu.Err,
Summary: fmt.Sprintf("CPU pass failed after %ds", cpu.ElapsedSecs),
Extras: extras,
}
}
// Pass 2: memory — only after CPU has demonstrated the box is
// sane. Cap derived from /proc/meminfo so we never overcommit.
avail, err := memAvailableBytes()
if err != nil {
d.Error("CPUStress: read MemAvailable: " + err.Error())
return Outcome{
Passed: false,
Message: "read MemAvailable: " + err.Error(),
Summary: "failed (meminfo unreadable)",
Extras: extras,
}
}
cap := avail - memHeadroomBytes
extras["mem_available_bytes"] = avail
extras["mem_bytes_cap"] = cap
extras["mem_headroom_bytes"] = int64(memHeadroomBytes)
if cap < memFloorBytes {
msg := fmt.Sprintf("MemAvailable=%d, below %d floor after %d headroom — refusing to run memory pass",
avail, memFloorBytes, memHeadroomBytes)
d.Error("CPUStress: " + msg)
return Outcome{
Passed: false,
Message: msg,
Summary: "failed (insufficient free RAM for memory pass)",
Extras: extras,
}
}
mem := runStressPass(ctx, d, "memory", memPassDuration, []string{
"--vm", "1",
"--vm-bytes", strconv.FormatInt(cap, 10),
"--vm-keep",
"--timeout", durationSeconds(memPassDuration),
"--metrics-brief",
"--verify",
})
extras["mem_pass"] = mem
if !mem.Passed {
return Outcome{
Passed: false,
Message: "memory pass failed: " + mem.Err,
Summary: fmt.Sprintf("memory pass failed after %ds", mem.ElapsedSecs),
Extras: extras,
}
}
return Outcome{
Passed: true,
Summary: fmt.Sprintf("CPU+RAM PASSED (%d cores, %s cap)",
cores, humanBytes(cap)),
Extras: extras,
}
}
const (
cpuPassDuration = 3 * time.Minute
memPassDuration = 3 * time.Minute
// memHeadroomBytes = 1.5 GiB reserved for kernel, agent, log
// buffers, and whatever page cache is still live when the stage
// starts. Conservative but keeps us off the OOM scorer.
memHeadroomBytes int64 = 1610612736
// memFloorBytes — if MemAvailable headroom drops below this,
// we refuse to run the memory pass rather than stressing a tiny
// window that tells us nothing.
memFloorBytes int64 = 268435456
passSlack = 2 * time.Second
)
// stressPass is the per-pass result embedded in CPUStress's Extras.
// Passed==true and Elapsed close to target is the only happy path.
type stressPass struct {
Passed bool `json:"passed"`
Err string `json:"err,omitempty"`
ElapsedSecs int `json:"elapsed_secs"`
TargetSecs int `json:"target_secs"`
OutputTail string `json:"output_tail,omitempty"`
}
// runStressPass invokes stress-ng and validates both exit code and
// elapsed time. Target is the intended --timeout; we require
// elapsed ≥ target passSlack so a premature-but-clean exit still
// counts as failure.
func runStressPass(ctx context.Context, d Deps, label string, target time.Duration, args []string) stressPass {
d.Info(fmt.Sprintf("CPUStress: %s pass starting — stress-ng %s", label, strings.Join(args, " ")))
runCtx, cancel := context.WithTimeout(ctx, target+30*time.Second)
defer cancel()
cmd := exec.CommandContext(runCtx, "stress-ng", args...)
start := time.Now()
out, err := cmd.CombinedOutput()
elapsed := time.Since(start)
res := stressPass{
ElapsedSecs: int(elapsed.Round(time.Second).Seconds()),
TargetSecs: int(target.Round(time.Second).Seconds()),
OutputTail: tailLines(string(out), 20),
}
if err != nil {
res.Err = err.Error()
d.Error(fmt.Sprintf("CPUStress: %s pass failed after %s: %s",
label, elapsed.Round(time.Second), err.Error()))
return res
}
if elapsed < target-passSlack {
res.Err = fmt.Sprintf("stress-ng exited cleanly after %s; expected ≥ %s (premature exit — signal or broken workload)",
elapsed.Round(time.Second), target-passSlack)
d.Error("CPUStress: " + label + " pass " + res.Err)
return res
}
res.Passed = true
d.Info(fmt.Sprintf("CPUStress: %s pass PASSED in %s", label, elapsed.Round(time.Second)))
return res
}
// memAvailableBytes reads /proc/meminfo and returns MemAvailable in
// bytes. Split from parseMemAvailable so the parse step is testable
// without touching the real filesystem.
func memAvailableBytes() (int64, error) {
f, err := os.Open("/proc/meminfo")
if err != nil {
return 0, err
}
defer func() { _ = f.Close() }()
return parseMemAvailable(f)
}
func parseMemAvailable(r io.Reader) (int64, error) {
sc := bufio.NewScanner(r)
for sc.Scan() {
line := sc.Text()
if !strings.HasPrefix(line, "MemAvailable:") {
continue
}
fields := strings.Fields(line)
if len(fields) < 2 {
return 0, fmt.Errorf("malformed MemAvailable line: %q", line)
}
kb, err := strconv.ParseInt(fields[1], 10, 64)
if err != nil {
return 0, fmt.Errorf("parse MemAvailable: %w", err)
}
return kb * 1024, nil
}
if err := sc.Err(); err != nil {
return 0, err
}
return 0, fmt.Errorf("MemAvailable not found in /proc/meminfo")
}
func durationSeconds(d time.Duration) string {
s := int(d.Seconds())
if s < 1 {
s = 1
}
return strconv.Itoa(s) + "s"
}
// tailLines returns the last n non-empty lines of s, for the summary.
func tailLines(s string, n int) string {
lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
if len(lines) > n {
lines = lines[len(lines)-n:]
}
return strings.Join(lines, "\n")
}
func humanBytes(b int64) string {
const (
kib = 1024
mib = 1024 * kib
gib = 1024 * mib
)
switch {
case b >= gib:
return fmt.Sprintf("%.1f GiB", float64(b)/float64(gib))
case b >= mib:
return fmt.Sprintf("%d MiB", b/mib)
default:
return fmt.Sprintf("%d B", b)
}
}