package tests import ( "bufio" "context" "fmt" "io" "os" "os/exec" "runtime" "strconv" "strings" "time" ) // CPUStress runs stress-ng as two serial passes. The previous shape // (--cpu N AND --vm N --vm-bytes 90% concurrently) OOM-killed the // agent itself on small hosts: 4 workers × 90% of an 8GiB box is 360% // overcommit, and the kernel killed stress-ng / agent / whatever the // OOM scorer picked. We flip it serial so only one stressor is live // at a time and the RAM cap is computed from MemAvailable with a // 1.5GiB headroom reserve, keeping the kernel + agent + log buffers // alive. // // Other stages were audited at the same time (SMART, Storage, // Network, GPU, PSU, Inventory, SpecValidate, Reporting) — none had // the CPUStress pattern of unbounded concurrency, so they're // unchanged. // // Pass 1 — CPU only, all methods, 3min. --verify re-runs the ALU // work and diffs against known-good outputs so a silent miscomputation // (rowhammered register, flaky bus) still fails the stage. // // Pass 2 — RAM only, single worker, 3min. --vm-bytes is // MemAvailable − 1.5GiB, floor 256MiB. --vm-keep reuses the same // mapping across iterations so we hit every page repeatedly within the // window. // // Each pass also asserts elapsed ≥ (target − 2s). A premature clean // exit (stress-ng killed by a signal, workload bailed quietly) now // counts as a failure instead of falsely passing on exit-0. func CPUStress(ctx context.Context, d Deps) Outcome { if _, err := exec.LookPath("stress-ng"); err != nil { d.Error("CPUStress: stress-ng not found in PATH — live image is missing required tool") return Outcome{ Passed: false, Message: "stress-ng binary missing from live image", Summary: "failed (stress-ng missing)", Extras: map[string]any{"reason": "stress_ng_missing"}, } } cores := runtime.NumCPU() extras := map[string]any{"cores": cores} // Pass 1: CPU cpu := runStressPass(ctx, d, "CPU", cpuPassDuration, []string{ "--cpu", strconv.Itoa(cores), "--cpu-method", "all", "--timeout", durationSeconds(cpuPassDuration), "--metrics-brief", "--verify", }) extras["cpu_pass"] = cpu if !cpu.Passed { return Outcome{ Passed: false, Message: "CPU pass failed: " + cpu.Err, Summary: fmt.Sprintf("CPU pass failed after %ds", cpu.ElapsedSecs), Extras: extras, } } // Pass 2: memory — only after CPU has demonstrated the box is // sane. Cap derived from /proc/meminfo so we never overcommit. avail, err := memAvailableBytes() if err != nil { d.Error("CPUStress: read MemAvailable: " + err.Error()) return Outcome{ Passed: false, Message: "read MemAvailable: " + err.Error(), Summary: "failed (meminfo unreadable)", Extras: extras, } } cap := avail - memHeadroomBytes extras["mem_available_bytes"] = avail extras["mem_bytes_cap"] = cap extras["mem_headroom_bytes"] = int64(memHeadroomBytes) if cap < memFloorBytes { msg := fmt.Sprintf("MemAvailable=%d, below %d floor after %d headroom — refusing to run memory pass", avail, memFloorBytes, memHeadroomBytes) d.Error("CPUStress: " + msg) return Outcome{ Passed: false, Message: msg, Summary: "failed (insufficient free RAM for memory pass)", Extras: extras, } } mem := runStressPass(ctx, d, "memory", memPassDuration, []string{ "--vm", "1", "--vm-bytes", strconv.FormatInt(cap, 10), "--vm-keep", "--timeout", durationSeconds(memPassDuration), "--metrics-brief", "--verify", }) extras["mem_pass"] = mem if !mem.Passed { return Outcome{ Passed: false, Message: "memory pass failed: " + mem.Err, Summary: fmt.Sprintf("memory pass failed after %ds", mem.ElapsedSecs), Extras: extras, } } return Outcome{ Passed: true, Summary: fmt.Sprintf("CPU+RAM PASSED (%d cores, %s cap)", cores, humanBytes(cap)), Extras: extras, } } const ( cpuPassDuration = 3 * time.Minute memPassDuration = 3 * time.Minute // memHeadroomBytes = 1.5 GiB reserved for kernel, agent, log // buffers, and whatever page cache is still live when the stage // starts. Conservative but keeps us off the OOM scorer. memHeadroomBytes int64 = 1610612736 // memFloorBytes — if MemAvailable − headroom drops below this, // we refuse to run the memory pass rather than stressing a tiny // window that tells us nothing. memFloorBytes int64 = 268435456 passSlack = 2 * time.Second ) // stressPass is the per-pass result embedded in CPUStress's Extras. // Passed==true and Elapsed close to target is the only happy path. type stressPass struct { Passed bool `json:"passed"` Err string `json:"err,omitempty"` ElapsedSecs int `json:"elapsed_secs"` TargetSecs int `json:"target_secs"` OutputTail string `json:"output_tail,omitempty"` } // runStressPass invokes stress-ng and validates both exit code and // elapsed time. Target is the intended --timeout; we require // elapsed ≥ target − passSlack so a premature-but-clean exit still // counts as failure. func runStressPass(ctx context.Context, d Deps, label string, target time.Duration, args []string) stressPass { d.Info(fmt.Sprintf("CPUStress: %s pass starting — stress-ng %s", label, strings.Join(args, " "))) runCtx, cancel := context.WithTimeout(ctx, target+30*time.Second) defer cancel() cmd := exec.CommandContext(runCtx, "stress-ng", args...) start := time.Now() out, err := cmd.CombinedOutput() elapsed := time.Since(start) res := stressPass{ ElapsedSecs: int(elapsed.Round(time.Second).Seconds()), TargetSecs: int(target.Round(time.Second).Seconds()), OutputTail: tailLines(string(out), 20), } if err != nil { res.Err = err.Error() d.Error(fmt.Sprintf("CPUStress: %s pass failed after %s: %s", label, elapsed.Round(time.Second), err.Error())) return res } if elapsed < target-passSlack { res.Err = fmt.Sprintf("stress-ng exited cleanly after %s; expected ≥ %s (premature exit — signal or broken workload)", elapsed.Round(time.Second), target-passSlack) d.Error("CPUStress: " + label + " pass " + res.Err) return res } res.Passed = true d.Info(fmt.Sprintf("CPUStress: %s pass PASSED in %s", label, elapsed.Round(time.Second))) return res } // memAvailableBytes reads /proc/meminfo and returns MemAvailable in // bytes. Split from parseMemAvailable so the parse step is testable // without touching the real filesystem. func memAvailableBytes() (int64, error) { f, err := os.Open("/proc/meminfo") if err != nil { return 0, err } defer func() { _ = f.Close() }() return parseMemAvailable(f) } func parseMemAvailable(r io.Reader) (int64, error) { sc := bufio.NewScanner(r) for sc.Scan() { line := sc.Text() if !strings.HasPrefix(line, "MemAvailable:") { continue } fields := strings.Fields(line) if len(fields) < 2 { return 0, fmt.Errorf("malformed MemAvailable line: %q", line) } kb, err := strconv.ParseInt(fields[1], 10, 64) if err != nil { return 0, fmt.Errorf("parse MemAvailable: %w", err) } return kb * 1024, nil } if err := sc.Err(); err != nil { return 0, err } return 0, fmt.Errorf("MemAvailable not found in /proc/meminfo") } func durationSeconds(d time.Duration) string { s := int(d.Seconds()) if s < 1 { s = 1 } return strconv.Itoa(s) + "s" } // tailLines returns the last n non-empty lines of s, for the summary. func tailLines(s string, n int) string { lines := strings.Split(strings.TrimRight(s, "\n"), "\n") if len(lines) > n { lines = lines[len(lines)-n:] } return strings.Join(lines, "\n") } func humanBytes(b int64) string { const ( kib = 1024 mib = 1024 * kib gib = 1024 * mib ) switch { case b >= gib: return fmt.Sprintf("%.1f GiB", float64(b)/float64(gib)) case b >= mib: return fmt.Sprintf("%d MiB", b/mib) default: return fmt.Sprintf("%d B", b) } }