package tests import ( "context" "fmt" "os/exec" "runtime" "strconv" "strings" "time" ) // CPUStress runs stress-ng with CPU workers AND memory stressors. The // memory stressors take the place of a Memtest86+ pass — per the plan, // running under Linux gives us exit-code-based pass/fail and log // capture we can't get from Memtest without IPMI serial redirection. // // Non-zero exit = stress-ng aborted due to a failure (bit flip, OOM // kill, etc.) → stage fails. Exit 0 means the kernel returned sane // pages for the full duration, which is the Phase 4 health bar. func CPUStress(ctx context.Context, d Deps) Outcome { if _, err := exec.LookPath("stress-ng"); err != nil { d.Warn("CPUStress: stress-ng not found in PATH — skipping stage") return Outcome{ Passed: true, Summary: "skipped (stress-ng missing)", Extras: map[string]any{"skipped": true, "reason": "stress_ng_missing"}, } } // Timeout: Deps.StageTimeout may be zero in tests; default 2 min. timeout := d.StageTimeout if timeout <= 0 { timeout = 2 * time.Minute } cores := runtime.NumCPU() // --vm N allocates N worker processes each touching 90% of RAM. On // an 8-core host with 32GiB this is 8 × ~28GiB sliding windows — // enough to exercise every DIMM row within a minute. args := []string{ "--cpu", strconv.Itoa(cores), "--cpu-method", "all", "--vm", strconv.Itoa(cores), "--vm-bytes", "90%", "--timeout", durationSeconds(timeout), "--metrics-brief", "--verify", } d.Info(fmt.Sprintf("CPUStress: stress-ng --cpu %d --vm %d --vm-bytes 90%% --timeout %s", cores, cores, durationSeconds(timeout))) runCtx, cancel := context.WithTimeout(ctx, timeout+30*time.Second) defer cancel() cmd := exec.CommandContext(runCtx, "stress-ng", args...) start := time.Now() out, err := cmd.CombinedOutput() elapsed := time.Since(start).Round(time.Second) extras := map[string]any{ "cores": cores, "elapsed_secs": elapsed.Seconds(), "output_tail": tailLines(string(out), 20), } if err != nil { d.Error("CPUStress: stress-ng failed: " + err.Error()) return Outcome{ Passed: false, Message: "stress-ng returned non-zero: " + err.Error(), Summary: fmt.Sprintf("failed after %s", elapsed), Extras: extras, } } d.Info(fmt.Sprintf("CPUStress: stress-ng completed cleanly in %s", elapsed)) return Outcome{ Passed: true, Summary: fmt.Sprintf("stress-ng PASSED after %s (%d cores + 90%% RAM)", elapsed, cores), Extras: extras, } } func durationSeconds(d time.Duration) string { s := int(d.Seconds()) if s < 1 { s = 1 } return strconv.Itoa(s) + "s" } // tailLines returns the last n non-empty lines of s, for the summary. func tailLines(s string, n int) string { lines := strings.Split(strings.TrimRight(s, "\n"), "\n") if len(lines) > n { lines = lines[len(lines)-n:] } return strings.Join(lines, "\n") }