9bb4b09a04
CI / Lint + build + test (push) Has been cancelled
Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
98 lines
2.8 KiB
Go
98 lines
2.8 KiB
Go
package tests
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"os/exec"
|
||
"runtime"
|
||
"strconv"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
// CPUStress runs stress-ng with CPU workers AND memory stressors. The
|
||
// memory stressors take the place of a Memtest86+ pass — per the plan,
|
||
// running under Linux gives us exit-code-based pass/fail and log
|
||
// capture we can't get from Memtest without IPMI serial redirection.
|
||
//
|
||
// Non-zero exit = stress-ng aborted due to a failure (bit flip, OOM
|
||
// kill, etc.) → stage fails. Exit 0 means the kernel returned sane
|
||
// pages for the full duration, which is the Phase 4 health bar.
|
||
func CPUStress(ctx context.Context, d Deps) Outcome {
|
||
if _, err := exec.LookPath("stress-ng"); err != nil {
|
||
d.Warn("CPUStress: stress-ng not found in PATH — skipping stage")
|
||
return Outcome{
|
||
Passed: true,
|
||
Summary: "skipped (stress-ng missing)",
|
||
Extras: map[string]any{"skipped": true, "reason": "stress_ng_missing"},
|
||
}
|
||
}
|
||
|
||
// Timeout: Deps.StageTimeout may be zero in tests; default 2 min.
|
||
timeout := d.StageTimeout
|
||
if timeout <= 0 {
|
||
timeout = 2 * time.Minute
|
||
}
|
||
|
||
cores := runtime.NumCPU()
|
||
// --vm N allocates N worker processes each touching 90% of RAM. On
|
||
// an 8-core host with 32GiB this is 8 × ~28GiB sliding windows —
|
||
// enough to exercise every DIMM row within a minute.
|
||
args := []string{
|
||
"--cpu", strconv.Itoa(cores),
|
||
"--cpu-method", "all",
|
||
"--vm", strconv.Itoa(cores),
|
||
"--vm-bytes", "90%",
|
||
"--timeout", durationSeconds(timeout),
|
||
"--metrics-brief",
|
||
"--verify",
|
||
}
|
||
d.Info(fmt.Sprintf("CPUStress: stress-ng --cpu %d --vm %d --vm-bytes 90%% --timeout %s",
|
||
cores, cores, durationSeconds(timeout)))
|
||
|
||
runCtx, cancel := context.WithTimeout(ctx, timeout+30*time.Second)
|
||
defer cancel()
|
||
cmd := exec.CommandContext(runCtx, "stress-ng", args...)
|
||
start := time.Now()
|
||
out, err := cmd.CombinedOutput()
|
||
elapsed := time.Since(start).Round(time.Second)
|
||
|
||
extras := map[string]any{
|
||
"cores": cores,
|
||
"elapsed_secs": elapsed.Seconds(),
|
||
"output_tail": tailLines(string(out), 20),
|
||
}
|
||
if err != nil {
|
||
d.Error("CPUStress: stress-ng failed: " + err.Error())
|
||
return Outcome{
|
||
Passed: false,
|
||
Message: "stress-ng returned non-zero: " + err.Error(),
|
||
Summary: fmt.Sprintf("failed after %s", elapsed),
|
||
Extras: extras,
|
||
}
|
||
}
|
||
d.Info(fmt.Sprintf("CPUStress: stress-ng completed cleanly in %s", elapsed))
|
||
return Outcome{
|
||
Passed: true,
|
||
Summary: fmt.Sprintf("stress-ng PASSED after %s (%d cores + 90%% RAM)", elapsed, cores),
|
||
Extras: extras,
|
||
}
|
||
}
|
||
|
||
func durationSeconds(d time.Duration) string {
|
||
s := int(d.Seconds())
|
||
if s < 1 {
|
||
s = 1
|
||
}
|
||
return strconv.Itoa(s) + "s"
|
||
}
|
||
|
||
// tailLines returns the last n non-empty lines of s, for the summary.
|
||
func tailLines(s string, n int) string {
|
||
lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
|
||
if len(lines) > n {
|
||
lines = lines[len(lines)-n:]
|
||
}
|
||
return strings.Join(lines, "\n")
|
||
}
|