Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CPUStress runs stress-ng with CPU workers AND memory stressors. The
|
||||
// memory stressors take the place of a Memtest86+ pass — per the plan,
|
||||
// running under Linux gives us exit-code-based pass/fail and log
|
||||
// capture we can't get from Memtest without IPMI serial redirection.
|
||||
//
|
||||
// Non-zero exit = stress-ng aborted due to a failure (bit flip, OOM
|
||||
// kill, etc.) → stage fails. Exit 0 means the kernel returned sane
|
||||
// pages for the full duration, which is the Phase 4 health bar.
|
||||
func CPUStress(ctx context.Context, d Deps) Outcome {
|
||||
if _, err := exec.LookPath("stress-ng"); err != nil {
|
||||
d.Warn("CPUStress: stress-ng not found in PATH — skipping stage")
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: "skipped (stress-ng missing)",
|
||||
Extras: map[string]any{"skipped": true, "reason": "stress_ng_missing"},
|
||||
}
|
||||
}
|
||||
|
||||
// Timeout: Deps.StageTimeout may be zero in tests; default 2 min.
|
||||
timeout := d.StageTimeout
|
||||
if timeout <= 0 {
|
||||
timeout = 2 * time.Minute
|
||||
}
|
||||
|
||||
cores := runtime.NumCPU()
|
||||
// --vm N allocates N worker processes each touching 90% of RAM. On
|
||||
// an 8-core host with 32GiB this is 8 × ~28GiB sliding windows —
|
||||
// enough to exercise every DIMM row within a minute.
|
||||
args := []string{
|
||||
"--cpu", strconv.Itoa(cores),
|
||||
"--cpu-method", "all",
|
||||
"--vm", strconv.Itoa(cores),
|
||||
"--vm-bytes", "90%",
|
||||
"--timeout", durationSeconds(timeout),
|
||||
"--metrics-brief",
|
||||
"--verify",
|
||||
}
|
||||
d.Info(fmt.Sprintf("CPUStress: stress-ng --cpu %d --vm %d --vm-bytes 90%% --timeout %s",
|
||||
cores, cores, durationSeconds(timeout)))
|
||||
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout+30*time.Second)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(runCtx, "stress-ng", args...)
|
||||
start := time.Now()
|
||||
out, err := cmd.CombinedOutput()
|
||||
elapsed := time.Since(start).Round(time.Second)
|
||||
|
||||
extras := map[string]any{
|
||||
"cores": cores,
|
||||
"elapsed_secs": elapsed.Seconds(),
|
||||
"output_tail": tailLines(string(out), 20),
|
||||
}
|
||||
if err != nil {
|
||||
d.Error("CPUStress: stress-ng failed: " + err.Error())
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "stress-ng returned non-zero: " + err.Error(),
|
||||
Summary: fmt.Sprintf("failed after %s", elapsed),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
d.Info(fmt.Sprintf("CPUStress: stress-ng completed cleanly in %s", elapsed))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("stress-ng PASSED after %s (%d cores + 90%% RAM)", elapsed, cores),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
|
||||
func durationSeconds(d time.Duration) string {
|
||||
s := int(d.Seconds())
|
||||
if s < 1 {
|
||||
s = 1
|
||||
}
|
||||
return strconv.Itoa(s) + "s"
|
||||
}
|
||||
|
||||
// tailLines returns the last n non-empty lines of s, for the summary.
|
||||
func tailLines(s string, n int) string {
|
||||
lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
|
||||
if len(lines) > n {
|
||||
lines = lines[len(lines)-n:]
|
||||
}
|
||||
return strings.Join(lines, "\n")
|
||||
}
|
||||
Reference in New Issue
Block a user