Initial commit: full Phases 1-6 implementation
CI / Lint + build + test (push) Has been cancelled

Post-repair hardware validation pipeline for Proxmox cluster hosts.
Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq
PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
+97
View File
@@ -0,0 +1,97 @@
package tests
import (
"context"
"fmt"
"os/exec"
"runtime"
"strconv"
"strings"
"time"
)
// CPUStress runs stress-ng with CPU workers AND memory stressors. The
// memory stressors take the place of a Memtest86+ pass — per the plan,
// running under Linux gives us exit-code-based pass/fail and log
// capture we can't get from Memtest without IPMI serial redirection.
//
// Non-zero exit = stress-ng aborted due to a failure (bit flip, OOM
// kill, etc.) → stage fails. Exit 0 means the kernel returned sane
// pages for the full duration, which is the Phase 4 health bar.
func CPUStress(ctx context.Context, d Deps) Outcome {
if _, err := exec.LookPath("stress-ng"); err != nil {
d.Warn("CPUStress: stress-ng not found in PATH — skipping stage")
return Outcome{
Passed: true,
Summary: "skipped (stress-ng missing)",
Extras: map[string]any{"skipped": true, "reason": "stress_ng_missing"},
}
}
// Timeout: Deps.StageTimeout may be zero in tests; default 2 min.
timeout := d.StageTimeout
if timeout <= 0 {
timeout = 2 * time.Minute
}
cores := runtime.NumCPU()
// --vm N allocates N worker processes each touching 90% of RAM. On
// an 8-core host with 32GiB this is 8 × ~28GiB sliding windows —
// enough to exercise every DIMM row within a minute.
args := []string{
"--cpu", strconv.Itoa(cores),
"--cpu-method", "all",
"--vm", strconv.Itoa(cores),
"--vm-bytes", "90%",
"--timeout", durationSeconds(timeout),
"--metrics-brief",
"--verify",
}
d.Info(fmt.Sprintf("CPUStress: stress-ng --cpu %d --vm %d --vm-bytes 90%% --timeout %s",
cores, cores, durationSeconds(timeout)))
runCtx, cancel := context.WithTimeout(ctx, timeout+30*time.Second)
defer cancel()
cmd := exec.CommandContext(runCtx, "stress-ng", args...)
start := time.Now()
out, err := cmd.CombinedOutput()
elapsed := time.Since(start).Round(time.Second)
extras := map[string]any{
"cores": cores,
"elapsed_secs": elapsed.Seconds(),
"output_tail": tailLines(string(out), 20),
}
if err != nil {
d.Error("CPUStress: stress-ng failed: " + err.Error())
return Outcome{
Passed: false,
Message: "stress-ng returned non-zero: " + err.Error(),
Summary: fmt.Sprintf("failed after %s", elapsed),
Extras: extras,
}
}
d.Info(fmt.Sprintf("CPUStress: stress-ng completed cleanly in %s", elapsed))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("stress-ng PASSED after %s (%d cores + 90%% RAM)", elapsed, cores),
Extras: extras,
}
}
func durationSeconds(d time.Duration) string {
s := int(d.Seconds())
if s < 1 {
s = 1
}
return strconv.Itoa(s) + "s"
}
// tailLines returns the last n non-empty lines of s, for the summary.
func tailLines(s string, n int) string {
lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
if len(lines) > n {
lines = lines[len(lines)-n:]
}
return strings.Join(lines, "\n")
}