Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// IperfSupervisor runs a single `iperf3 -s` process under the
|
||||
// orchestrator so the Network stage has a stable server to dial. Each
|
||||
// run's Network test is sequential (stages are always serial), so one
|
||||
// server process handles every host under test.
|
||||
//
|
||||
// Missing iperf3 binary is logged once and the supervisor becomes a
|
||||
// no-op — the agent's Network stage will then fail to connect and skip
|
||||
// cleanly via the stage's own error path.
|
||||
type IperfSupervisor struct {
|
||||
Port int // default 5201
|
||||
|
||||
mu sync.Mutex
|
||||
cmd *exec.Cmd
|
||||
started bool
|
||||
fatal error
|
||||
}
|
||||
|
||||
func NewIperfSupervisor(port int) *IperfSupervisor {
|
||||
if port <= 0 {
|
||||
port = 5201
|
||||
}
|
||||
return &IperfSupervisor{Port: port}
|
||||
}
|
||||
|
||||
func (s *IperfSupervisor) Start(ctx context.Context) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.started {
|
||||
return nil
|
||||
}
|
||||
if _, err := exec.LookPath("iperf3"); err != nil {
|
||||
s.fatal = fmt.Errorf("iperf3 not in PATH: %w", err)
|
||||
log.Printf("iperf supervisor: %v (Network stage will fail to connect)", s.fatal)
|
||||
return nil
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, "iperf3", "-s", "-p", strconv.Itoa(s.Port))
|
||||
if err := cmd.Start(); err != nil {
|
||||
s.fatal = err
|
||||
return err
|
||||
}
|
||||
s.cmd = cmd
|
||||
s.started = true
|
||||
log.Printf("iperf supervisor: iperf3 -s -p %d (pid=%d)", s.Port, cmd.Process.Pid)
|
||||
go s.wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Shutdown politely stops the iperf3 subprocess. Called from main on
|
||||
// SIGINT. A 3s grace period is enough for iperf3 to flush logs; after
|
||||
// that we kill.
|
||||
func (s *IperfSupervisor) Shutdown(timeout time.Duration) error {
|
||||
s.mu.Lock()
|
||||
cmd := s.cmd
|
||||
s.mu.Unlock()
|
||||
if cmd == nil || cmd.Process == nil {
|
||||
return nil
|
||||
}
|
||||
// os.Interrupt is cross-platform; on Linux it maps to SIGINT which
|
||||
// iperf3 handles gracefully. On Windows (dev only) it's a no-op and
|
||||
// we'll fall through to Kill after the timeout.
|
||||
_ = cmd.Process.Signal(os.Interrupt)
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- cmd.Wait() }()
|
||||
select {
|
||||
case <-done:
|
||||
return nil
|
||||
case <-time.After(timeout):
|
||||
_ = cmd.Process.Kill()
|
||||
return errors.New("iperf3 did not exit in time; killed")
|
||||
}
|
||||
}
|
||||
|
||||
func (s *IperfSupervisor) wait() {
|
||||
_ = s.cmd.Wait()
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.started = false
|
||||
}
|
||||
Reference in New Issue
Block a user