Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,64 @@
|
||||
// Package bootstate parses kernel cmdline parameters that the
|
||||
// orchestrator baked into the iPXE script. The agent consumes these
|
||||
// on startup to learn which run it belongs to and how to reach back.
|
||||
package bootstate
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Params struct {
|
||||
OrchestratorURL string
|
||||
RunID int64
|
||||
MAC string
|
||||
Token string
|
||||
TLSCertFPR string // optional
|
||||
}
|
||||
|
||||
// ParseCmdline reads /proc/cmdline (or a user-supplied path for tests)
|
||||
// and pulls out the vetting.* parameters.
|
||||
func ParseCmdline(path string) (*Params, error) {
|
||||
if path == "" {
|
||||
path = "/proc/cmdline"
|
||||
}
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read %s: %w", path, err)
|
||||
}
|
||||
return ParseCmdlineString(string(b))
|
||||
}
|
||||
|
||||
func ParseCmdlineString(s string) (*Params, error) {
|
||||
fields := strings.Fields(strings.TrimSpace(s))
|
||||
var p Params
|
||||
for _, f := range fields {
|
||||
k, v, ok := strings.Cut(f, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
switch k {
|
||||
case "vetting.orchestrator":
|
||||
p.OrchestratorURL = v
|
||||
case "vetting.run_id":
|
||||
id, err := strconv.ParseInt(v, 10, 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vetting.run_id=%q: %w", v, err)
|
||||
}
|
||||
p.RunID = id
|
||||
case "vetting.mac":
|
||||
p.MAC = strings.ToLower(v)
|
||||
case "vetting.token":
|
||||
p.Token = v
|
||||
case "vetting.cert_fpr":
|
||||
p.TLSCertFPR = v
|
||||
}
|
||||
}
|
||||
if p.OrchestratorURL == "" || p.RunID == 0 || p.MAC == "" || p.Token == "" {
|
||||
return nil, errors.New("cmdline missing one of vetting.orchestrator, vetting.run_id, vetting.mac, vetting.token")
|
||||
}
|
||||
return &p, nil
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
package bootstate
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseCmdlineGoldenPath(t *testing.T) {
|
||||
s := `BOOT_IMAGE=vmlinuz initrd=initrd.img vetting.orchestrator=http://10.0.0.5:8080 vetting.run_id=42 vetting.mac=aa:bb:cc:dd:ee:ff vetting.token=deadbeefcafe vetting.cert_fpr=abc123 console=ttyS0,115200n8 quiet`
|
||||
p, err := ParseCmdlineString(s)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseCmdlineString: %v", err)
|
||||
}
|
||||
if p.OrchestratorURL != "http://10.0.0.5:8080" || p.RunID != 42 || p.MAC != "aa:bb:cc:dd:ee:ff" ||
|
||||
p.Token != "deadbeefcafe" || p.TLSCertFPR != "abc123" {
|
||||
t.Fatalf("parsed wrong: %+v", p)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCmdlineMissingRequired(t *testing.T) {
|
||||
s := `vetting.orchestrator=http://x vetting.mac=aa:bb:cc:dd:ee:ff vetting.token=t`
|
||||
if _, err := ParseCmdlineString(s); err == nil {
|
||||
t.Fatalf("expected error when vetting.run_id missing")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCmdlineLowercasesMAC(t *testing.T) {
|
||||
s := `vetting.orchestrator=http://x vetting.run_id=1 vetting.mac=AA:BB:CC:DD:EE:FF vetting.token=t`
|
||||
p, err := ParseCmdlineString(s)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseCmdlineString: %v", err)
|
||||
}
|
||||
if p.MAC != "aa:bb:cc:dd:ee:ff" {
|
||||
t.Fatalf("MAC not lowercased: %q", p.MAC)
|
||||
}
|
||||
}
|
||||
+181
@@ -0,0 +1,181 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Client talks to the orchestrator's /api/v1/runs/:id/* endpoints.
|
||||
type Client struct {
|
||||
BaseURL string
|
||||
RunID int64
|
||||
Token string
|
||||
TLSCertFPR string // optional sha256 hex fingerprint
|
||||
HTTP *http.Client
|
||||
}
|
||||
|
||||
func NewClient(baseURL string, runID int64, token, tlsCertFPR string) *Client {
|
||||
tlsCfg := &tls.Config{MinVersion: tls.VersionTLS12}
|
||||
// Cert pinning: if fingerprint provided, accept any cert whose DER
|
||||
// sha256 matches. The orchestrator may be using a self-signed cert
|
||||
// inside the LAN.
|
||||
if tlsCertFPR != "" {
|
||||
want := strings.ToLower(strings.ReplaceAll(tlsCertFPR, ":", ""))
|
||||
tlsCfg.InsecureSkipVerify = true
|
||||
tlsCfg.VerifyPeerCertificate = func(rawCerts [][]byte, _ [][]*x509.Certificate) error {
|
||||
for _, c := range rawCerts {
|
||||
sum := sha256.Sum256(c)
|
||||
if hex.EncodeToString(sum[:]) == want {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("agent: no presented cert matched pinned fingerprint")
|
||||
}
|
||||
}
|
||||
return &Client{
|
||||
BaseURL: strings.TrimRight(baseURL, "/"),
|
||||
RunID: runID,
|
||||
Token: token,
|
||||
TLSCertFPR: tlsCertFPR,
|
||||
HTTP: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
Transport: &http.Transport{TLSClientConfig: tlsCfg},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) Hello(ctx context.Context) error {
|
||||
return c.postJSON(ctx, "/hello", nil, nil)
|
||||
}
|
||||
|
||||
func (c *Client) Claim(ctx context.Context, agentIP string) (*ClaimResponse, error) {
|
||||
body := map[string]any{"agent_ip": agentIP}
|
||||
var out ClaimResponse
|
||||
if err := c.postJSON(ctx, "/claim", body, &out); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
func (c *Client) Heartbeat(ctx context.Context) (*HeartbeatResponse, error) {
|
||||
var out HeartbeatResponse
|
||||
if err := c.postJSON(ctx, "/heartbeat", nil, &out); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
func (c *Client) Log(ctx context.Context, lines []LogLine) error {
|
||||
return c.postJSON(ctx, "/log", map[string]any{"lines": lines}, nil)
|
||||
}
|
||||
|
||||
func (c *Client) Result(ctx context.Context, result any) (*ResultResponse, error) {
|
||||
var out ResultResponse
|
||||
if err := c.postJSON(ctx, "/result", result, &out); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
func (c *Client) Hold(ctx context.Context, agentIP string) (*HoldResponse, error) {
|
||||
var out HoldResponse
|
||||
if err := c.postJSON(ctx, "/hold", map[string]any{"agent_ip": agentIP}, &out); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
// Sensor posts a batch of numeric samples (thermal readings, fio IOPS,
|
||||
// iperf throughput, PSU voltages). Empty batches are allowed.
|
||||
func (c *Client) Sensor(ctx context.Context, samples []SensorSample) error {
|
||||
return c.postJSON(ctx, "/sensor", map[string]any{"samples": samples}, nil)
|
||||
}
|
||||
|
||||
// SensorSample is the on-wire shape; the server persists each row into
|
||||
// the measurements table.
|
||||
type SensorSample struct {
|
||||
TS string `json:"ts,omitempty"`
|
||||
Kind string `json:"kind"`
|
||||
Key string `json:"key"`
|
||||
Value float64 `json:"value"`
|
||||
Unit string `json:"unit,omitempty"`
|
||||
}
|
||||
|
||||
type ClaimResponse struct {
|
||||
OK bool `json:"ok"`
|
||||
RunID int64 `json:"run_id"`
|
||||
Stages []string `json:"stages"`
|
||||
ExpectedDisks []ClaimExpectedDiskSpec `json:"expected_disks"`
|
||||
IperfPort int `json:"iperf_port"`
|
||||
}
|
||||
|
||||
type ClaimExpectedDiskSpec struct {
|
||||
Serial string `json:"serial"`
|
||||
SizeGB int `json:"size_gb"`
|
||||
}
|
||||
|
||||
type HeartbeatResponse struct {
|
||||
Cmd string `json:"cmd"`
|
||||
State string `json:"state"`
|
||||
Stage string `json:"stage,omitempty"`
|
||||
OverrideFlags json.RawMessage `json:"override_flags,omitempty"`
|
||||
}
|
||||
|
||||
type LogLine struct {
|
||||
TS string `json:"ts,omitempty"`
|
||||
Level string `json:"level,omitempty"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
type ResultResponse struct {
|
||||
OK bool `json:"ok"`
|
||||
NextState string `json:"next_state"`
|
||||
}
|
||||
|
||||
type HoldResponse struct {
|
||||
AuthorizedKey string `json:"authorized_key"`
|
||||
RunID int64 `json:"run_id"`
|
||||
}
|
||||
|
||||
func (c *Client) postJSON(ctx context.Context, path string, in, out any) error {
|
||||
var body io.Reader
|
||||
if in != nil {
|
||||
buf, err := json.Marshal(in)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
body = bytes.NewReader(buf)
|
||||
}
|
||||
url := fmt.Sprintf("%s/api/v1/runs/%d%s", c.BaseURL, c.RunID, path)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+c.Token)
|
||||
if in != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
resp, err := c.HTTP.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode >= 300 {
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("%s %s: %d %s", req.Method, path, resp.StatusCode, strings.TrimSpace(string(b)))
|
||||
}
|
||||
if out != nil {
|
||||
return json.NewDecoder(resp.Body).Decode(out)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,264 @@
|
||||
// Package probes collects hardware facts from a booted Linux system.
|
||||
// Phase 3 only needs enough to feed the spec diff: CPU model/cores,
|
||||
// total RAM, per-disk serial+size, per-NIC MAC+speed, per-GPU model.
|
||||
//
|
||||
// Every probe is tolerant of missing files or tools — if /sys isn't
|
||||
// available the field is just left empty. The orchestrator's diff
|
||||
// engine will surface missing expected fields as failures; missing
|
||||
// fields that weren't expected stay silent.
|
||||
package probes
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"vetting/internal/spec"
|
||||
)
|
||||
|
||||
// Collect runs every probe and returns the merged inventory. The only
|
||||
// errors it surfaces are fatal ones that prevent progress — individual
|
||||
// probe failures are logged to the returned Inventory's raw field and
|
||||
// do not fail the whole call.
|
||||
func Collect() (*spec.Inventory, error) {
|
||||
inv := &spec.Inventory{}
|
||||
|
||||
inv.CPU = probeCPU()
|
||||
inv.Memory = probeMemory()
|
||||
inv.Disks = probeDisks()
|
||||
inv.NICs = probeNICs()
|
||||
inv.GPUs = probeGPUs()
|
||||
|
||||
return inv, nil
|
||||
}
|
||||
|
||||
// ----- CPU --------------------------------------------------------------
|
||||
|
||||
func probeCPU() spec.CPUSpec {
|
||||
// model: first "model name" in /proc/cpuinfo.
|
||||
// logical_cores: runtime.NumCPU (Linux respects cpu cgroup; agent
|
||||
// runs on bare metal so it will report every HT thread).
|
||||
c := spec.CPUSpec{LogicalCores: runtime.NumCPU()}
|
||||
f, err := os.Open("/proc/cpuinfo")
|
||||
if err != nil {
|
||||
return c
|
||||
}
|
||||
defer func() { _ = f.Close() }()
|
||||
scan := bufio.NewScanner(f)
|
||||
for scan.Scan() {
|
||||
line := scan.Text()
|
||||
if strings.HasPrefix(line, "model name") {
|
||||
if _, v, ok := strings.Cut(line, ":"); ok {
|
||||
c.Model = strings.TrimSpace(v)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// ----- Memory -----------------------------------------------------------
|
||||
|
||||
func probeMemory() spec.MemorySpec {
|
||||
// /proc/meminfo reports MemTotal in kB. Round down to the nearest
|
||||
// GiB so the diff's ±2 GiB tolerance is meaningful.
|
||||
f, err := os.Open("/proc/meminfo")
|
||||
if err != nil {
|
||||
return spec.MemorySpec{}
|
||||
}
|
||||
defer func() { _ = f.Close() }()
|
||||
scan := bufio.NewScanner(f)
|
||||
for scan.Scan() {
|
||||
fields := strings.Fields(scan.Text())
|
||||
if len(fields) >= 2 && fields[0] == "MemTotal:" {
|
||||
kb, err := strconv.ParseInt(fields[1], 10, 64)
|
||||
if err == nil {
|
||||
return spec.MemorySpec{TotalGiB: int(kb / 1024 / 1024)}
|
||||
}
|
||||
}
|
||||
}
|
||||
return spec.MemorySpec{}
|
||||
}
|
||||
|
||||
// ----- Disks ------------------------------------------------------------
|
||||
|
||||
// probeDisks walks /sys/class/block and picks out real block devices
|
||||
// (no partitions, no loop/ram). For each it reads size (512B sectors)
|
||||
// and serial. Virtio disks in QEMU report a serial only when launched
|
||||
// with `-drive serial=...`; without that the field is empty, which is
|
||||
// fine — the diff skips disks with empty serials anyway.
|
||||
func probeDisks() []spec.DiskSpec {
|
||||
entries, err := os.ReadDir("/sys/class/block")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var out []spec.DiskSpec
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if !isRealDisk(name) {
|
||||
continue
|
||||
}
|
||||
base := filepath.Join("/sys/class/block", name)
|
||||
size := diskSizeGB(base)
|
||||
serial := diskSerial(name)
|
||||
// size == 0 means we couldn't read /size; skip rather than
|
||||
// emit garbage.
|
||||
if size == 0 && serial == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, spec.DiskSpec{Serial: serial, SizeGB: size})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isRealDisk(name string) bool {
|
||||
// Exclude partitions: they have a parent block dir and a "partition"
|
||||
// attribute. sd* disks without trailing digits are whole disks; nvme
|
||||
// disks use nvme0n1 for the namespace and nvme0n1p1 for partitions.
|
||||
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
|
||||
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
|
||||
return false
|
||||
}
|
||||
partPath := filepath.Join("/sys/class/block", name, "partition")
|
||||
if _, err := os.Stat(partPath); err == nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func diskSizeGB(base string) int {
|
||||
b, err := os.ReadFile(filepath.Join(base, "size"))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
sectors, err := strconv.ParseInt(strings.TrimSpace(string(b)), 10, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
// /sys reports sectors of 512B regardless of physical sector size.
|
||||
return int(sectors * 512 / 1_000_000_000)
|
||||
}
|
||||
|
||||
func diskSerial(name string) string {
|
||||
// Try a few known paths; the kernel exposes serials differently for
|
||||
// ATA/SCSI vs NVMe.
|
||||
for _, rel := range []string{
|
||||
filepath.Join("/sys/block", name, "device", "serial"),
|
||||
filepath.Join("/sys/block", name, "device", "vpd_pg80"),
|
||||
filepath.Join("/sys/block", name, "serial"),
|
||||
} {
|
||||
if b, err := os.ReadFile(rel); err == nil {
|
||||
s := strings.TrimSpace(string(b))
|
||||
if s != "" {
|
||||
return s
|
||||
}
|
||||
}
|
||||
}
|
||||
// Fallback: udevadm often knows the wwid / serial. Best-effort.
|
||||
cmd := exec.Command("udevadm", "info", "--query=property", "--name="+name)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
|
||||
return strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// ----- NICs -------------------------------------------------------------
|
||||
|
||||
func probeNICs() []spec.NICSpec {
|
||||
root := "/sys/class/net"
|
||||
entries, err := os.ReadDir(root)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var out []spec.NICSpec
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if name == "lo" {
|
||||
continue
|
||||
}
|
||||
base := filepath.Join(root, name)
|
||||
mac := readLine(filepath.Join(base, "address"))
|
||||
if mac == "" || mac == "00:00:00:00:00:00" {
|
||||
continue
|
||||
}
|
||||
// /sys/class/net/*/speed reports Mbps or -1 if link down.
|
||||
speed := 0
|
||||
if b, err := os.ReadFile(filepath.Join(base, "speed")); err == nil {
|
||||
if mbps, err := strconv.Atoi(strings.TrimSpace(string(b))); err == nil && mbps > 0 {
|
||||
speed = mbps / 1000
|
||||
}
|
||||
}
|
||||
out = append(out, spec.NICSpec{MAC: strings.ToLower(mac), SpeedGbps: speed})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// ----- GPUs -------------------------------------------------------------
|
||||
|
||||
// probeGPUs leans on lspci; if lspci is missing, returns nothing and
|
||||
// the diff engine just won't match any GPU expectations. Phase 4 will
|
||||
// add nvidia-smi for VRAM and firmware.
|
||||
func probeGPUs() []spec.GPUSpec {
|
||||
cmd := exec.Command("lspci", "-mm", "-nnk")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var gpus []spec.GPUSpec
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
low := strings.ToLower(line)
|
||||
if !strings.Contains(low, "vga compatible controller") &&
|
||||
!strings.Contains(low, "3d controller") {
|
||||
continue
|
||||
}
|
||||
// `lspci -mm` quotes fields; device name is usually field 3.
|
||||
fields := splitQuoted(line)
|
||||
if len(fields) >= 4 {
|
||||
gpus = append(gpus, spec.GPUSpec{Model: fmt.Sprintf("%s %s", fields[2], fields[3])})
|
||||
}
|
||||
}
|
||||
return gpus
|
||||
}
|
||||
|
||||
func splitQuoted(line string) []string {
|
||||
var out []string
|
||||
var cur strings.Builder
|
||||
inQ := false
|
||||
for _, r := range line {
|
||||
switch {
|
||||
case r == '"':
|
||||
inQ = !inQ
|
||||
if !inQ {
|
||||
out = append(out, cur.String())
|
||||
cur.Reset()
|
||||
}
|
||||
case r == ' ' && !inQ:
|
||||
continue
|
||||
default:
|
||||
cur.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// ----- shared helpers ---------------------------------------------------
|
||||
|
||||
func readLine(path string) string {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(b))
|
||||
}
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
package probes
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ThermalSample is one reading from /sys/class/hwmon. Kind is "temp",
|
||||
// Key is the label (or chip-relative name) and Value is degrees C.
|
||||
type ThermalSample struct {
|
||||
Kind string
|
||||
Key string
|
||||
Value float64
|
||||
Unit string
|
||||
}
|
||||
|
||||
// Thermals walks /sys/class/hwmon looking for temp*_input files. The
|
||||
// kernel reports millidegrees C; we divide by 1000. Labels come from
|
||||
// temp*_label (preferred) or a chip-relative fallback.
|
||||
//
|
||||
// This is also used by the thermal sidecar; it re-reads on each tick
|
||||
// rather than holding open handles so hot-plugged sensors (e.g. a PCIe
|
||||
// card enumerating late) get picked up.
|
||||
func Thermals() []ThermalSample {
|
||||
root := "/sys/class/hwmon"
|
||||
chips, err := os.ReadDir(root)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var out []ThermalSample
|
||||
for _, c := range chips {
|
||||
base := filepath.Join(root, c.Name())
|
||||
chipName := strings.TrimSpace(readFileStr(filepath.Join(base, "name")))
|
||||
files, err := os.ReadDir(base)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, f := range files {
|
||||
name := f.Name()
|
||||
if !strings.HasPrefix(name, "temp") || !strings.HasSuffix(name, "_input") {
|
||||
continue
|
||||
}
|
||||
idx := strings.TrimSuffix(strings.TrimPrefix(name, "temp"), "_input")
|
||||
label := strings.TrimSpace(readFileStr(filepath.Join(base, "temp"+idx+"_label")))
|
||||
if label == "" {
|
||||
label = chipName + "/temp" + idx
|
||||
}
|
||||
raw := strings.TrimSpace(readFileStr(filepath.Join(base, name)))
|
||||
milli, err := strconv.Atoi(raw)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, ThermalSample{Kind: "temp", Key: label, Value: float64(milli) / 1000, Unit: "C"})
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func readFileStr(p string) string {
|
||||
b, err := os.ReadFile(p)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
+498
@@ -0,0 +1,498 @@
|
||||
// Package agent implements the in-live-image control loop.
|
||||
//
|
||||
// Phase 4 scope: after /claim, the agent walks through every stage the
|
||||
// orchestrator advertises, dispatching on the stage name to a function
|
||||
// in agent/tests. Each stage posts a /result; the response carries the
|
||||
// orchestrator's next_state, which the loop uses to pick the next
|
||||
// stage. Stages the orchestrator owns (SpecValidate, Reporting) resolve
|
||||
// server-side inside /result so the agent never sees them as "its turn".
|
||||
//
|
||||
// Terminal states:
|
||||
// - FailedHolding → request hold key, install authorized_keys, wait
|
||||
// on heartbeats for a retry_stage directive.
|
||||
// - Completed → heartbeat carries cmd=shutdown; agent runs
|
||||
// `systemctl poweroff` and exits.
|
||||
//
|
||||
// Thermal sidecar runs from the moment the agent claims until ctx
|
||||
// cancel; it posts a handful of /sys/class/hwmon samples every 5s.
|
||||
package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"vetting/agent/bootstate"
|
||||
"vetting/agent/probes"
|
||||
"vetting/agent/tests"
|
||||
"vetting/internal/spec"
|
||||
)
|
||||
|
||||
// Run is the long-lived entry point. It blocks until ctx is cancelled
|
||||
// or a fatal error makes progress impossible.
|
||||
func Run(ctx context.Context, p *bootstate.Params) error {
|
||||
c := NewClient(p.OrchestratorURL, p.RunID, p.Token, p.TLSCertFPR)
|
||||
fwd := newLogForwarder(ctx, c)
|
||||
defer fwd.close()
|
||||
|
||||
ip := localIP()
|
||||
fwd.info(fmt.Sprintf("agent starting on %s (run=%d mac=%s)", ip, p.RunID, p.MAC))
|
||||
|
||||
if err := callWithBackoff(ctx, "hello", func(ctx context.Context) error {
|
||||
return c.Hello(ctx)
|
||||
}); err != nil {
|
||||
fwd.warn("hello never succeeded: " + err.Error())
|
||||
}
|
||||
|
||||
var claim *ClaimResponse
|
||||
if err := callWithBackoff(ctx, "claim", func(ctx context.Context) error {
|
||||
r, err := c.Claim(ctx, ip)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
claim = r
|
||||
return nil
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
fwd.info(fmt.Sprintf("claimed run; stages=%v", claim.Stages))
|
||||
|
||||
go thermalSidecar(ctx, c, fwd)
|
||||
|
||||
hbCh := make(chan HeartbeatResponse, 4)
|
||||
go heartbeatLoop(ctx, c, fwd, hbCh)
|
||||
|
||||
// Run every stage the orchestrator advertises. Stages owned by the
|
||||
// orchestrator (SpecValidate, Reporting) resolve inside /result and
|
||||
// flip next_state forward past themselves, so they simply never match
|
||||
// our dispatch table.
|
||||
nextStage := "Inventory"
|
||||
for nextStage != "" {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
fwd.info("stage: starting " + nextStage)
|
||||
outcome := runStage(ctx, nextStage, claim, fwd, c, overrideFlags{})
|
||||
resp, err := postResult(ctx, c, nextStage, outcome)
|
||||
if err != nil {
|
||||
fwd.error("submit result for " + nextStage + ": " + err.Error())
|
||||
return err
|
||||
}
|
||||
fwd.info(fmt.Sprintf("stage %s → next_state=%s", nextStage, resp.NextState))
|
||||
|
||||
if resp.NextState == "FailedHolding" {
|
||||
if err := requestHold(ctx, c, fwd); err != nil {
|
||||
return err
|
||||
}
|
||||
// Park and wait for an override directive.
|
||||
return waitForOverride(ctx, c, fwd, hbCh, claim)
|
||||
}
|
||||
if resp.NextState == "Completed" || resp.NextState == "" {
|
||||
fwd.info("pipeline complete")
|
||||
<-ctx.Done()
|
||||
return ctx.Err()
|
||||
}
|
||||
nextStage = stageForState(resp.NextState)
|
||||
if nextStage == "" {
|
||||
// next_state is something we don't map (e.g. SpecValidate — but
|
||||
// the orchestrator's /result already resolved it and handed us
|
||||
// back a further-along state). Defensive bail so we don't loop.
|
||||
fwd.warn("no stage maps to state " + resp.NextState + "; parking")
|
||||
<-ctx.Done()
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
<-ctx.Done()
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
// runStage dispatches on stage name. The Inventory stage is special —
|
||||
// it runs the inventory probe and passes the result as the /result body
|
||||
// (the orchestrator persists it as an artifact). Every other stage
|
||||
// returns a tests.Outcome which postResult marshals generically.
|
||||
func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome {
|
||||
deps := newDeps(ctx, c, fwd, ovr, claim)
|
||||
switch stage {
|
||||
case "Inventory":
|
||||
fwd.info("Inventory: probing host hardware")
|
||||
inv, err := probes.Collect()
|
||||
if err != nil {
|
||||
return stageOutcome{Outcome: tests.Outcome{Passed: false, Message: err.Error(), Summary: "probe error"}}
|
||||
}
|
||||
fwd.info("Inventory: " + inventorySummary(inv))
|
||||
return stageOutcome{
|
||||
Outcome: tests.Outcome{
|
||||
Passed: true,
|
||||
Summary: inventorySummary(inv),
|
||||
},
|
||||
Inventory: inv,
|
||||
}
|
||||
case "SMART":
|
||||
return stageOutcome{Outcome: tests.SMART(ctx, deps)}
|
||||
case "CPUStress":
|
||||
return stageOutcome{Outcome: tests.CPUStress(ctx, deps)}
|
||||
case "Storage":
|
||||
return stageOutcome{Outcome: tests.Storage(ctx, deps)}
|
||||
case "Network":
|
||||
return stageOutcome{Outcome: tests.Network(ctx, deps, tests.NetworkConfig{
|
||||
OrchestratorURL: c.BaseURL,
|
||||
IperfPort: claim.IperfPort,
|
||||
Duration: 10 * time.Second,
|
||||
})}
|
||||
case "GPU":
|
||||
return stageOutcome{Outcome: tests.GPU(ctx, deps)}
|
||||
case "PSU":
|
||||
return stageOutcome{Outcome: tests.PSU(ctx, deps)}
|
||||
}
|
||||
return stageOutcome{Outcome: tests.Outcome{
|
||||
Passed: false,
|
||||
Message: "unknown stage " + stage,
|
||||
}}
|
||||
}
|
||||
|
||||
type stageOutcome struct {
|
||||
Outcome tests.Outcome
|
||||
Inventory *spec.Inventory // only for Inventory stage
|
||||
}
|
||||
|
||||
type overrideFlags struct {
|
||||
Wipe bool `json:"wipe"`
|
||||
}
|
||||
|
||||
func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlags, claim *ClaimResponse) tests.Deps {
|
||||
var expected []tests.ExpectedDisk
|
||||
for _, e := range claim.ExpectedDisks {
|
||||
expected = append(expected, tests.ExpectedDisk{Serial: e.Serial, SizeGB: e.SizeGB})
|
||||
}
|
||||
return tests.Deps{
|
||||
Info: fwd.info,
|
||||
Warn: fwd.warn,
|
||||
Error: fwd.error,
|
||||
OverrideWipe: ovr.Wipe,
|
||||
ExpectedDisks: expected,
|
||||
StageTimeout: 2 * time.Minute,
|
||||
Sensor: func(ctx context.Context, samples []tests.Sample) error {
|
||||
out := make([]SensorSample, 0, len(samples))
|
||||
for _, s := range samples {
|
||||
out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
|
||||
}
|
||||
return c.Sensor(ctx, out)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// postResult marshals stageOutcome for the /result endpoint. The
|
||||
// Inventory shape is special-cased: it includes the inventory blob so
|
||||
// the orchestrator can persist it and run server-side spec diff.
|
||||
func postResult(ctx context.Context, c *Client, stage string, s stageOutcome) (*ResultResponse, error) {
|
||||
summary, _ := s.Outcome.MarshalSummary()
|
||||
body := map[string]any{
|
||||
"stage": stage,
|
||||
"passed": s.Outcome.Passed,
|
||||
}
|
||||
if len(summary) > 2 {
|
||||
body["summary"] = json.RawMessage(summary)
|
||||
}
|
||||
if s.Outcome.Message != "" {
|
||||
body["message"] = s.Outcome.Message
|
||||
}
|
||||
if s.Inventory != nil {
|
||||
body["inventory"] = s.Inventory
|
||||
}
|
||||
return c.Result(ctx, body)
|
||||
}
|
||||
|
||||
// stageForState maps a RunState string back to the stage executor name.
|
||||
// Every stage-name is the same as its state except Inventory↔InventoryCheck.
|
||||
func stageForState(state string) string {
|
||||
switch state {
|
||||
case "InventoryCheck":
|
||||
return "Inventory"
|
||||
case "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU":
|
||||
return state
|
||||
}
|
||||
// SpecValidate and Reporting are orchestrator-owned; we never see
|
||||
// them as next_state because /result resolves past them.
|
||||
return ""
|
||||
}
|
||||
|
||||
// waitForOverride parks the agent in FailedHolding. It listens for a
|
||||
// heartbeat directive that tells it to retry a stage (e.g. Storage
|
||||
// with wipe-override armed) and re-enters runStage from that point.
|
||||
func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-chan HeartbeatResponse, claim *ClaimResponse) error {
|
||||
fwd.info("holding: awaiting operator decision (heartbeat directive or ctx cancel)")
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case cmd, ok := <-hb:
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
if cmd.Cmd != "retry_stage" || cmd.Stage == "" {
|
||||
continue
|
||||
}
|
||||
fwd.info("operator override: retrying stage " + cmd.Stage)
|
||||
var ovr overrideFlags
|
||||
if len(cmd.OverrideFlags) > 0 {
|
||||
_ = json.Unmarshal(cmd.OverrideFlags, &ovr)
|
||||
}
|
||||
outcome := runStage(ctx, cmd.Stage, claim, fwd, c, ovr)
|
||||
resp, err := postResult(ctx, c, cmd.Stage, outcome)
|
||||
if err != nil {
|
||||
fwd.error("override: submit result: " + err.Error())
|
||||
continue
|
||||
}
|
||||
fwd.info(fmt.Sprintf("override stage %s → next_state=%s", cmd.Stage, resp.NextState))
|
||||
if resp.NextState == "FailedHolding" {
|
||||
// Still broken; keep holding.
|
||||
continue
|
||||
}
|
||||
if resp.NextState == "Completed" {
|
||||
return nil
|
||||
}
|
||||
// Successful retry — continue walking the pipeline from the
|
||||
// state the orchestrator advanced us into.
|
||||
if nextStage := stageForState(resp.NextState); nextStage != "" {
|
||||
for nextStage != "" {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
fwd.info("stage: starting " + nextStage)
|
||||
out := runStage(ctx, nextStage, claim, fwd, c, overrideFlags{})
|
||||
rr, err := postResult(ctx, c, nextStage, out)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if rr.NextState == "FailedHolding" || rr.NextState == "Completed" || rr.NextState == "" {
|
||||
return nil
|
||||
}
|
||||
nextStage = stageForState(rr.NextState)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// requestHold fetches the per-run pubkey and installs it into
|
||||
// /root/.ssh/authorized_keys so the operator can SSH in.
|
||||
func requestHold(ctx context.Context, c *Client, fwd *logForwarder) error {
|
||||
fwd.warn("entering FailedHolding; requesting hold key")
|
||||
resp, err := c.Hold(ctx, localIP())
|
||||
if err != nil {
|
||||
fwd.error("hold request failed: " + err.Error())
|
||||
return err
|
||||
}
|
||||
authPath := "/root/.ssh/authorized_keys"
|
||||
if err := os.MkdirAll(filepath.Dir(authPath), 0o700); err != nil {
|
||||
fwd.error("mkdir .ssh: " + err.Error())
|
||||
return err
|
||||
}
|
||||
f, err := os.OpenFile(authPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o600)
|
||||
if err != nil {
|
||||
fwd.error("open authorized_keys: " + err.Error())
|
||||
return err
|
||||
}
|
||||
defer func() { _ = f.Close() }()
|
||||
if _, err := fmt.Fprintln(f, resp.AuthorizedKey); err != nil {
|
||||
fwd.error("write authorized_keys: " + err.Error())
|
||||
return err
|
||||
}
|
||||
fwd.info("hold key installed; SSH is available to root@" + localIP())
|
||||
return nil
|
||||
}
|
||||
|
||||
func inventorySummary(inv *spec.Inventory) string {
|
||||
return fmt.Sprintf("cpu=%q cores=%d ram=%dGiB disks=%d nics=%d gpus=%d",
|
||||
inv.CPU.Model, inv.CPU.LogicalCores, inv.Memory.TotalGiB,
|
||||
len(inv.Disks), len(inv.NICs), len(inv.GPUs))
|
||||
}
|
||||
|
||||
// thermalSidecar posts a batch of /sys/class/hwmon samples every 5s.
|
||||
// Idempotent: a dead sensor just drops out of the next batch. Errors
|
||||
// are logged but never fatal — we'd rather have a run with partial
|
||||
// thermal data than kill the agent over an I/O hiccup.
|
||||
func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) {
|
||||
t := time.NewTicker(5 * time.Second)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
samples := probes.Thermals()
|
||||
if len(samples) == 0 {
|
||||
continue
|
||||
}
|
||||
out := make([]SensorSample, 0, len(samples))
|
||||
for _, s := range samples {
|
||||
out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
|
||||
}
|
||||
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
if err := c.Sensor(sendCtx, out); err != nil {
|
||||
fwd.warn("thermal sidecar: " + err.Error())
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func heartbeatLoop(ctx context.Context, c *Client, fwd *logForwarder, out chan<- HeartbeatResponse) {
|
||||
t := time.NewTicker(10 * time.Second)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
hbCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
resp, err := c.Heartbeat(hbCtx)
|
||||
cancel()
|
||||
if err != nil {
|
||||
fwd.warn("heartbeat error: " + err.Error())
|
||||
continue
|
||||
}
|
||||
if resp.Cmd == "abort" {
|
||||
fwd.warn("orchestrator said abort; stopping loop")
|
||||
return
|
||||
}
|
||||
if resp.Cmd == "shutdown" {
|
||||
fwd.info("orchestrator said shutdown; powering off host")
|
||||
// Best effort: systemd then sysvinit fallback. Either way,
|
||||
// return so the agent process stops issuing heartbeats.
|
||||
if err := exec.Command("systemctl", "poweroff").Run(); err != nil {
|
||||
fwd.warn("systemctl poweroff failed: " + err.Error())
|
||||
_ = exec.Command("shutdown", "-h", "now").Run()
|
||||
}
|
||||
return
|
||||
}
|
||||
if resp.Cmd == "retry_stage" {
|
||||
select {
|
||||
case out <- *resp:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func callWithBackoff(ctx context.Context, label string, f func(context.Context) error) error {
|
||||
backoff := 2 * time.Second
|
||||
for attempt := 1; ; attempt++ {
|
||||
callCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
err := f(callCtx)
|
||||
cancel()
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
if attempt > 20 {
|
||||
return err
|
||||
}
|
||||
log.Printf("agent: %s attempt %d failed: %v (retry in %s)", label, attempt, err, backoff)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(backoff):
|
||||
}
|
||||
if backoff < 30*time.Second {
|
||||
backoff *= 2
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func localIP() string {
|
||||
addrs, err := net.InterfaceAddrs()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
for _, a := range addrs {
|
||||
ipnet, ok := a.(*net.IPNet)
|
||||
if !ok || ipnet.IP.IsLoopback() {
|
||||
continue
|
||||
}
|
||||
v4 := ipnet.IP.To4()
|
||||
if v4 != nil {
|
||||
return v4.String()
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// ----- log forwarder -----------------------------------------------------
|
||||
|
||||
type logForwarder struct {
|
||||
c *Client
|
||||
mu sync.Mutex
|
||||
buf []LogLine
|
||||
wg sync.WaitGroup
|
||||
cancel context.CancelFunc
|
||||
}
|
||||
|
||||
func newLogForwarder(parent context.Context, c *Client) *logForwarder {
|
||||
ctx, cancel := context.WithCancel(parent)
|
||||
f := &logForwarder{c: c, cancel: cancel}
|
||||
f.wg.Add(1)
|
||||
go f.loop(ctx)
|
||||
return f
|
||||
}
|
||||
|
||||
func (f *logForwarder) loop(ctx context.Context) {
|
||||
defer f.wg.Done()
|
||||
t := time.NewTicker(2 * time.Second)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
f.flush()
|
||||
return
|
||||
case <-t.C:
|
||||
f.flush()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (f *logForwarder) push(level, text string) {
|
||||
stamp := time.Now().UTC().Format(time.RFC3339Nano)
|
||||
log.Printf("[%s] %s", level, text)
|
||||
f.mu.Lock()
|
||||
f.buf = append(f.buf, LogLine{TS: stamp, Level: level, Text: text})
|
||||
f.mu.Unlock()
|
||||
}
|
||||
|
||||
func (f *logForwarder) info(s string) { f.push("info", s) }
|
||||
func (f *logForwarder) warn(s string) { f.push("warn", s) }
|
||||
func (f *logForwarder) error(s string) { f.push("error", s) }
|
||||
|
||||
func (f *logForwarder) flush() {
|
||||
f.mu.Lock()
|
||||
if len(f.buf) == 0 {
|
||||
f.mu.Unlock()
|
||||
return
|
||||
}
|
||||
lines := f.buf
|
||||
f.buf = nil
|
||||
f.mu.Unlock()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := f.c.Log(ctx, lines); err != nil {
|
||||
log.Printf("log forward failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (f *logForwarder) close() {
|
||||
f.cancel()
|
||||
f.wg.Wait()
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CPUStress runs stress-ng with CPU workers AND memory stressors. The
|
||||
// memory stressors take the place of a Memtest86+ pass — per the plan,
|
||||
// running under Linux gives us exit-code-based pass/fail and log
|
||||
// capture we can't get from Memtest without IPMI serial redirection.
|
||||
//
|
||||
// Non-zero exit = stress-ng aborted due to a failure (bit flip, OOM
|
||||
// kill, etc.) → stage fails. Exit 0 means the kernel returned sane
|
||||
// pages for the full duration, which is the Phase 4 health bar.
|
||||
func CPUStress(ctx context.Context, d Deps) Outcome {
|
||||
if _, err := exec.LookPath("stress-ng"); err != nil {
|
||||
d.Warn("CPUStress: stress-ng not found in PATH — skipping stage")
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: "skipped (stress-ng missing)",
|
||||
Extras: map[string]any{"skipped": true, "reason": "stress_ng_missing"},
|
||||
}
|
||||
}
|
||||
|
||||
// Timeout: Deps.StageTimeout may be zero in tests; default 2 min.
|
||||
timeout := d.StageTimeout
|
||||
if timeout <= 0 {
|
||||
timeout = 2 * time.Minute
|
||||
}
|
||||
|
||||
cores := runtime.NumCPU()
|
||||
// --vm N allocates N worker processes each touching 90% of RAM. On
|
||||
// an 8-core host with 32GiB this is 8 × ~28GiB sliding windows —
|
||||
// enough to exercise every DIMM row within a minute.
|
||||
args := []string{
|
||||
"--cpu", strconv.Itoa(cores),
|
||||
"--cpu-method", "all",
|
||||
"--vm", strconv.Itoa(cores),
|
||||
"--vm-bytes", "90%",
|
||||
"--timeout", durationSeconds(timeout),
|
||||
"--metrics-brief",
|
||||
"--verify",
|
||||
}
|
||||
d.Info(fmt.Sprintf("CPUStress: stress-ng --cpu %d --vm %d --vm-bytes 90%% --timeout %s",
|
||||
cores, cores, durationSeconds(timeout)))
|
||||
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout+30*time.Second)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(runCtx, "stress-ng", args...)
|
||||
start := time.Now()
|
||||
out, err := cmd.CombinedOutput()
|
||||
elapsed := time.Since(start).Round(time.Second)
|
||||
|
||||
extras := map[string]any{
|
||||
"cores": cores,
|
||||
"elapsed_secs": elapsed.Seconds(),
|
||||
"output_tail": tailLines(string(out), 20),
|
||||
}
|
||||
if err != nil {
|
||||
d.Error("CPUStress: stress-ng failed: " + err.Error())
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "stress-ng returned non-zero: " + err.Error(),
|
||||
Summary: fmt.Sprintf("failed after %s", elapsed),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
d.Info(fmt.Sprintf("CPUStress: stress-ng completed cleanly in %s", elapsed))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("stress-ng PASSED after %s (%d cores + 90%% RAM)", elapsed, cores),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
|
||||
func durationSeconds(d time.Duration) string {
|
||||
s := int(d.Seconds())
|
||||
if s < 1 {
|
||||
s = 1
|
||||
}
|
||||
return strconv.Itoa(s) + "s"
|
||||
}
|
||||
|
||||
// tailLines returns the last n non-empty lines of s, for the summary.
|
||||
func tailLines(s string, n int) string {
|
||||
lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
|
||||
if len(lines) > n {
|
||||
lines = lines[len(lines)-n:]
|
||||
}
|
||||
return strings.Join(lines, "\n")
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// GPU enumerates VGA / 3D PCI devices. No devices → skip cleanly (a
|
||||
// CPU-only server passes this stage by virtue of having nothing to
|
||||
// stress). Devices present → try nvidia-smi for NVIDIA cards, else
|
||||
// accept PCI presence.
|
||||
func GPU(ctx context.Context, d Deps) Outcome {
|
||||
devices := listGPUPCI(ctx)
|
||||
if len(devices) == 0 {
|
||||
d.Info("GPU: no VGA/3D PCI devices found — skipping stage")
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: "skipped (no GPU present)",
|
||||
Extras: map[string]any{"skipped": true, "reason": "no_gpu_present"},
|
||||
}
|
||||
}
|
||||
d.Info("GPU: found " + joinDevices(devices))
|
||||
|
||||
nvidia := nvidiaSmiList(ctx)
|
||||
extras := map[string]any{
|
||||
"pci_devices": devices,
|
||||
"skipped": false,
|
||||
}
|
||||
if len(nvidia) > 0 {
|
||||
extras["nvidia"] = nvidia
|
||||
d.Info("GPU: nvidia-smi reports: " + strings.Join(nvidia, ", "))
|
||||
}
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: formatCount(len(devices), "GPU present"),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
|
||||
// listGPUPCI shells out to lspci. Returns human-readable strings, one
|
||||
// per VGA/3D device. If lspci isn't available we return nil and the
|
||||
// caller treats it as "no GPU" which auto-skips.
|
||||
func listGPUPCI(ctx context.Context) []string {
|
||||
cmd := exec.CommandContext(ctx, "lspci", "-mm")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var devs []string
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
l := strings.ToLower(line)
|
||||
if strings.Contains(l, "vga compatible controller") || strings.Contains(l, "3d controller") {
|
||||
devs = append(devs, strings.TrimSpace(line))
|
||||
}
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
// nvidiaSmiList returns each card's "<name>, <pci bus>" line; empty
|
||||
// slice when nvidia-smi isn't installed or fails.
|
||||
func nvidiaSmiList(ctx context.Context) []string {
|
||||
cmd := exec.CommandContext(ctx, "nvidia-smi", "-L")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var lines []string
|
||||
for _, l := range strings.Split(string(out), "\n") {
|
||||
l = strings.TrimSpace(l)
|
||||
if l != "" {
|
||||
lines = append(lines, l)
|
||||
}
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
func joinDevices(devs []string) string {
|
||||
if len(devs) == 0 {
|
||||
return ""
|
||||
}
|
||||
if len(devs) == 1 {
|
||||
return devs[0]
|
||||
}
|
||||
return devs[0] + " (+" + strings.TrimSpace(formatCount(len(devs)-1, "more")) + ")"
|
||||
}
|
||||
@@ -0,0 +1,144 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// NetworkConfig is what the agent passes to Network: the orchestrator's
|
||||
// iperf3 server address and port. We derive host from OrchestratorURL.
|
||||
type NetworkConfig struct {
|
||||
OrchestratorURL string
|
||||
IperfPort int // 0 = 5201
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
// Network runs iperf3 against the orchestrator's bundled server. Records
|
||||
// bandwidth as a measurement; fails if iperf3 is missing, the server
|
||||
// isn't reachable, or throughput is zero.
|
||||
func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
|
||||
if _, err := exec.LookPath("iperf3"); err != nil {
|
||||
d.Warn("Network: iperf3 not found — skipping stage")
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: "skipped (iperf3 missing)",
|
||||
Extras: map[string]any{"skipped": true, "reason": "iperf3_missing"},
|
||||
}
|
||||
}
|
||||
host, err := deriveHost(cfg.OrchestratorURL)
|
||||
if err != nil || host == "" {
|
||||
d.Warn("Network: can't derive orchestrator host from URL — skipping stage")
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: "skipped (no orchestrator host)",
|
||||
Extras: map[string]any{"skipped": true, "reason": "no_host"},
|
||||
}
|
||||
}
|
||||
port := cfg.IperfPort
|
||||
if port == 0 {
|
||||
port = 5201
|
||||
}
|
||||
duration := cfg.Duration
|
||||
if duration <= 0 {
|
||||
duration = 10 * time.Second
|
||||
}
|
||||
|
||||
args := []string{
|
||||
"-c", host,
|
||||
"-p", strconv.Itoa(port),
|
||||
"-t", strconv.Itoa(int(duration.Seconds())),
|
||||
"-J", // JSON output
|
||||
}
|
||||
d.Info(fmt.Sprintf("Network: iperf3 -c %s -p %d -t %s", host, port, duration))
|
||||
|
||||
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(runCtx, "iperf3", args...)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
d.Error("Network: iperf3 client failed: " + err.Error())
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "iperf3 client error: " + err.Error(),
|
||||
Summary: "iperf3 failed",
|
||||
Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)},
|
||||
}
|
||||
}
|
||||
mbps, parsed, err := parseIperfJSON(out)
|
||||
if err != nil {
|
||||
d.Error("Network: parse iperf3 output: " + err.Error())
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "parse iperf3 json: " + err.Error(),
|
||||
Summary: "parse error",
|
||||
Extras: map[string]any{"raw": string(out)},
|
||||
}
|
||||
}
|
||||
if d.Sensor != nil {
|
||||
_ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}})
|
||||
}
|
||||
|
||||
extras := map[string]any{
|
||||
"throughput_mbps": mbps,
|
||||
"iperf_end": parsed,
|
||||
}
|
||||
if mbps <= 0 {
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "iperf3 reported zero throughput",
|
||||
Summary: "zero throughput",
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
|
||||
// deriveHost pulls the hostname out of an https://host:port base URL.
|
||||
func deriveHost(raw string) (string, error) {
|
||||
if raw == "" {
|
||||
return "", fmt.Errorf("empty url")
|
||||
}
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
h := u.Hostname()
|
||||
return strings.TrimSpace(h), nil
|
||||
}
|
||||
|
||||
// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J.
|
||||
// Returns (Mbps, full-json-map, err).
|
||||
func parseIperfJSON(b []byte) (float64, map[string]any, error) {
|
||||
var top map[string]any
|
||||
if err := json.Unmarshal(b, &top); err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
end, ok := top["end"].(map[string]any)
|
||||
if !ok {
|
||||
return 0, top, fmt.Errorf("missing end")
|
||||
}
|
||||
// iperf3 reports either sum_sent (when -R not set) or sum_received.
|
||||
for _, key := range []string{"sum_sent", "sum_received", "sum"} {
|
||||
sum, ok := end[key].(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
bps, ok := sum["bits_per_second"].(float64)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
return bps / 1_000_000, end, nil
|
||||
}
|
||||
return 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
|
||||
}
|
||||
@@ -0,0 +1,153 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
|
||||
// PSU rails. In home-lab hosts the kernel surfaces a handful of named
|
||||
// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10%
|
||||
// window of its nominal value → fail.
|
||||
func PSU(ctx context.Context, d Deps) Outcome {
|
||||
rails := scanPSURails()
|
||||
if len(rails) == 0 {
|
||||
d.Info("PSU: no voltage rails found under /sys/class/hwmon — skipping stage")
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: "skipped (no PSU sensors)",
|
||||
Extras: map[string]any{"skipped": true, "reason": "no_hwmon_voltages"},
|
||||
}
|
||||
}
|
||||
|
||||
var samples []Sample
|
||||
problems := []string{}
|
||||
for _, rail := range rails {
|
||||
samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"})
|
||||
if ok, why := voltageInRange(rail); !ok {
|
||||
problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why))
|
||||
}
|
||||
}
|
||||
if d.Sensor != nil {
|
||||
_ = d.Sensor(ctx, samples)
|
||||
}
|
||||
|
||||
extras := map[string]any{
|
||||
"rails": rails,
|
||||
"problems": problems,
|
||||
}
|
||||
if len(problems) > 0 {
|
||||
d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", "))
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "PSU rails out of range: " + strings.Join(problems, ", "),
|
||||
Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails)))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("%d rails nominal", len(rails)),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
|
||||
type psuRail struct {
|
||||
Label string `json:"label"`
|
||||
Volts float64 `json:"volts"`
|
||||
}
|
||||
|
||||
// scanPSURails walks every hwmon chip looking for in*_input files with
|
||||
// an accompanying in*_label that mentions a known rail name. Unknown
|
||||
// labels are skipped rather than flagged — motherboard VRMs report many
|
||||
// rails that aren't PSU outputs.
|
||||
func scanPSURails() []psuRail {
|
||||
root := "/sys/class/hwmon"
|
||||
chips, err := os.ReadDir(root)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var out []psuRail
|
||||
for _, c := range chips {
|
||||
base := filepath.Join(root, c.Name())
|
||||
files, err := os.ReadDir(base)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, f := range files {
|
||||
name := f.Name()
|
||||
if !strings.HasPrefix(name, "in") || !strings.HasSuffix(name, "_input") {
|
||||
continue
|
||||
}
|
||||
n := strings.TrimSuffix(strings.TrimPrefix(name, "in"), "_input")
|
||||
labelPath := filepath.Join(base, "in"+n+"_label")
|
||||
label := strings.TrimSpace(readFileStr(labelPath))
|
||||
if !isPSULabel(label) {
|
||||
continue
|
||||
}
|
||||
raw := strings.TrimSpace(readFileStr(filepath.Join(base, name)))
|
||||
mv, err := strconv.Atoi(raw)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, psuRail{Label: label, Volts: float64(mv) / 1000})
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// isPSULabel filters labels that look like PSU rails. Keeps a small
|
||||
// allowlist to avoid flagging CPU VRM rails as PSU failures.
|
||||
func isPSULabel(label string) bool {
|
||||
l := strings.ToLower(label)
|
||||
switch {
|
||||
case strings.Contains(l, "12v"), strings.Contains(l, "5v"),
|
||||
strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"),
|
||||
strings.Contains(l, "vccin"):
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// voltageInRange returns (ok, reason). A label like "12V" has a 12.0V
|
||||
// nominal; we accept ±10%. Unknown labels pass.
|
||||
func voltageInRange(r psuRail) (bool, string) {
|
||||
nom := nominalFor(r.Label)
|
||||
if nom == 0 {
|
||||
return true, ""
|
||||
}
|
||||
delta := r.Volts - nom
|
||||
if delta < 0 {
|
||||
delta = -delta
|
||||
}
|
||||
if delta/nom > 0.10 {
|
||||
return false, fmt.Sprintf("expected ~%.1fV", nom)
|
||||
}
|
||||
return true, ""
|
||||
}
|
||||
|
||||
func nominalFor(label string) float64 {
|
||||
l := strings.ToLower(label)
|
||||
switch {
|
||||
case strings.Contains(l, "12v"):
|
||||
return 12.0
|
||||
case strings.Contains(l, "5v"):
|
||||
return 5.0
|
||||
case strings.Contains(l, "3.3v"), strings.Contains(l, "3v3"):
|
||||
return 3.3
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func readFileStr(p string) string {
|
||||
b, err := os.ReadFile(p)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
@@ -0,0 +1,152 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// SMART runs smartctl -a on each block device the kernel exposes. We
|
||||
// pass each device's result through smartctl --json output and key on:
|
||||
//
|
||||
// smart_status.passed -> overall-health PASSED
|
||||
// ata_smart_attributes -> per-attribute raw + threshold (ATA only)
|
||||
// nvme_smart_health_information_log -> NVMe health flags
|
||||
//
|
||||
// Missing smartctl / unsupported device (e.g. QEMU virtio-blk) just
|
||||
// surfaces as a per-disk "skipped" entry; the stage only fails if at
|
||||
// least one disk reports !passed.
|
||||
func SMART(ctx context.Context, d Deps) Outcome {
|
||||
disks, err := listBlockDisks()
|
||||
if err != nil {
|
||||
d.Warn("SMART: failed to enumerate /sys/class/block: " + err.Error())
|
||||
return Outcome{Passed: true, Summary: "skipped (no block devices enumerable)", Extras: map[string]any{"skipped": true}}
|
||||
}
|
||||
if len(disks) == 0 {
|
||||
d.Info("SMART: no physical disks found — skipping stage")
|
||||
return Outcome{Passed: true, Summary: "skipped (no disks)", Extras: map[string]any{"skipped": true}}
|
||||
}
|
||||
|
||||
type diskReport struct {
|
||||
Device string `json:"device"`
|
||||
Passed bool `json:"passed"`
|
||||
Skipped bool `json:"skipped,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
Raw map[string]any `json:"raw,omitempty"`
|
||||
}
|
||||
|
||||
var reports []diskReport
|
||||
failed := 0
|
||||
usable := 0
|
||||
for _, dev := range disks {
|
||||
rep := diskReport{Device: dev}
|
||||
out, err := runSmartctl(ctx, dev)
|
||||
if err != nil {
|
||||
rep.Skipped = true
|
||||
rep.Reason = err.Error()
|
||||
reports = append(reports, rep)
|
||||
d.Info("SMART: " + dev + " skipped (" + err.Error() + ")")
|
||||
continue
|
||||
}
|
||||
usable++
|
||||
rep.Raw = out
|
||||
if passed, ok := smartPassed(out); ok {
|
||||
rep.Passed = passed
|
||||
if !passed {
|
||||
failed++
|
||||
d.Error(fmt.Sprintf("SMART: %s reports FAILED", dev))
|
||||
} else {
|
||||
d.Info(fmt.Sprintf("SMART: %s PASSED", dev))
|
||||
}
|
||||
} else {
|
||||
rep.Skipped = true
|
||||
rep.Reason = "no smart_status in output"
|
||||
}
|
||||
reports = append(reports, rep)
|
||||
}
|
||||
|
||||
extras := map[string]any{
|
||||
"disks": reports,
|
||||
"tested": usable,
|
||||
"failing": failed,
|
||||
}
|
||||
if failed > 0 {
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: fmt.Sprintf("%d disk(s) report SMART FAILED", failed),
|
||||
Summary: fmt.Sprintf("%d/%d failing", failed, usable),
|
||||
Extras: extras,
|
||||
}
|
||||
}
|
||||
summary := fmt.Sprintf("%d disks, %d SMART-reporting, all PASSED", len(disks), usable)
|
||||
if usable == 0 {
|
||||
summary = "skipped (no smartctl data on any disk)"
|
||||
extras["skipped"] = true
|
||||
}
|
||||
return Outcome{Passed: true, Summary: summary, Extras: extras}
|
||||
}
|
||||
|
||||
func listBlockDisks() ([]string, error) {
|
||||
entries, err := os.ReadDir("/sys/class/block")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out []string
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if !isRealBlockDisk(name) {
|
||||
continue
|
||||
}
|
||||
out = append(out, "/dev/"+name)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func isRealBlockDisk(name string) bool {
|
||||
if strings.HasPrefix(name, "loop") || strings.HasPrefix(name, "ram") ||
|
||||
strings.HasPrefix(name, "zram") || strings.HasPrefix(name, "dm-") {
|
||||
return false
|
||||
}
|
||||
partPath := filepath.Join("/sys/class/block", name, "partition")
|
||||
if _, err := os.Stat(partPath); err == nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// runSmartctl invokes `smartctl -aj <dev>` and returns the parsed JSON.
|
||||
// Exit code 4 means smartctl found no device info (e.g. virtio), which
|
||||
// we surface as a skip rather than a failure.
|
||||
func runSmartctl(ctx context.Context, dev string) (map[string]any, error) {
|
||||
cmd := exec.CommandContext(ctx, "smartctl", "-aj", dev)
|
||||
out, err := cmd.Output()
|
||||
if len(out) == 0 {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("smartctl: %w", err)
|
||||
}
|
||||
return nil, fmt.Errorf("empty smartctl output")
|
||||
}
|
||||
var parsed map[string]any
|
||||
if jerr := json.Unmarshal(out, &parsed); jerr != nil {
|
||||
return nil, fmt.Errorf("parse smartctl output: %w", jerr)
|
||||
}
|
||||
// Even with a non-zero exit code, if we got valid JSON with
|
||||
// smart_status, trust the structured result.
|
||||
return parsed, nil
|
||||
}
|
||||
|
||||
// smartPassed extracts smart_status.passed from a smartctl --json blob.
|
||||
// Returns (passed, present) so callers can distinguish "passed=false"
|
||||
// from "attribute missing".
|
||||
func smartPassed(out map[string]any) (bool, bool) {
|
||||
status, ok := out["smart_status"].(map[string]any)
|
||||
if !ok {
|
||||
return false, false
|
||||
}
|
||||
passed, ok := status["passed"].(bool)
|
||||
return passed, ok
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
// Package tests contains the per-stage executors the agent runs on the
|
||||
// host under test. Each stage implements Runner, is called with a
|
||||
// Context that carries the client + forwarder + run params, and returns
|
||||
// an Outcome that the caller POSTs to /result.
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Outcome is what a stage returns; it maps directly to the /result body.
|
||||
// - Passed=true and len(Skipped)>0 counts as a pass but surfaces in the
|
||||
// tile summary so operators can see "GPU: skipped (no VGA device)".
|
||||
// - Message is only used on failure; the UI displays it in the log.
|
||||
// - Extras is merged into the posted summary so stages can add
|
||||
// their own shape (e.g. Storage returns per-disk probe results).
|
||||
type Outcome struct {
|
||||
Passed bool
|
||||
Message string
|
||||
Summary string // short human-readable one-liner
|
||||
Extras map[string]any // merged into posted summary JSON
|
||||
}
|
||||
|
||||
// MarshalSummary builds the summary JSON body POSTed to /result.
|
||||
// Stages accumulate fields via Extras; this helper adds "summary" (the
|
||||
// human-readable line) and serializes.
|
||||
func (o Outcome) MarshalSummary() (json.RawMessage, error) {
|
||||
body := map[string]any{}
|
||||
for k, v := range o.Extras {
|
||||
body[k] = v
|
||||
}
|
||||
if o.Summary != "" {
|
||||
body["summary"] = o.Summary
|
||||
}
|
||||
return json.Marshal(body)
|
||||
}
|
||||
|
||||
// Deps bundles what stages need without pulling in the whole agent.
|
||||
// Logger methods print to stdout + forward to the orchestrator; Sensor
|
||||
// drops numeric samples; OverrideFlags carries operator-set bypasses.
|
||||
type Deps struct {
|
||||
Info func(string)
|
||||
Warn func(string)
|
||||
Error func(string)
|
||||
Sensor func(ctx context.Context, samples []Sample) error
|
||||
OverrideWipe bool
|
||||
ExpectedDisks []ExpectedDisk // serials + sizes from host.expected_spec
|
||||
StageTimeout time.Duration
|
||||
}
|
||||
|
||||
// Sample mirrors the server's SensorSample but lives in the tests
|
||||
// package so probe code doesn't import internal/api.
|
||||
type Sample struct {
|
||||
Kind string
|
||||
Key string
|
||||
Value float64
|
||||
Unit string
|
||||
}
|
||||
|
||||
// ExpectedDisk is the subset of internal/spec.DiskSpec that Storage
|
||||
// needs: a device allowlist keyed on serial.
|
||||
type ExpectedDisk struct {
|
||||
Serial string
|
||||
SizeGB int
|
||||
}
|
||||
@@ -0,0 +1,298 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Storage is the destructive stage: badblocks (write-mode sample) + fio
|
||||
// random IO, persisting IOPS + latency as measurements. Pre-gates:
|
||||
//
|
||||
// 1. Device allowlist: only act on /dev/<X> where the kernel-reported
|
||||
// serial matches one of Deps.ExpectedDisks. This is the operator's
|
||||
// contract for what can be written to. USB sticks and unexpected
|
||||
// drives are excluded.
|
||||
// 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
|
||||
// signatures, partition tables, or LVM metadata → fail with
|
||||
// UnexpectedData unless Deps.OverrideWipe is set.
|
||||
//
|
||||
// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w`
|
||||
// and `fio` in write mode. This matches the plan's "destructive disk
|
||||
// tests are always-on, gated by layered safety."
|
||||
func Storage(ctx context.Context, d Deps) Outcome {
|
||||
if len(d.ExpectedDisks) == 0 {
|
||||
d.Info("Storage: no expected disks in spec — skipping stage")
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: "skipped (no expected disks)",
|
||||
Extras: map[string]any{"skipped": true, "reason": "no_expected_disks"},
|
||||
}
|
||||
}
|
||||
|
||||
targets := resolveTargets(d.ExpectedDisks)
|
||||
if len(targets) == 0 {
|
||||
d.Error("Storage: none of the expected disks are present on this host")
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "device allowlist matched zero disks",
|
||||
Summary: "no allowed disks present",
|
||||
Extras: map[string]any{"expected": d.ExpectedDisks},
|
||||
}
|
||||
}
|
||||
|
||||
// Wipe probe on every target. A single dirty disk halts the stage
|
||||
// unless the operator has set OverrideWipe via the UI.
|
||||
probes := map[string]wipeProbeResult{}
|
||||
dirty := []string{}
|
||||
for _, t := range targets {
|
||||
probe := probeWipe(ctx, t.Device)
|
||||
probes[t.Device] = probe
|
||||
if probe.HasData {
|
||||
dirty = append(dirty, t.Device)
|
||||
}
|
||||
}
|
||||
if len(dirty) > 0 && !d.OverrideWipe {
|
||||
d.Error("Storage: wipe probe found existing data on: " + strings.Join(dirty, ", "))
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
|
||||
Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
|
||||
Extras: map[string]any{
|
||||
"wipe_probe": probes,
|
||||
"override_hint": "click 'Override wipe & retry' in the held tile",
|
||||
"dirty_devices": dirty,
|
||||
},
|
||||
}
|
||||
}
|
||||
if d.OverrideWipe && len(dirty) > 0 {
|
||||
d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
|
||||
}
|
||||
|
||||
// Per target: short badblocks write sample + fio random-read/write.
|
||||
var samples []Sample
|
||||
perDisk := map[string]any{}
|
||||
for _, t := range targets {
|
||||
d.Info("Storage: running badblocks write sample on " + t.Device)
|
||||
bb := runBadblocks(ctx, t.Device)
|
||||
d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
|
||||
fr := runFio(ctx, t.Device)
|
||||
perDisk[t.Device] = map[string]any{
|
||||
"badblocks": bb,
|
||||
"fio": fr,
|
||||
}
|
||||
samples = append(samples,
|
||||
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
|
||||
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
|
||||
)
|
||||
if !bb.OK {
|
||||
return Outcome{
|
||||
Passed: false,
|
||||
Message: "badblocks found errors on " + t.Device,
|
||||
Summary: "badblocks failed on " + t.Device,
|
||||
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
|
||||
}
|
||||
}
|
||||
}
|
||||
if d.Sensor != nil {
|
||||
_ = d.Sensor(ctx, samples)
|
||||
}
|
||||
|
||||
d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets)))
|
||||
return Outcome{
|
||||
Passed: true,
|
||||
Summary: fmt.Sprintf("%d disks passed", len(targets)),
|
||||
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
|
||||
}
|
||||
}
|
||||
|
||||
type diskTarget struct {
|
||||
Serial string
|
||||
Device string
|
||||
}
|
||||
|
||||
// resolveTargets maps expected-disk serials to /dev/<X> paths by reading
|
||||
// /sys/block. Uses the same mechanism as probes.inventory to avoid drift.
|
||||
func resolveTargets(expected []ExpectedDisk) []diskTarget {
|
||||
disks, err := listBlockDisks()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
// Build serial → device map from /sys.
|
||||
serialOf := map[string]string{}
|
||||
for _, dev := range disks {
|
||||
name := strings.TrimPrefix(dev, "/dev/")
|
||||
s := diskSerialFromSys(name)
|
||||
if s != "" {
|
||||
serialOf[strings.ToLower(s)] = dev
|
||||
}
|
||||
}
|
||||
var out []diskTarget
|
||||
for _, e := range expected {
|
||||
if e.Serial == "" {
|
||||
continue
|
||||
}
|
||||
if dev, ok := serialOf[strings.ToLower(e.Serial)]; ok {
|
||||
out = append(out, diskTarget{Serial: e.Serial, Device: dev})
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// diskSerialFromSys is a smaller copy of probes.diskSerial; imported
|
||||
// from internal/probes would cause a cycle so we duplicate the short
|
||||
// lookup. If it drifts from the inventory probe, Storage fails because
|
||||
// the serial doesn't match — which is the correct behavior.
|
||||
func diskSerialFromSys(name string) string {
|
||||
for _, rel := range []string{
|
||||
"/sys/block/" + name + "/device/serial",
|
||||
"/sys/block/" + name + "/serial",
|
||||
} {
|
||||
b, err := readFileBytes(rel)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
s := strings.TrimSpace(string(b))
|
||||
if s != "" {
|
||||
return s
|
||||
}
|
||||
}
|
||||
// Fall back to udevadm — ID_SERIAL_SHORT is more reliable on SCSI.
|
||||
out, err := exec.Command("udevadm", "info", "--query=property", "--name="+name).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
|
||||
return strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func readFileBytes(p string) ([]byte, error) {
|
||||
return readFile(p)
|
||||
}
|
||||
|
||||
// ---------- wipe probe ----------
|
||||
|
||||
type wipeProbeResult struct {
|
||||
Device string `json:"device"`
|
||||
HasData bool `json:"has_data"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
}
|
||||
|
||||
// probeWipe runs blkid + wipefs -n. Any non-empty output from either is
|
||||
// a "has data" signal. This is deliberately conservative: we'd rather
|
||||
// halt on a bare ext4 signature than hand badblocks a disk with real
|
||||
// bytes on it.
|
||||
func probeWipe(ctx context.Context, device string) wipeProbeResult {
|
||||
out := wipeProbeResult{Device: device}
|
||||
|
||||
if b, err := exec.CommandContext(ctx, "blkid", "-o", "full", device).Output(); err == nil {
|
||||
s := strings.TrimSpace(string(b))
|
||||
if s != "" {
|
||||
out.Findings = append(out.Findings, "blkid: "+s)
|
||||
out.HasData = true
|
||||
}
|
||||
}
|
||||
if b, err := exec.CommandContext(ctx, "wipefs", "--no-act", device).Output(); err == nil {
|
||||
s := strings.TrimSpace(string(b))
|
||||
// wipefs prints a header line even on a clean disk; keep only
|
||||
// lines with actual signature data.
|
||||
for _, line := range strings.Split(s, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" || strings.HasPrefix(line, "DEVICE") || strings.HasPrefix(line, "offset") {
|
||||
continue
|
||||
}
|
||||
out.Findings = append(out.Findings, "wipefs: "+line)
|
||||
out.HasData = true
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// ---------- badblocks ----------
|
||||
|
||||
type badblocksResult struct {
|
||||
OK bool `json:"ok"`
|
||||
Elapsed string `json:"elapsed"`
|
||||
Error string `json:"error,omitempty"`
|
||||
OutputTail string `json:"output_tail,omitempty"`
|
||||
}
|
||||
|
||||
func runBadblocks(ctx context.Context, device string) badblocksResult {
|
||||
// -c 64 blocks per check, -w destructive write, -b 4096 block size,
|
||||
// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays
|
||||
// bounded. A real burn-in would run the whole disk; that belongs in
|
||||
// a separate "deep" stage.
|
||||
args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"}
|
||||
start := time.Now()
|
||||
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(runCtx, "badblocks", args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)}
|
||||
if err != nil {
|
||||
r.Error = err.Error()
|
||||
return r
|
||||
}
|
||||
// badblocks prints each bad block to stdout. Empty output = clean.
|
||||
if strings.TrimSpace(string(out)) == "" {
|
||||
r.OK = true
|
||||
} else {
|
||||
r.Error = "bad blocks found"
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// ---------- fio ----------
|
||||
|
||||
type fioResult struct {
|
||||
ReadIOPS float64 `json:"read_iops"`
|
||||
WriteIOPS float64 `json:"write_iops"`
|
||||
ReadBWKBps float64 `json:"read_bw_kbps"`
|
||||
WriteBWKBps float64 `json:"write_bw_kbps"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
|
||||
// This is a health bar, not a benchmark — we want to know the disk
|
||||
// services IO, not how fast it is at p99.
|
||||
func runFio(ctx context.Context, device string) fioResult {
|
||||
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||||
defer cancel()
|
||||
args := []string{
|
||||
"--name=health", "--filename=" + device, "--rw=randrw",
|
||||
"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
|
||||
"--group_reporting", "--output-format=json", "--direct=1",
|
||||
}
|
||||
cmd := exec.CommandContext(runCtx, "fio", args...)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return fioResult{Error: err.Error()}
|
||||
}
|
||||
var top struct {
|
||||
Jobs []struct {
|
||||
Read struct {
|
||||
IOPS float64 `json:"iops"`
|
||||
BW float64 `json:"bw"`
|
||||
} `json:"read"`
|
||||
Write struct {
|
||||
IOPS float64 `json:"iops"`
|
||||
BW float64 `json:"bw"`
|
||||
} `json:"write"`
|
||||
} `json:"jobs"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 {
|
||||
return fioResult{Error: "parse fio json: " + fmt.Sprint(err)}
|
||||
}
|
||||
j := top.Jobs[0]
|
||||
return fioResult{
|
||||
ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
|
||||
ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
)
|
||||
|
||||
// readFile is used by stages that need to peek at /sys files without
|
||||
// importing the agent's probes package (which would cycle).
|
||||
func readFile(p string) ([]byte, error) {
|
||||
return os.ReadFile(p)
|
||||
}
|
||||
|
||||
// formatCount pluralizes a count + label: (0, "disk") → "0 disks",
|
||||
// (1, "disk") → "1 disk", (n, "disk") → "n disks". Keeps log lines tidy.
|
||||
func formatCount(n int, label string) string {
|
||||
if n == 1 {
|
||||
return fmt.Sprintf("%d %s", n, label)
|
||||
}
|
||||
return fmt.Sprintf("%d %ss", n, label)
|
||||
}
|
||||
Reference in New Issue
Block a user