deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+17 -1
View File
@@ -42,4 +42,20 @@ jobs:
GOOS=linux GOARCH=amd64 go build ./... GOOS=linux GOARCH=amd64 go build ./...
- name: Test - name: Test
run: go test -race -count=1 ./... run: go test -race -count=1 -coverprofile=coverage.out ./...
- name: Coverage summary
run: |
go tool cover -func=coverage.out | tee coverage.txt
go tool cover -html=coverage.out -o coverage.html
- name: Upload coverage artifact
uses: actions/upload-artifact@v4
if: always()
with:
name: coverage
path: |
coverage.out
coverage.txt
coverage.html
retention-days: 14
+50
View File
@@ -124,6 +124,56 @@ type ClaimResponse struct {
// at the right stage instead of silently replaying Inventory and // at the right stage instead of silently replaying Inventory and
// letting the orchestrator advance past the crashed stage. // letting the orchestrator advance past the crashed stage.
CurrentState string `json:"current_state"` CurrentState string `json:"current_state"`
// StageConfig carries per-profile stage knobs (Phase 2): stage-level
// timeouts and probe-level durations/modes. Empty when the agent
// talks to a pre-Phase-2 orchestrator; the agent applies compile-
// time defaults in that case.
StageConfig ClaimStageConfig `json:"stage_config"`
}
// ClaimStageConfig mirrors config.StageConfig server-side — duplicated so
// the agent doesn't need to import internal/config. Durations arrive as
// strings ("2m", "2h") and are parsed by the tests package at the point
// of use. An empty field means "use the agent-side default" so a missing
// knob doesn't silently turn CPUStress / Storage into a no-op.
type ClaimStageConfig struct {
Profile string `json:"profile"`
StageTimeouts map[string]string `json:"stage_timeouts,omitempty"`
CPUStress ClaimCPUStressKnobs `json:"cpustress"`
Storage ClaimStorageKnobs `json:"storage"`
Network ClaimNetworkKnobs `json:"network"`
Burn ClaimBurnKnobs `json:"burn"`
}
type ClaimCPUStressKnobs struct {
CPUPass string `json:"cpu_pass,omitempty"`
MemPass string `json:"mem_pass,omitempty"`
EDACPoll string `json:"edac_poll,omitempty"`
}
type ClaimStorageKnobs struct {
Mode string `json:"mode,omitempty"`
FioSize string `json:"fio_size,omitempty"`
FioTime string `json:"fio_time,omitempty"`
FioBS string `json:"fio_bs,omitempty"`
FioRW string `json:"fio_rw,omitempty"`
Verify string `json:"verify,omitempty"`
}
type ClaimNetworkKnobs struct {
Duration string `json:"duration,omitempty"`
}
// ClaimBurnKnobs mirrors config.BurnKnobs. Duration/CPUWorkers arrive as
// strings so the agent can treat empty as "use compile-time default".
// MemPct is a percentage (0-100); IperfParallel is the parallel stream
// count fed to iperf3 -P. FioOnSpare gates whether fio runs inside Burn.
type ClaimBurnKnobs struct {
Duration string `json:"duration,omitempty"`
CPUWorkers string `json:"cpu_workers,omitempty"`
MemPct int `json:"mem_pct,omitempty"`
FioOnSpare bool `json:"fio_on_spare,omitempty"`
IperfParallel int `json:"iperf_parallel,omitempty"`
} }
type ClaimExpectedDiskSpec struct { type ClaimExpectedDiskSpec struct {
+70
View File
@@ -0,0 +1,70 @@
package probes
import (
"os"
"path/filepath"
"strconv"
"strings"
)
// EDACSample is one counter reading from /sys/devices/system/edac/mc/.
// Kind is "edac_ce" (correctable ECC errors) or "edac_ue"
// (uncorrectable — always a critical signal). Key identifies the memory
// controller (e.g. "mc0"). Value is the cumulative count since boot;
// the threshold evaluator flags it the moment it exceeds 0.
type EDACSample struct {
Kind string
Key string
Value float64
Unit string
}
// EDAC returns one EDACSample per (memory-controller × {ce,ue}) pair
// that /sys exposes. Returns an empty slice when EDAC isn't available
// (virtualized host, missing kernel driver, mdadm-style boards without
// a controller node) — callers treat an empty return as "no data",
// not "passed". Errors are swallowed for the same reason: a hot-
// swapped DIMM that makes /sys blink briefly shouldn't fail the stage
// before the real counter can be read.
//
// This is intentionally small — the sidecar polls periodically, so one
// bad read is recovered on the next tick. The counters are monotonic,
// so emitting the current raw value is correct.
func EDAC() []EDACSample {
root := "/sys/devices/system/edac/mc"
entries, err := os.ReadDir(root)
if err != nil {
return nil
}
var out []EDACSample
for _, e := range entries {
name := e.Name()
if !strings.HasPrefix(name, "mc") {
continue
}
base := filepath.Join(root, name)
if ce, ok := readCount(filepath.Join(base, "ce_count")); ok {
out = append(out, EDACSample{Kind: "edac_ce", Key: name, Value: ce, Unit: "count"})
}
if ue, ok := readCount(filepath.Join(base, "ue_count")); ok {
out = append(out, EDACSample{Kind: "edac_ue", Key: name, Value: ue, Unit: "count"})
}
}
return out
}
// readCount reads a single decimal integer from a sysfs file and
// returns it as a float. Returns (0, false) on any failure so callers
// can skip the sample without a diagnostic.
func readCount(path string) (float64, bool) {
b, err := os.ReadFile(path)
if err != nil {
return 0, false
}
s := strings.TrimSpace(string(b))
n, err := strconv.ParseInt(s, 10, 64)
if err != nil {
return 0, false
}
return float64(n), true
}
+496
View File
@@ -0,0 +1,496 @@
package probes
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"
)
// FirmwareSnapshot is the on-wire shape the agent POSTs alongside the
// Firmware stage result. Mirrors internal/store.FirmwareSnapshot without
// the import — the /result handler converts to the store type and
// persists. One run produces many snapshots (one per BIOS / BMC / NIC
// port / HBA / microcode / NVMe); identifier distinguishes siblings
// (e.g. "eth0" / "eth1"), version is the canonical string to diff.
type FirmwareSnapshot struct {
Component string `json:"component"` // bios|bmc|nic|hba|microcode|nvme_fw
Identifier string `json:"identifier"`
Version string `json:"version"`
Vendor string `json:"vendor,omitempty"`
Raw map[string]string `json:"raw,omitempty"`
}
// Firmware runs every sub-probe in sequence. Each one is bounded with
// a short timeout so a hung dmidecode / ipmitool / nvme tool can't
// freeze the stage — the probe is best-effort, missing tools produce
// empty output rather than an error. Returns the aggregated slice
// along with a list of probe-level warnings (surfaced in the stage
// summary so operators see which subsystem couldn't be read).
func Firmware(ctx context.Context) ([]FirmwareSnapshot, []string) {
var out []FirmwareSnapshot
var warnings []string
if snap, warn := probeBIOS(ctx); snap != nil {
out = append(out, *snap)
} else if warn != "" {
warnings = append(warnings, warn)
}
if snap, warn := probeBMC(ctx); snap != nil {
out = append(out, *snap)
} else if warn != "" {
warnings = append(warnings, warn)
}
out = append(out, probeNICFirmware(ctx)...)
out = append(out, probeNVMeFirmware(ctx)...)
out = append(out, probeHBAFirmware(ctx)...)
if snap := probeMicrocode(); snap != nil {
out = append(out, *snap)
}
return out, warnings
}
// runCmd executes a short-lived command with a per-call timeout. The
// timeout is intentionally aggressive (5 s) because firmware probes
// read device registers and occasionally block forever on a wedged
// controller — the stage should report "no HBA firmware readable"
// rather than hang the pipeline.
func runCmd(ctx context.Context, name string, args ...string) (string, error) {
cctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
cmd := exec.CommandContext(cctx, name, args...)
out, err := cmd.CombinedOutput()
if err != nil {
return string(out), err
}
return string(out), nil
}
// ----- BIOS --------------------------------------------------------------
// probeBIOS invokes dmidecode -t bios and parses the vendor + version
// lines. dmidecode must run as root; we let it fail gracefully when the
// agent is mis-deployed without privileges.
func probeBIOS(ctx context.Context) (*FirmwareSnapshot, string) {
if _, err := exec.LookPath("dmidecode"); err != nil {
return nil, "bios: dmidecode not installed"
}
out, err := runCmd(ctx, "dmidecode", "-t", "bios")
if err != nil {
return nil, fmt.Sprintf("bios: dmidecode failed: %v", trimErr(err, out))
}
snap := parseDmidecodeBIOS(strings.NewReader(out))
if snap == nil {
return nil, "bios: dmidecode produced no usable output"
}
return snap, ""
}
// parseDmidecodeBIOS consumes `dmidecode -t bios` output and pulls
// Vendor / Version / Release Date. Kept as an io.Reader for unit tests.
func parseDmidecodeBIOS(r io.Reader) *FirmwareSnapshot {
kv := parseDmidecodeSection(r, "BIOS Information")
if kv == nil {
return nil
}
snap := &FirmwareSnapshot{
Component: "bios",
Identifier: "system",
Version: firstNonEmpty(kv["Version"], kv["Firmware Revision"]),
Vendor: kv["Vendor"],
Raw: kv,
}
if snap.Version == "" {
return nil
}
return snap
}
// parseDmidecodeSection returns the key/value map of the first dmidecode
// handle whose title matches. dmidecode blocks look like:
// Handle 0x0000, ...
// BIOS Information
// Vendor: American Megatrends
// Version: 3.0
// ...
// With a blank line between blocks. Values like "Characteristics:"
// followed by a bulleted sub-list are collapsed into "…" so we don't
// accidentally swallow the next handle.
func parseDmidecodeSection(r io.Reader, title string) map[string]string {
sc := bufio.NewScanner(r)
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
var kv map[string]string
var inside, seenTitle bool
for sc.Scan() {
line := sc.Text()
trim := strings.TrimSpace(line)
if strings.HasPrefix(line, "Handle ") {
if seenTitle && kv != nil {
return kv
}
inside = false
kv = nil
continue
}
if !inside {
if trim == title {
inside = true
seenTitle = true
kv = map[string]string{}
}
continue
}
if trim == "" {
continue
}
if k, v, ok := strings.Cut(trim, ":"); ok {
v = strings.TrimSpace(v)
if v == "" {
continue
}
kv[strings.TrimSpace(k)] = v
}
}
if seenTitle {
return kv
}
return nil
}
// ----- BMC / IPMI --------------------------------------------------------
// probeBMC walks `ipmitool mc info`. Home-lab hosts often lack a BMC —
// missing binary or a non-zero exit returns a warning without failing
// the stage. We capture Firmware Revision + Manufacturer as the version.
func probeBMC(ctx context.Context) (*FirmwareSnapshot, string) {
if _, err := exec.LookPath("ipmitool"); err != nil {
return nil, "bmc: ipmitool not installed"
}
out, err := runCmd(ctx, "ipmitool", "mc", "info")
if err != nil {
return nil, fmt.Sprintf("bmc: ipmitool mc info failed: %v", trimErr(err, out))
}
snap := parseIpmitoolMCInfo(strings.NewReader(out))
if snap == nil {
return nil, "bmc: ipmitool output not parseable"
}
return snap, ""
}
// parseIpmitoolMCInfo pulls "Firmware Revision" + "Manufacturer Name"
// from the textual output. Format is indented key : value lines.
func parseIpmitoolMCInfo(r io.Reader) *FirmwareSnapshot {
sc := bufio.NewScanner(r)
kv := map[string]string{}
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if k, v, ok := strings.Cut(line, ":"); ok {
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
}
}
version := firstNonEmpty(kv["Firmware Revision"], kv["Aux Firmware Rev Info"])
if version == "" {
return nil
}
return &FirmwareSnapshot{
Component: "bmc",
Identifier: "bmc0",
Version: version,
Vendor: kv["Manufacturer Name"],
Raw: kv,
}
}
// ----- NIC firmware ------------------------------------------------------
// probeNICFirmware enumerates /sys/class/net/*/device and calls
// `ethtool -i <iface>` on each real NIC (skip lo, bridges, virtuals).
// One snapshot per interface so a mismatched port lights up in the diff
// without silencing sibling ports.
func probeNICFirmware(ctx context.Context) []FirmwareSnapshot {
if _, err := exec.LookPath("ethtool"); err != nil {
return nil
}
ifaces, err := os.ReadDir("/sys/class/net")
if err != nil {
return nil
}
var out []FirmwareSnapshot
for _, entry := range ifaces {
name := entry.Name()
if !isRealNIC(name) {
continue
}
raw, err := runCmd(ctx, "ethtool", "-i", name)
if err != nil {
continue
}
snap := parseEthtoolI(strings.NewReader(raw), name)
if snap != nil {
out = append(out, *snap)
}
}
return out
}
// parseEthtoolI extracts driver/firmware-version from `ethtool -i`
// output. Lines are "key: value" with a consistent prefix order.
func parseEthtoolI(r io.Reader, iface string) *FirmwareSnapshot {
sc := bufio.NewScanner(r)
kv := map[string]string{}
for sc.Scan() {
line := sc.Text()
if k, v, ok := strings.Cut(line, ":"); ok {
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
}
}
if kv["firmware-version"] == "" && kv["driver"] == "" {
return nil
}
return &FirmwareSnapshot{
Component: "nic",
Identifier: iface,
Version: kv["firmware-version"],
Vendor: kv["driver"],
Raw: kv,
}
}
// isRealNIC filters out loopback, bridges, veth, and the handful of
// virtual kernel devices ethtool will refuse on.
func isRealNIC(name string) bool {
if name == "" || name == "lo" {
return false
}
for _, prefix := range []string{"docker", "br-", "veth", "virbr", "tun", "tap", "bond"} {
if strings.HasPrefix(name, prefix) {
return false
}
}
// Only accept interfaces that have a `device` link — real PCI NICs
// do; pure virtuals (dummy0, wg*) don't.
if _, err := os.Stat(filepath.Join("/sys/class/net", name, "device")); err != nil {
return false
}
return true
}
// ----- NVMe --------------------------------------------------------------
// probeNVMeFirmware reads /sys/class/nvme/nvmeN/firmware_rev for every
// controller. Falls back to `nvme id-ctrl` if the sysfs file is missing
// (older kernels). Identifier is the controller path so a run with two
// drives produces two snapshots.
func probeNVMeFirmware(ctx context.Context) []FirmwareSnapshot {
entries, err := os.ReadDir("/sys/class/nvme")
if err != nil {
return nil
}
var out []FirmwareSnapshot
for _, e := range entries {
ctrl := e.Name()
rev := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "firmware_rev")))
model := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "model")))
if rev == "" {
// Fallback: nvme id-ctrl -H /dev/<ctrl>. Available on hosts
// where sysfs doesn't export firmware_rev.
if _, err := exec.LookPath("nvme"); err == nil {
raw, _ := runCmd(ctx, "nvme", "id-ctrl", "/dev/"+ctrl)
rev = parseNVMeIDCtrl(strings.NewReader(raw), "fr")
if model == "" {
model = parseNVMeIDCtrl(strings.NewReader(raw), "mn")
}
}
}
if rev == "" {
continue
}
out = append(out, FirmwareSnapshot{
Component: "nvme_fw",
Identifier: ctrl,
Version: rev,
Vendor: model,
Raw: map[string]string{"model": model, "firmware_rev": rev},
})
}
return out
}
// parseNVMeIDCtrl pulls a single field out of `nvme id-ctrl` output.
// Format: "fr : FW1234" / "mn : Samsung SSD 980 PRO".
// Leading spaces vary, values may contain spaces.
func parseNVMeIDCtrl(r io.Reader, key string) string {
sc := bufio.NewScanner(r)
prefix := key + " "
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if !strings.HasPrefix(line, prefix) {
continue
}
_, v, ok := strings.Cut(line, ":")
if !ok {
continue
}
return strings.TrimSpace(v)
}
return ""
}
// ----- HBA ---------------------------------------------------------------
var lspciClassHBA = regexp.MustCompile(`(?i)(serial attached scsi|sas controller|raid bus controller)`)
// probeHBAFirmware looks for SAS/RAID HBAs via `lspci -Dvvnn`. The
// firmware string is typically exposed as "Product Name" +
// "Capabilities" but in practice the LSI/Broadcom driver writes a
// "revision" on the device line. We capture what's printed and rely on
// SpecValidate to diff — this keeps us off tool-specific CLIs (storcli,
// mpt-status) that aren't always installed.
func probeHBAFirmware(ctx context.Context) []FirmwareSnapshot {
if _, err := exec.LookPath("lspci"); err != nil {
return nil
}
out, err := runCmd(ctx, "lspci", "-Dvvnn")
if err != nil {
return nil
}
return parseLspciHBA(strings.NewReader(out))
}
// parseLspciHBA walks `lspci -Dvvnn` stanzas and picks SAS/RAID
// controllers. One snapshot per device; identifier is the PCI address.
// Version is the device line's revision (rev NN) or the Kernel modules
// string when no rev is printed.
func parseLspciHBA(r io.Reader) []FirmwareSnapshot {
sc := bufio.NewScanner(r)
sc.Buffer(make([]byte, 0, 64*1024), 1024*1024)
var out []FirmwareSnapshot
var cur *FirmwareSnapshot
revRe := regexp.MustCompile(`\(rev\s+([0-9a-fA-F]+)\)`)
flush := func() {
if cur != nil && cur.Version != "" {
out = append(out, *cur)
}
cur = nil
}
for sc.Scan() {
line := sc.Text()
if !strings.HasPrefix(line, "\t") && strings.Contains(line, " ") {
// New device line.
flush()
if lspciClassHBA.MatchString(line) {
addr, rest, _ := strings.Cut(line, " ")
cur = &FirmwareSnapshot{
Component: "hba",
Identifier: addr,
Vendor: strings.TrimSpace(rest),
Raw: map[string]string{"device_line": line},
}
if m := revRe.FindStringSubmatch(line); len(m) == 2 {
cur.Version = "rev " + m[1]
}
}
continue
}
if cur == nil {
continue
}
trim := strings.TrimSpace(line)
if strings.HasPrefix(trim, "Kernel modules:") {
cur.Raw["kernel_modules"] = strings.TrimPrefix(trim, "Kernel modules:")
}
if strings.HasPrefix(trim, "Kernel driver in use:") {
cur.Raw["kernel_driver"] = strings.TrimPrefix(trim, "Kernel driver in use:")
}
}
flush()
return out
}
// ----- Microcode ---------------------------------------------------------
// probeMicrocode reads /proc/cpuinfo for the "microcode" line. All
// cores report the same value post-boot, so one snapshot is enough.
func probeMicrocode() *FirmwareSnapshot {
f, err := os.Open("/proc/cpuinfo")
if err != nil {
return nil
}
defer func() { _ = f.Close() }()
snap := parseMicrocode(f)
return snap
}
func parseMicrocode(r io.Reader) *FirmwareSnapshot {
sc := bufio.NewScanner(r)
version := ""
vendor := ""
for sc.Scan() {
line := sc.Text()
k, v, ok := strings.Cut(line, ":")
if !ok {
continue
}
key := strings.TrimSpace(k)
val := strings.TrimSpace(v)
switch key {
case "microcode":
if version == "" {
version = val
}
case "vendor_id":
if vendor == "" {
vendor = val
}
}
if version != "" && vendor != "" {
break
}
}
if version == "" {
return nil
}
return &FirmwareSnapshot{
Component: "microcode",
Identifier: "cpu",
Version: version,
Vendor: vendor,
}
}
// ----- helpers -----------------------------------------------------------
func firstNonEmpty(ss ...string) string {
for _, s := range ss {
if strings.TrimSpace(s) != "" {
return s
}
}
return ""
}
func readFile(p string) string {
b, err := os.ReadFile(p)
if err != nil {
return ""
}
return string(b)
}
// trimErr joins the underlying error with the first line of combined
// output so the warning message carries enough diagnostic context
// without dumping a screenful of dmidecode/ipmitool noise.
func trimErr(err error, out string) string {
firstLine := strings.SplitN(strings.TrimSpace(out), "\n", 2)[0]
if firstLine == "" {
return err.Error()
}
return fmt.Sprintf("%v (%s)", err, firstLine)
}
+232
View File
@@ -0,0 +1,232 @@
package probes
import (
"strings"
"testing"
)
// Golden dmidecode -t bios output (trimmed, representative). A real
// host will have more lines; parse must tolerate the unknown fields.
const dmidecodeBIOS = `# dmidecode 3.3
Getting SMBIOS data from sysfs.
SMBIOS 3.2.0 present.
Handle 0x0000, DMI type 0, 26 bytes
BIOS Information
Vendor: American Megatrends Inc.
Version: 3.2
Release Date: 07/15/2021
Address: 0xF0000
Runtime Size: 64 kB
ROM Size: 32 MB
Characteristics:
PCI is supported
BIOS is upgradeable
Handle 0x0001, DMI type 1, 27 bytes
System Information
Manufacturer: Supermicro
Product Name: X11SSL-F
`
func TestParseDmidecodeBIOS(t *testing.T) {
snap := parseDmidecodeBIOS(strings.NewReader(dmidecodeBIOS))
if snap == nil {
t.Fatal("parseDmidecodeBIOS returned nil")
}
if snap.Component != "bios" {
t.Errorf("component = %q, want bios", snap.Component)
}
if snap.Version != "3.2" {
t.Errorf("version = %q, want 3.2", snap.Version)
}
if snap.Vendor != "American Megatrends Inc." {
t.Errorf("vendor = %q, want American Megatrends Inc.", snap.Vendor)
}
if snap.Raw["Release Date"] != "07/15/2021" {
t.Errorf("release date = %q, want 07/15/2021", snap.Raw["Release Date"])
}
}
func TestParseDmidecodeBIOSMissingBlock(t *testing.T) {
// No BIOS Information block → nil result, not a crash.
input := "Handle 0x0001, DMI type 1, 27 bytes\nSystem Information\n\tManufacturer: Acme\n"
if snap := parseDmidecodeBIOS(strings.NewReader(input)); snap != nil {
t.Fatalf("expected nil when BIOS block absent, got %+v", snap)
}
}
const ipmitoolMCInfo = `Device ID : 32
Device Revision : 1
Firmware Revision : 1.74
IPMI Version : 2.0
Manufacturer ID : 10876
Manufacturer Name : Supermicro
Product ID : 2051 (0x0803)
Product Name : Unknown (0x803)
`
func TestParseIpmitoolMCInfo(t *testing.T) {
snap := parseIpmitoolMCInfo(strings.NewReader(ipmitoolMCInfo))
if snap == nil {
t.Fatal("parseIpmitoolMCInfo returned nil")
}
if snap.Component != "bmc" {
t.Errorf("component = %q, want bmc", snap.Component)
}
if snap.Version != "1.74" {
t.Errorf("version = %q, want 1.74", snap.Version)
}
if snap.Vendor != "Supermicro" {
t.Errorf("vendor = %q, want Supermicro", snap.Vendor)
}
}
func TestParseIpmitoolMCInfoEmpty(t *testing.T) {
if snap := parseIpmitoolMCInfo(strings.NewReader("")); snap != nil {
t.Fatalf("expected nil on empty input, got %+v", snap)
}
}
const ethtoolEth0 = `driver: mlx5_core
version: 5.15.0
firmware-version: 16.32.1010 (MT_0000000008)
expansion-rom-version:
bus-info: 0000:5e:00.0
supports-statistics: yes
`
func TestParseEthtoolI(t *testing.T) {
snap := parseEthtoolI(strings.NewReader(ethtoolEth0), "eth0")
if snap == nil {
t.Fatal("parseEthtoolI returned nil")
}
if snap.Component != "nic" || snap.Identifier != "eth0" {
t.Errorf("component/id = %q/%q, want nic/eth0", snap.Component, snap.Identifier)
}
if snap.Version != "16.32.1010 (MT_0000000008)" {
t.Errorf("version = %q, want 16.32.1010 (MT_0000000008)", snap.Version)
}
if snap.Vendor != "mlx5_core" {
t.Errorf("vendor = %q, want mlx5_core", snap.Vendor)
}
}
func TestParseEthtoolIEmpty(t *testing.T) {
if snap := parseEthtoolI(strings.NewReader("not a valid output"), "eth0"); snap != nil {
t.Fatalf("expected nil on garbage input, got %+v", snap)
}
}
const nvmeIDCtrl = `NVME Identify Controller:
vid : 0x144d
ssvid : 0x144d
sn : S5GYNX0R500123X
mn : Samsung SSD 980 PRO 1TB
fr : 5B2QGXA7
rab : 2
`
func TestParseNVMeIDCtrl(t *testing.T) {
if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "fr"); got != "5B2QGXA7" {
t.Errorf("fr = %q, want 5B2QGXA7", got)
}
if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "mn"); got != "Samsung SSD 980 PRO 1TB" {
t.Errorf("mn = %q, want Samsung SSD 980 PRO 1TB", got)
}
if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "missing"); got != "" {
t.Errorf("missing key should be empty, got %q", got)
}
}
const lspciHBA = `0000:01:00.0 Ethernet controller [0200]: Intel Corporation I350 [8086:1521] (rev 01)
Subsystem: Intel Corporation I350 [8086:0001]
Kernel driver in use: igb
Kernel modules: igb
0000:03:00.0 Serial Attached SCSI controller [0107]: Broadcom / LSI SAS3008 PCI-Express Fusion-MPT SAS-3 [1000:0097] (rev 02)
Subsystem: Broadcom / LSI SAS9300-8i [1000:30e0]
Kernel driver in use: mpt3sas
Kernel modules: mpt3sas
0000:04:00.0 RAID bus controller [0104]: LSI MegaRAID SAS-3 3108 [1000:005d] (rev 02)
Subsystem: LSI MegaRAID SAS 9361-8i [1000:9361]
Kernel driver in use: megaraid_sas
Kernel modules: megaraid_sas
`
func TestParseLspciHBA(t *testing.T) {
got := parseLspciHBA(strings.NewReader(lspciHBA))
if len(got) != 2 {
t.Fatalf("got %d HBA snapshots, want 2 (SAS + RAID; Ethernet must be skipped)", len(got))
}
for _, s := range got {
if s.Component != "hba" {
t.Errorf("component = %q, want hba", s.Component)
}
if s.Version != "rev 02" {
t.Errorf("version = %q, want 'rev 02'", s.Version)
}
}
if got[0].Identifier != "0000:03:00.0" {
t.Errorf("first identifier = %q, want 0000:03:00.0", got[0].Identifier)
}
if got[1].Identifier != "0000:04:00.0" {
t.Errorf("second identifier = %q, want 0000:04:00.0", got[1].Identifier)
}
}
const cpuinfo = `processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 85
model name : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz
stepping : 7
microcode : 0x5003006
cpu MHz : 2100.000
`
func TestParseMicrocode(t *testing.T) {
snap := parseMicrocode(strings.NewReader(cpuinfo))
if snap == nil {
t.Fatal("parseMicrocode returned nil")
}
if snap.Version != "0x5003006" {
t.Errorf("version = %q, want 0x5003006", snap.Version)
}
if snap.Vendor != "GenuineIntel" {
t.Errorf("vendor = %q, want GenuineIntel", snap.Vendor)
}
if snap.Identifier != "cpu" {
t.Errorf("identifier = %q, want cpu", snap.Identifier)
}
}
func TestParseMicrocodeMissing(t *testing.T) {
// A /proc/cpuinfo without a microcode line returns nil.
input := "processor\t: 0\nvendor_id\t: GenuineIntel\n"
if snap := parseMicrocode(strings.NewReader(input)); snap != nil {
t.Fatalf("expected nil when microcode line absent, got %+v", snap)
}
}
func TestIsRealNIC(t *testing.T) {
cases := []struct {
name string
want bool // want=true means a real-looking name (the /sys/class/net/<name>/device check is skipped here)
}{
{"lo", false},
{"", false},
{"docker0", false},
{"br-abc", false},
{"veth1234", false},
{"virbr0", false},
{"bond0", false},
{"tun0", false},
}
for _, tc := range cases {
if got := isRealNIC(tc.name); got != tc.want {
t.Errorf("isRealNIC(%q) = %v, want %v", tc.name, got, tc.want)
}
}
}
+85
View File
@@ -0,0 +1,85 @@
package probes
import (
"bufio"
"io"
"os"
"strconv"
"strings"
)
// NetDevSnapshot is the per-interface counter row from /proc/net/dev at
// a single instant. Used by the Network stage to compute deltas across
// an iperf window — a rising rx_errors or tx_dropped during a loaded
// link is a real NIC problem, not general noise.
type NetDevSnapshot struct {
Iface string
RxBytes uint64
RxErrs uint64
RxDrop uint64
TxBytes uint64
TxErrs uint64
TxDrop uint64
}
// NetDev reads /proc/net/dev and returns one snapshot per non-loopback
// interface. Returns nil on read/parse failure (best-effort: a missing
// /proc is survivable; the caller skips delta reporting that tick).
func NetDev() []NetDevSnapshot {
f, err := os.Open("/proc/net/dev")
if err != nil {
return nil
}
defer func() { _ = f.Close() }()
return parseNetDev(f)
}
// parseNetDev is split from NetDev so tests can feed a fixture without
// touching the real /proc. The /proc/net/dev format is two header lines
// followed by rows of "iface: rx_bytes rx_packets rx_errs rx_drop ... tx_bytes tx_packets tx_errs tx_drop ..."
// — 16 whitespace-separated counters, of which we pull a curated six.
func parseNetDev(r io.Reader) []NetDevSnapshot {
var out []NetDevSnapshot
sc := bufio.NewScanner(r)
// Skip the two header lines (iface || bytes ... || bytes ...).
for i := 0; i < 2 && sc.Scan(); i++ {
}
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if line == "" {
continue
}
colon := strings.IndexByte(line, ':')
if colon < 0 {
continue
}
iface := strings.TrimSpace(line[:colon])
if iface == "" || iface == "lo" {
continue
}
fields := strings.Fields(line[colon+1:])
if len(fields) < 16 {
continue
}
// /proc/net/dev columns:
// 0 rx_bytes 1 rx_packets 2 rx_errs 3 rx_drop 4 fifo 5 frame 6 compressed 7 multicast
// 8 tx_bytes 9 tx_packets 10 tx_errs 11 tx_drop 12 fifo 13 colls 14 carrier 15 compressed
snap := NetDevSnapshot{Iface: iface}
snap.RxBytes = parseU64(fields[0])
snap.RxErrs = parseU64(fields[2])
snap.RxDrop = parseU64(fields[3])
snap.TxBytes = parseU64(fields[8])
snap.TxErrs = parseU64(fields[10])
snap.TxDrop = parseU64(fields[11])
out = append(out, snap)
}
return out
}
func parseU64(s string) uint64 {
n, err := strconv.ParseUint(s, 10, 64)
if err != nil {
return 0
}
return n
}
+84
View File
@@ -0,0 +1,84 @@
package probes
import (
"strings"
"testing"
)
// TestParseNetDev_RealSample exercises parseNetDev against a synthetic
// /proc/net/dev fixture with the full 16-column layout. Confirms the
// loopback interface is dropped, headers are skipped, and each of the
// six curated counters lands in the right field.
func TestParseNetDev_RealSample(t *testing.T) {
// Columns after "iface:":
// 0 rx_bytes 1 rx_packets 2 rx_errs 3 rx_drop
// 4 fifo 5 frame 6 compressed 7 multicast
// 8 tx_bytes 9 tx_packets 10 tx_errs 11 tx_drop
// 12 fifo 13 colls 14 carrier 15 compressed
fixture := `Inter-| Receive | Transmit
face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed
lo: 1000000 10000 0 0 0 0 0 0 1000000 10000 0 0 0 0 0 0
eth0: 50000000 100000 7 12 0 0 0 0 40000000 90000 3 5 0 0 0 0
eth1: 12345 200 0 0 0 0 0 0 54321 180 0 0 0 0 0 0
`
snaps := parseNetDev(strings.NewReader(fixture))
if len(snaps) != 2 {
t.Fatalf("got %d snapshots, want 2 (lo should be dropped)", len(snaps))
}
byIface := map[string]NetDevSnapshot{}
for _, s := range snaps {
byIface[s.Iface] = s
}
eth0, ok := byIface["eth0"]
if !ok {
t.Fatalf("eth0 missing from parsed snapshots")
}
if eth0.RxBytes != 50000000 {
t.Errorf("eth0 RxBytes=%d, want 50000000", eth0.RxBytes)
}
if eth0.RxErrs != 7 {
t.Errorf("eth0 RxErrs=%d, want 7", eth0.RxErrs)
}
if eth0.RxDrop != 12 {
t.Errorf("eth0 RxDrop=%d, want 12", eth0.RxDrop)
}
if eth0.TxBytes != 40000000 {
t.Errorf("eth0 TxBytes=%d, want 40000000", eth0.TxBytes)
}
if eth0.TxErrs != 3 {
t.Errorf("eth0 TxErrs=%d, want 3", eth0.TxErrs)
}
if eth0.TxDrop != 5 {
t.Errorf("eth0 TxDrop=%d, want 5", eth0.TxDrop)
}
if _, ok := byIface["lo"]; ok {
t.Errorf("lo should have been filtered out")
}
}
// TestParseNetDev_Empty: an empty reader returns no snapshots, not a
// crash. Callers treat nil as "no data" and skip the delta step.
func TestParseNetDev_Empty(t *testing.T) {
snaps := parseNetDev(strings.NewReader(""))
if len(snaps) != 0 {
t.Errorf("got %d snapshots from empty reader, want 0", len(snaps))
}
}
// TestParseNetDev_MalformedRow skips rows that don't have the expected
// 16 columns rather than panicking. A truncated line shouldn't hide the
// good rows that follow.
func TestParseNetDev_MalformedRow(t *testing.T) {
fixture := `header line 1
header line 2
bad0: 123 456
eth0: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
`
snaps := parseNetDev(strings.NewReader(fixture))
if len(snaps) != 1 {
t.Fatalf("got %d snapshots, want 1 (bad0 should be dropped)", len(snaps))
}
if snaps[0].Iface != "eth0" {
t.Errorf("got iface=%q, want eth0", snaps[0].Iface)
}
}
+133 -24
View File
@@ -26,6 +26,7 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"strings"
"sync" "sync"
"sync/atomic" "sync/atomic"
"time" "time"
@@ -71,7 +72,10 @@ func Run(ctx context.Context, p *bootstate.Params) error {
} }
fwd.info(fmt.Sprintf("claimed run; stages=%v current_state=%s", claim.Stages, claim.CurrentState)) fwd.info(fmt.Sprintf("claimed run; stages=%v current_state=%s", claim.Stages, claim.CurrentState))
go thermalSidecar(ctx, c, fwd) mux := NewSensorMux(ctx, c)
defer mux.Close()
go thermalSidecar(ctx, mux, fwd)
hbCh := make(chan HeartbeatResponse, 4) hbCh := make(chan HeartbeatResponse, 4)
go heartbeatLoop(ctx, c, fwd, hbCh) go heartbeatLoop(ctx, c, fwd, hbCh)
@@ -101,7 +105,7 @@ func Run(ctx context.Context, p *bootstate.Params) error {
default: default:
} }
fwd.info("stage: starting " + nextStage) fwd.info("stage: starting " + nextStage)
outcome := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{}) outcome := runStageCancellable(ctx, nextStage, claim, fwd, c, mux, overrideFlags{})
if outcome.Cancelled { if outcome.Cancelled {
fwd.warn("stage cancelled by operator; posting result and exiting") fwd.warn("stage cancelled by operator; posting result and exiting")
_, _ = postResult(ctx, c, nextStage, outcome) _, _ = postResult(ctx, c, nextStage, outcome)
@@ -119,7 +123,7 @@ func Run(ctx context.Context, p *bootstate.Params) error {
return err return err
} }
// Park and wait for an override directive. // Park and wait for an override directive.
return waitForOverride(ctx, c, fwd, hbCh, claim) return waitForOverride(ctx, c, fwd, mux, hbCh, claim)
} }
if resp.NextState == "Completed" || resp.NextState == "" { if resp.NextState == "Completed" || resp.NextState == "" {
fwd.info("pipeline complete") fwd.info("pipeline complete")
@@ -144,10 +148,10 @@ func Run(ctx context.Context, p *bootstate.Params) error {
// it runs the inventory probe and passes the result as the /result body // it runs the inventory probe and passes the result as the /result body
// (the orchestrator persists it as an artifact). Every other stage // (the orchestrator persists it as an artifact). Every other stage
// returns a tests.Outcome which postResult marshals generically. // returns a tests.Outcome which postResult marshals generically.
func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome { func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, mux *SensorMux, ovr overrideFlags) stageOutcome {
fwd.SetStage(stage) fwd.SetStage(stage)
defer fwd.ClearStage() defer fwd.ClearStage()
deps := newDeps(ctx, c, fwd, ovr, claim) deps := newDeps(ctx, c, fwd, mux, ovr, claim, stage)
switch stage { switch stage {
case "Inventory": case "Inventory":
fwd.info("Inventory: probing host hardware") fwd.info("Inventory: probing host hardware")
@@ -163,6 +167,25 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
}, },
Inventory: inv, Inventory: inv,
} }
case "Firmware":
fwd.info("Firmware: probing firmware versions")
snaps, warns := probes.Firmware(ctx)
for _, w := range warns {
fwd.warn(w)
}
summary := firmwareSummary(snaps)
fwd.info("Firmware: " + summary)
return stageOutcome{
Outcome: tests.Outcome{
Passed: true,
Summary: summary,
Extras: map[string]any{
"warnings": warns,
"snapshots": len(snaps),
},
},
Firmware: snaps,
}
case "SMART": case "SMART":
return stageOutcome{Outcome: tests.SMART(ctx, deps)} return stageOutcome{Outcome: tests.SMART(ctx, deps)}
case "CPUStress": case "CPUStress":
@@ -170,10 +193,19 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
case "Storage": case "Storage":
return stageOutcome{Outcome: tests.Storage(ctx, deps)} return stageOutcome{Outcome: tests.Storage(ctx, deps)}
case "Network": case "Network":
duration := deps.NetworkKnobs.Duration
if duration <= 0 {
duration = 10 * time.Second
}
return stageOutcome{Outcome: tests.Network(ctx, deps, tests.NetworkConfig{ return stageOutcome{Outcome: tests.Network(ctx, deps, tests.NetworkConfig{
OrchestratorURL: c.BaseURL, OrchestratorURL: c.BaseURL,
IperfPort: claim.IperfPort, IperfPort: claim.IperfPort,
Duration: 10 * time.Second, Duration: duration,
})}
case "Burn":
return stageOutcome{Outcome: tests.Burn(ctx, deps, tests.BurnConfig{
OrchestratorURL: c.BaseURL,
IperfPort: claim.IperfPort,
})} })}
case "GPU": case "GPU":
return stageOutcome{Outcome: tests.GPU(ctx, deps)} return stageOutcome{Outcome: tests.GPU(ctx, deps)}
@@ -188,8 +220,9 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF
type stageOutcome struct { type stageOutcome struct {
Outcome tests.Outcome Outcome tests.Outcome
Inventory *spec.Inventory // only for Inventory stage Inventory *spec.Inventory // only for Inventory stage
Cancelled bool // set when the stage was cut short by operator cancel Firmware []probes.FirmwareSnapshot // only for Firmware stage
Cancelled bool // set when the stage was cut short by operator cancel
} }
// runStageCancellable wraps runStage in a per-stage context so the // runStageCancellable wraps runStage in a per-stage context so the
@@ -197,14 +230,14 @@ type stageOutcome struct {
// is currently running. If the derived context was cancelled while the // is currently running. If the derived context was cancelled while the
// stage executed, the outcome is rewritten as a cancellation record so // stage executed, the outcome is rewritten as a cancellation record so
// the orchestrator has something to persist. // the orchestrator has something to persist.
func runStageCancellable(parent context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome { func runStageCancellable(parent context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, mux *SensorMux, ovr overrideFlags) stageOutcome {
stageCtx, cancel := context.WithCancel(parent) stageCtx, cancel := context.WithCancel(parent)
stageCancel.Store(cancel) stageCancel.Store(cancel)
defer func() { defer func() {
cancel() cancel()
stageCancel.Store(context.CancelFunc(nil)) stageCancel.Store(context.CancelFunc(nil))
}() }()
out := runStage(stageCtx, stage, claim, fwd, c, ovr) out := runStage(stageCtx, stage, claim, fwd, c, mux, ovr)
// If the parent is still live but the stage ctx was cancelled, the // If the parent is still live but the stage ctx was cancelled, the
// operator fired a cancel — mark the outcome so the caller can exit // operator fired a cancel — mark the outcome so the caller can exit
// the pipeline cleanly. Plain ctx-cancel on ctx.Done (e.g. shutdown) // the pipeline cleanly. Plain ctx-cancel on ctx.Done (e.g. shutdown)
@@ -235,7 +268,7 @@ type overrideFlags struct {
Wipe bool `json:"wipe"` Wipe bool `json:"wipe"`
} }
func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlags, claim *ClaimResponse) tests.Deps { func newDeps(ctx context.Context, c *Client, fwd *logForwarder, mux *SensorMux, ovr overrideFlags, claim *ClaimResponse, stage string) tests.Deps {
var expected []tests.ExpectedDisk var expected []tests.ExpectedDisk
for _, e := range claim.ExpectedDisks { for _, e := range claim.ExpectedDisks {
expected = append(expected, tests.ExpectedDisk{Serial: e.Serial, SizeGB: e.SizeGB}) expected = append(expected, tests.ExpectedDisk{Serial: e.Serial, SizeGB: e.SizeGB})
@@ -247,17 +280,73 @@ func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlag
OverrideWipe: ovr.Wipe, OverrideWipe: ovr.Wipe,
NonDestructive: claim.NonDestructive, NonDestructive: claim.NonDestructive,
ExpectedDisks: expected, ExpectedDisks: expected,
StageTimeout: 2 * time.Minute, StageTimeout: stageTimeout(claim, stage),
Sensor: func(ctx context.Context, samples []tests.Sample) error { CPUStressKnobs: tests.CPUStressKnobs{
CPUPass: parseDur(claim.StageConfig.CPUStress.CPUPass),
MemPass: parseDur(claim.StageConfig.CPUStress.MemPass),
EDACPoll: parseDur(claim.StageConfig.CPUStress.EDACPoll),
},
StorageKnobs: tests.StorageKnobs{
Mode: claim.StageConfig.Storage.Mode,
FioSize: claim.StageConfig.Storage.FioSize,
FioTime: parseDur(claim.StageConfig.Storage.FioTime),
FioBS: claim.StageConfig.Storage.FioBS,
FioRW: claim.StageConfig.Storage.FioRW,
Verify: claim.StageConfig.Storage.Verify,
},
NetworkKnobs: tests.NetworkKnobs{
Duration: parseDur(claim.StageConfig.Network.Duration),
},
BurnKnobs: tests.BurnKnobs{
Duration: parseDur(claim.StageConfig.Burn.Duration),
CPUWorkers: claim.StageConfig.Burn.CPUWorkers,
MemPct: claim.StageConfig.Burn.MemPct,
FioOnSpare: claim.StageConfig.Burn.FioOnSpare,
IperfParallel: claim.StageConfig.Burn.IperfParallel,
},
Sensor: func(_ context.Context, samples []tests.Sample) error {
out := make([]SensorSample, 0, len(samples)) out := make([]SensorSample, 0, len(samples))
for _, s := range samples { for _, s := range samples {
out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit}) out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
} }
return c.Sensor(ctx, out) mux.Send(out)
return nil
}, },
} }
} }
// stageTimeout reads claim.StageConfig.StageTimeouts[stage] and falls
// back to 2 minutes (the pre-Phase-2 default). Malformed entries log and
// fall back — we'd rather run the stage than refuse on a typo.
func stageTimeout(claim *ClaimResponse, stage string) time.Duration {
if claim == nil || claim.StageConfig.StageTimeouts == nil {
return 2 * time.Minute
}
raw, ok := claim.StageConfig.StageTimeouts[stage]
if !ok || raw == "" {
return 2 * time.Minute
}
d, err := time.ParseDuration(raw)
if err != nil || d <= 0 {
return 2 * time.Minute
}
return d
}
// parseDur is the permissive duration parser for the knob wire shape.
// Empty strings / parse failures yield 0 so callers can treat a zero
// value as "use the compile-time default" without a nil-check dance.
func parseDur(s string) time.Duration {
if s == "" {
return 0
}
d, err := time.ParseDuration(s)
if err != nil || d < 0 {
return 0
}
return d
}
// postResult marshals stageOutcome for the /result endpoint. The // postResult marshals stageOutcome for the /result endpoint. The
// Inventory shape is special-cased: it includes the inventory blob so // Inventory shape is special-cased: it includes the inventory blob so
// the orchestrator can persist it and run server-side spec diff. // the orchestrator can persist it and run server-side spec diff.
@@ -276,6 +365,9 @@ func postResult(ctx context.Context, c *Client, stage string, s stageOutcome) (*
if s.Inventory != nil { if s.Inventory != nil {
body["inventory"] = s.Inventory body["inventory"] = s.Inventory
} }
if len(s.Firmware) > 0 {
body["firmware"] = s.Firmware
}
if len(s.Outcome.SubSteps) > 0 { if len(s.Outcome.SubSteps) > 0 {
wire := make([]SubStepReport, 0, len(s.Outcome.SubSteps)) wire := make([]SubStepReport, 0, len(s.Outcome.SubSteps))
for _, ss := range s.Outcome.SubSteps { for _, ss := range s.Outcome.SubSteps {
@@ -304,7 +396,7 @@ func stageForState(state string) string {
switch state { switch state {
case "InventoryCheck": case "InventoryCheck":
return "Inventory" return "Inventory"
case "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU": case "Firmware", "SMART", "CPUStress", "Storage", "Network", "Burn", "GPU", "PSU":
return state return state
} }
// SpecValidate and Reporting are orchestrator-owned; we never see // SpecValidate and Reporting are orchestrator-owned; we never see
@@ -315,7 +407,7 @@ func stageForState(state string) string {
// waitForOverride parks the agent in FailedHolding. It listens for a // waitForOverride parks the agent in FailedHolding. It listens for a
// heartbeat directive that tells it to retry a stage (e.g. Storage // heartbeat directive that tells it to retry a stage (e.g. Storage
// with wipe-override armed) and re-enters runStage from that point. // with wipe-override armed) and re-enters runStage from that point.
func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-chan HeartbeatResponse, claim *ClaimResponse) error { func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, mux *SensorMux, hb <-chan HeartbeatResponse, claim *ClaimResponse) error {
fwd.info("holding: awaiting operator decision (heartbeat directive or ctx cancel)") fwd.info("holding: awaiting operator decision (heartbeat directive or ctx cancel)")
for { for {
select { select {
@@ -333,7 +425,7 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha
if len(cmd.OverrideFlags) > 0 { if len(cmd.OverrideFlags) > 0 {
_ = json.Unmarshal(cmd.OverrideFlags, &ovr) _ = json.Unmarshal(cmd.OverrideFlags, &ovr)
} }
outcome := runStageCancellable(ctx, cmd.Stage, claim, fwd, c, ovr) outcome := runStageCancellable(ctx, cmd.Stage, claim, fwd, c, mux, ovr)
if outcome.Cancelled { if outcome.Cancelled {
fwd.warn("stage cancelled by operator; posting result and exiting") fwd.warn("stage cancelled by operator; posting result and exiting")
_, _ = postResult(ctx, c, cmd.Stage, outcome) _, _ = postResult(ctx, c, cmd.Stage, outcome)
@@ -362,7 +454,7 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha
default: default:
} }
fwd.info("stage: starting " + nextStage) fwd.info("stage: starting " + nextStage)
out := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{}) out := runStageCancellable(ctx, nextStage, claim, fwd, c, mux, overrideFlags{})
if out.Cancelled { if out.Cancelled {
fwd.warn("stage cancelled by operator; posting result and exiting") fwd.warn("stage cancelled by operator; posting result and exiting")
_, _ = postResult(ctx, c, nextStage, out) _, _ = postResult(ctx, c, nextStage, out)
@@ -417,11 +509,32 @@ func inventorySummary(inv *spec.Inventory) string {
len(inv.Disks), len(inv.NICs), len(inv.GPUs)) len(inv.Disks), len(inv.NICs), len(inv.GPUs))
} }
// firmwareSummary renders the one-liner surfaced in the stage tile:
// per-component counts so an operator can see "bios=1 nic=2 nvme_fw=1"
// without opening the report.
func firmwareSummary(snaps []probes.FirmwareSnapshot) string {
counts := map[string]int{}
for _, s := range snaps {
counts[s.Component]++
}
if len(counts) == 0 {
return "no firmware readable"
}
keys := []string{"bios", "bmc", "nic", "hba", "nvme_fw", "microcode"}
parts := make([]string, 0, len(keys))
for _, k := range keys {
if n := counts[k]; n > 0 {
parts = append(parts, fmt.Sprintf("%s=%d", k, n))
}
}
return strings.Join(parts, " ")
}
// thermalSidecar posts a batch of /sys/class/hwmon samples every 5s. // thermalSidecar posts a batch of /sys/class/hwmon samples every 5s.
// Idempotent: a dead sensor just drops out of the next batch. Errors // Idempotent: a dead sensor just drops out of the next batch. Errors
// are logged but never fatal — we'd rather have a run with partial // are logged but never fatal — we'd rather have a run with partial
// thermal data than kill the agent over an I/O hiccup. // thermal data than kill the agent over an I/O hiccup.
func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) { func thermalSidecar(ctx context.Context, mux *SensorMux, fwd *logForwarder) {
t := time.NewTicker(5 * time.Second) t := time.NewTicker(5 * time.Second)
defer t.Stop() defer t.Stop()
for { for {
@@ -437,11 +550,7 @@ func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) {
for _, s := range samples { for _, s := range samples {
out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit}) out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
} }
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) mux.Send(out)
if err := c.Sensor(sendCtx, out); err != nil {
fwd.warn("thermal sidecar: " + err.Error())
}
cancel()
} }
} }
} }
+139
View File
@@ -0,0 +1,139 @@
package agent
import (
"context"
"log"
"sync"
"time"
)
// SensorMux coalesces sensor samples from every stage + sidecar into a
// single batched HTTP POST stream. Without it, a Burn run that fans out
// four concurrent workloads + thermal + PSU + EDAC sidecars can push ~50
// samples/sec, each as a separate /sensor request — enough to either
// saturate the orchestrator's request budget or stall a stage on its
// own sensor-forwarding path.
//
// Contract:
// - Send is non-blocking; a full input channel drops a batch on the
// floor and logs a warning. That's preferred over back-pressuring
// a workload goroutine and skewing its timing.
// - Flush happens every flushInterval *or* whenever the pending buffer
// exceeds maxBatch samples. Chunk-at-flush keeps each HTTP request
// bounded regardless of the incoming rate.
// - Close flushes whatever is in the buffer. Callers that need the
// final flush to reach the server should defer Close before other
// deferred shutdown work.
type SensorMux struct {
c *Client
in chan []SensorSample
flushInterval time.Duration
maxBatch int
ctx context.Context
cancel context.CancelFunc
wg sync.WaitGroup
}
// NewSensorMux starts the flush loop. Callers hand the returned mux to
// every code path that previously called Client.Sensor directly (stage
// Deps.Sensor, thermal sidecar, EDAC sidecar). The mux lives for the
// duration of the agent run.
func NewSensorMux(parent context.Context, c *Client) *SensorMux {
ctx, cancel := context.WithCancel(parent)
m := &SensorMux{
c: c,
in: make(chan []SensorSample, 32),
flushInterval: 2 * time.Second,
maxBatch: 500,
ctx: ctx,
cancel: cancel,
}
m.wg.Add(1)
go m.loop()
return m
}
// Send enqueues a batch for the next flush tick. Empty batches are
// silently ignored so callers with conditional sample lists don't need
// to guard the call site.
func (m *SensorMux) Send(samples []SensorSample) {
if m == nil || len(samples) == 0 {
return
}
// Copy so caller mutations don't race with the flush loop.
out := make([]SensorSample, len(samples))
copy(out, samples)
select {
case m.in <- out:
default:
log.Printf("sensor mux: input channel full, dropping %d samples", len(out))
}
}
// Close stops the flush loop and flushes the residual buffer. Safe to
// call twice (the second is a no-op because the internal context is
// already cancelled).
func (m *SensorMux) Close() {
if m == nil {
return
}
m.cancel()
m.wg.Wait()
}
func (m *SensorMux) loop() {
defer m.wg.Done()
buf := make([]SensorSample, 0, m.maxBatch)
t := time.NewTicker(m.flushInterval)
defer t.Stop()
for {
select {
case <-m.ctx.Done():
m.flushChunks(buf)
buf = nil
// Drain whatever is still sitting in the channel so a
// workload that pushed right before Close doesn't lose
// those final samples.
for {
select {
case batch := <-m.in:
m.flushChunks(batch)
default:
return
}
}
case batch := <-m.in:
buf = append(buf, batch...)
if len(buf) >= m.maxBatch {
m.flushChunks(buf)
buf = buf[:0]
}
case <-t.C:
if len(buf) > 0 {
m.flushChunks(buf)
buf = buf[:0]
}
}
}
}
// flushChunks splits a potentially-large slice into maxBatch-sized
// HTTP requests so no single POST carries more than the configured cap.
// A 10-second per-chunk timeout keeps a stalled orchestrator from
// freezing the flush loop.
func (m *SensorMux) flushChunks(all []SensorSample) {
for len(all) > 0 {
n := len(all)
if n > m.maxBatch {
n = m.maxBatch
}
chunk := all[:n]
all = all[n:]
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
if err := m.c.Sensor(ctx, chunk); err != nil {
log.Printf("sensor mux: flush of %d samples failed: %v", len(chunk), err)
}
cancel()
}
}
+144
View File
@@ -0,0 +1,144 @@
package agent
import (
"context"
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"strings"
"sync"
"sync/atomic"
"testing"
"time"
)
// TestSensorMux_CloseFlushesBuffer confirms Close() empties the
// pending buffer through the HTTP client before returning. Without
// this guarantee a Burn run would drop the last 2 s of samples when
// the stage tears down, which is exactly the window that contains the
// peak-load PSU / thermal readings we care about.
func TestSensorMux_CloseFlushesBuffer(t *testing.T) {
var batches int32
var totalSamples int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if !strings.HasSuffix(r.URL.Path, "/sensor") {
t.Errorf("unexpected path %s", r.URL.Path)
}
body, _ := io.ReadAll(r.Body)
var env struct {
Samples []SensorSample `json:"samples"`
}
if err := json.Unmarshal(body, &env); err != nil {
t.Errorf("decode: %v", err)
}
atomic.AddInt32(&batches, 1)
atomic.AddInt32(&totalSamples, int32(len(env.Samples)))
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
c := &Client{
BaseURL: srv.URL,
RunID: 1,
Token: "t",
HTTP: srv.Client(),
}
mux := NewSensorMux(context.Background(), c)
mux.Send([]SensorSample{
{Kind: "temp", Key: "cpu/0", Value: 72.5, Unit: "C"},
{Kind: "psu_volt", Key: "+12V", Value: 12.05, Unit: "V"},
})
mux.Send([]SensorSample{
{Kind: "mce", Key: "0", Value: 0, Unit: "count"},
})
mux.Close()
if got := atomic.LoadInt32(&totalSamples); got != 3 {
t.Errorf("expected 3 samples flushed, got %d across %d batch(es)", got, atomic.LoadInt32(&batches))
}
if atomic.LoadInt32(&batches) == 0 {
t.Errorf("expected at least one batch HTTP post")
}
}
// TestSensorMux_ChunksOversizedBatch verifies flushChunks splits a
// single oversized input into maxBatch-sized HTTP requests. The plan's
// Burn stage can legitimately push a single input larger than the cap
// (e.g. a workload goroutine dumping a backlog), and a single giant
// POST would defeat the point of the multiplexer.
func TestSensorMux_ChunksOversizedBatch(t *testing.T) {
var batchSizes []int
var mu sync.Mutex
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
body, _ := io.ReadAll(r.Body)
var env struct {
Samples []SensorSample `json:"samples"`
}
_ = json.Unmarshal(body, &env)
mu.Lock()
batchSizes = append(batchSizes, len(env.Samples))
mu.Unlock()
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
c := &Client{BaseURL: srv.URL, RunID: 1, Token: "t", HTTP: srv.Client()}
mux := NewSensorMux(context.Background(), c)
// One input with 1200 samples → expect chunks of 500 + 500 + 200
// given the default maxBatch of 500.
big := make([]SensorSample, 1200)
for i := range big {
big[i] = SensorSample{Kind: "burn/throughput_mbps", Key: "eth0", Value: float64(i), Unit: "Mbps"}
}
mux.Send(big)
mux.Close()
mu.Lock()
defer mu.Unlock()
total := 0
for _, n := range batchSizes {
total += n
if n > 500 {
t.Errorf("batch size %d exceeds maxBatch=500", n)
}
}
if total != 1200 {
t.Errorf("sum of batch sizes = %d, want 1200 (sizes=%v)", total, batchSizes)
}
if len(batchSizes) < 3 {
t.Errorf("expected at least 3 chunks for a 1200-sample input, got %d (%v)", len(batchSizes), batchSizes)
}
}
// TestSensorMux_EmptyAndNilSafe covers the defensive guards around
// Send(nil) / Send([]) / a nil *SensorMux. Callers with conditional
// sample lists (storage probe that skipped a disk, GPU stage with no
// devices) should be able to call Send unconditionally without adding
// their own nil check.
func TestSensorMux_EmptyAndNilSafe(t *testing.T) {
var batches int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
atomic.AddInt32(&batches, 1)
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
// Nil receiver must be a no-op.
var nilMux *SensorMux
nilMux.Send([]SensorSample{{Kind: "x", Key: "y"}})
nilMux.Close()
c := &Client{BaseURL: srv.URL, RunID: 1, Token: "t", HTTP: srv.Client()}
mux := NewSensorMux(context.Background(), c)
mux.Send(nil)
mux.Send([]SensorSample{})
mux.Close()
// Give any spurious goroutine a chance to surprise us.
time.Sleep(50 * time.Millisecond)
if atomic.LoadInt32(&batches) != 0 {
t.Errorf("empty/nil Send must not produce HTTP batches, got %d", atomic.LoadInt32(&batches))
}
}
+486
View File
@@ -0,0 +1,486 @@
package tests
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"runtime"
"strconv"
"strings"
"sync"
"time"
"vetting/agent/probes"
)
// BurnConfig is what the agent passes to Burn: the orchestrator's iperf3
// server address and port. Durations + concurrency knobs come from
// Deps.BurnKnobs so they scale with profile.
type BurnConfig struct {
OrchestratorURL string
IperfPort int // 0 = 5201
}
// Burn is the concurrent soak stage. Unlike CPUStress (serial
// CPU→memory) or Storage (serial per disk) it fans out every workload
// at once: stress-ng hammers CPU + memory, fio drives the allow-listed
// disks, iperf3 pushes sustained NIC traffic, and two sidecars poll
// EDAC + PSU rails for the duration of the window.
//
// This is where PSU rails actually matter: 12V sag under simultaneous
// CPU + disk + NIC load is exactly the failure a thermal/power
// regression produces, and it's invisible to any stage that loads one
// subsystem at a time. The PSU stage that follows Burn in the pipeline
// re-samples rails post-window to confirm they settle back to nominal.
//
// Burn stays inside the stage framework — it doesn't spawn a parallel
// stage runner. The goroutine fan-out is local; the stage converges
// before returning an Outcome so every invariant the orchestrator
// relies on (serial stage order, single in-flight stage per run) still
// holds.
func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome {
duration := d.BurnKnobs.Duration
if duration <= 0 {
duration = 2 * time.Minute
}
cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers)
memPct := clampMemPct(d.BurnKnobs.MemPct)
iperfParallel := d.BurnKnobs.IperfParallel
if iperfParallel <= 0 {
iperfParallel = 2
}
d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v",
duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare))
// Sidecars run for the lifetime of the window and are cancelled on
// return so the main stage converges cleanly. EDAC catches DIMM
// bit-flips that appear only under concurrent load; PSU catches
// rail sag that only appears when CPU + disk + NIC pull current
// simultaneously.
sideCtx, sideCancel := context.WithCancel(ctx)
defer sideCancel()
var sideWG sync.WaitGroup
sideWG.Add(2)
go runEDACSidecar(sideCtx, &sideWG, d)
go runPSUSidecar(sideCtx, &sideWG, d)
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
defer cancel()
results := make(chan burnSubResult, 4)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnCPU(runCtx, d, duration, cpuWorkers)
}()
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnMemory(runCtx, d, duration, memPct)
}()
// fio runs only when explicitly enabled *and* there are allow-listed
// disks *and* the run wasn't marked non-destructive. Any of those
// missing records a Skipped sub-step so the operator sees why.
if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive {
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnFio(runCtx, d, duration)
}()
} else {
reason := burnFioSkipReason(d)
results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason}
}
// iperf requires an orchestrator host. Lab hosts run with the
// bundled iperf3 server; without a base URL we can't derive a
// target so we skip rather than fail the stage.
if cfg.OrchestratorURL != "" {
wg.Add(1)
go func() {
defer wg.Done()
results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel)
}()
} else {
results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"}
}
wg.Wait()
sideCancel()
sideWG.Wait()
close(results)
subs, samples, failures := collectBurnResults(results)
if d.Sensor != nil && len(samples) > 0 {
_ = d.Sensor(ctx, samples)
}
extras := map[string]any{
"duration": duration.String(),
"cpu_workers": cpuWorkers,
"mem_pct": memPct,
"iperf_parallel": iperfParallel,
"fio_on_spare": d.BurnKnobs.FioOnSpare,
}
if len(failures) > 0 {
msg := "Burn workloads failed: " + strings.Join(failures, ", ")
d.Error(msg)
return Outcome{
Passed: false,
Message: msg,
Summary: fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)),
Extras: extras,
SubSteps: subs,
}
}
d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs)))
return Outcome{
Passed: true,
Summary: fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)),
Extras: extras,
SubSteps: subs,
}
}
// burnSubResult is the per-workload return type used by the fan-out
// goroutines. Sample slice is merged into the stage's final /sensor
// batch; SubStep becomes a row on the /result sub-steps list.
type burnSubResult struct {
Name string
Passed bool
Skipped bool
Reason string // why a workload was skipped
Err string // why a workload failed
Samples []Sample
SubStep SubStepReport
}
func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) {
var subs []SubStepReport
var samples []Sample
var failures []string
for r := range ch {
// Non-skipped goroutines populate SubStep directly. Skipped slots
// get a synthesized row here so the /result shape stays stable.
if r.Skipped {
stamp := time.Now().UTC()
subs = append(subs, SubStepReport{
Name: r.Name,
Skipped: true,
StartedAt: stamp,
CompletedAt: stamp,
SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}),
})
continue
}
subs = append(subs, r.SubStep)
samples = append(samples, r.Samples...)
if !r.Passed {
reason := r.Err
if reason == "" {
reason = "unknown"
}
failures = append(failures, r.Name+": "+reason)
}
}
return subs, samples, failures
}
func burnFioSkipReason(d Deps) string {
if !d.BurnKnobs.FioOnSpare {
return "fio_on_spare knob disabled"
}
if d.NonDestructive {
return "non-destructive run"
}
if len(d.ExpectedDisks) == 0 {
return "no allowlisted disks"
}
return "disabled"
}
// runBurnCPU hammers all CPU cores with stress-ng for the window. Same
// shape as CPUStress pass 1 but with shorter label so the sub-step row
// doesn't collide with the earlier stage's "CPU pass".
func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult {
if _, err := exec.LookPath("stress-ng"); err != nil {
return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"}
}
args := []string{
"--cpu", strconv.Itoa(workers),
"--cpu-method", "all",
"--timeout", durationSeconds(duration),
"--metrics-brief",
"--verify",
}
d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " ")))
pass := runStressPass(ctx, d, "Burn CPU", duration, args)
return burnSubResult{
Name: "Burn CPU",
Passed: pass.Passed,
Err: pass.Err,
SubStep: subStepFromPass("Burn CPU", pass),
}
}
// runBurnMemory drives a single --vm worker sized at memPct of
// MemAvailable, capped so the kernel + agent + other workloads still
// have headroom. Clamping happens here rather than in resolveBurnKnobs
// so the cap is computed against real live memory each run.
func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult {
if _, err := exec.LookPath("stress-ng"); err != nil {
return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"}
}
avail, err := memAvailableBytes()
if err != nil {
return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()}
}
// Budget = avail * memPct / 100, then subtract the standard headroom.
// If the result is below the memory-pass floor we record a skipped
// row instead — the window is too tight to be meaningful on this box.
budget := int64(float64(avail) * float64(memPct) / 100.0)
cap := budget - memHeadroomBytes
if cap < memFloorBytes {
return burnSubResult{
Name: "Burn memory",
Skipped: true,
Reason: fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)),
}
}
args := []string{
"--vm", "1",
"--vm-bytes", strconv.FormatInt(cap, 10),
"--vm-keep",
"--timeout", durationSeconds(duration),
"--metrics-brief",
"--verify",
}
d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct))
pass := runStressPass(ctx, d, "Burn memory", duration, args)
return burnSubResult{
Name: "Burn memory",
Passed: pass.Passed,
Err: pass.Err,
SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass),
}
}
// runBurnFio runs fio_sample against the first allow-listed disk for
// the window. Reuses runFioVerify + parseFioJSON so the samples line
// up with what Storage emits. Using fio_sample (bounded by --size)
// keeps Burn's write volume predictable regardless of profile.
func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult {
if _, err := exec.LookPath("fio"); err != nil {
return burnSubResult{Name: "Burn fio", Err: "fio missing"}
}
targets := resolveTargets(d.ExpectedDisks)
if len(targets) == 0 {
return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"}
}
t := targets[0]
opts := fioOpts{
Mode: "fio_sample",
Size: "512MiB",
Runtime: duration,
BS: "4k",
RW: "randrw",
Verify: "md5",
}
start := time.Now()
d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration))
fr := runFioVerify(ctx, t.Device, opts)
end := time.Now()
sub := SubStepReport{
Name: "Burn fio " + t.Device,
Passed: fr.Error == "",
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(fr),
}
out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error}
if fr.Error == "" {
out.Samples = append(out.Samples,
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
)
if fr.ReadP99Us > 0 {
out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
}
if fr.WriteP99Us > 0 {
out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
}
}
return out
}
// runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON
// so the same (mbps, retrans, bytesSent) extraction the Network stage
// uses applies here too. Samples emitted as Burn-scoped keys so the
// dashboard can tell at-a-glance which window they came from.
func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult {
if _, err := exec.LookPath("iperf3"); err != nil {
return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"}
}
host, err := deriveHost(orchestratorURL)
if err != nil || host == "" {
return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"}
}
if port == 0 {
port = 5201
}
if parallel < 1 {
parallel = 1
}
args := []string{
"-c", host,
"-p", strconv.Itoa(port),
"-t", strconv.Itoa(int(duration.Seconds())),
"-P", strconv.Itoa(parallel),
"-J",
}
runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
defer cancel()
start := time.Now()
out, err := exec.CommandContext(runCtx, "iperf3", args...).Output()
end := time.Now()
if err != nil {
return burnSubResult{
Name: "Burn iperf",
Err: "iperf3 client error: " + err.Error(),
SubStep: SubStepReport{
Name: "Burn iperf",
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}),
},
}
}
mbps, retrans, bytesSent, _, perr := parseIperfJSON(out)
if perr != nil {
return burnSubResult{
Name: "Burn iperf",
Err: "parse iperf3 json: " + perr.Error(),
SubStep: SubStepReport{
Name: "Burn iperf",
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}),
},
}
}
samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}}
if bytesSent > 0 {
packets := float64(bytesSent) / 1460.0
if packets > 0 {
samples = append(samples, Sample{
Kind: "nic_retrans", Key: "burn/rate",
Value: float64(retrans) / packets, Unit: "rate",
})
}
}
passed := mbps > 0
errMsg := ""
if !passed {
errMsg = "zero throughput from iperf3"
}
return burnSubResult{
Name: "Burn iperf",
Passed: passed,
Err: errMsg,
Samples: samples,
SubStep: SubStepReport{
Name: fmt.Sprintf("Burn iperf (P=%d)", parallel),
Passed: passed,
StartedAt: start,
CompletedAt: end,
SummaryJSON: mustJSON(map[string]any{
"throughput_mbps": mbps,
"retransmits": retrans,
"bytes_sent": bytesSent,
"parallel": parallel,
}),
},
}
}
// runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration
// of the Burn window, piping each read into the stage's sensor channel
// as a psu_volt sample. The threshold evaluator then applies the same
// within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V
// under load will fire the critical threshold mid-Burn and the run
// will flip into FailedHolding without waiting for the post-Burn PSU
// stage to catch it.
func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
defer wg.Done()
if d.Sensor == nil {
return
}
t := time.NewTicker(5 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
rails := scanPSURails()
if len(rails) == 0 {
continue
}
batch := make([]Sample, 0, len(rails))
for _, r := range rails {
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
}
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
if err := d.Sensor(sendCtx, batch); err != nil {
d.Warn("Burn: PSU sample post: " + err.Error())
}
cancel()
}
}
}
func resolveCPUWorkers(raw string) int {
if raw == "" || strings.EqualFold(raw, "all") {
return runtime.NumCPU()
}
if n, err := strconv.Atoi(raw); err == nil && n > 0 {
return n
}
return runtime.NumCPU()
}
// clampMemPct keeps the knob in a sane band. 0 means "use default 50%";
// above 90 would crowd the kernel + agent + fio + iperf3 workers off the
// page cache. Anything outside [10, 90] is clamped.
func clampMemPct(pct int) int {
if pct <= 0 {
return 50
}
if pct < 10 {
return 10
}
if pct > 90 {
return 90
}
return pct
}
func mustJSON(v any) json.RawMessage {
b, err := json.Marshal(v)
if err != nil {
return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`))
}
return b
}
// Ensure the probes package import stays anchored — the Burn sidecars
// use probes.EDAC + the PSU rail scanner defined in psu.go which
// otherwise wouldn't pull probes in on its own.
var _ = probes.EDAC
+58
View File
@@ -0,0 +1,58 @@
package tests
import (
"runtime"
"testing"
)
// TestResolveCPUWorkers covers the three parse branches: empty/"all"
// falls back to NumCPU, a valid integer is used verbatim, and garbage
// also falls back to NumCPU rather than returning zero. Zero workers
// would make stress-ng a no-op and silently defeat Burn's CPU load.
func TestResolveCPUWorkers(t *testing.T) {
np := runtime.NumCPU()
cases := []struct {
name string
in string
want int
}{
{"empty defaults to NumCPU", "", np},
{"all defaults to NumCPU", "all", np},
{"ALL is case-insensitive", "ALL", np},
{"explicit integer", "3", 3},
{"negative falls back", "-1", np},
{"zero falls back", "0", np},
{"garbage falls back", "lots", np},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := resolveCPUWorkers(tc.in); got != tc.want {
t.Errorf("resolveCPUWorkers(%q) = %d, want %d", tc.in, got, tc.want)
}
})
}
}
// TestClampMemPct ensures the mem_pct knob never drives the memory
// burner into OOM territory (upper clamp) or into uselessness (lower
// clamp). Zero is treated as "use default 50" so a missing knob in an
// older orchestrator's claim response doesn't collapse the workload.
func TestClampMemPct(t *testing.T) {
cases := []struct {
in, want int
}{
{0, 50}, // default
{-10, 50}, // negative treated as default
{5, 10}, // below lower band → clamp up
{10, 10},
{50, 50},
{90, 90},
{95, 90}, // above upper band → clamp down
{1000, 90},
}
for _, tc := range cases {
if got := clampMemPct(tc.in); got != tc.want {
t.Errorf("clampMemPct(%d) = %d, want %d", tc.in, got, tc.want)
}
}
}
+82 -4
View File
@@ -11,7 +11,10 @@ import (
"runtime" "runtime"
"strconv" "strconv"
"strings" "strings"
"sync"
"time" "time"
"vetting/agent/probes"
) )
// CPUStress runs stress-ng as two serial passes. The previous shape // CPUStress runs stress-ng as two serial passes. The previous shape
@@ -55,11 +58,28 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
extras := map[string]any{"cores": cores} extras := map[string]any{"cores": cores}
var subs []SubStepReport var subs []SubStepReport
// EDAC sidecar runs for the lifetime of the stage; cancelled on
// return. It polls /sys/devices/system/edac/mc/*/{ce,ue}_count and
// posts the current counters so the server-side threshold evaluator
// can gate edac_ue > 0 → fail the run. Zero-valued poll falls back
// to 10s — the same cadence rasdaemon uses by default.
sideCtx, sideCancel := context.WithCancel(ctx)
defer sideCancel()
var sideWG sync.WaitGroup
sideWG.Add(1)
go runEDACSidecar(sideCtx, &sideWG, d)
// Per-profile durations come from Deps; zero values (missing knobs
// or legacy orchestrator) fall back to the package default so the
// stage always has a defined budget.
cpuDur := nonzeroDur(d.CPUStressKnobs.CPUPass, cpuPassDuration)
memDur := nonzeroDur(d.CPUStressKnobs.MemPass, memPassDuration)
// Pass 1: CPU // Pass 1: CPU
cpu := runStressPass(ctx, d, "CPU", cpuPassDuration, []string{ cpu := runStressPass(ctx, d, "CPU", cpuDur, []string{
"--cpu", strconv.Itoa(cores), "--cpu", strconv.Itoa(cores),
"--cpu-method", "all", "--cpu-method", "all",
"--timeout", durationSeconds(cpuPassDuration), "--timeout", durationSeconds(cpuDur),
"--metrics-brief", "--metrics-brief",
"--verify", "--verify",
}) })
@@ -104,11 +124,11 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
SubSteps: subs, SubSteps: subs,
} }
} }
mem := runStressPass(ctx, d, "memory", memPassDuration, []string{ mem := runStressPass(ctx, d, "memory", memDur, []string{
"--vm", "1", "--vm", "1",
"--vm-bytes", strconv.FormatInt(cap, 10), "--vm-bytes", strconv.FormatInt(cap, 10),
"--vm-keep", "--vm-keep",
"--timeout", durationSeconds(memPassDuration), "--timeout", durationSeconds(memDur),
"--metrics-brief", "--metrics-brief",
"--verify", "--verify",
}) })
@@ -133,6 +153,64 @@ func CPUStress(ctx context.Context, d Deps) Outcome {
} }
} }
// runEDACSidecar polls /sys EDAC counters on d.CPUStressKnobs.EDACPoll
// cadence (or 10s fallback) for the lifetime of the stage ctx, emitting
// one sample per (memory-controller × {ce,ue}) pair on each tick. A
// single failing read is tolerated: the next tick picks up the counter.
//
// This is where the critical edac_ue threshold becomes a hard-fail: as
// soon as a UE counter advances past 0, the server-side evaluator trips
// and flips the run into FailedHolding. The sidecar emits whether or
// not stress-ng is still running; that keeps the signal live during
// inter-pass gaps.
//
// MCE counts are intentionally not sampled here — they require
// rasdaemon or mcelog and vary by live-image packaging. The threshold
// rule for mce stays seeded (so the DB shape is stable) but only fires
// once a matching kind lands, which is a follow-up.
func runEDACSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
defer wg.Done()
if d.Sensor == nil {
return
}
poll := d.CPUStressKnobs.EDACPoll
if poll <= 0 {
poll = 10 * time.Second
}
t := time.NewTicker(poll)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
edac := probes.EDAC()
if len(edac) == 0 {
continue
}
batch := make([]Sample, 0, len(edac))
for _, s := range edac {
batch = append(batch, Sample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit})
}
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
if err := d.Sensor(sendCtx, batch); err != nil {
d.Warn("CPUStress: edac sample post: " + err.Error())
}
cancel()
}
}
}
// nonzeroDur picks override over fallback, but only when override is
// strictly positive. Lets callers pass a zero-value duration to mean
// "no override; use fallback" without a separate ok return.
func nonzeroDur(override, fallback time.Duration) time.Duration {
if override > 0 {
return override
}
return fallback
}
// subStepFromPass projects a stressPass into a SubStepReport — shared by // subStepFromPass projects a stressPass into a SubStepReport — shared by
// both passes and by the mid-stage early-return paths so the UI always // both passes and by the mid-stage early-return paths so the UI always
// sees exactly one row per pass, even on failure. // sees exactly one row per pass, even on failure.
+24
View File
@@ -0,0 +1,24 @@
// fake_dmidecode simulates `dmidecode -t bios` for unit tests of the
// firmware probe's BIOS parser. Prints deterministic output modeled on
// a real Supermicro host; exits 0 regardless of flags.
package main
import "fmt"
func main() {
fmt.Println(`# dmidecode 3.3
Getting SMBIOS data from sysfs.
SMBIOS 3.2.0 present.
Handle 0x0000, DMI type 0, 26 bytes
BIOS Information
Vendor: American Megatrends Inc.
Version: 3.2
Release Date: 07/15/2021
Address: 0xF0000
Runtime Size: 64 kB
ROM Size: 32 MB
Characteristics:
PCI is supported
BIOS is upgradeable`)
}
+22
View File
@@ -0,0 +1,22 @@
// Package fakes is the umbrella for deterministic stand-ins for
// external probe binaries that Vetting's stage code normally shells
// out to (stress-ng, fio, iperf3, dmidecode, ethtool, nvidia-smi,
// mcelog, nvme). Each real binary gets its own subpackage under
// fakes/<name>/ with `package main` and a main() that prints golden
// output — build with `go build -o <tmp>/<name> ./agent/tests/fakes/<name>`
// and point a test's tests.Deps.LookPath at <tmp>/<name>.
//
// The seam in tests is tests.Deps.LookPath: when non-nil the stage
// code uses it instead of os/exec.LookPath. Outside tests, nil
// LookPath means "use the real binary on $PATH" — stages continue to
// work on production hosts without the fakes package around.
//
// How to add a new fake:
// 1. Create agent/tests/fakes/<binaryname>/main.go.
// 2. Write `package main` with a main() that prints exactly the
// bytes the real tool would produce for the input you care to
// simulate. Determinism > completeness — tests want a known
// sample, not a realistic one.
// 3. Reference the fake from the unit test with `go test` compiling
// it via t.TempDir() + `go build -o` before the test body runs.
package fakes
+18
View File
@@ -0,0 +1,18 @@
// fake_stress_ng simulates stress-ng for unit tests. Accepts (and
// ignores) any flag, sleeps briefly so callers that measure wall-clock
// see a non-zero elapsed, and prints the "passed" lines CPUStress
// expects. Exits 0.
package main
import (
"fmt"
"os"
"time"
)
func main() {
fmt.Fprintln(os.Stderr, "fake_stress_ng invoked:", os.Args[1:])
time.Sleep(50 * time.Millisecond)
fmt.Println("stress-ng: info: [1] dispatching hogs: 1 cpu")
fmt.Println("stress-ng: info: [1] successful run completed in 0.05s")
}
+130 -16
View File
@@ -9,19 +9,27 @@ import (
"strconv" "strconv"
"strings" "strings"
"time" "time"
"vetting/agent/probes"
) )
// NetworkConfig is what the agent passes to Network: the orchestrator's // NetworkConfig is what the agent passes to Network: the orchestrator's
// iperf3 server address and port. We derive host from OrchestratorURL. // iperf3 server address, port, and the per-profile duration.
type NetworkConfig struct { type NetworkConfig struct {
OrchestratorURL string OrchestratorURL string
IperfPort int // 0 = 5201 IperfPort int // 0 = 5201
Duration time.Duration Duration time.Duration
} }
// Network runs iperf3 against the orchestrator's bundled server. Records // Network runs iperf3 against the orchestrator's bundled server for
// bandwidth as a measurement; fails if iperf3 is missing, the server // the profile-configured duration. Records throughput as a measurement;
// isn't reachable, or throughput is zero. // records per-interface rx/tx error-rate deltas as nic_retrans samples
// so the server-side threshold gate (`nic_retrans rate < 0.001`) fires
// on a flaky PHY or a wire that drops half its packets under load.
//
// Failure cases: iperf3 missing, server unreachable, zero throughput.
// Zero throughput is treated as a hard failure — an iperf that finished
// cleanly but pushed zero bytes is indistinguishable from a bad run.
func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome { func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
if _, err := exec.LookPath("iperf3"); err != nil { if _, err := exec.LookPath("iperf3"); err != nil {
// Live image ships iperf3; absence means packaging regression. // Live image ships iperf3; absence means packaging regression.
@@ -51,6 +59,11 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
duration = 10 * time.Second duration = 10 * time.Second
} }
// Snapshot /proc/net/dev before the test so we can attribute any
// error-count growth to *this stage's* traffic. The same snapshot
// taken after iperf returns is the end of the window.
netStart := indexNetDev(probes.NetDev())
args := []string{ args := []string{
"-c", host, "-c", host,
"-p", strconv.Itoa(port), "-p", strconv.Itoa(port),
@@ -72,7 +85,7 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)}, Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)},
} }
} }
mbps, parsed, err := parseIperfJSON(out) mbps, retrans, bytesSent, parsed, err := parseIperfJSON(out)
if err != nil { if err != nil {
d.Error("Network: parse iperf3 output: " + err.Error()) d.Error("Network: parse iperf3 output: " + err.Error())
return Outcome{ return Outcome{
@@ -82,12 +95,58 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
Extras: map[string]any{"raw": string(out)}, Extras: map[string]any{"raw": string(out)},
} }
} }
netEnd := indexNetDev(probes.NetDev())
netDelta := diffNetDev(netStart, netEnd)
samples := []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}
// iperf-derived retrans rate: retrans_count / packet_count_estimate.
// TCP typical MTU 1500; payload ~1460. We divide bytes by 1460 to
// approximate packets. This keeps the rate bounded in [0, 1].
if bytesSent > 0 {
packets := float64(bytesSent) / 1460.0
if packets > 0 {
samples = append(samples, Sample{
Kind: "nic_retrans",
Key: "iperf/rate",
Value: float64(retrans) / packets,
Unit: "rate",
})
}
}
// Per-interface error-rate deltas. A flaky cable typically surfaces
// as tx_errs or tx_drop on the originating interface, not inside
// iperf's own tally.
for iface, delta := range netDelta {
if delta.TxBytes > 0 {
packets := float64(delta.TxBytes) / 1460.0
if packets > 0 {
rate := float64(delta.TxErrs+delta.TxDrop) / packets
samples = append(samples, Sample{
Kind: "nic_retrans", Key: iface + "/rate", Value: rate, Unit: "rate",
})
}
}
// Diagnostic raw counts so the report can show which interface
// bled. These don't fire a threshold today but are useful for
// post-mortem.
samples = append(samples,
Sample{Kind: "nic_errs", Key: iface + "/rx", Value: float64(delta.RxErrs + delta.RxDrop), Unit: "count"},
Sample{Kind: "nic_errs", Key: iface + "/tx", Value: float64(delta.TxErrs + delta.TxDrop), Unit: "count"},
)
}
if d.Sensor != nil { if d.Sensor != nil {
_ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}) _ = d.Sensor(ctx, samples)
} }
extras := map[string]any{ extras := map[string]any{
"throughput_mbps": mbps, "throughput_mbps": mbps,
"retransmits": retrans,
"bytes_sent": bytesSent,
"net_delta": netDelta,
"iperf_end": parsed, "iperf_end": parsed,
} }
if mbps <= 0 { if mbps <= 0 {
@@ -98,14 +157,55 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome {
Extras: extras, Extras: extras,
} }
} }
d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps)) d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps (retransmits=%d)", mbps, retrans))
return Outcome{ return Outcome{
Passed: true, Passed: true,
Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host), Summary: fmt.Sprintf("%.1f Mbps to %s (retransmits=%d)", mbps, host, retrans),
Extras: extras, Extras: extras,
} }
} }
// indexNetDev flattens a NetDev slice into a map keyed by interface
// name so diffNetDev can pair start/end by name without O(n²) scans.
func indexNetDev(snaps []probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
out := map[string]probes.NetDevSnapshot{}
for _, s := range snaps {
out[s.Iface] = s
}
return out
}
// diffNetDev computes end start for each interface present in both
// snapshots. An interface that dropped away mid-run is dropped from
// the result (can't compute a delta). Underflow (end < start, rare
// after a counter reset) is clamped to 0.
func diffNetDev(start, end map[string]probes.NetDevSnapshot) map[string]probes.NetDevSnapshot {
out := map[string]probes.NetDevSnapshot{}
for iface, e := range end {
s, ok := start[iface]
if !ok {
continue
}
out[iface] = probes.NetDevSnapshot{
Iface: iface,
RxBytes: subU64(e.RxBytes, s.RxBytes),
RxErrs: subU64(e.RxErrs, s.RxErrs),
RxDrop: subU64(e.RxDrop, s.RxDrop),
TxBytes: subU64(e.TxBytes, s.TxBytes),
TxErrs: subU64(e.TxErrs, s.TxErrs),
TxDrop: subU64(e.TxDrop, s.TxDrop),
}
}
return out
}
func subU64(a, b uint64) uint64 {
if a < b {
return 0
}
return a - b
}
// deriveHost pulls the hostname out of an https://host:port base URL. // deriveHost pulls the hostname out of an https://host:port base URL.
func deriveHost(raw string) (string, error) { func deriveHost(raw string) (string, error) {
if raw == "" { if raw == "" {
@@ -119,18 +219,22 @@ func deriveHost(raw string) (string, error) {
return strings.TrimSpace(h), nil return strings.TrimSpace(h), nil
} }
// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J. // parseIperfJSON pulls end.sum_sent.bits_per_second and retransmits out
// Returns (Mbps, full-json-map, err). // of iperf3 -J. Returns (Mbps, retransmits, bytes_sent, full-end-map, err).
func parseIperfJSON(b []byte) (float64, map[string]any, error) { func parseIperfJSON(b []byte) (float64, int64, int64, map[string]any, error) {
var top map[string]any var top map[string]any
if err := json.Unmarshal(b, &top); err != nil { if err := json.Unmarshal(b, &top); err != nil {
return 0, nil, err return 0, 0, 0, nil, err
} }
end, ok := top["end"].(map[string]any) end, ok := top["end"].(map[string]any)
if !ok { if !ok {
return 0, top, fmt.Errorf("missing end") return 0, 0, 0, nil, fmt.Errorf("missing end")
} }
// iperf3 reports either sum_sent (when -R not set) or sum_received. // Pull the first sum that carries bits_per_second; retransmits +
// bytes live there too for TCP.
var mbps float64
var retrans int64
var bytesSent int64
for _, key := range []string{"sum_sent", "sum_received", "sum"} { for _, key := range []string{"sum_sent", "sum_received", "sum"} {
sum, ok := end[key].(map[string]any) sum, ok := end[key].(map[string]any)
if !ok { if !ok {
@@ -140,7 +244,17 @@ func parseIperfJSON(b []byte) (float64, map[string]any, error) {
if !ok { if !ok {
continue continue
} }
return bps / 1_000_000, end, nil mbps = bps / 1_000_000
if r, ok := sum["retransmits"].(float64); ok {
retrans = int64(r)
}
if bs, ok := sum["bytes"].(float64); ok {
bytesSent = int64(bs)
}
break
} }
return 0, end, fmt.Errorf("no bits_per_second in end.sum_*") if mbps == 0 {
return 0, 0, 0, end, fmt.Errorf("no bits_per_second in end.sum_*")
}
return mbps, retrans, bytesSent, end, nil
} }
+192
View File
@@ -0,0 +1,192 @@
package tests
import (
"encoding/json"
"testing"
"vetting/agent/probes"
)
// TestParseIperfJSON_SumSent confirms we pull throughput, retransmits,
// and bytes_sent from end.sum_sent. Real iperf3 -J output nests these
// three under end.sum_sent for TCP streams.
func TestParseIperfJSON_SumSent(t *testing.T) {
raw := `{
"end": {
"sum_sent": {
"bits_per_second": 950000000,
"retransmits": 42,
"bytes": 1187500000
}
}
}`
mbps, retrans, bytesSent, _, err := parseIperfJSON([]byte(raw))
if err != nil {
t.Fatalf("parseIperfJSON: %v", err)
}
if mbps != 950 {
t.Errorf("mbps = %v, want 950", mbps)
}
if retrans != 42 {
t.Errorf("retransmits = %d, want 42", retrans)
}
if bytesSent != 1187500000 {
t.Errorf("bytesSent = %d, want 1187500000", bytesSent)
}
}
// TestParseIperfJSON_MissingEnd fails cleanly when iperf returned
// something without an end block (partial/aborted run).
func TestParseIperfJSON_MissingEnd(t *testing.T) {
raw := `{"start": {}}`
if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
t.Errorf("expected error on iperf output missing end block")
}
}
// TestParseIperfJSON_ZeroBps returns an error so the stage can fail
// fast. A successful-exit iperf that pushed zero bits is indistinguishable
// from a broken run and must not pass.
func TestParseIperfJSON_ZeroBps(t *testing.T) {
raw := `{"end": {"sum_sent": {"bits_per_second": 0}}}`
if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil {
t.Errorf("expected error when bits_per_second is 0")
}
}
// TestParseIperfJSON_FallsBackToSumReceived: UDP tests and some edge
// cases don't populate sum_sent. The parser walks sum_sent → sum_received
// → sum and picks the first that has a throughput number.
func TestParseIperfJSON_FallsBackToSumReceived(t *testing.T) {
raw := `{
"end": {
"sum_received": {"bits_per_second": 500000000}
}
}`
mbps, _, _, _, err := parseIperfJSON([]byte(raw))
if err != nil {
t.Fatalf("parseIperfJSON: %v", err)
}
if mbps != 500 {
t.Errorf("mbps = %v, want 500", mbps)
}
}
// TestDiffNetDev_HappyPath confirms end start on a shared interface
// produces the delta we expect. eth0 pushed 10k bytes and accumulated
// 3 tx errors during the window.
func TestDiffNetDev_HappyPath(t *testing.T) {
start := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", RxBytes: 1000, RxErrs: 0, TxBytes: 5000, TxErrs: 1},
}
end := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", RxBytes: 2000, RxErrs: 0, TxBytes: 15000, TxErrs: 4},
}
delta := diffNetDev(start, end)
got, ok := delta["eth0"]
if !ok {
t.Fatalf("eth0 missing from diff output")
}
if got.RxBytes != 1000 {
t.Errorf("RxBytes delta=%d, want 1000", got.RxBytes)
}
if got.TxBytes != 10000 {
t.Errorf("TxBytes delta=%d, want 10000", got.TxBytes)
}
if got.TxErrs != 3 {
t.Errorf("TxErrs delta=%d, want 3", got.TxErrs)
}
}
// TestDiffNetDev_InterfaceVanished: an interface present at start but
// gone at end drops from the diff rather than carrying a negative or
// stale number.
func TestDiffNetDev_InterfaceVanished(t *testing.T) {
start := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", TxBytes: 1000},
"eth1": {Iface: "eth1", TxBytes: 500},
}
end := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", TxBytes: 2000},
}
delta := diffNetDev(start, end)
if _, ok := delta["eth1"]; ok {
t.Errorf("eth1 should have been dropped (gone at end)")
}
if delta["eth0"].TxBytes != 1000 {
t.Errorf("eth0 TxBytes delta=%d, want 1000", delta["eth0"].TxBytes)
}
}
// TestDiffNetDev_CounterReset: if a counter resets between snapshots
// (kernel restart, wrap-around on a 32-bit counter) we clamp to 0
// rather than underflow a uint64.
func TestDiffNetDev_CounterReset(t *testing.T) {
start := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", TxBytes: 9999, TxErrs: 5},
}
end := map[string]probes.NetDevSnapshot{
"eth0": {Iface: "eth0", TxBytes: 100, TxErrs: 0},
}
delta := diffNetDev(start, end)
if delta["eth0"].TxBytes != 0 {
t.Errorf("reset TxBytes delta=%d, want 0 (clamped)", delta["eth0"].TxBytes)
}
if delta["eth0"].TxErrs != 0 {
t.Errorf("reset TxErrs delta=%d, want 0 (clamped)", delta["eth0"].TxErrs)
}
}
// TestDeriveHost: orchestrator URL → host extraction is how the agent
// picks the iperf3 server target. Handles both https://host and
// https://host:port shapes.
func TestDeriveHost(t *testing.T) {
cases := []struct {
raw string
want string
}{
{"https://orch.local", "orch.local"},
{"https://orch.local:8443", "orch.local"},
{"http://10.0.0.5:8080", "10.0.0.5"},
}
for _, c := range cases {
got, err := deriveHost(c.raw)
if err != nil {
t.Errorf("deriveHost(%q) error: %v", c.raw, err)
continue
}
if got != c.want {
t.Errorf("deriveHost(%q) = %q, want %q", c.raw, got, c.want)
}
}
}
func TestDeriveHost_Empty(t *testing.T) {
if _, err := deriveHost(""); err == nil {
t.Errorf("deriveHost(\"\") should error")
}
}
// TestParseIperfJSON_ParsesEndMap confirms the full end map is returned
// so extras can show every field iperf produced, not just the three we
// extract by hand.
func TestParseIperfJSON_ParsesEndMap(t *testing.T) {
raw := `{
"end": {
"sum_sent": {"bits_per_second": 1000000, "retransmits": 0, "bytes": 125000},
"cpu_utilization_percent": {"host_total": 12.3}
}
}`
_, _, _, endMap, err := parseIperfJSON([]byte(raw))
if err != nil {
t.Fatalf("parseIperfJSON: %v", err)
}
if endMap == nil {
t.Fatalf("endMap is nil")
}
// Sanity: both keys round-trip via json.
b, _ := json.Marshal(endMap)
if len(b) == 0 {
t.Errorf("endMap marshaled to empty")
}
}
+137 -18
View File
@@ -7,12 +7,20 @@ import (
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"time"
) )
// PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find // PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find
// PSU rails. In home-lab hosts the kernel surfaces a handful of named // PSU rails, then samples each rail every psuSampleInterval for a
// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10% // window sized by the stage timeout. During Burn a separate sidecar
// window of its nominal value → fail. // (see burn.go) runs the same probe concurrently with workload — the
// PSU stage itself catches slow post-load sag that only surfaces once
// the 12V rail starts recovering from a brownout under concurrent CPU
// + fio + iperf load.
//
// Any rail outside ±10% of its nominal value at any tick fires the
// critical threshold (server-side) and fails the stage. A host with no
// PSU rails wired to hwmon auto-skips.
func PSU(ctx context.Context, d Deps) Outcome { func PSU(ctx context.Context, d Deps) Outcome {
rails := scanPSURails() rails := scanPSURails()
if len(rails) == 0 { if len(rails) == 0 {
@@ -24,39 +32,150 @@ func PSU(ctx context.Context, d Deps) Outcome {
} }
} }
var samples []Sample window := resolvePSUWindow(d.StageTimeout)
problems := []string{} deadline := time.Now().Add(window)
for _, rail := range rails { interval := psuSampleInterval
samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"}) if window < interval*2 {
if ok, why := voltageInRange(rail); !ok { // Tiny window (tests, pathological stage_timeout) — at least two
problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why)) // ticks so aggregate stats are meaningful.
interval = window / 2
if interval < time.Second {
interval = time.Second
} }
} }
if d.Sensor != nil {
_ = d.Sensor(ctx, samples) // Per-label tracking: min/max across the window, count of out-of-range
// hits, last-observed value (shown in the summary).
type railStats struct {
label string
minV float64
maxV float64
lastV float64
ticks int
breaches int
reason string
}
stats := map[string]*railStats{}
tick := time.NewTicker(interval)
defer tick.Stop()
// Start with an immediate sample so a sub-45s window still produces
// at least one reading.
sampleOnce := func() {
cur := scanPSURails()
if len(cur) == 0 {
return
}
batch := make([]Sample, 0, len(cur))
for _, r := range cur {
s, ok := stats[r.Label]
if !ok {
s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts}
stats[r.Label] = s
}
s.ticks++
s.lastV = r.Volts
if r.Volts < s.minV {
s.minV = r.Volts
}
if r.Volts > s.maxV {
s.maxV = r.Volts
}
if ok, why := voltageInRange(r); !ok {
s.breaches++
if s.reason == "" {
s.reason = why
}
}
batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
}
if d.Sensor != nil && len(batch) > 0 {
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
_ = d.Sensor(sendCtx, batch)
cancel()
}
}
sampleOnce()
sampling:
for time.Now().Before(deadline) {
select {
case <-ctx.Done():
break sampling
case <-tick.C:
sampleOnce()
}
}
// Build the outcome. Extras carry per-rail rollup so the report can
// show "12V min=11.1 max=12.05 (3/120 ticks out of range)".
type railRollup struct {
Label string `json:"label"`
MinV float64 `json:"min_v"`
MaxV float64 `json:"max_v"`
LastV float64 `json:"last_v"`
Ticks int `json:"ticks"`
Breaches int `json:"breaches"`
Reason string `json:"reason,omitempty"`
}
rollups := make([]railRollup, 0, len(stats))
problems := []string{}
for _, s := range stats {
rollups = append(rollups, railRollup{
Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV,
Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason,
})
if s.breaches > 0 {
problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason))
}
} }
extras := map[string]any{ extras := map[string]any{
"rails": rails, "rails": rollups,
"problems": problems, "problems": problems,
"window": window.String(),
"interval": interval.String(),
} }
if len(problems) > 0 { if len(problems) > 0 {
d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", ")) d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; "))
return Outcome{ return Outcome{
Passed: false, Passed: false,
Message: "PSU rails out of range: " + strings.Join(problems, ", "), Message: "PSU rails out of range: " + strings.Join(problems, "; "),
Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)), Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)),
Extras: extras, Extras: extras,
} }
} }
d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails))) d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window))
return Outcome{ return Outcome{
Passed: true, Passed: true,
Summary: fmt.Sprintf("%d rails nominal", len(rails)), Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window),
Extras: extras, Extras: extras,
} }
} }
// psuSampleInterval is the default tick for post-Burn rail sampling.
// Five seconds is slow enough to stay under the HTTP budget and fast
// enough to catch rail recovery transients.
const psuSampleInterval = 5 * time.Second
// resolvePSUWindow maps the stage timeout to the sampling window.
// With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot-
// like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom
// for sensor flush + result post, capped at 10 min so a 24 h soak
// doesn't spend all day in PSU.
func resolvePSUWindow(stageTimeout time.Duration) time.Duration {
if stageTimeout <= 0 {
return 30 * time.Second
}
w := stageTimeout - 5*time.Second
if w < 30*time.Second {
w = 30 * time.Second
}
if w > 10*time.Minute {
w = 10 * time.Minute
}
return w
}
type psuRail struct { type psuRail struct {
Label string `json:"label"` Label string `json:"label"`
Volts float64 `json:"volts"` Volts float64 `json:"volts"`
+112
View File
@@ -0,0 +1,112 @@
package tests
import (
"testing"
"time"
)
// TestIsPSULabel keeps the allowlist narrow enough that CPU VRM rails
// don't get misclassified as PSU-out-of-range failures but wide enough
// that common SuperMicro/Intel hwmon labels land in the Yes bucket.
func TestIsPSULabel(t *testing.T) {
cases := []struct {
label string
want bool
}{
{"+12V", true},
{"12V", true},
{"+5V", true},
{"5V", true},
{"+3.3V", true},
{"3V3", true},
{"VCCIN", true},
{"vccin", true},
{"Vcore", false},
{"CPU VCORE", false},
{"AVCC", false},
{"", false},
}
for _, tc := range cases {
if got := isPSULabel(tc.label); got != tc.want {
t.Errorf("isPSULabel(%q) = %v, want %v", tc.label, got, tc.want)
}
}
}
// TestNominalFor maps rail labels back to expected nominal voltages.
// Unknown labels must return 0 so voltageInRange short-circuits — an
// accidental nominal would invent out-of-range failures.
func TestNominalFor(t *testing.T) {
cases := []struct {
label string
want float64
}{
{"+12V", 12.0},
{"12V", 12.0},
{"+5V", 5.0},
{"+3.3V", 3.3},
{"3V3", 3.3},
{"VCCIN", 0},
{"unknown", 0},
}
for _, tc := range cases {
if got := nominalFor(tc.label); got != tc.want {
t.Errorf("nominalFor(%q) = %v, want %v", tc.label, got, tc.want)
}
}
}
// TestVoltageInRange verifies the ±10% band: 12V passes in [10.8,
// 13.2], fails anywhere outside. Unknown labels always pass (since
// nominalFor returned 0 above).
func TestVoltageInRange(t *testing.T) {
cases := []struct {
rail psuRail
ok bool
}{
{psuRail{Label: "+12V", Volts: 12.0}, true},
{psuRail{Label: "+12V", Volts: 10.8}, true}, // exactly at the band
{psuRail{Label: "+12V", Volts: 13.2}, true}, // exactly at the band
{psuRail{Label: "+12V", Volts: 10.7}, false}, // just below
{psuRail{Label: "+12V", Volts: 13.3}, false}, // just above
{psuRail{Label: "+12V", Volts: 10.5}, false}, // real sag
{psuRail{Label: "+5V", Volts: 4.6}, true}, // 8% low on 5V still in band
{psuRail{Label: "+5V", Volts: 4.4}, false}, // 12% low on 5V — out of band
{psuRail{Label: "+5V", Volts: 5.0}, true},
{psuRail{Label: "VCCIN", Volts: 1.8}, true}, // unknown nominal → pass
}
for _, tc := range cases {
got, _ := voltageInRange(tc.rail)
if got != tc.ok {
t.Errorf("voltageInRange(%+v) = %v, want %v", tc.rail, got, tc.ok)
}
}
}
// TestResolvePSUWindow maps stage timeouts to the sampling window.
// Quick's 1m stage_timeout → 55s window; deep's 10m → capped at 10m;
// missing/zero → 30s (test / legacy orchestrator path); sub-35s → at
// least 30s so aggregates are non-trivial.
func TestResolvePSUWindow(t *testing.T) {
cases := []struct {
name string
in time.Duration
want time.Duration
}{
{"zero → snapshot fallback", 0, 30 * time.Second},
{"negative → snapshot fallback", -1 * time.Second, 30 * time.Second},
{"tiny timeout clamps up to 30s floor", 10 * time.Second, 30 * time.Second},
{"35s - 5s = 30s", 35 * time.Second, 30 * time.Second},
{"1m quick → 55s", time.Minute, 55 * time.Second},
{"10m deep → 9m55s", 10 * time.Minute, 9*time.Minute + 55*time.Second},
{"15m soak → capped at 10m", 15 * time.Minute, 10 * time.Minute},
{"1h → capped at 10m", time.Hour, 10 * time.Minute},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := resolvePSUWindow(tc.in); got != tc.want {
t.Errorf("resolvePSUWindow(%s) = %s, want %s", tc.in, got, tc.want)
}
})
}
}
+57
View File
@@ -59,6 +59,11 @@ func (o Outcome) MarshalSummary() (json.RawMessage, error) {
// Deps bundles what stages need without pulling in the whole agent. // Deps bundles what stages need without pulling in the whole agent.
// Logger methods print to stdout + forward to the orchestrator; Sensor // Logger methods print to stdout + forward to the orchestrator; Sensor
// drops numeric samples; OverrideFlags carries operator-set bypasses. // drops numeric samples; OverrideFlags carries operator-set bypasses.
//
// CPUStressKnobs / StorageKnobs / NetworkKnobs are Phase-2 profile
// knobs. Zero-valued fields mean "fall back to the compile-time
// default" — that keeps the stages runnable even when the runner can't
// materialize a profile (tests, legacy orchestrator, etc).
type Deps struct { type Deps struct {
Info func(string) Info func(string)
Warn func(string) Warn func(string)
@@ -68,6 +73,58 @@ type Deps struct {
NonDestructive bool // skip wipe-probe + writes in Storage NonDestructive bool // skip wipe-probe + writes in Storage
ExpectedDisks []ExpectedDisk // serials + sizes from host.expected_spec ExpectedDisks []ExpectedDisk // serials + sizes from host.expected_spec
StageTimeout time.Duration StageTimeout time.Duration
CPUStressKnobs CPUStressKnobs
StorageKnobs StorageKnobs
NetworkKnobs NetworkKnobs
BurnKnobs BurnKnobs
// LookPath is the unit-test seam for swapping a real external
// binary (stress-ng, fio, iperf3, dmidecode, …) for a fake. When
// nil the stage falls back to os/exec.LookPath — production and
// existing tests keep working unchanged. Tests under
// agent/tests/fakes/ populate this to redirect lookups to a built
// fake binary in a tempdir.
LookPath func(name string) (string, error)
}
// CPUStressKnobs parameterizes the CPUStress stage. Zero durations fall
// back to the package's compile-time defaults (cpuPassDuration etc).
type CPUStressKnobs struct {
CPUPass time.Duration
MemPass time.Duration
EDACPoll time.Duration
}
// StorageKnobs parameterizes the Storage stage. Mode picks between
// "fio_sample" (bounded tempfile inside the device, quick profile) and
// "full_disk" (whole-device write verify, deep/soak). Empty strings
// fall back to the stage's safe defaults.
type StorageKnobs struct {
Mode string
FioSize string
FioTime time.Duration
FioBS string
FioRW string
Verify string
}
// NetworkKnobs parameterizes the Network stage.
type NetworkKnobs struct {
Duration time.Duration
}
// BurnKnobs parameterizes the Burn super-stage. Duration is the total
// Burn window; sub-workloads run concurrently inside that window.
// CPUWorkers is "all" (runtime.NumCPU) or a numeric string. MemPct is a
// percentage of MemAvailable to allocate for the memory burner (clamped
// 0-90 by the stage). IperfParallel feeds iperf3 -P to generate sustained
// NIC load. FioOnSpare gates the storage sub-workload: true = fio runs
// against the allow-listed disks for the same window; false = skip fio.
type BurnKnobs struct {
Duration time.Duration
CPUWorkers string
MemPct int
FioOnSpare bool
IperfParallel int
} }
// Sample mirrors the server's SensorSample but lives in the tests // Sample mirrors the server's SensorSample but lives in the tests
+318 -105
View File
@@ -5,24 +5,36 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"os/exec" "os/exec"
"strconv"
"strings" "strings"
"time" "time"
) )
// Storage is the destructive stage: badblocks (write-mode sample) + fio // Storage is the destructive stage. Phase 2 replaced the old
// random IO, persisting IOPS + latency as measurements. Pre-gates: // badblocks + 128 MiB fio combo with a single fio run per disk that
// writes, verifies md5 of what it wrote, and reports p99 latency.
// Modes:
//
// - fio_sample (quick): bounded 1 GiB write per disk, ~3 min runtime.
// - full_disk (deep/soak): writes the whole device, time-bounded by
// the fio_time knob (2 h deep, 6 h soak).
//
// Pre-gates kept from Phase 1:
// //
// 1. Device allowlist: only act on /dev/<X> where the kernel-reported // 1. Device allowlist: only act on /dev/<X> where the kernel-reported
// serial matches one of Deps.ExpectedDisks. This is the operator's // serial matches one of Deps.ExpectedDisks. USB sticks and unexpected
// contract for what can be written to. USB sticks and unexpected
// drives are excluded. // drives are excluded.
// 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem // 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
// signatures, partition tables, or LVM metadata → fail with // signature, partition table, or LVM metadata → fail with
// UnexpectedData unless Deps.OverrideWipe is set. // UnexpectedData unless Deps.OverrideWipe is set.
// //
// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w` // After fio, the stage captures a SMART diff (start snapshot taken
// and `fio` in write mode. This matches the plan's "destructive disk // before any writes; end snapshot after all writes finish) and posts
// tests are always-on, gated by layered safety." // deltas on attributes like Reallocated_Sector_Ct and Current_Pending_Sector.
// The threshold evaluator isn't seeded to gate smart_delta out of the
// box — those samples are diagnostic for the report. Fio's p99 latency
// posts as fio_p99_us so the per-stage Storage warning threshold can
// fire on a latency cliff.
func Storage(ctx context.Context, d Deps) Outcome { func Storage(ctx context.Context, d Deps) Outcome {
if len(d.ExpectedDisks) == 0 { if len(d.ExpectedDisks) == 0 {
d.Info("Storage: no expected disks in spec — skipping stage") d.Info("Storage: no expected disks in spec — skipping stage")
@@ -44,10 +56,10 @@ func Storage(ctx context.Context, d Deps) Outcome {
} }
} }
// Non-destructive runs skip wipe-probe (nothing to refuse), badblocks // Non-destructive runs skip wipe-probe (nothing to refuse), fio
// -w, and write-mode fio. Every expected disk is still asserted // writes, and SMART delta (nothing changed so no delta to report).
// present + readable by listing /sys/block and reading SMART-accessible // Every expected disk is still asserted present so a vanished drive
// identity; the per-disk map flags the shortcut so the report is clear. // still fails the stage.
if d.NonDestructive { if d.NonDestructive {
perDisk := map[string]any{} perDisk := map[string]any{}
for _, t := range targets { for _, t := range targets {
@@ -79,9 +91,9 @@ func Storage(ctx context.Context, d Deps) Outcome {
Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)", Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)), Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
Extras: map[string]any{ Extras: map[string]any{
"wipe_probe": probes, "wipe_probe": probes,
"override_hint": "click 'Override wipe & retry' in the held tile", "override_hint": "click 'Override wipe & retry' in the held tile",
"dirty_devices": dirty, "dirty_devices": dirty,
}, },
} }
} }
@@ -89,64 +101,80 @@ func Storage(ctx context.Context, d Deps) Outcome {
d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", ")) d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
} }
// Per target: short badblocks write sample + fio random-read/write. // Capture start-of-stage SMART attributes before we write anything
// so the delta is attributable to *this* stage's writes and not the
// host's prior history. Per-disk failures are tolerated (e.g. the
// device doesn't expose SMART); we just can't emit a delta for it.
startSMART := captureSMARTAttrs(ctx, targets)
fioOpts := resolveFioOpts(d.StorageKnobs)
d.Info(fmt.Sprintf("Storage: fio mode=%s size=%s runtime=%s bs=%s rw=%s verify=%s",
fioOpts.Mode, fioOpts.Size, fioOpts.Runtime, fioOpts.BS, fioOpts.RW, fioOpts.Verify))
var samples []Sample var samples []Sample
var subs []SubStepReport var subs []SubStepReport
perDisk := map[string]any{} perDisk := map[string]any{}
failed := ""
for _, t := range targets { for _, t := range targets {
d.Info("Storage: running badblocks write sample on " + t.Device) d.Info(fmt.Sprintf("Storage: running fio %s on %s", fioOpts.Mode, t.Device))
bbStart := time.Now()
bb := runBadblocks(ctx, t.Device)
bbEnd := time.Now()
bbSummary, _ := json.Marshal(bb)
subs = append(subs, SubStepReport{
Name: fmt.Sprintf("badblocks %s", t.Device),
Passed: bb.OK,
StartedAt: bbStart,
CompletedAt: bbEnd,
SummaryJSON: bbSummary,
})
d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device))
fioStart := time.Now() fioStart := time.Now()
fr := runFio(ctx, t.Device) fr := runFioVerify(ctx, t.Device, fioOpts)
fioEnd := time.Now() fioEnd := time.Now()
fioSummary, _ := json.Marshal(fr) fioSummary, _ := json.Marshal(fr)
subs = append(subs, SubStepReport{ subs = append(subs, SubStepReport{
Name: fmt.Sprintf("fio %s", t.Device), Name: fmt.Sprintf("fio %s %s", fioOpts.Mode, t.Device),
Passed: fr.Error == "", Passed: fr.Error == "",
StartedAt: fioStart, StartedAt: fioStart,
CompletedAt: fioEnd, CompletedAt: fioEnd,
SummaryJSON: fioSummary, SummaryJSON: fioSummary,
}) })
perDisk[t.Device] = map[string]any{"fio": fr}
perDisk[t.Device] = map[string]any{ if fr.Error == "" {
"badblocks": bb, samples = append(samples,
"fio": fr, Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
} Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
samples = append(samples, )
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"}, if fr.ReadP99Us > 0 {
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"}, samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
)
if !bb.OK {
return Outcome{
Passed: false,
Message: "badblocks found errors on " + t.Device,
Summary: "badblocks failed on " + t.Device,
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes},
SubSteps: subs,
} }
if fr.WriteP99Us > 0 {
samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
}
} else if failed == "" {
failed = t.Device
} }
} }
if d.Sensor != nil {
// End-of-stage SMART snapshot + diff. We capture whether or not fio
// succeeded — a mid-run failure still produces attributable deltas,
// which is often more interesting than the stage outcome itself.
endSMART := captureSMARTAttrs(ctx, targets)
deltas := diffSMARTAttrs(startSMART, endSMART)
for dev, attrs := range deltas {
for attr, delta := range attrs {
samples = append(samples, Sample{Kind: "smart_delta", Key: dev + "/" + attr, Value: delta, Unit: "count"})
}
}
if d.Sensor != nil && len(samples) > 0 {
_ = d.Sensor(ctx, samples) _ = d.Sensor(ctx, samples)
} }
d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets))) if failed != "" {
return Outcome{
Passed: false,
Message: "fio verify failed on " + failed,
Summary: "fio failed on " + failed,
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
SubSteps: subs,
}
}
d.Info(fmt.Sprintf("Storage: %d disk(s) passed fio --verify", len(targets)))
return Outcome{ return Outcome{
Passed: true, Passed: true,
Summary: fmt.Sprintf("%d disks passed", len(targets)), Summary: fmt.Sprintf("%d disks passed (%s)", len(targets), fioOpts.Mode),
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes}, Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
SubSteps: subs, SubSteps: subs,
} }
} }
@@ -229,8 +257,8 @@ type wipeProbeResult struct {
// probeWipe runs blkid + wipefs -n. Any non-empty output from either is // probeWipe runs blkid + wipefs -n. Any non-empty output from either is
// a "has data" signal. This is deliberately conservative: we'd rather // a "has data" signal. This is deliberately conservative: we'd rather
// halt on a bare ext4 signature than hand badblocks a disk with real // halt on a bare ext4 signature than hand fio a disk with real bytes on
// bytes on it. // it.
func probeWipe(ctx context.Context, device string) wipeProbeResult { func probeWipe(ctx context.Context, device string) wipeProbeResult {
out := wipeProbeResult{Device: device} out := wipeProbeResult{Device: device}
@@ -257,84 +285,269 @@ func probeWipe(ctx context.Context, device string) wipeProbeResult {
return out return out
} }
// ---------- badblocks ---------- // ---------- fio ----------
type badblocksResult struct { // fioOpts resolves the probe knobs into the concrete flag values fio
OK bool `json:"ok"` // needs. Defaults match the quick profile's fio_sample shape so callers
Elapsed string `json:"elapsed"` // with zero knobs still run something bounded.
Error string `json:"error,omitempty"` type fioOpts struct {
OutputTail string `json:"output_tail,omitempty"` Mode string `json:"mode"` // "fio_sample" | "full_disk"
Size string `json:"size"` // "1GiB"; only used for fio_sample
Runtime time.Duration `json:"runtime"` // bounding time
BS string `json:"bs"` // "4k"
RW string `json:"rw"` // "randrw"
Verify string `json:"verify"` // "md5" | ""
} }
func runBadblocks(ctx context.Context, device string) badblocksResult { // resolveFioOpts normalizes the knobs into a runnable config. Zero-
// -c 64 blocks per check, -w destructive write, -b 4096 block size, // valued fields fall back to the quick defaults so a stage that's
// -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays // missing its knobs still has coherent behavior (safer than refusing).
// bounded. A real burn-in would run the whole disk; that belongs in func resolveFioOpts(k StorageKnobs) fioOpts {
// a separate "deep" stage. o := fioOpts{
args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"} Mode: firstNonEmpty(k.Mode, "fio_sample"),
start := time.Now() Size: firstNonEmpty(k.FioSize, "1GiB"),
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) Runtime: k.FioTime,
BS: firstNonEmpty(k.FioBS, "4k"),
RW: firstNonEmpty(k.FioRW, "randrw"),
Verify: firstNonEmpty(k.Verify, "md5"),
}
if o.Runtime <= 0 {
o.Runtime = 3 * time.Minute
}
return o
}
func firstNonEmpty(vs ...string) string {
for _, v := range vs {
if v != "" {
return v
}
}
return ""
}
type fioResult struct {
Mode string `json:"mode"`
ReadIOPS float64 `json:"read_iops"`
WriteIOPS float64 `json:"write_iops"`
ReadBWKBps float64 `json:"read_bw_kbps"`
WriteBWKBps float64 `json:"write_bw_kbps"`
ReadP99Us float64 `json:"read_p99_us,omitempty"`
WriteP99Us float64 `json:"write_p99_us,omitempty"`
Error string `json:"error,omitempty"`
OutputTail string `json:"output_tail,omitempty"`
}
// runFioVerify invokes fio with md5-verify semantics. fio_sample mode
// caps the IO at opts.Size; full_disk drives the whole device bounded
// by runtime. Both use direct IO to bypass the page cache — we want
// real disk latency, not Linux' cheerful buffer.
func runFioVerify(ctx context.Context, device string, opts fioOpts) fioResult {
// 30s grace over runtime so fio has time to flush + close cleanly.
runCtx, cancel := context.WithTimeout(ctx, opts.Runtime+30*time.Second)
defer cancel() defer cancel()
cmd := exec.CommandContext(runCtx, "badblocks", args...)
out, err := cmd.CombinedOutput() args := []string{
r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)} "--name=verify-" + strings.TrimPrefix(device, "/dev/"),
"--filename=" + device,
"--rw=" + opts.RW,
"--bs=" + opts.BS,
"--numjobs=1",
"--direct=1",
"--group_reporting",
"--output-format=json",
"--runtime=" + strconv.Itoa(int(opts.Runtime.Seconds())),
}
if opts.Verify != "" {
args = append(args,
"--verify="+opts.Verify,
"--verify_pattern=random",
"--do_verify=1",
)
}
switch opts.Mode {
case "full_disk":
// Time-bounded across the full device — fio uses the device's
// full size when --size is omitted on a block device.
args = append(args, "--time_based=1")
default:
// fio_sample: bounded write. Setting --size= limits the IO
// volume regardless of runtime.
args = append(args, "--size="+opts.Size, "--time_based=0")
}
cmd := exec.CommandContext(runCtx, "fio", args...)
out, err := cmd.Output()
r := fioResult{Mode: opts.Mode, OutputTail: tailLines(string(out), 20)}
if err != nil { if err != nil {
r.Error = err.Error() r.Error = err.Error()
return r return r
} }
// badblocks prints each bad block to stdout. Empty output = clean. parsed, perr := parseFioJSON(out)
if strings.TrimSpace(string(out)) == "" { if perr != nil {
r.OK = true r.Error = "parse fio json: " + perr.Error()
} else { return r
r.Error = "bad blocks found"
} }
r.ReadIOPS = parsed.ReadIOPS
r.WriteIOPS = parsed.WriteIOPS
r.ReadBWKBps = parsed.ReadBWKBps
r.WriteBWKBps = parsed.WriteBWKBps
r.ReadP99Us = parsed.ReadP99Us
r.WriteP99Us = parsed.WriteP99Us
return r return r
} }
// ---------- fio ---------- // parseFioJSON extracts the bits we care about from fio's --output-format=json.
// Latency percentiles live at .jobs[0].read.clat_ns.percentile["99.000000"];
type fioResult struct { // we convert nanoseconds to microseconds for the fio_p99_us sample.
ReadIOPS float64 `json:"read_iops"` func parseFioJSON(out []byte) (fioResult, error) {
WriteIOPS float64 `json:"write_iops"`
ReadBWKBps float64 `json:"read_bw_kbps"`
WriteBWKBps float64 `json:"write_bw_kbps"`
Error string `json:"error,omitempty"`
}
// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks.
// This is a health bar, not a benchmark — we want to know the disk
// services IO, not how fast it is at p99.
func runFio(ctx context.Context, device string) fioResult {
runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()
args := []string{
"--name=health", "--filename=" + device, "--rw=randrw",
"--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0",
"--group_reporting", "--output-format=json", "--direct=1",
}
cmd := exec.CommandContext(runCtx, "fio", args...)
out, err := cmd.Output()
if err != nil {
return fioResult{Error: err.Error()}
}
var top struct { var top struct {
Jobs []struct { Jobs []struct {
Read struct { Read struct {
IOPS float64 `json:"iops"` IOPS float64 `json:"iops"`
BW float64 `json:"bw"` BW float64 `json:"bw"`
CLat struct {
Percentile map[string]float64 `json:"percentile"`
} `json:"clat_ns"`
} `json:"read"` } `json:"read"`
Write struct { Write struct {
IOPS float64 `json:"iops"` IOPS float64 `json:"iops"`
BW float64 `json:"bw"` BW float64 `json:"bw"`
CLat struct {
Percentile map[string]float64 `json:"percentile"`
} `json:"clat_ns"`
} `json:"write"` } `json:"write"`
} `json:"jobs"` } `json:"jobs"`
} }
if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 { if err := json.Unmarshal(out, &top); err != nil {
return fioResult{Error: "parse fio json: " + fmt.Sprint(err)} return fioResult{}, err
}
if len(top.Jobs) == 0 {
return fioResult{}, fmt.Errorf("no jobs in fio output")
} }
j := top.Jobs[0] j := top.Jobs[0]
return fioResult{ r := fioResult{
ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS, ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW, ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
} }
if p := j.Read.CLat.Percentile["99.000000"]; p > 0 {
r.ReadP99Us = p / 1000.0
}
if p := j.Write.CLat.Percentile["99.000000"]; p > 0 {
r.WriteP99Us = p / 1000.0
}
return r, nil
}
// ---------- SMART delta ----------
// smartAttrMap: device → attribute → raw counter value. ATA drives
// populate named attributes (Reallocated_Sector_Ct etc); NVMe drives
// populate a flatter nvme-specific map. We track a curated whitelist
// of wear indicators — anything else is diagnostic and drops to the raw
// report output.
type smartAttrMap map[string]map[string]float64
// captureSMARTAttrs runs smartctl -aj on each target and pulls the
// whitelisted attributes. Per-device failures (virtio, permission
// issues) degrade silently — the delta step just shows no data for
// that device.
func captureSMARTAttrs(ctx context.Context, targets []diskTarget) smartAttrMap {
out := smartAttrMap{}
for _, t := range targets {
parsed, err := runSmartctl(ctx, t.Device)
if err != nil {
continue
}
attrs := extractSMARTAttrs(parsed)
if len(attrs) > 0 {
out[t.Device] = attrs
}
}
return out
}
// smartAttributeWhitelist is the set of attributes we diff across a
// stage. They're the ones that reflect *this stage's* IO damage, not
// cumulative drive history. Adding attributes is cheap — missing ones
// just drop to zero.
var smartAttributeWhitelist = map[string]bool{
// ATA SMART attribute names (smartctl normalizes to these)
"Reallocated_Sector_Ct": true,
"Current_Pending_Sector": true,
"Offline_Uncorrectable": true,
"UDMA_CRC_Error_Count": true,
"Reported_Uncorrect": true,
"Raw_Read_Error_Rate": true,
// NVMe log fields (flat keys at top of nvme_smart_health_information_log)
"media_errors": true,
"num_err_log_entries": true,
"percentage_used": true,
}
// extractSMARTAttrs walks smartctl's JSON for whitelisted attribute
// values. Handles both the ATA shape (ata_smart_attributes.table[]) and
// the NVMe shape (nvme_smart_health_information_log). Returns a map
// keyed by the canonical attribute name.
func extractSMARTAttrs(raw map[string]any) map[string]float64 {
out := map[string]float64{}
// ATA attributes are in ata_smart_attributes.table[] — each element
// has {"name": "Reallocated_Sector_Ct", "raw": {"value": N}}.
if ata, ok := raw["ata_smart_attributes"].(map[string]any); ok {
if tbl, ok := ata["table"].([]any); ok {
for _, row := range tbl {
rm, ok := row.(map[string]any)
if !ok {
continue
}
name, _ := rm["name"].(string)
if !smartAttributeWhitelist[name] {
continue
}
if r, ok := rm["raw"].(map[string]any); ok {
if v, ok := r["value"].(float64); ok {
out[name] = v
}
}
}
}
}
// NVMe attributes live flat under nvme_smart_health_information_log.
if nvme, ok := raw["nvme_smart_health_information_log"].(map[string]any); ok {
for k, v := range nvme {
if !smartAttributeWhitelist[k] {
continue
}
if n, ok := v.(float64); ok {
out[k] = n
}
}
}
return out
}
// diffSMARTAttrs subtracts start from end per (device, attribute).
// Only attributes present in both ends produce a delta; missing
// attributes drop out (can't attribute a zero-to-present delta safely).
// Negative deltas are kept so a drive that resets a counter is visible.
func diffSMARTAttrs(start, end smartAttrMap) map[string]map[string]float64 {
out := map[string]map[string]float64{}
for dev, endAttrs := range end {
startAttrs, ok := start[dev]
if !ok {
continue
}
devOut := map[string]float64{}
for attr, endV := range endAttrs {
startV, ok := startAttrs[attr]
if !ok {
continue
}
devOut[attr] = endV - startV
}
if len(devOut) > 0 {
out[dev] = devOut
}
}
return out
} }
+218
View File
@@ -0,0 +1,218 @@
package tests
import (
"encoding/json"
"testing"
"time"
)
// TestParseFioJSON_ATAReadWrite confirms we pull IOPS, BW, and p99
// latency from both read and write sides. P99 is read from clat_ns and
// converted ns → us (the unit we emit to the threshold evaluator).
func TestParseFioJSON_ATAReadWrite(t *testing.T) {
raw := `{
"jobs": [{
"read": {"iops": 1234.5, "bw": 5000, "clat_ns": {"percentile": {"99.000000": 250000}}},
"write": {"iops": 432.1, "bw": 2000, "clat_ns": {"percentile": {"99.000000": 500000}}}
}]
}`
r, err := parseFioJSON([]byte(raw))
if err != nil {
t.Fatalf("parseFioJSON: %v", err)
}
if r.ReadIOPS != 1234.5 {
t.Errorf("ReadIOPS = %v, want 1234.5", r.ReadIOPS)
}
if r.WriteIOPS != 432.1 {
t.Errorf("WriteIOPS = %v, want 432.1", r.WriteIOPS)
}
if r.ReadBWKBps != 5000 {
t.Errorf("ReadBWKBps = %v, want 5000", r.ReadBWKBps)
}
// 250000 ns → 250 us
if r.ReadP99Us != 250 {
t.Errorf("ReadP99Us = %v, want 250", r.ReadP99Us)
}
// 500000 ns → 500 us
if r.WriteP99Us != 500 {
t.Errorf("WriteP99Us = %v, want 500", r.WriteP99Us)
}
}
// TestParseFioJSON_ReadOnlyJob: if only one side has p99 populated the
// other stays zero (not emitted as a sample). Mirrors a randread job.
func TestParseFioJSON_ReadOnlyJob(t *testing.T) {
raw := `{
"jobs": [{
"read": {"iops": 1000, "bw": 4000, "clat_ns": {"percentile": {"99.000000": 100000}}},
"write": {"iops": 0, "bw": 0}
}]
}`
r, err := parseFioJSON([]byte(raw))
if err != nil {
t.Fatalf("parseFioJSON: %v", err)
}
if r.WriteP99Us != 0 {
t.Errorf("WriteP99Us = %v on read-only job, want 0", r.WriteP99Us)
}
if r.ReadP99Us != 100 {
t.Errorf("ReadP99Us = %v, want 100", r.ReadP99Us)
}
}
// TestParseFioJSON_NoJobs fails rather than reporting zeroes silently.
// An empty jobs array means fio didn't run anything.
func TestParseFioJSON_NoJobs(t *testing.T) {
raw := `{"jobs": []}`
if _, err := parseFioJSON([]byte(raw)); err == nil {
t.Errorf("expected error on empty jobs array")
}
}
// TestExtractSMARTAttrs_ATA picks attributes out of ata_smart_attributes.table
// when present. Attributes outside the whitelist drop out silently.
func TestExtractSMARTAttrs_ATA(t *testing.T) {
raw := map[string]any{}
smartJSON := `{
"ata_smart_attributes": {
"table": [
{"name": "Reallocated_Sector_Ct", "raw": {"value": 7}},
{"name": "Current_Pending_Sector", "raw": {"value": 3}},
{"name": "Spin_Retry_Count", "raw": {"value": 99}}
]
}
}`
if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
t.Fatalf("unmarshal fixture: %v", err)
}
out := extractSMARTAttrs(raw)
if out["Reallocated_Sector_Ct"] != 7 {
t.Errorf("Reallocated_Sector_Ct = %v, want 7", out["Reallocated_Sector_Ct"])
}
if out["Current_Pending_Sector"] != 3 {
t.Errorf("Current_Pending_Sector = %v, want 3", out["Current_Pending_Sector"])
}
if _, ok := out["Spin_Retry_Count"]; ok {
t.Errorf("Spin_Retry_Count should not appear (not in whitelist)")
}
}
// TestExtractSMARTAttrs_NVMe picks media_errors and friends from the
// nvme health log shape, which is a flat map at the top of the JSON.
func TestExtractSMARTAttrs_NVMe(t *testing.T) {
raw := map[string]any{}
smartJSON := `{
"nvme_smart_health_information_log": {
"media_errors": 2,
"num_err_log_entries": 15,
"percentage_used": 7,
"temperature": 42
}
}`
if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil {
t.Fatalf("unmarshal fixture: %v", err)
}
out := extractSMARTAttrs(raw)
if out["media_errors"] != 2 {
t.Errorf("media_errors = %v, want 2", out["media_errors"])
}
if out["num_err_log_entries"] != 15 {
t.Errorf("num_err_log_entries = %v, want 15", out["num_err_log_entries"])
}
if out["percentage_used"] != 7 {
t.Errorf("percentage_used = %v, want 7", out["percentage_used"])
}
if _, ok := out["temperature"]; ok {
t.Errorf("temperature should not appear (not in whitelist)")
}
}
// TestDiffSMARTAttrs: end start per (device, attr). Only attrs in
// both snapshots yield a delta; any disappearing attribute just drops
// out instead of showing a misleading negative.
func TestDiffSMARTAttrs(t *testing.T) {
start := smartAttrMap{
"/dev/sda": {"Reallocated_Sector_Ct": 5, "Current_Pending_Sector": 0},
}
end := smartAttrMap{
"/dev/sda": {"Reallocated_Sector_Ct": 8, "Current_Pending_Sector": 2, "UDMA_CRC_Error_Count": 1},
}
out := diffSMARTAttrs(start, end)
if out["/dev/sda"]["Reallocated_Sector_Ct"] != 3 {
t.Errorf("Reallocated_Sector_Ct delta = %v, want 3", out["/dev/sda"]["Reallocated_Sector_Ct"])
}
if out["/dev/sda"]["Current_Pending_Sector"] != 2 {
t.Errorf("Current_Pending_Sector delta = %v, want 2", out["/dev/sda"]["Current_Pending_Sector"])
}
if _, ok := out["/dev/sda"]["UDMA_CRC_Error_Count"]; ok {
t.Errorf("UDMA_CRC_Error_Count should not appear (missing at start)")
}
}
// TestDiffSMARTAttrs_DeviceNewAtEnd: a device only present in the end
// snapshot (drive hot-plugged mid-run, or SMART read succeeded only at
// end) is dropped from the diff — no start baseline to subtract from.
func TestDiffSMARTAttrs_DeviceNewAtEnd(t *testing.T) {
start := smartAttrMap{}
end := smartAttrMap{
"/dev/sda": {"Reallocated_Sector_Ct": 10},
}
out := diffSMARTAttrs(start, end)
if _, ok := out["/dev/sda"]; ok {
t.Errorf("/dev/sda should drop from diff when absent at start")
}
}
// TestResolveFioOpts_Defaults: zero-valued knobs resolve to the quick
// profile's fio_sample shape. Any stage that's missing per-profile
// knobs (legacy claim response, test harness) still has coherent
// bounded defaults — we won't accidentally fall into unbounded writes.
func TestResolveFioOpts_Defaults(t *testing.T) {
o := resolveFioOpts(StorageKnobs{})
if o.Mode != "fio_sample" {
t.Errorf("Mode = %q, want fio_sample", o.Mode)
}
if o.Size != "1GiB" {
t.Errorf("Size = %q, want 1GiB", o.Size)
}
if o.Runtime != 3*time.Minute {
t.Errorf("Runtime = %v, want 3m", o.Runtime)
}
if o.BS != "4k" {
t.Errorf("BS = %q, want 4k", o.BS)
}
if o.RW != "randrw" {
t.Errorf("RW = %q, want randrw", o.RW)
}
if o.Verify != "md5" {
t.Errorf("Verify = %q, want md5", o.Verify)
}
}
// TestResolveFioOpts_FullDiskOverride confirms the deep/soak shape
// round-trips. FioTime as 2h overrides the 3-minute default.
func TestResolveFioOpts_FullDiskOverride(t *testing.T) {
k := StorageKnobs{
Mode: "full_disk",
FioTime: 2 * time.Hour,
FioBS: "64k",
FioRW: "write",
}
o := resolveFioOpts(k)
if o.Mode != "full_disk" {
t.Errorf("Mode = %q, want full_disk", o.Mode)
}
if o.Runtime != 2*time.Hour {
t.Errorf("Runtime = %v, want 2h", o.Runtime)
}
if o.BS != "64k" {
t.Errorf("BS = %q, want 64k", o.BS)
}
if o.RW != "write" {
t.Errorf("RW = %q, want write", o.RW)
}
// Verify should fall back to md5 default since knob was empty.
if o.Verify != "md5" {
t.Errorf("Verify = %q, want md5 (default)", o.Verify)
}
}
+18 -11
View File
@@ -60,6 +60,8 @@ func main() {
artifactStore := &store.Artifacts{DB: conn} artifactStore := &store.Artifacts{DB: conn}
specDiffStore := &store.SpecDiffs{DB: conn} specDiffStore := &store.SpecDiffs{DB: conn}
measurementStore := &store.Measurements{DB: conn} measurementStore := &store.Measurements{DB: conn}
thresholdStore := &store.Thresholds{DB: conn}
firmwareStore := &store.Firmware{DB: conn}
hub := events.NewHub() hub := events.NewHub()
@@ -99,17 +101,19 @@ func main() {
} }
ui := &api.UI{ ui := &api.UI{
Hosts: hostStore, Hosts: hostStore,
Runs: runStore, Runs: runStore,
Stages: stageStore, Stages: stageStore,
SubSteps: subStepStore, SubSteps: subStepStore,
SpecDiffs: specDiffStore, SpecDiffs: specDiffStore,
Artifacts: artifactStore, Artifacts: artifactStore,
EventHub: hub, Thresholds: thresholdStore,
Logs: logHub, Profiles: cfg.Profiles,
Runner: runner, EventHub: hub,
Tiles: tiles, Logs: logHub,
PublicURL: cfg.Server.PublicURL, Runner: runner,
Tiles: tiles,
PublicURL: cfg.Server.PublicURL,
} }
// Inject the host-page + run-page fragment renderers. Each reuses // Inject the host-page + run-page fragment renderers. Each reuses
@@ -157,6 +161,9 @@ func main() {
Artifacts: artifactStore, Artifacts: artifactStore,
SpecDiffs: specDiffStore, SpecDiffs: specDiffStore,
Measurements: measurementStore, Measurements: measurementStore,
Thresholds: thresholdStore,
Firmware: firmwareStore,
Profiles: cfg.Profiles,
Runner: runner, Runner: runner,
EventHub: hub, EventHub: hub,
Logs: logHub, Logs: logHub,
+51
View File
@@ -85,3 +85,54 @@ agent:
notifiers: [] notifiers: []
routes: [] routes: []
# Vetting pipeline shared defaults. Every profile (quick/deep/soak)
# walks the same stage list; only per-stage durations differ.
# Thresholds here apply to every profile — a 92°C CPU fails a
# 2-minute quick run and a 12-hour soak run alike.
vetting:
stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting]
thresholds:
- { stage: "*", kind: temp, key: "cpu/*", op: lt, value: 92, unit: C, severity: critical }
- { stage: PSU, kind: psu_volt, key: "+12V", op: within_pct, value: 5, nominal: 12.0, severity: critical }
- { stage: PSU, kind: psu_volt, key: "+5V", op: within_pct, value: 5, nominal: 5.0, severity: critical }
- { stage: PSU, kind: psu_volt, key: "+3.3V", op: within_pct, value: 5, nominal: 3.3, severity: critical }
- { stage: Storage, kind: fio_p99_us, key: "*", op: lt, value: 50000, severity: warning }
- { stage: Network, kind: iperf, key: throughput_mbps, op: gte, value: 900, severity: critical }
- { stage: Network, kind: nic_retrans, key: "*/rate", op: lt, value: 0.001, severity: warning }
- { stage: CPUStress, kind: edac_ue, key: "*", op: lte, value: 0, severity: critical }
- { stage: CPUStress, kind: mce, key: "*", op: lte, value: 0, severity: critical }
# Per-profile durations + probe knobs. Only the *durations* scale across
# profiles — every profile exercises every probe and gate. Quick is a
# ~10-minute same-day sanity check; deep is the 812 h overnight soak;
# soak is the opt-in 3640 h extreme run.
profiles:
quick:
stage_timeouts:
CPUStress: 5m
Storage: 5m
Network: 2m
defaults:
cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s }
storage: { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 }
network: { duration: 60s }
deep:
stage_timeouts:
CPUStress: 2h
Storage: 4h
Network: 35m
defaults:
cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s }
storage: { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 }
network: { duration: 30m }
soak:
inherit: deep
stage_timeouts:
CPUStress: 14h
Storage: 8h
Network: 2h30m
defaults:
cpustress: { cpu_pass: 12h }
storage: { mode: full_disk, fio_time: 6h }
network: { duration: 2h }
+38
View File
@@ -75,3 +75,41 @@ agent:
notifiers: [] notifiers: []
routes: [] routes: []
# Vetting pipeline shared defaults. Every profile (quick/deep/soak)
# walks the same stage list; only per-stage durations differ.
# Thresholds apply to every profile — critical breaches fail a run
# regardless of which profile the operator picked.
vetting:
stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting]
thresholds:
- { stage: "*", kind: temp, key: "cpu/*", op: lt, value: 92, unit: C, severity: critical }
- { stage: PSU, kind: psu_volt, key: "+12V", op: within_pct, value: 5, nominal: 12.0, severity: critical }
- { stage: PSU, kind: psu_volt, key: "+5V", op: within_pct, value: 5, nominal: 5.0, severity: critical }
- { stage: PSU, kind: psu_volt, key: "+3.3V", op: within_pct, value: 5, nominal: 3.3, severity: critical }
- { stage: Storage, kind: fio_p99_us, key: "*", op: lt, value: 50000, severity: warning }
- { stage: Network, kind: iperf, key: throughput_mbps, op: gte, value: 900, severity: critical }
- { stage: Network, kind: nic_retrans, key: "*/rate", op: lt, value: 0.001, severity: warning }
- { stage: CPUStress, kind: edac_ue, key: "*", op: lte, value: 0, severity: critical }
- { stage: CPUStress, kind: mce, key: "*", op: lte, value: 0, severity: critical }
profiles:
quick:
stage_timeouts: { CPUStress: 5m, Storage: 5m, Network: 2m }
defaults:
cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s }
storage: { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 }
network: { duration: 60s }
deep:
stage_timeouts: { CPUStress: 2h, Storage: 4h, Network: 35m }
defaults:
cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s }
storage: { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 }
network: { duration: 30m }
soak:
inherit: deep
stage_timeouts: { CPUStress: 14h, Storage: 8h, Network: 2h30m }
defaults:
cpustress: { cpu_pass: 12h }
storage: { mode: full_disk, fio_time: 6h }
network: { duration: 2h }
+259 -3
View File
@@ -19,6 +19,7 @@ import (
"github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5"
"vetting/internal/config"
"vetting/internal/events" "vetting/internal/events"
"vetting/internal/hold" "vetting/internal/hold"
"vetting/internal/logs" "vetting/internal/logs"
@@ -41,6 +42,9 @@ type Agent struct {
Artifacts *store.Artifacts Artifacts *store.Artifacts
SpecDiffs *store.SpecDiffs SpecDiffs *store.SpecDiffs
Measurements *store.Measurements Measurements *store.Measurements
Thresholds *store.Thresholds // Phase 1: seeded per run; consulted on each /sensor batch
Firmware *store.Firmware // Phase 4: firmware snapshots (unused before then)
Profiles *config.ProfileRegistry // Phase 2: /claim resolves the run's profile → stage knobs
Runner *orchestrator.Runner Runner *orchestrator.Runner
EventHub *events.Hub EventHub *events.Hub
Logs *logs.Hub Logs *logs.Hub
@@ -216,6 +220,21 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
if iperfPort == 0 { if iperfPort == 0 {
iperfPort = 5201 iperfPort = 5201
} }
// Resolve the run's profile → agent-visible stage knobs. The agent
// reads these to size CPUStress / Storage / Network work. An empty
// profile (legacy runs seeded before Phase 1) falls back to "quick".
profileName := run.Profile
if profileName == "" {
profileName = config.ProfileQuick
}
var stageCfg config.StageConfig
if a.Profiles != nil {
stageCfg = a.Profiles.ResolveStageConfig(profileName)
} else {
stageCfg = config.StageConfig{Profile: profileName}
}
writeJSON(w, http.StatusOK, map[string]any{ writeJSON(w, http.StatusOK, map[string]any{
"ok": true, "ok": true,
"run_id": runID, "run_id": runID,
@@ -224,6 +243,7 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) {
"iperf_port": iperfPort, "iperf_port": iperfPort,
"non_destructive": run.NonDestructive, "non_destructive": run.NonDestructive,
"current_state": string(currentState), "current_state": string(currentState),
"stage_config": stageCfg,
}) })
} }
@@ -398,10 +418,24 @@ type StageResult struct {
Passed bool `json:"passed"` Passed bool `json:"passed"`
Summary json.RawMessage `json:"summary,omitempty"` Summary json.RawMessage `json:"summary,omitempty"`
Inventory *spec.Inventory `json:"inventory,omitempty"` Inventory *spec.Inventory `json:"inventory,omitempty"`
Firmware []FirmwareLine `json:"firmware,omitempty"`
Message string `json:"message,omitempty"` Message string `json:"message,omitempty"`
SubSteps []SubStepResultLine `json:"sub_steps,omitempty"` SubSteps []SubStepResultLine `json:"sub_steps,omitempty"`
} }
// FirmwareLine is a single firmware snapshot POSTed alongside the
// Firmware stage's /result body. Mirrors agent/probes.FirmwareSnapshot.
// The server converts each line to a store.FirmwareSnapshot and persists
// it under the run — SpecValidate reads these back to diff against the
// host's expected_firmware.
type FirmwareLine struct {
Component string `json:"component"`
Identifier string `json:"identifier"`
Version string `json:"version"`
Vendor string `json:"vendor,omitempty"`
Raw map[string]string `json:"raw,omitempty"`
}
// SubStepResultLine is one entry in StageResult.SubSteps. Ordinal is // SubStepResultLine is one entry in StageResult.SubSteps. Ordinal is
// assigned from slice index server-side; the agent doesn't set it. // assigned from slice index server-side; the agent doesn't set it.
type SubStepResultLine struct { type SubStepResultLine struct {
@@ -476,6 +510,20 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
return return
} }
// Aggregate threshold gate: flip Passed=false server-side when any
// critical breach landed for this stage. The agent's verdict is
// advisory — a stage-executor can miss a runaway sample that the
// sidecar caught. We check this *before* writing the stage state
// so the DB reflects the server-side decision.
thresholdDetail := ""
if body.Passed {
if breached, detail := a.stageHadCriticalBreach(r.Context(), runID, body.Stage); breached {
body.Passed = false
thresholdDetail = detail
a.appendLog(runID, "error", fmt.Sprintf("%s reported passed but %s — flipping to failed", body.Stage, detail))
}
}
stageState := model.StagePassed stageState := model.StagePassed
if !body.Passed { if !body.Passed {
stageState = model.StageFailed stageState = model.StageFailed
@@ -488,6 +536,9 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError) http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError)
return return
} }
if thresholdDetail != "" && body.Message == "" {
body.Message = thresholdDetail
}
// Agent-authored sub-steps: persist in slice order (ordinal = index) // Agent-authored sub-steps: persist in slice order (ordinal = index)
// and fan out a per-row SSE event each so the detail pane shows them // and fan out a per-row SSE event each so the detail pane shows them
@@ -502,6 +553,14 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) {
} }
} }
// Firmware-specific: persist each snapshot into firmware_snapshots.
// SpecValidate reads them back to diff against expected_firmware.
if body.Stage == "Firmware" && len(body.Firmware) > 0 {
if err := a.persistFirmware(r.Context(), runID, body.Firmware); err != nil {
log.Printf("persist firmware run %d: %v", runID, err)
}
}
if !body.Passed { if !body.Passed {
if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil { if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil {
log.Printf("set failed stage: %v", err) log.Printf("set failed stage: %v", err)
@@ -615,6 +674,34 @@ func parseResultTime(s string) *time.Time {
return nil return nil
} }
// persistFirmware writes the reported snapshots. A nil/unset a.Firmware
// store is a no-op so tests that don't wire it up stay green; a mid-run
// persist error is logged but doesn't fail the stage (Firmware is
// advisory — SpecValidate is the gate).
func (a *Agent) persistFirmware(ctx context.Context, runID int64, lines []FirmwareLine) error {
if a.Firmware == nil || len(lines) == 0 {
return nil
}
rows := make([]store.FirmwareSnapshot, 0, len(lines))
for _, l := range lines {
raw := "{}"
if len(l.Raw) > 0 {
if b, err := json.Marshal(l.Raw); err == nil {
raw = string(b)
}
}
rows = append(rows, store.FirmwareSnapshot{
RunID: runID,
Component: l.Component,
Identifier: l.Identifier,
Version: l.Version,
Vendor: l.Vendor,
RawJSON: raw,
})
}
return a.Firmware.CreateBatch(ctx, rows)
}
func (a *Agent) persistInventory(r *http.Request, run *model.Run, inv *spec.Inventory) error { func (a *Agent) persistInventory(r *http.Request, run *model.Run, inv *spec.Inventory) error {
dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", run.ID)) dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", run.ID))
if err := os.MkdirAll(dir, 0o755); err != nil { if err := os.MkdirAll(dir, 0o755); err != nil {
@@ -667,6 +754,22 @@ func (a *Agent) resolveSpecValidate(r *http.Request, runID int64) {
return return
} }
diffs := spec.Diff(expected, inv) diffs := spec.Diff(expected, inv)
if a.Firmware != nil && len(expected.Firmware) > 0 {
snaps, err := a.Firmware.ListForRun(r.Context(), runID)
if err != nil {
log.Printf("specvalidate: list firmware: %v", err)
} else {
observed := make([]spec.FirmwareObserved, 0, len(snaps))
for _, s := range snaps {
observed = append(observed, spec.FirmwareObserved{
Component: s.Component,
Identifier: s.Identifier,
Version: s.Version,
})
}
diffs = append(diffs, spec.DiffFirmware(expected.Firmware, observed)...)
}
}
if err := a.SpecDiffs.ReplaceForRun(r.Context(), runID, diffs); err != nil { if err := a.SpecDiffs.ReplaceForRun(r.Context(), runID, diffs); err != nil {
log.Printf("specvalidate: write diffs: %v", err) log.Printf("specvalidate: write diffs: %v", err)
} }
@@ -884,13 +987,17 @@ type SensorSample struct {
} }
// Sensor persists a batch of numeric samples. The thermal sidecar hits // Sensor persists a batch of numeric samples. The thermal sidecar hits
// this on a tick; stage executors (iperf, fio) also drop here. // this on a tick; stage executors (iperf, fio) also drop here. Each
// sample is evaluated against the run's seeded thresholds — critical
// breaches fail the run immediately (thermal runaway, EDAC UE, voltage
// sag); warning breaches are recorded for the report only.
func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) { func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
runID, ok := runIDFromURL(w, r) runID, ok := runIDFromURL(w, r)
if !ok { if !ok {
return return
} }
if _, ok := a.authenticate(w, r, runID); !ok { run, ok := a.authenticate(w, r, runID)
if !ok {
return return
} }
if a.Measurements == nil { if a.Measurements == nil {
@@ -903,8 +1010,12 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
return return
} }
rows := make([]model.Measurement, 0, len(body.Samples)) rows := make([]model.Measurement, 0, len(body.Samples))
sampleStages := make([]string, 0, len(body.Samples))
for _, s := range body.Samples { for _, s := range body.Samples {
ts, _ := time.Parse(time.RFC3339Nano, s.TS) ts, _ := time.Parse(time.RFC3339Nano, s.TS)
if ts.IsZero() {
ts = time.Now().UTC()
}
rows = append(rows, model.Measurement{ rows = append(rows, model.Measurement{
RunID: runID, RunID: runID,
TS: ts, TS: ts,
@@ -913,12 +1024,139 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) {
Value: s.Value, Value: s.Value,
Unit: s.Unit, Unit: s.Unit,
}) })
// Stage the sample belongs to drives threshold selector
// matching. We use the run's current state — the agent does
// not tag samples with a stage.
sampleStages = append(sampleStages, orchestrator.StageNameForState(run.State))
} }
if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil { if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil {
http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError) http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError)
return return
} }
writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(rows)}) critical := a.evaluateSensorBatch(r.Context(), runID, rows, sampleStages)
writeJSON(w, http.StatusOK, map[string]any{
"ok": true,
"written": len(rows),
"breach": critical != "",
"breach_kind": critical,
})
if critical != "" {
a.failRunOnCriticalBreach(r, run, critical)
}
}
// evaluateSensorBatch runs each sample through the run's thresholds,
// persists evaluations, and returns a short human-readable label for
// the first critical breach it sees (empty when all samples pass or
// only hit warning-severity rules).
func (a *Agent) evaluateSensorBatch(ctx context.Context, runID int64, rows []model.Measurement, sampleStages []string) string {
if a.Thresholds == nil || len(rows) == 0 {
return ""
}
rules, err := a.Thresholds.ListForRun(ctx, runID)
if err != nil {
log.Printf("sensor: list thresholds run %d: %v", runID, err)
return ""
}
if len(rules) == 0 {
return ""
}
evalRules := make([]orchestrator.Threshold, 0, len(rules))
for _, r := range rules {
evalRules = append(evalRules, orchestrator.Threshold{
ID: r.ID,
Stage: r.Stage,
Kind: r.Kind,
Key: r.Key,
Op: orchestrator.ThresholdOp(r.Op),
Value: r.Threshold,
Nominal: r.Nominal,
Severity: orchestrator.ThresholdSeverity(r.Severity),
})
}
evals := make([]store.ThresholdEvaluation, 0, len(rows))
critical := ""
for i, m := range rows {
sample := orchestrator.Sample{
Stage: sampleStages[i],
Kind: m.Kind,
Key: m.Key,
Value: m.Value,
}
for _, res := range orchestrator.Evaluate(sample, evalRules) {
evals = append(evals, store.ThresholdEvaluation{
RunID: runID,
ThresholdID: res.Threshold.ID,
Stage: sample.Stage,
Kind: sample.Kind,
Key: sample.Key,
TS: m.TS,
Observed: res.Observed,
Passed: res.Passed,
})
if critical == "" && res.CriticalBreach() {
critical = fmt.Sprintf("%s %s=%g breached %s %g",
res.Threshold.Kind, sample.Key, res.Observed, res.Threshold.Op, res.Threshold.Value)
}
}
}
if err := a.Thresholds.RecordBatch(ctx, evals); err != nil {
log.Printf("sensor: record evals run %d: %v", runID, err)
}
return critical
}
// stageHadCriticalBreach returns true if any critical-severity
// threshold evaluation for this run matched samples attributed to the
// given stage (stage selector "*" or exact). Called at /result close
// so even an agent that reports Passed=true gets overridden when the
// aggregate view says the stage tripped a gate.
func (a *Agent) stageHadCriticalBreach(ctx context.Context, runID int64, stage string) (bool, string) {
if a.Thresholds == nil {
return false, ""
}
breaches, err := a.Thresholds.CriticalBreaches(ctx, runID)
if err != nil {
log.Printf("result: list breaches run %d: %v", runID, err)
return false, ""
}
for _, b := range breaches {
if b.Stage == stage || b.Stage == "" || b.Stage == "*" {
return true, fmt.Sprintf("critical threshold breach: %s %s=%g", b.Kind, b.Key, b.Observed)
}
}
return false, ""
}
// failRunOnCriticalBreach flips the run to FailedHolding in response
// to a live threshold breach (thermal runaway, EDAC UE, rail sag).
// The agent's pending /result for the current stage may still arrive —
// the silent-skip guard handles that by refusing to double-transition.
func (a *Agent) failRunOnCriticalBreach(r *http.Request, run *model.Run, detail string) {
stage := orchestrator.StageNameForState(run.State)
if stage == "" {
stage = "threshold"
}
if err := a.Runs.SetFailedStage(r.Context(), run.ID, stage+" (threshold)"); err != nil {
log.Printf("sensor: set failed stage run %d: %v", run.ID, err)
}
if _, err := a.Runner.Transition(r.Context(), run.ID, orchestrator.TriggerStageFailed); err != nil {
// If we're already in FailedHolding the transition errors —
// that's fine, the first breach wins.
log.Printf("sensor: fail-transition run %d: %v", run.ID, err)
return
}
hostName := a.hostNameFor(r.Context(), run.HostID)
a.dispatchEvent(notify.Event{
Kind: notify.KindStageFailed,
Severity: notify.SeverityCritical,
RunID: run.ID,
HostName: hostName,
Title: fmt.Sprintf("[vetting] %s FAILED: %s (threshold)", hostName, stage),
Body: fmt.Sprintf("Run %d on %s tripped a critical threshold during %s: %s", run.ID, hostName, stage, detail),
URL: a.runLinkURL(run.ID),
})
a.appendLog(run.ID, "error", fmt.Sprintf("threshold breach during %s: %s — run parked in FailedHolding", stage, detail))
} }
// resolveReporting runs when the pipeline advances into StateReporting. // resolveReporting runs when the pipeline advances into StateReporting.
@@ -956,12 +1194,20 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) {
log.Printf("reporting: list measurements: %v", err) log.Printf("reporting: list measurements: %v", err)
} }
} }
var firmware []store.FirmwareSnapshot
if a.Firmware != nil {
firmware, err = a.Firmware.ListForRun(ctx, runID)
if err != nil {
log.Printf("reporting: list firmware: %v", err)
}
}
bundle := map[string]any{ bundle := map[string]any{
"run": run, "run": run,
"host": host, "host": host,
"stages": stages, "stages": stages,
"spec_diffs": diffs, "spec_diffs": diffs,
"measurements": measurements, "measurements": measurements,
"firmware": firmware,
"generated_at": time.Now().UTC().Format(time.RFC3339), "generated_at": time.Now().UTC().Format(time.RFC3339),
} }
buf, err := json.MarshalIndent(bundle, "", " ") buf, err := json.MarshalIndent(bundle, "", " ")
@@ -993,6 +1239,15 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) {
// Also render the operator-facing HTML summary alongside the JSON. // Also render the operator-facing HTML summary alongside the JSON.
// Failures here are non-fatal — the JSON is the source of truth. // Failures here are non-fatal — the JSON is the source of truth.
if host != nil { if host != nil {
fwRows := make([]report.FirmwareSnapshot, 0, len(firmware))
for _, f := range firmware {
fwRows = append(fwRows, report.FirmwareSnapshot{
Component: f.Component,
Identifier: f.Identifier,
Version: f.Version,
Vendor: f.Vendor,
})
}
htmlData := report.Data{ htmlData := report.Data{
GeneratedAt: time.Now().UTC(), GeneratedAt: time.Now().UTC(),
Run: *run, Run: *run,
@@ -1000,6 +1255,7 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) {
Stages: stages, Stages: stages,
SpecDiffs: diffs, SpecDiffs: diffs,
Aggregates: report.AggregateMeasurements(measurements), Aggregates: report.AggregateMeasurements(measurements),
Firmware: fwRows,
} }
if htmlBuf, err := report.RenderHTML(htmlData); err != nil { if htmlBuf, err := report.RenderHTML(htmlData); err != nil {
log.Printf("reporting: render html: %v", err) log.Printf("reporting: render html: %v", err)
+2 -2
View File
@@ -108,7 +108,7 @@ func TestRunPage_DefaultStep_Running(t *testing.T) {
}) })
runID, _ := runs.Create(ctx, id, "rr", false) runID, _ := runs.Create(ctx, id, "rr", false)
_ = ui.Stages.Seed(ctx, runID) _ = ui.Stages.Seed(ctx, runID)
for _, name := range []string{"Inventory", "SpecValidate"} { for _, name := range []string{"Inventory", "Firmware", "SpecValidate"} {
_ = ui.Stages.StartByName(ctx, runID, name) _ = ui.Stages.StartByName(ctx, runID, name)
_ = ui.Stages.CompleteByName(ctx, runID, name, model.StagePassed, "") _ = ui.Stages.CompleteByName(ctx, runID, name, model.StagePassed, "")
} }
@@ -135,7 +135,7 @@ func TestRunPage_DefaultStep_Failed(t *testing.T) {
}) })
runID, _ := runs.Create(ctx, id, "rf", false) runID, _ := runs.Create(ctx, id, "rf", false)
_ = ui.Stages.Seed(ctx, runID) _ = ui.Stages.Seed(ctx, runID)
for _, name := range []string{"Inventory", "SpecValidate", "SMART"} { for _, name := range []string{"Inventory", "Firmware", "SpecValidate", "SMART"} {
_ = ui.Stages.StartByName(ctx, runID, name) _ = ui.Stages.StartByName(ctx, runID, name)
_ = ui.Stages.CompleteByName(ctx, runID, name, model.StagePassed, "") _ = ui.Stages.CompleteByName(ctx, runID, name, model.StagePassed, "")
} }
+169
View File
@@ -0,0 +1,169 @@
package api_test
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"path/filepath"
"strconv"
"testing"
"vetting/internal/api"
"vetting/internal/db"
"vetting/internal/events"
"vetting/internal/model"
"vetting/internal/orchestrator"
"vetting/internal/store"
)
// setupAgentWithThresholds builds an Agent wired up to the thresholds
// store + a Runner so the /sensor handler can drive the state machine.
// Seeds one critical thermal threshold and parks the run in CPUStress
// so the handler will stamp a stage-relevant failed_stage.
func setupAgentWithThresholds(t *testing.T) (*api.Agent, int64, string) {
t.Helper()
path := filepath.Join(t.TempDir(), "vetting.db")
conn, err := db.Open(path)
if err != nil {
t.Fatalf("open db: %v", err)
}
t.Cleanup(func() { _ = conn.Close() })
hosts := &store.Hosts{DB: conn}
runs := &store.Runs{DB: conn}
stages := &store.Stages{DB: conn}
meas := &store.Measurements{DB: conn}
thresholds := &store.Thresholds{DB: conn}
hub := events.NewHub()
runner := &orchestrator.Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub}
hostID, err := hosts.Create(context.Background(), model.Host{
Name: "thresh-host",
MAC: "aa:bb:cc:dd:ee:aa",
WoLBroadcastIP: "10.0.0.255",
WoLPort: 9,
ExpectedSpecYAML: "memory:\n total_gib: 16\n",
})
if err != nil {
t.Fatalf("create host: %v", err)
}
plain, hash, err := orchestrator.IssueRunToken()
if err != nil {
t.Fatalf("issue token: %v", err)
}
runID, err := runs.Create(context.Background(), hostID, hash, false)
if err != nil {
t.Fatalf("create run: %v", err)
}
if err := stages.Seed(context.Background(), runID); err != nil {
t.Fatalf("seed stages: %v", err)
}
// Park the run where a real thermal sidecar would be posting samples.
if err := runs.SetState(context.Background(), runID, model.StateCPUStress); err != nil {
t.Fatalf("set state: %v", err)
}
// Seed one critical thermal threshold.
if _, err := thresholds.SeedForRun(context.Background(), runID, []store.ThresholdSpec{
{Stage: "*", Kind: "temp", Key: "cpu/*", Op: "lt", Value: 92, Unit: "C", Severity: "critical", Source: "profile"},
}); err != nil {
t.Fatalf("seed thresholds: %v", err)
}
return &api.Agent{
Hosts: hosts,
Runs: runs,
Stages: stages,
Measurements: meas,
Thresholds: thresholds,
Runner: runner,
}, runID, plain
}
// TestSensor_ThermalRunawayFailsRun: a sample that breaches a critical
// threshold lands in threshold_evaluations (passed=0) and flips the
// run into FailedHolding with failed_stage naming the current stage.
// This is the Phase-1 behavior gate — without the evaluator, the sample
// would just sit in measurements and the run would happily march on.
func TestSensor_ThermalRunawayFailsRun(t *testing.T) {
a, runID, token := setupAgentWithThresholds(t)
batch := api.SensorBatch{Samples: []api.SensorSample{
{Kind: "temp", Key: "cpu/0", Value: 95.3, Unit: "C"},
}}
buf, _ := json.Marshal(batch)
req := routedRequest(runID, http.MethodPost,
"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
req.Header.Set("Authorization", "Bearer "+token)
req.Header.Set("Content-Type", "application/json")
rr := httptest.NewRecorder()
a.Sensor(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
}
var resp struct {
OK bool `json:"ok"`
Breach bool `json:"breach"`
Kind string `json:"breach_kind"`
}
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
t.Fatalf("decode: %v", err)
}
if !resp.Breach {
t.Fatalf("expected breach=true, got %+v", resp)
}
run, err := a.Runs.Get(context.Background(), runID)
if err != nil {
t.Fatalf("get run: %v", err)
}
if run.State != model.StateFailedHolding {
t.Fatalf("state = %s, want FailedHolding", run.State)
}
if run.FailedStage == "" {
t.Fatalf("failed_stage empty; want stage-named breach")
}
evals, err := a.Thresholds.ListEvaluations(context.Background(), runID)
if err != nil {
t.Fatalf("list evaluations: %v", err)
}
if len(evals) != 1 {
t.Fatalf("want 1 evaluation recorded, got %d", len(evals))
}
if evals[0].Passed {
t.Fatalf("evaluation recorded as passed for 95.3C sample against <92C rule")
}
}
// TestSensor_WithinThresholdPasses: a sample comfortably inside the
// threshold writes an evaluation row with passed=1 and leaves the run
// state untouched.
func TestSensor_WithinThresholdPasses(t *testing.T) {
a, runID, token := setupAgentWithThresholds(t)
batch := api.SensorBatch{Samples: []api.SensorSample{
{Kind: "temp", Key: "cpu/0", Value: 55.0, Unit: "C"},
}}
buf, _ := json.Marshal(batch)
req := routedRequest(runID, http.MethodPost,
"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf)
req.Header.Set("Authorization", "Bearer "+token)
req.Header.Set("Content-Type", "application/json")
rr := httptest.NewRecorder()
a.Sensor(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String())
}
run, err := a.Runs.Get(context.Background(), runID)
if err != nil {
t.Fatalf("get run: %v", err)
}
if run.State != model.StateCPUStress {
t.Fatalf("state = %s, want CPUStress unchanged", run.State)
}
evals, err := a.Thresholds.ListEvaluations(context.Background(), runID)
if err != nil {
t.Fatalf("list evaluations: %v", err)
}
if len(evals) != 1 || !evals[0].Passed {
t.Fatalf("want 1 passed evaluation, got %+v", evals)
}
}
+96 -8
View File
@@ -75,6 +75,12 @@ func newCaptureRegistry(c *captureNotifier) *notify.Registry {
// (agent, runID, plainTokenForBearer). Caller is responsible for // (agent, runID, plainTokenForBearer). Caller is responsible for
// transitioning the run out of Queued. // transitioning the run out of Queued.
func fullAgent(t *testing.T) (*api.Agent, int64, string) { func fullAgent(t *testing.T) (*api.Agent, int64, string) {
return fullAgentWithSpec(t, "")
}
// fullAgentWithSpec is the same as fullAgent but seeds the host with
// an ExpectedSpecYAML so SpecValidate can pick up diffs in the test.
func fullAgentWithSpec(t *testing.T, expectedSpecYAML string) (*api.Agent, int64, string) {
t.Helper() t.Helper()
tmp := t.TempDir() tmp := t.TempDir()
conn, err := db.Open(filepath.Join(tmp, "vetting.db")) conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
@@ -89,6 +95,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
artifactStore := &store.Artifacts{DB: conn} artifactStore := &store.Artifacts{DB: conn}
specDiffStore := &store.SpecDiffs{DB: conn} specDiffStore := &store.SpecDiffs{DB: conn}
measurementStore := &store.Measurements{DB: conn} measurementStore := &store.Measurements{DB: conn}
firmwareStore := &store.Firmware{DB: conn}
hub := events.NewHub() hub := events.NewHub()
logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub) logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
@@ -109,7 +116,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
MAC: "aa:bb:cc:dd:ee:10", MAC: "aa:bb:cc:dd:ee:10",
WoLBroadcastIP: "10.0.0.255", WoLBroadcastIP: "10.0.0.255",
WoLPort: 9, WoLPort: 9,
ExpectedSpecYAML: "", // empty spec → no diffs ExpectedSpecYAML: expectedSpecYAML,
}) })
if err != nil { if err != nil {
t.Fatalf("create host: %v", err) t.Fatalf("create host: %v", err)
@@ -132,6 +139,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) {
Artifacts: artifactStore, Artifacts: artifactStore,
SpecDiffs: specDiffStore, SpecDiffs: specDiffStore,
Measurements: measurementStore, Measurements: measurementStore,
Firmware: firmwareStore,
Runner: runner, Runner: runner,
EventHub: hub, EventHub: hub,
Logs: logHub, Logs: logHub,
@@ -195,20 +203,24 @@ func TestFullPipelineToCompleted(t *testing.T) {
Memory: spec.MemorySpec{TotalGiB: 16}, Memory: spec.MemorySpec{TotalGiB: 16},
} }
next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}) next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
// After Inventory → SpecValidate resolves inline → SMART // After Inventory → Firmware
if next != "SMART" { if next != "Firmware" {
t.Fatalf("after Inventory, next_state = %q, want SMART", next) t.Fatalf("after Inventory, next_state = %q, want Firmware", next)
} }
// The remaining stages advance one-for-one in order. // The remaining stages advance one-for-one in order. After Firmware
// the inline SpecValidate resolver advances through SpecValidate to
// SMART without a dedicated /result POST for SpecValidate.
walkPlan := []struct { walkPlan := []struct {
stage string stage string
expected string expected string
}{ }{
{"Firmware", "SMART"},
{"SMART", "CPUStress"}, {"SMART", "CPUStress"},
{"CPUStress", "Storage"}, {"CPUStress", "Storage"},
{"Storage", "Network"}, {"Storage", "Network"},
{"Network", "GPU"}, {"Network", "Burn"},
{"Burn", "GPU"},
{"GPU", "PSU"}, {"GPU", "PSU"},
{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed {"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
} }
@@ -287,8 +299,11 @@ func TestFaultInjectionSMART(t *testing.T) {
} }
inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}} inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" { if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
t.Fatalf("after Inventory, next = %q want SMART", next) t.Fatalf("after Inventory, next = %q want Firmware", next)
}
if next := walkStage(t, a, runID, token, "Firmware", true, nil); next != "SMART" {
t.Fatalf("after Firmware, next = %q want SMART (inline SpecValidate)", next)
} }
// Fake SMART failure → expect FailedHolding. // Fake SMART failure → expect FailedHolding.
@@ -316,3 +331,76 @@ func TestFaultInjectionSMART(t *testing.T) {
t.Errorf("StageFailed severity = %q, want critical", ev.Severity) t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
} }
} }
// TestFirmwarePersistAndSpecMismatch exercises the Phase 4 firmware
// integration: the agent POSTs Firmware snapshots; server persists; the
// following SpecValidate diff picks up a firmware mismatch and parks
// the run in FailedHolding with FailedStage=SpecValidate.
func TestFirmwarePersistAndSpecMismatch(t *testing.T) {
// Host demands BIOS 3.3; agent will POST 3.2 → one critical firmware diff.
yaml := "firmware:\n - component: bios\n version: \"3.3\"\n"
a, runID, token := fullAgentWithSpec(t, yaml)
a.Notify = newCaptureRegistry(&captureNotifier{name: "capture"})
if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
t.Fatalf("set state: %v", err)
}
inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
t.Fatalf("after Inventory, next = %q want Firmware", next)
}
// Firmware stage: agent reports actual BIOS 3.2 → one row persisted.
fw := []map[string]any{
{"component": "bios", "identifier": "system", "version": "3.2", "vendor": "AMI"},
}
next := walkStage(t, a, runID, token, "Firmware", true, map[string]any{"firmware": fw})
// Inline SpecValidate should detect the firmware mismatch and send
// the run to FailedHolding without the agent posting SpecValidate.
if next != "FailedHolding" {
t.Fatalf("after Firmware mismatch, next = %q want FailedHolding", next)
}
run, err := a.Runs.Get(context.Background(), runID)
if err != nil {
t.Fatalf("get run: %v", err)
}
if run.State != model.StateFailedHolding {
t.Fatalf("run.State = %q, want FailedHolding", run.State)
}
if run.FailedStage != "SpecValidate" {
t.Fatalf("run.FailedStage = %q, want SpecValidate", run.FailedStage)
}
// Persistence: row landed in firmware_snapshots.
snaps, err := a.Firmware.ListForRun(context.Background(), runID)
if err != nil {
t.Fatalf("ListForRun firmware: %v", err)
}
if len(snaps) != 1 {
t.Fatalf("firmware rows = %d, want 1: %+v", len(snaps), snaps)
}
if snaps[0].Component != "bios" || snaps[0].Version != "3.2" {
t.Errorf("persisted snapshot = %+v", snaps[0])
}
// Diff row: SpecDiffs has a firmware-specific entry (rather than
// only CPU/memory/disk rows) and is critical.
diffs, err := a.SpecDiffs.ListForRun(context.Background(), runID)
if err != nil {
t.Fatalf("ListForRun specdiffs: %v", err)
}
found := false
for _, d := range diffs {
if strings.HasPrefix(d.Field, "firmware[") {
found = true
if d.Severity != "critical" {
t.Errorf("firmware diff severity = %q, want critical", d.Severity)
}
}
}
if !found {
t.Fatalf("no firmware[...] entry in spec diffs: %+v", diffs)
}
}
+64 -13
View File
@@ -16,6 +16,7 @@ import (
"github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5"
"gopkg.in/yaml.v3" "gopkg.in/yaml.v3"
"vetting/internal/config"
"vetting/internal/events" "vetting/internal/events"
"vetting/internal/logs" "vetting/internal/logs"
"vetting/internal/model" "vetting/internal/model"
@@ -26,17 +27,19 @@ import (
) )
type UI struct { type UI struct {
Hosts *store.Hosts Hosts *store.Hosts
Runs *store.Runs Runs *store.Runs
Stages *store.Stages Stages *store.Stages
SubSteps *store.SubSteps SubSteps *store.SubSteps
SpecDiffs *store.SpecDiffs SpecDiffs *store.SpecDiffs
Artifacts *store.Artifacts Artifacts *store.Artifacts
EventHub *events.Hub Thresholds *store.Thresholds // Phase 1: seeded at StartRun from Profiles
Logs *logs.Hub Profiles *config.ProfileRegistry
Runner *orchestrator.Runner EventHub *events.Hub
Tiles *TileEnricher Logs *logs.Hub
PublicURL string // user-visible base URL baked into the quick-register one-liner Runner *orchestrator.Runner
Tiles *TileEnricher
PublicURL string // user-visible base URL baked into the quick-register one-liner
// PXE, when non-nil, gets Reload()ed after host create/delete so // PXE, when non-nil, gets Reload()ed after host create/delete so
// dnsmasq's dhcp-host= allowlist reflects the current registry. // dnsmasq's dhcp-host= allowlist reflects the current registry.
// Without this, a newly-registered host PXE-boots and gets // Without this, a newly-registered host PXE-boots and gets
@@ -316,23 +319,71 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
} }
nonDestructive := r.PostFormValue("non_destructive") == "1" nonDestructive := r.PostFormValue("non_destructive") == "1"
profile := strings.TrimSpace(r.PostFormValue("profile"))
if profile == "" {
profile = config.ProfileQuick
}
if !config.IsValidProfile(profile) {
http.Error(w, "unknown profile: "+profile, http.StatusBadRequest)
return
}
_, hash, err := orchestrator.IssueRunToken() _, hash, err := orchestrator.IssueRunToken()
if err != nil { if err != nil {
http.Error(w, "token: "+err.Error(), http.StatusInternalServerError) http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
return return
} }
runID, err := u.Runs.Create(r.Context(), hostID, hash, nonDestructive) runID, err := u.Runs.CreateWithProfile(r.Context(), hostID, hash, nonDestructive, profile)
if err != nil { if err != nil {
http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError) http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
return return
} }
log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID) if err := u.seedThresholds(r.Context(), runID, host, profile); err != nil {
// A threshold-seed failure shouldn't orphan a run row — log
// and continue. Samples will just accumulate without a gate
// until the operator retries, same as before Phase 1.
log.Printf("ui: seed thresholds run %d: %v", runID, err)
}
log.Printf("ui: created run %d for host %d profile=%s (state=Queued)", runID, hostID, profile)
// Send the operator straight to the new run — the button they clicked // Send the operator straight to the new run — the button they clicked
// was "Start vetting", the thing they want next is to watch it. // was "Start vetting", the thing they want next is to watch it.
http.Redirect(w, r, fmt.Sprintf("/runs/%d", runID), http.StatusSeeOther) http.Redirect(w, r, fmt.Sprintf("/runs/%d", runID), http.StatusSeeOther)
} }
// seedThresholds materializes the per-run threshold table from the
// ProfileRegistry. The shared vetting.thresholds block applies to
// every profile; future per-profile overrides will layer on top here,
// and per-host overrides (Phase 1 extra) land via ExpectedSpecYAML in
// a later iteration. Safe to skip silently when Thresholds or the
// registry isn't wired — tests do not always build one.
func (u *UI) seedThresholds(ctx context.Context, runID int64, host *model.Host, profile string) error {
if u.Thresholds == nil || u.Profiles == nil {
return nil
}
_ = host // reserved for per-host override layer
_ = profile // reserved for per-profile override layer
defaults := u.Profiles.Vetting.Thresholds
if len(defaults) == 0 {
return nil
}
specs := make([]store.ThresholdSpec, 0, len(defaults))
for _, d := range defaults {
specs = append(specs, store.ThresholdSpec{
Stage: d.Stage,
Kind: d.Kind,
Key: d.Key,
Op: d.Op,
Value: d.Value,
Nominal: d.Nominal,
Unit: d.Unit,
Severity: d.Severity,
Source: "profile",
})
}
_, err := u.Thresholds.SeedForRun(ctx, runID, specs)
return err
}
func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) { func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
_ = templates.Registration(templates.RegistrationForm{ _ = templates.Registration(templates.RegistrationForm{
QuickRegisterURL: u.baseURL(r), QuickRegisterURL: u.baseURL(r),
+21
View File
@@ -20,6 +20,13 @@ type Config struct {
Agent Agent `yaml:"agent"` Agent Agent `yaml:"agent"`
Notifiers []Notifier `yaml:"notifiers"` Notifiers []Notifier `yaml:"notifiers"`
Routes []Route `yaml:"routes"` Routes []Route `yaml:"routes"`
// Profiles holds the Phase-1 quick/deep/soak registry (stage order,
// threshold defaults, per-profile stage timeouts + probe knobs).
// Populated from the `vetting:` and `profiles:` top-level blocks
// during Load. Nil is never returned — Load installs a default
// registry when those blocks are absent.
Profiles *ProfileRegistry `yaml:"-"`
} }
type Server struct { type Server struct {
@@ -111,6 +118,20 @@ func Load(path string) (*Config, error) {
if err := yaml.Unmarshal(b, &c); err != nil { if err := yaml.Unmarshal(b, &c); err != nil {
return nil, fmt.Errorf("parse config: %w", err) return nil, fmt.Errorf("parse config: %w", err)
} }
// The `vetting:` + `profiles:` blocks live alongside the existing
// fields but we decode them into the raw shape because YAML
// durations arrive as strings. Reusing the same byte buffer is
// safe: yaml.Unmarshal is happy to ignore keys the target doesn't
// know about.
var rawProfiles rawProfilesBlock
if err := yaml.Unmarshal(b, &rawProfiles); err != nil {
return nil, fmt.Errorf("parse profiles: %w", err)
}
reg, err := buildProfileRegistry(rawProfiles)
if err != nil {
return nil, fmt.Errorf("profiles: %w", err)
}
c.Profiles = reg
if c.Server.Bind == "" { if c.Server.Bind == "" {
c.Server.Bind = "127.0.0.1:8080" c.Server.Bind = "127.0.0.1:8080"
} }
+441
View File
@@ -0,0 +1,441 @@
package config
import (
"fmt"
"strings"
"time"
)
// ProfileName is the set of legal values for a Run's profile column.
// Exposed as constants so callers (UI handler, tests, agent) don't
// sprinkle literal strings.
const (
ProfileQuick = "quick"
ProfileDeep = "deep"
ProfileSoak = "soak"
)
// AllProfiles is the canonical ordering shown in the picker. Leftmost
// is the default; rightmost is the longest-running.
var AllProfiles = []string{ProfileQuick, ProfileDeep, ProfileSoak}
// IsValidProfile returns true when name is one of the known profile
// identifiers. Used at the UI boundary to reject malformed POSTs and in
// store code as a fallback guard.
func IsValidProfile(name string) bool {
for _, p := range AllProfiles {
if p == name {
return true
}
}
return false
}
// Vetting holds the stage order + threshold defaults that are shared
// across all profiles. Only the per-stage durations/concurrency differ
// between quick/deep/soak; gates like "CPU > 92C fails the run" apply
// to a 2-minute quick run and a 12-hour soak alike.
type Vetting struct {
Stages []string `yaml:"stages"`
Thresholds []ThresholdDefaults `yaml:"thresholds"`
}
// ThresholdDefaults is the YAML shape of a threshold declaration. One
// stanza can declare a per-stage rule ("stage: Network") or a global
// rule ("stage: *") — the threshold evaluator applies both to samples
// with matching (stage, kind, key).
type ThresholdDefaults struct {
Stage string `yaml:"stage"`
Kind string `yaml:"kind"`
Key string `yaml:"key"`
Op string `yaml:"op"` // lt|lte|gt|gte|within_pct
Value float64 `yaml:"value"`
Nominal float64 `yaml:"nominal"` // only used by within_pct (e.g. 12.0 for +12V rail)
Unit string `yaml:"unit"`
Severity string `yaml:"severity"` // critical|warning
}
// ProfileRegistry is the in-memory view of the `profiles:` block in
// vetting.yaml. The orchestrator queries it at run creation time to
// seed thresholds and (in Phase 3+) to scale per-stage durations.
type ProfileRegistry struct {
// Shared stage ordering + threshold defaults. Every profile walks
// the same list; only durations/concurrency differ.
Vetting Vetting
// Profiles is keyed by name ("quick"/"deep"/"soak"). Inherit is
// already resolved at load time — a caller sees a flattened view.
Profiles map[string]Profile
}
// Profile is a loaded profile. StageTimeouts is keyed by stage name.
// Defaults carries the free-form knobs each probe reads.
type Profile struct {
Name string
Inherit string
StageTimeouts map[string]time.Duration
Defaults map[string]map[string]any
}
// StageConfig is the flat view of a profile's knobs, shipped on the
// claim response so the agent can size CPUStress/Storage/Network/Burn
// work without parsing YAML. Empty values mean "fall back to the
// agent's compile-time default" — an older orchestrator that doesn't
// set these fields keeps working unchanged.
type StageConfig struct {
Profile string `json:"profile"`
StageTimeouts map[string]string `json:"stage_timeouts,omitempty"`
CPUStress CPUStressKnobs `json:"cpustress"`
Storage StorageKnobs `json:"storage"`
Network NetworkKnobs `json:"network"`
Burn BurnKnobs `json:"burn"`
}
// CPUStressKnobs parallels the `cpustress:` block under `profiles.<name>.defaults`.
// Durations are YAML duration strings ("2m", "60m", "12h").
type CPUStressKnobs struct {
CPUPass string `json:"cpu_pass,omitempty"`
MemPass string `json:"mem_pass,omitempty"`
EDACPoll string `json:"edac_poll,omitempty"`
}
// StorageKnobs parallels `storage:` defaults. Mode is "fio_sample" (quick)
// or "full_disk" (deep/soak). Verify names the integrity mode ("md5" or "").
type StorageKnobs struct {
Mode string `json:"mode,omitempty"`
FioSize string `json:"fio_size,omitempty"`
FioTime string `json:"fio_time,omitempty"`
FioBS string `json:"fio_bs,omitempty"`
FioRW string `json:"fio_rw,omitempty"`
Verify string `json:"verify,omitempty"`
}
// NetworkKnobs parallels `network:` defaults. Duration is a YAML string.
type NetworkKnobs struct {
Duration string `json:"duration,omitempty"`
}
// BurnKnobs parallels `burn:` defaults. Duration is the total Burn window.
// CPUWorkers is "all" (agent picks runtime.NumCPU) or a numeric string.
// MemPct is a percentage of MemAvailable to stress. FioOnSpare gates
// whether fio runs inside Burn (set false if operator lacks a spare
// partition). IperfParallel is the parallel stream count fed to iperf3 -P.
type BurnKnobs struct {
Duration string `json:"duration,omitempty"`
CPUWorkers string `json:"cpu_workers,omitempty"`
MemPct int `json:"mem_pct,omitempty"`
FioOnSpare bool `json:"fio_on_spare,omitempty"`
IperfParallel int `json:"iperf_parallel,omitempty"`
}
// ResolveStageConfig flattens the named profile into the wire shape the
// claim handler ships. Missing keys render as empty strings so the agent
// falls back to its own defaults.
func (pr *ProfileRegistry) ResolveStageConfig(name string) StageConfig {
if pr == nil {
return StageConfig{Profile: name}
}
p, err := pr.Lookup(name)
if err != nil {
return StageConfig{Profile: name}
}
out := StageConfig{Profile: p.Name}
if len(p.StageTimeouts) > 0 {
out.StageTimeouts = make(map[string]string, len(p.StageTimeouts))
for k, v := range p.StageTimeouts {
out.StageTimeouts[k] = v.String()
}
}
cpu := p.Defaults["cpustress"]
out.CPUStress.CPUPass = yamlString(cpu, "cpu_pass")
out.CPUStress.MemPass = yamlString(cpu, "mem_pass")
out.CPUStress.EDACPoll = yamlString(cpu, "edac_poll")
st := p.Defaults["storage"]
out.Storage.Mode = yamlString(st, "mode")
out.Storage.FioSize = yamlString(st, "fio_size")
out.Storage.FioTime = yamlString(st, "fio_time")
out.Storage.FioBS = yamlString(st, "fio_bs")
out.Storage.FioRW = yamlString(st, "fio_rw")
out.Storage.Verify = yamlString(st, "verify")
net := p.Defaults["network"]
out.Network.Duration = yamlString(net, "duration")
burn := p.Defaults["burn"]
out.Burn.Duration = yamlString(burn, "duration")
out.Burn.CPUWorkers = yamlString(burn, "cpu_workers")
out.Burn.MemPct = yamlInt(burn, "mem_pct")
out.Burn.FioOnSpare = yamlBool(burn, "fio_on_spare")
out.Burn.IperfParallel = yamlInt(burn, "iperf_parallel")
return out
}
// yamlInt coerces a map[string]any entry to int. Accepts native int,
// float64 (JSON numbers round-trip as float), or numeric string. Missing
// / malformed values return 0 so the agent falls back to its default.
func yamlInt(m map[string]any, key string) int {
v, ok := m[key]
if !ok || v == nil {
return 0
}
switch x := v.(type) {
case int:
return x
case int64:
return int(x)
case float64:
return int(x)
case string:
// Best-effort string → int. Empty and non-numeric fall through
// to zero.
var n int
if _, err := fmt.Sscanf(x, "%d", &n); err == nil {
return n
}
}
return 0
}
// yamlBool accepts native bool or "true"/"false" strings. Anything else
// (missing key, numeric, typo) returns false — a safer default than
// "true" for a destructive knob like fio_on_spare.
func yamlBool(m map[string]any, key string) bool {
v, ok := m[key]
if !ok || v == nil {
return false
}
switch x := v.(type) {
case bool:
return x
case string:
return strings.EqualFold(x, "true")
}
return false
}
// yamlString coerces a map[string]any entry to its string form. YAML
// durations like "2m" parse as strings; numeric literals like 5 parse as
// int. We format non-string scalars with fmt.Sprint so the agent can
// still interpret them.
func yamlString(m map[string]any, key string) string {
v, ok := m[key]
if !ok || v == nil {
return ""
}
if s, ok := v.(string); ok {
return s
}
return fmt.Sprint(v)
}
// Lookup returns the profile with the given name. Falls back to the
// default profile (quick) if the name is empty. Returns an error when
// the name is non-empty but unknown so the caller can surface it.
func (pr *ProfileRegistry) Lookup(name string) (Profile, error) {
if name == "" {
name = ProfileQuick
}
p, ok := pr.Profiles[name]
if !ok {
return Profile{}, fmt.Errorf("unknown profile %q", name)
}
return p, nil
}
// Names returns the registry's profile names in the canonical
// picker order (quick/deep/soak). Profiles present in the config but
// unknown to AllProfiles are appended after, alphabetically.
func (pr *ProfileRegistry) Names() []string {
out := make([]string, 0, len(pr.Profiles))
seen := map[string]bool{}
for _, n := range AllProfiles {
if _, ok := pr.Profiles[n]; ok {
out = append(out, n)
seen[n] = true
}
}
for n := range pr.Profiles {
if !seen[n] {
out = append(out, n)
}
}
return out
}
// Stages returns the shared stage order, or a safe default when the
// config didn't declare one — keeps tests that don't build a full
// ProfileRegistry from tripping over a nil slice.
func (pr *ProfileRegistry) Stages() []string {
if len(pr.Vetting.Stages) == 0 {
return DefaultStages()
}
out := make([]string, len(pr.Vetting.Stages))
copy(out, pr.Vetting.Stages)
return out
}
// DefaultStages is the canonical stage list the orchestrator walks
// when no config is loaded. Mirrored in the vetting.yaml shipped with
// the repo so edits to the slice and the file stay in sync.
func DefaultStages() []string {
return []string{
"Inventory",
"Firmware",
"SpecValidate",
"SMART",
"CPUStress",
"Storage",
"Network",
"Burn",
"GPU",
"PSU",
"Reporting",
}
}
// rawProfile is the YAML shape before inherit resolution. Durations
// arrive as strings (e.g. "2h") so we can parse them with
// time.ParseDuration instead of rolling our own.
type rawProfile struct {
Inherit string `yaml:"inherit"`
StageTimeouts map[string]string `yaml:"stage_timeouts"`
Defaults map[string]map[string]any `yaml:"defaults"`
}
type rawProfilesBlock struct {
Vetting Vetting `yaml:"vetting"`
Profiles map[string]rawProfile `yaml:"profiles"`
}
// buildProfileRegistry flattens a rawProfilesBlock into a ProfileRegistry.
// Resolves `inherit:` by recursive merge (child keys win), parses
// stage_timeouts strings into time.Durations, and returns an error if
// the inherit chain loops or references an unknown profile.
func buildProfileRegistry(raw rawProfilesBlock) (*ProfileRegistry, error) {
if len(raw.Profiles) == 0 {
raw.Profiles = defaultRawProfiles()
}
out := &ProfileRegistry{
Vetting: raw.Vetting,
Profiles: make(map[string]Profile, len(raw.Profiles)),
}
if len(out.Vetting.Stages) == 0 {
out.Vetting.Stages = DefaultStages()
}
for name := range raw.Profiles {
resolved, err := resolveProfile(raw.Profiles, name, nil)
if err != nil {
return nil, err
}
out.Profiles[name] = resolved
}
return out, nil
}
// resolveProfile recursively walks inherit chains, depth-first. The
// visited slice is a cycle guard — we add the current name before
// recursing and bail if we ever see it again.
func resolveProfile(all map[string]rawProfile, name string, visited []string) (Profile, error) {
for _, v := range visited {
if v == name {
return Profile{}, fmt.Errorf("profile inherit cycle: %s -> %s", strings.Join(visited, " -> "), name)
}
}
raw, ok := all[name]
if !ok {
return Profile{}, fmt.Errorf("unknown profile %q", name)
}
base := Profile{
Name: name,
Inherit: raw.Inherit,
StageTimeouts: map[string]time.Duration{},
Defaults: map[string]map[string]any{},
}
if raw.Inherit != "" {
parent, err := resolveProfile(all, raw.Inherit, append(visited, name))
if err != nil {
return Profile{}, err
}
for k, v := range parent.StageTimeouts {
base.StageTimeouts[k] = v
}
for k, v := range parent.Defaults {
copyMap := make(map[string]any, len(v))
for kk, vv := range v {
copyMap[kk] = vv
}
base.Defaults[k] = copyMap
}
}
for stage, s := range raw.StageTimeouts {
d, err := time.ParseDuration(s)
if err != nil {
return Profile{}, fmt.Errorf("profile %s stage_timeouts[%s]: %w", name, stage, err)
}
base.StageTimeouts[stage] = d
}
for group, kv := range raw.Defaults {
dest, ok := base.Defaults[group]
if !ok {
dest = map[string]any{}
base.Defaults[group] = dest
}
for k, v := range kv {
dest[k] = v
}
}
return base, nil
}
// defaultRawProfiles returns sane per-profile durations + probe knobs
// used when vetting.yaml omits the `profiles:` block entirely. Matches
// the plan's per-stage budget table so the agent still gets coherent
// CPUStress/Storage/Network knobs without any operator-visible config.
func defaultRawProfiles() map[string]rawProfile {
return map[string]rawProfile{
ProfileQuick: {
StageTimeouts: map[string]string{
"CPUStress": "5m",
"Storage": "5m",
"Network": "2m",
"Burn": "3m",
"PSU": "1m",
},
Defaults: map[string]map[string]any{
"cpustress": {"cpu_pass": "2m", "mem_pass": "2m", "edac_poll": "10s"},
"storage": {"mode": "fio_sample", "fio_size": "1GiB", "fio_time": "3m", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"},
"network": {"duration": "60s"},
"burn": {"duration": "2m", "cpu_workers": "all", "mem_pct": 50, "fio_on_spare": true, "iperf_parallel": 2},
},
},
ProfileDeep: {
StageTimeouts: map[string]string{
"CPUStress": "2h",
"Storage": "4h",
"Network": "35m",
"Burn": "3h",
"PSU": "10m",
},
Defaults: map[string]map[string]any{
"cpustress": {"cpu_pass": "60m", "mem_pass": "60m", "edac_poll": "10s"},
"storage": {"mode": "full_disk", "fio_time": "2h", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"},
"network": {"duration": "30m"},
"burn": {"duration": "2h", "cpu_workers": "all", "mem_pct": 70, "fio_on_spare": true, "iperf_parallel": 4},
},
},
ProfileSoak: {
Inherit: ProfileDeep,
StageTimeouts: map[string]string{
"CPUStress": "14h",
"Storage": "8h",
"Network": "2h30m",
"Burn": "20h",
"PSU": "15m",
},
Defaults: map[string]map[string]any{
"cpustress": {"cpu_pass": "12h"},
"storage": {"mode": "full_disk", "fio_time": "6h"},
"network": {"duration": "2h"},
"burn": {"duration": "18h", "iperf_parallel": 8},
},
},
}
}
@@ -0,0 +1,57 @@
-- Phase-1 groundwork for profile-aware, threshold-gated vetting.
--
-- Adds:
-- * runs.profile — which profile the run is executing
-- (quick|deep|soak; defaults to quick for
-- backfill of older rows + tests).
-- * thresholds — seeded per run at creation from the
-- ProfileRegistry + per-host overrides;
-- immutable for that run so a late config
-- edit can't retroactively pass/fail it.
-- * threshold_evaluations — one row per observed sample vs threshold;
-- drives the report + pipeline badges.
-- * firmware_snapshots — per-run BIOS/BMC/NIC/HBA/microcode/NVMe
-- version captures used by SpecValidate
-- diffing in Phase 4.
ALTER TABLE runs ADD COLUMN profile TEXT NOT NULL DEFAULT 'quick';
CREATE TABLE IF NOT EXISTS thresholds (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
stage_name TEXT NOT NULL, -- "*" matches any stage
kind TEXT NOT NULL, -- temp|psu_volt|iperf|fio_p99_us|nic_retrans|edac_ce|edac_ue|mce|...
key TEXT NOT NULL, -- "*" or glob-ish match (prefix* / *suffix / exact)
op TEXT NOT NULL, -- lt|lte|gt|gte|within_pct
threshold REAL NOT NULL,
nominal REAL NOT NULL DEFAULT 0, -- used by within_pct; 0 elsewhere
unit TEXT NOT NULL DEFAULT '',
severity TEXT NOT NULL, -- critical|warning
source TEXT NOT NULL -- profile|host_override
);
CREATE INDEX IF NOT EXISTS idx_thresholds_run ON thresholds(run_id);
CREATE INDEX IF NOT EXISTS idx_thresholds_kind ON thresholds(run_id, stage_name, kind);
CREATE TABLE IF NOT EXISTS threshold_evaluations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
threshold_id INTEGER NOT NULL REFERENCES thresholds(id) ON DELETE CASCADE,
stage_name TEXT NOT NULL,
kind TEXT NOT NULL,
key TEXT NOT NULL,
ts TIMESTAMP NOT NULL,
observed REAL NOT NULL,
passed INTEGER NOT NULL -- 1 = sample within threshold, 0 = breach
);
CREATE INDEX IF NOT EXISTS idx_threshold_evals_run ON threshold_evaluations(run_id, passed);
CREATE TABLE IF NOT EXISTS firmware_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE,
component TEXT NOT NULL, -- bios|bmc|nic|hba|microcode|nvme_fw
identifier TEXT NOT NULL, -- slot/serial/device path that distinguishes this component
version TEXT NOT NULL,
vendor TEXT NOT NULL DEFAULT '',
raw_json TEXT NOT NULL DEFAULT '{}'
);
CREATE INDEX IF NOT EXISTS idx_firmware_run ON firmware_snapshots(run_id, component);
+3
View File
@@ -26,11 +26,13 @@ const (
StateWaitingReboot RunState = "WaitingReboot" StateWaitingReboot RunState = "WaitingReboot"
StateBooting RunState = "Booting" StateBooting RunState = "Booting"
StateInventoryCheck RunState = "InventoryCheck" StateInventoryCheck RunState = "InventoryCheck"
StateFirmware RunState = "Firmware"
StateSpecValidate RunState = "SpecValidate" StateSpecValidate RunState = "SpecValidate"
StateSMART RunState = "SMART" StateSMART RunState = "SMART"
StateCPUStress RunState = "CPUStress" StateCPUStress RunState = "CPUStress"
StateStorage RunState = "Storage" StateStorage RunState = "Storage"
StateNetwork RunState = "Network" StateNetwork RunState = "Network"
StateBurn RunState = "Burn"
StateGPU RunState = "GPU" StateGPU RunState = "GPU"
StatePSU RunState = "PSU" StatePSU RunState = "PSU"
StateReporting RunState = "Reporting" StateReporting RunState = "Reporting"
@@ -63,6 +65,7 @@ type Run struct {
HoldIP string HoldIP string
OverrideFlagsJSON string OverrideFlagsJSON string
NonDestructive bool NonDestructive bool
Profile string // quick|deep|soak; empty is treated as "quick"
} }
type StageState string type StageState string
+2 -2
View File
@@ -119,9 +119,9 @@ func (d *Dispatcher) pickNext(ctx context.Context) {
queued = &runs[i] queued = &runs[i]
} }
case model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting, case model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART, model.StateInventoryCheck, model.StateFirmware, model.StateSpecValidate, model.StateSMART,
model.StateCPUStress, model.StateStorage, model.StateNetwork, model.StateCPUStress, model.StateStorage, model.StateNetwork,
model.StateGPU, model.StatePSU, model.StateReporting: model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting:
inFlight++ inFlight++
} }
} }
+6 -2
View File
@@ -30,11 +30,13 @@ const (
// "InventoryCheck". Later stages share a name with their state. // "InventoryCheck". Later stages share a name with their state.
var stageStates = map[string]model.RunState{ var stageStates = map[string]model.RunState{
"Inventory": model.StateInventoryCheck, "Inventory": model.StateInventoryCheck,
"Firmware": model.StateFirmware,
"SpecValidate": model.StateSpecValidate, "SpecValidate": model.StateSpecValidate,
"SMART": model.StateSMART, "SMART": model.StateSMART,
"CPUStress": model.StateCPUStress, "CPUStress": model.StateCPUStress,
"Storage": model.StateStorage, "Storage": model.StateStorage,
"Network": model.StateNetwork, "Network": model.StateNetwork,
"Burn": model.StateBurn,
"GPU": model.StateGPU, "GPU": model.StateGPU,
"PSU": model.StatePSU, "PSU": model.StatePSU,
"Reporting": model.StateReporting, "Reporting": model.StateReporting,
@@ -44,11 +46,13 @@ var stageStates = map[string]model.RunState{
// first stage to Completed. Kept in sync with store.DefaultStageOrder. // first stage to Completed. Kept in sync with store.DefaultStageOrder.
var stageOrder = []model.RunState{ var stageOrder = []model.RunState{
model.StateInventoryCheck, model.StateInventoryCheck,
model.StateFirmware,
model.StateSpecValidate, model.StateSpecValidate,
model.StateSMART, model.StateSMART,
model.StateCPUStress, model.StateCPUStress,
model.StateStorage, model.StateStorage,
model.StateNetwork, model.StateNetwork,
model.StateBurn,
model.StateGPU, model.StateGPU,
model.StatePSU, model.StatePSU,
model.StateReporting, model.StateReporting,
@@ -143,9 +147,9 @@ func nextStageState(current model.RunState) (model.RunState, error) {
func allActiveStates() []model.RunState { func allActiveStates() []model.RunState {
return []model.RunState{ return []model.RunState{
model.StateQueued, model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting, model.StateQueued, model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting,
model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART, model.StateInventoryCheck, model.StateFirmware, model.StateSpecValidate, model.StateSMART,
model.StateCPUStress, model.StateStorage, model.StateNetwork, model.StateCPUStress, model.StateStorage, model.StateNetwork,
model.StateGPU, model.StatePSU, model.StateReporting, model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting,
} }
} }
@@ -80,11 +80,13 @@ func TestTriggerAgentClaimedFromWaitingReboot(t *testing.T) {
func TestTriggerStageMismatch(t *testing.T) { func TestTriggerStageMismatch(t *testing.T) {
stageStates := []model.RunState{ stageStates := []model.RunState{
model.StateInventoryCheck, model.StateInventoryCheck,
model.StateFirmware,
model.StateSpecValidate, model.StateSpecValidate,
model.StateSMART, model.StateSMART,
model.StateCPUStress, model.StateCPUStress,
model.StateStorage, model.StateStorage,
model.StateNetwork, model.StateNetwork,
model.StateBurn,
model.StateGPU, model.StateGPU,
model.StatePSU, model.StatePSU,
model.StateReporting, model.StateReporting,
@@ -114,11 +116,13 @@ func TestTriggerStageMismatch(t *testing.T) {
func TestStageNameForState(t *testing.T) { func TestStageNameForState(t *testing.T) {
pairs := map[string]model.RunState{ pairs := map[string]model.RunState{
"Inventory": model.StateInventoryCheck, "Inventory": model.StateInventoryCheck,
"Firmware": model.StateFirmware,
"SpecValidate": model.StateSpecValidate, "SpecValidate": model.StateSpecValidate,
"SMART": model.StateSMART, "SMART": model.StateSMART,
"CPUStress": model.StateCPUStress, "CPUStress": model.StateCPUStress,
"Storage": model.StateStorage, "Storage": model.StateStorage,
"Network": model.StateNetwork, "Network": model.StateNetwork,
"Burn": model.StateBurn,
"GPU": model.StateGPU, "GPU": model.StateGPU,
"PSU": model.StatePSU, "PSU": model.StatePSU,
"Reporting": model.StateReporting, "Reporting": model.StateReporting,
@@ -143,11 +147,13 @@ func TestNextStageWalk(t *testing.T) {
// one in the canonical order, and from Reporting onto Completed. // one in the canonical order, and from Reporting onto Completed.
chain := []model.RunState{ chain := []model.RunState{
model.StateInventoryCheck, model.StateInventoryCheck,
model.StateFirmware,
model.StateSpecValidate, model.StateSpecValidate,
model.StateSMART, model.StateSMART,
model.StateCPUStress, model.StateCPUStress,
model.StateStorage, model.StateStorage,
model.StateNetwork, model.StateNetwork,
model.StateBurn,
model.StateGPU, model.StateGPU,
model.StatePSU, model.StatePSU,
model.StateReporting, model.StateReporting,
+182
View File
@@ -0,0 +1,182 @@
package orchestrator
import (
"fmt"
"strings"
)
// ThresholdOp is one of the comparison operators a threshold supports.
// within_pct is the only one that cares about a "nominal" value for
// the key — used for PSU rails ("+12V within 5% of 12.0").
type ThresholdOp string
const (
OpLT ThresholdOp = "lt"
OpLTE ThresholdOp = "lte"
OpGT ThresholdOp = "gt"
OpGTE ThresholdOp = "gte"
OpWithinPct ThresholdOp = "within_pct"
)
// ThresholdSeverity routes a breach to either "fail the run" or "just
// surface a warning in the report". The evaluator returns it alongside
// the Pass flag so the caller can decide whether to transition the run.
type ThresholdSeverity string
const (
SeverityCritical ThresholdSeverity = "critical"
SeverityWarning ThresholdSeverity = "warning"
)
// Threshold is the evaluator's view of a stored threshold row. It's a
// flat, already-parsed value-object — the evaluator doesn't look at
// the DB and the store doesn't look at the evaluator.
type Threshold struct {
ID int64
Stage string // "*" matches any stage
Kind string
Key string // glob-ish: "*" / "prefix*" / "*suffix" / exact
Op ThresholdOp
Value float64
Nominal float64 // for within_pct (nominal voltage/frequency)
Severity ThresholdSeverity
}
// Sample is a single observation the evaluator tests against matching
// thresholds. Stage may be empty when the agent doesn't know which
// stage posted it (e.g. the thermal sidecar running across stages) —
// empty-stage samples only match thresholds with Stage == "*".
type Sample struct {
Stage string
Kind string
Key string
Value float64
}
// EvalResult is the per-sample outcome of a threshold evaluation:
// which threshold was consulted, whether the sample passed, and the
// severity so the caller can fast-fail on critical breaches.
type EvalResult struct {
Threshold Threshold
Passed bool
Observed float64
}
// Breached returns true when the sample violated the threshold.
func (r EvalResult) Breached() bool { return !r.Passed }
// CriticalBreach returns true only for critical-severity breaches —
// the "fail the run right now" case.
func (r EvalResult) CriticalBreach() bool {
return r.Breached() && r.Threshold.Severity == SeverityCritical
}
// Evaluate runs a single sample through every threshold that applies
// to it. A sample may match more than one threshold (a generic "*"
// rule + a stage-specific override); each match produces its own
// EvalResult in the returned slice so both get persisted.
func Evaluate(sample Sample, thresholds []Threshold) []EvalResult {
out := make([]EvalResult, 0, 1)
for _, t := range thresholds {
if !thresholdMatchesSample(t, sample) {
continue
}
passed, err := evaluateOp(t.Op, sample.Value, t.Value, t.Nominal)
if err != nil {
// Unknown operator — skip. The caller could validate on
// insert; here we prefer to drop the threshold than to
// return an error that forces every Sensor write to 500.
continue
}
out = append(out, EvalResult{
Threshold: t,
Passed: passed,
Observed: sample.Value,
})
}
return out
}
// thresholdMatchesSample applies the stage + kind + key filter. Kind
// is always literal — there's no "any kind" threshold and if there
// ever is we'll add a `kind: *` escape hatch. Stage and key both
// support glob-ish matching.
func thresholdMatchesSample(t Threshold, s Sample) bool {
if t.Kind != s.Kind {
return false
}
if !stageMatches(t.Stage, s.Stage) {
return false
}
if !keyMatches(t.Key, s.Key) {
return false
}
return true
}
// stageMatches returns true if the threshold's stage selector applies
// to the sample's stage. "*" matches everything; empty threshold
// selector is treated as "*" so a threshold declared without a stage
// key isn't accidentally inert. A sample without a stage only matches
// the "*" selector — we don't guess.
func stageMatches(selector, sampleStage string) bool {
if selector == "" || selector == "*" {
return true
}
return selector == sampleStage
}
// keyMatches handles "*", "prefix*", "*suffix", and exact match. We
// avoid pulling in filepath.Match so Windows `\`-vs-`/` rules don't
// leak into the sample namespace (key "eth0/rx_errors" is not a path).
func keyMatches(pattern, key string) bool {
if pattern == "" || pattern == "*" {
return true
}
hasPrefix := strings.HasPrefix(pattern, "*")
hasSuffix := strings.HasSuffix(pattern, "*")
switch {
case hasPrefix && hasSuffix:
inner := strings.TrimPrefix(strings.TrimSuffix(pattern, "*"), "*")
return strings.Contains(key, inner)
case hasSuffix:
return strings.HasPrefix(key, strings.TrimSuffix(pattern, "*"))
case hasPrefix:
return strings.HasSuffix(key, strings.TrimPrefix(pattern, "*"))
default:
return pattern == key
}
}
// evaluateOp does the numeric comparison. within_pct is the oddball:
// it tests |observed - nominal| <= (pct / 100) * nominal. Returns an
// error for unknown operators so the caller can log + drop.
func evaluateOp(op ThresholdOp, observed, threshold, nominal float64) (bool, error) {
switch op {
case OpLT:
return observed < threshold, nil
case OpLTE:
return observed <= threshold, nil
case OpGT:
return observed > threshold, nil
case OpGTE:
return observed >= threshold, nil
case OpWithinPct:
if nominal == 0 {
// within_pct against a 0 nominal is meaningless. Treat as
// pass so a misconfigured rule doesn't spuriously fail.
return true, nil
}
allowed := (threshold / 100.0) * nominal
if allowed < 0 {
allowed = -allowed
}
diff := observed - nominal
if diff < 0 {
diff = -diff
}
return diff <= allowed, nil
default:
return false, fmt.Errorf("unknown op %q", op)
}
}
+152
View File
@@ -0,0 +1,152 @@
package orchestrator
import "testing"
// TestEvaluate_Ops covers every operator against the boundary case
// (equal to threshold) plus one clearly-inside and one clearly-outside
// value. Table-driven because the logic is regular.
func TestEvaluate_Ops(t *testing.T) {
cases := []struct {
name string
op ThresholdOp
value float64
nominal float64
observed float64
want bool
}{
{"lt strict below", OpLT, 10, 0, 5, true},
{"lt equal fails", OpLT, 10, 0, 10, false},
{"lt above fails", OpLT, 10, 0, 15, false},
{"lte below", OpLTE, 10, 0, 5, true},
{"lte equal passes", OpLTE, 10, 0, 10, true},
{"lte above fails", OpLTE, 10, 0, 11, false},
{"gt below fails", OpGT, 900, 0, 800, false},
{"gt equal fails", OpGT, 900, 0, 900, false},
{"gt above passes", OpGT, 900, 0, 950, true},
{"gte equal passes", OpGTE, 900, 0, 900, true},
{"gte below fails", OpGTE, 900, 0, 800, false},
{"within_pct exact", OpWithinPct, 5, 12.0, 12.0, true},
{"within_pct inside", OpWithinPct, 5, 12.0, 11.7, true},
{"within_pct outside low", OpWithinPct, 5, 12.0, 11.0, false},
{"within_pct outside high", OpWithinPct, 5, 12.0, 12.7, false},
{"within_pct zero nominal passes", OpWithinPct, 5, 0, 99, true},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
rules := []Threshold{{
Stage: "*", Kind: "k", Key: "k", Op: tc.op,
Value: tc.value, Nominal: tc.nominal, Severity: SeverityCritical,
}}
res := Evaluate(Sample{Stage: "Any", Kind: "k", Key: "k", Value: tc.observed}, rules)
if len(res) != 1 {
t.Fatalf("expected 1 match, got %d", len(res))
}
if res[0].Passed != tc.want {
t.Fatalf("op=%s observed=%v want passed=%v got %v", tc.op, tc.observed, tc.want, res[0].Passed)
}
})
}
}
// TestEvaluate_StageMatching: a Network-scoped rule ignores samples
// stamped with other stages. Global "*" catches everything.
func TestEvaluate_StageMatching(t *testing.T) {
rules := []Threshold{
{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
{Stage: "Burn", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 88, Severity: SeverityCritical},
}
// Sample from CPUStress — only the global rule applies.
res := Evaluate(Sample{Stage: "CPUStress", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
if len(res) != 1 {
t.Fatalf("cpustress sample: expected 1 match, got %d", len(res))
}
if res[0].Threshold.Value != 92 {
t.Fatalf("cpustress sample matched wrong rule: %+v", res[0].Threshold)
}
// Sample from Burn — both rules match. The stricter one breaches.
res = Evaluate(Sample{Stage: "Burn", Kind: "temp", Key: "cpu/0", Value: 89}, rules)
if len(res) != 2 {
t.Fatalf("burn sample: expected 2 matches, got %d", len(res))
}
var globalPassed, burnPassed bool
for _, r := range res {
switch r.Threshold.Value {
case 92:
globalPassed = r.Passed
case 88:
burnPassed = r.Passed
}
}
if !globalPassed {
t.Fatalf("global 92C rule should pass at 89C")
}
if burnPassed {
t.Fatalf("burn 88C rule should breach at 89C")
}
}
// TestEvaluate_KeyWildcards covers "*" / "prefix*" / "*suffix".
func TestEvaluate_KeyWildcards(t *testing.T) {
cases := []struct {
pattern string
key string
match bool
}{
{"*", "anything", true},
{"", "anything", true},
{"cpu/*", "cpu/0", true},
{"cpu/*", "gpu/0", false},
{"*/rate", "eth0/rate", true},
{"*/rate", "eth0/count", false},
{"exact", "exact", true},
{"exact", "exactly", false},
}
for _, tc := range cases {
t.Run(tc.pattern+"_vs_"+tc.key, func(t *testing.T) {
got := keyMatches(tc.pattern, tc.key)
if got != tc.match {
t.Fatalf("keyMatches(%q, %q) = %v, want %v", tc.pattern, tc.key, got, tc.match)
}
})
}
}
// TestEvaluate_SeverityDispatch: only critical breaches flip
// CriticalBreach; warning-severity breaches stay advisory.
func TestEvaluate_SeverityDispatch(t *testing.T) {
rules := []Threshold{
{Stage: "*", Kind: "temp", Key: "cpu", Op: OpLT, Value: 92, Severity: SeverityCritical},
{Stage: "*", Kind: "fio", Key: "p99", Op: OpLT, Value: 50000, Severity: SeverityWarning},
}
res := Evaluate(Sample{Stage: "CPU", Kind: "temp", Key: "cpu", Value: 95}, rules)
if len(res) != 1 || !res[0].CriticalBreach() {
t.Fatalf("critical breach not detected: %+v", res)
}
res = Evaluate(Sample{Stage: "Storage", Kind: "fio", Key: "p99", Value: 80000}, rules)
if len(res) != 1 {
t.Fatalf("expected 1 match, got %d", len(res))
}
if res[0].CriticalBreach() {
t.Fatalf("warning-severity breach should not be critical")
}
if !res[0].Breached() {
t.Fatalf("warning-severity rule should still show breach=true")
}
}
// TestEvaluate_NoMatchingThreshold: a sample that doesn't hit any rule
// produces an empty result slice — callers treat that as "advisory".
func TestEvaluate_NoMatchingThreshold(t *testing.T) {
rules := []Threshold{
{Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical},
}
res := Evaluate(Sample{Stage: "Network", Kind: "iperf", Key: "throughput", Value: 950}, rules)
if len(res) != 0 {
t.Fatalf("unmatched sample should yield 0 results, got %d", len(res))
}
}
+32 -1
View File
@@ -28,7 +28,17 @@ type Data struct {
Host model.Host Host model.Host
Stages []model.Stage Stages []model.Stage
SpecDiffs []model.SpecDiff SpecDiffs []model.SpecDiff
Aggregates []Aggregate // flattened measurement summary; see Aggregate Aggregates []Aggregate // flattened measurement summary; see Aggregate
Firmware []FirmwareSnapshot // captured firmware versions, empty if none
}
// FirmwareSnapshot is the report-facing view of one firmware row.
// Package-local so the HTML template stays decoupled from store types.
type FirmwareSnapshot struct {
Component string
Identifier string
Version string
Vendor string
} }
// Aggregate is a per (kind, key) summary of a run's measurements. Min/ // Aggregate is a per (kind, key) summary of a run's measurements. Min/
@@ -196,6 +206,27 @@ const htmlTemplate = `<!doctype html>
</table> </table>
</section> </section>
<section>
<h2>Firmware ({{len .Firmware}})</h2>
{{if .Firmware}}
<table>
<thead><tr><th>Component</th><th>Identifier</th><th>Version</th><th>Vendor</th></tr></thead>
<tbody>
{{range .Firmware}}
<tr>
<td>{{.Component}}</td>
<td><code>{{.Identifier}}</code></td>
<td><code>{{.Version}}</code></td>
<td>{{.Vendor}}</td>
</tr>
{{end}}
</tbody>
</table>
{{else}}
<p>No firmware snapshots captured.</p>
{{end}}
</section>
<section> <section>
<h2>Spec diffs ({{len .SpecDiffs}})</h2> <h2>Spec diffs ({{len .SpecDiffs}})</h2>
{{if .SpecDiffs}} {{if .SpecDiffs}}
+97 -5
View File
@@ -21,11 +21,36 @@ import (
) )
type Spec struct { type Spec struct {
CPU *CPUSpec `yaml:"cpu,omitempty"` CPU *CPUSpec `yaml:"cpu,omitempty"`
Memory *MemorySpec `yaml:"memory,omitempty"` Memory *MemorySpec `yaml:"memory,omitempty"`
Disks []DiskSpec `yaml:"disks,omitempty"` Disks []DiskSpec `yaml:"disks,omitempty"`
NICs []NICSpec `yaml:"nics,omitempty"` NICs []NICSpec `yaml:"nics,omitempty"`
GPUs []GPUSpec `yaml:"gpus,omitempty"` GPUs []GPUSpec `yaml:"gpus,omitempty"`
Firmware []FirmwareSpec `yaml:"firmware,omitempty"`
}
// FirmwareSpec is one row in the expected-spec YAML's `firmware:` block.
// Component is one of bios|bmc|nic|hba|microcode|nvme_fw (matches the
// on-wire value from agent/probes.FirmwareSnapshot.Component). Identifier
// is optional — when empty the rule applies to every observed snapshot
// of that component (use for single-instance things like BIOS/microcode);
// when set it pins the check to a specific NIC port / NVMe controller /
// PCI address. Version is the literal string expected; comparison is
// exact after trimming whitespace.
type FirmwareSpec struct {
Component string `yaml:"component"`
Identifier string `yaml:"identifier,omitempty"`
Version string `yaml:"version"`
}
// FirmwareObserved is what the agent reported, in a spec-package-local
// shape so callers don't need to thread store types through the diff.
// The server converts store.FirmwareSnapshot → FirmwareObserved before
// calling DiffFirmware.
type FirmwareObserved struct {
Component string
Identifier string
Version string
} }
type CPUSpec struct { type CPUSpec struct {
@@ -175,6 +200,73 @@ func diffNICs(expected, actual []NICSpec) []model.SpecDiff {
return out return out
} }
// DiffFirmware returns a SpecDiff per firmware expectation that doesn't
// find a matching observed snapshot. Matching rules:
// - An expected rule with Identifier set matches by (component, id);
// a missing observed snapshot yields a "present=false" diff.
// - An expected rule with Identifier empty applies to every observed
// snapshot of that component — useful for "all NICs must run fw
// 8.30" without listing each port. Zero observed snapshots of the
// component yields a single "present=false" diff, not N.
// - Version mismatch emits an exact-string expected→actual diff.
// Case is preserved (firmware versions are case-sensitive in practice).
func DiffFirmware(expected []FirmwareSpec, actual []FirmwareObserved) []model.SpecDiff {
if len(expected) == 0 {
return nil
}
byCompIdent := map[string]FirmwareObserved{}
byComp := map[string][]FirmwareObserved{}
for _, o := range actual {
byCompIdent[fwKey(o.Component, o.Identifier)] = o
byComp[o.Component] = append(byComp[o.Component], o)
}
var out []model.SpecDiff
for _, exp := range expected {
comp := strings.TrimSpace(exp.Component)
if comp == "" || strings.TrimSpace(exp.Version) == "" {
continue
}
label := "firmware[" + comp
if exp.Identifier != "" {
label += "/" + exp.Identifier
}
label += "]"
if exp.Identifier != "" {
got, ok := byCompIdent[fwKey(comp, exp.Identifier)]
if !ok {
out = append(out, diff(label+".present", "true", "false"))
continue
}
if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
out = append(out, diff(label+".version", exp.Version, got.Version))
}
continue
}
// No identifier: fan out across every observed snapshot of this
// component. Missing is one diff; a mismatching port/controller
// emits one diff per mismatch.
observed := byComp[comp]
if len(observed) == 0 {
out = append(out, diff(label+".present", "true", "false"))
continue
}
for _, got := range observed {
if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) {
slot := got.Identifier
if slot == "" {
slot = "*"
}
out = append(out, diff("firmware["+comp+"/"+slot+"].version", exp.Version, got.Version))
}
}
}
return out
}
func fwKey(component, identifier string) string {
return strings.ToLower(component) + "|" + strings.ToLower(identifier)
}
func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff { func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff {
if len(expected) == 0 { if len(expected) == 0 {
return nil return nil
+93
View File
@@ -119,3 +119,96 @@ func TestDiffSeverityAlwaysCritical(t *testing.T) {
} }
} }
} }
func TestDiffFirmwareIdentifierMatch(t *testing.T) {
exp := []FirmwareSpec{{Component: "bios", Version: "3.2"}}
obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}}
if d := DiffFirmware(exp, obs); len(d) != 0 {
t.Fatalf("matching bios version should produce no diff, got %+v", d)
}
}
func TestDiffFirmwareVersionMismatch(t *testing.T) {
exp := []FirmwareSpec{{Component: "bios", Version: "3.3"}}
obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}}
d := DiffFirmware(exp, obs)
if len(d) != 1 {
t.Fatalf("want 1 diff, got %d: %+v", len(d), d)
}
if d[0].Expected != "3.3" || d[0].Actual != "3.2" {
t.Fatalf("diff expected/actual = %q/%q, want 3.3/3.2", d[0].Expected, d[0].Actual)
}
if d[0].Severity != "critical" {
t.Errorf("severity = %q, want critical", d[0].Severity)
}
}
func TestDiffFirmwareMissingComponentPresent(t *testing.T) {
// Expected rule with no identifier + zero observed snapshots →
// single "present=false" diff, not N.
exp := []FirmwareSpec{{Component: "bmc", Version: "1.74"}}
d := DiffFirmware(exp, nil)
if len(d) != 1 {
t.Fatalf("want 1 diff for missing BMC, got %d: %+v", len(d), d)
}
if d[0].Field != "firmware[bmc].present" || d[0].Expected != "true" || d[0].Actual != "false" {
t.Fatalf("missing-BMC diff = %+v", d[0])
}
}
func TestDiffFirmwareWildcardFanOut(t *testing.T) {
// Expected rule with empty identifier fans across every observed
// snapshot of the component — one port matches, one doesn't → one diff.
exp := []FirmwareSpec{{Component: "nic", Version: "16.32.1010"}}
obs := []FirmwareObserved{
{Component: "nic", Identifier: "eth0", Version: "16.32.1010"},
{Component: "nic", Identifier: "eth1", Version: "14.28.0000"},
}
d := DiffFirmware(exp, obs)
if len(d) != 1 {
t.Fatalf("want 1 diff (mismatched eth1 only), got %d: %+v", len(d), d)
}
if d[0].Field != "firmware[nic/eth1].version" {
t.Errorf("field = %q, want firmware[nic/eth1].version", d[0].Field)
}
}
func TestDiffFirmwareIdentifierPin(t *testing.T) {
// Identifier set: pins the rule to a specific port. Other ports
// with mismatched firmware are not evaluated by this rule.
exp := []FirmwareSpec{{Component: "nic", Identifier: "eth0", Version: "1.0"}}
obs := []FirmwareObserved{
{Component: "nic", Identifier: "eth0", Version: "1.0"},
{Component: "nic", Identifier: "eth1", Version: "9.9"},
}
if d := DiffFirmware(exp, obs); len(d) != 0 {
t.Fatalf("pinned rule should ignore other ports, got %+v", d)
}
}
func TestDiffFirmwareIdentifierPinMissing(t *testing.T) {
// Pinned rule with no matching observed snapshot → present=false diff.
exp := []FirmwareSpec{{Component: "nic", Identifier: "eth0", Version: "1.0"}}
if d := DiffFirmware(exp, nil); len(d) != 1 || d[0].Field != "firmware[nic/eth0].present" {
t.Fatalf("want present=false for pinned rule, got %+v", d)
}
}
func TestDiffFirmwareEmptyRuleSkipped(t *testing.T) {
// Empty component or empty version silently skip rather than panic.
exp := []FirmwareSpec{{Component: "", Version: "x"}, {Component: "bios", Version: ""}}
obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}}
if d := DiffFirmware(exp, obs); len(d) != 0 {
t.Fatalf("empty rules should skip, got %+v", d)
}
}
func TestDiffFirmwareCaseInsensitive(t *testing.T) {
// Version match is case-insensitive after trim; avoids spurious diff
// from ethtool's "FW1234" vs expected YAML's "fw1234".
exp := []FirmwareSpec{{Component: "nvme_fw", Identifier: "nvme0", Version: "fw1234"}}
obs := []FirmwareObserved{{Component: "nvme_fw", Identifier: "nvme0", Version: "FW1234"}}
if d := DiffFirmware(exp, obs); len(d) != 0 {
t.Fatalf("case-insensitive match expected, got %+v", d)
}
}
+97
View File
@@ -0,0 +1,97 @@
package store
import (
"context"
"database/sql"
"fmt"
)
// FirmwareSnapshot is one row in firmware_snapshots. A run captures
// many (one per BIOS/BMC/NIC/HBA/microcode/NVMe) so SpecValidate can
// diff them against the host's expected spec in Phase 4.
type FirmwareSnapshot struct {
ID int64
RunID int64
Component string // bios|bmc|nic|hba|microcode|nvme_fw
Identifier string // slot/serial/device path
Version string
Vendor string
RawJSON string
}
// Firmware is the CRUD seam. The agent's Phase-4 probe POSTs captured
// rows; the orchestrator stores them. SpecValidate reads them back.
type Firmware struct {
DB *sql.DB
}
// Create inserts a single firmware snapshot. One call per (run, component,
// identifier) — the agent probe owns dedup/formatting.
func (f *Firmware) Create(ctx context.Context, s FirmwareSnapshot) (int64, error) {
raw := s.RawJSON
if raw == "" {
raw = "{}"
}
res, err := f.DB.ExecContext(ctx, `
INSERT INTO firmware_snapshots(run_id, component, identifier, version, vendor, raw_json)
VALUES(?,?,?,?,?,?)
`, s.RunID, s.Component, s.Identifier, s.Version, s.Vendor, raw)
if err != nil {
return 0, fmt.Errorf("insert firmware: %w", err)
}
return res.LastInsertId()
}
// CreateBatch persists a slice of snapshots under one transaction.
// Agent probe enumerates all components in one pass, so batching wins.
func (f *Firmware) CreateBatch(ctx context.Context, rows []FirmwareSnapshot) error {
if len(rows) == 0 {
return nil
}
tx, err := f.DB.BeginTx(ctx, nil)
if err != nil {
return err
}
defer func() { _ = tx.Rollback() }()
stmt, err := tx.PrepareContext(ctx, `
INSERT INTO firmware_snapshots(run_id, component, identifier, version, vendor, raw_json)
VALUES(?,?,?,?,?,?)
`)
if err != nil {
return fmt.Errorf("prepare firmware insert: %w", err)
}
defer func() { _ = stmt.Close() }()
for _, s := range rows {
raw := s.RawJSON
if raw == "" {
raw = "{}"
}
if _, err := stmt.ExecContext(ctx, s.RunID, s.Component, s.Identifier, s.Version, s.Vendor, raw); err != nil {
return fmt.Errorf("insert firmware %s/%s: %w", s.Component, s.Identifier, err)
}
}
return tx.Commit()
}
// ListForRun returns every firmware snapshot for a run in stable order.
// Report page + SpecValidate both read this.
func (f *Firmware) ListForRun(ctx context.Context, runID int64) ([]FirmwareSnapshot, error) {
rows, err := f.DB.QueryContext(ctx, `
SELECT id, run_id, component, identifier, version, vendor, raw_json
FROM firmware_snapshots WHERE run_id = ? ORDER BY id
`, runID)
if err != nil {
return nil, err
}
defer rows.Close()
var out []FirmwareSnapshot
for rows.Next() {
var s FirmwareSnapshot
if err := rows.Scan(&s.ID, &s.RunID, &s.Component, &s.Identifier,
&s.Version, &s.Vendor, &s.RawJSON); err != nil {
return nil, err
}
out = append(out, s)
}
return out, rows.Err()
}
+30 -12
View File
@@ -14,16 +14,30 @@ type Runs struct {
DB *sql.DB DB *sql.DB
} }
// Create inserts a new run using the default "quick" profile. Older
// call sites (and most tests) target this form — the profile column's
// DEFAULT 'quick' on runs takes care of the backfill.
func (r *Runs) Create(ctx context.Context, hostID int64, tokenHash string, nonDestructive bool) (int64, error) { func (r *Runs) Create(ctx context.Context, hostID int64, tokenHash string, nonDestructive bool) (int64, error) {
return r.CreateWithProfile(ctx, hostID, tokenHash, nonDestructive, "quick")
}
// CreateWithProfile inserts a new run with an explicit profile
// ("quick"|"deep"|"soak"). The UI handler is the authoritative caller;
// empty profile falls back to "quick" so a misconfigured form doesn't
// leave a row with a blank profile column.
func (r *Runs) CreateWithProfile(ctx context.Context, hostID int64, tokenHash string, nonDestructive bool, profile string) (int64, error) {
if profile == "" {
profile = "quick"
}
now := time.Now().UTC() now := time.Now().UTC()
nd := 0 nd := 0
if nonDestructive { if nonDestructive {
nd = 1 nd = 1
} }
res, err := r.DB.ExecContext(ctx, ` res, err := r.DB.ExecContext(ctx, `
INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at, non_destructive) INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at, non_destructive, profile)
VALUES(?,?,?,?,?,?) VALUES(?,?,?,?,?,?,?)
`, hostID, string(model.StateQueued), tokenHash, "linux", now, nd) `, hostID, string(model.StateQueued), tokenHash, "linux", now, nd, profile)
if err != nil { if err != nil {
return 0, fmt.Errorf("insert run: %w", err) return 0, fmt.Errorf("insert run: %w", err)
} }
@@ -107,14 +121,15 @@ func (r *Runs) Get(ctx context.Context, id int64) (*model.Run, error) {
SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
COALESCE(next_boot_target,''), agent_token_hash, started_at, COALESCE(next_boot_target,''), agent_token_hash, started_at,
completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
COALESCE(override_flags_json,''), COALESCE(non_destructive,0) COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
COALESCE(profile,'quick')
FROM runs WHERE id = ? FROM runs WHERE id = ?
`, id) `, id)
var run model.Run var run model.Run
var completedAt sql.NullTime var completedAt sql.NullTime
err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive) &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile)
if errors.Is(err, sql.ErrNoRows) { if errors.Is(err, sql.ErrNoRows) {
return nil, ErrNotFound return nil, ErrNotFound
} }
@@ -133,7 +148,8 @@ func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, err
SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
COALESCE(next_boot_target,''), agent_token_hash, started_at, COALESCE(next_boot_target,''), agent_token_hash, started_at,
completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
COALESCE(override_flags_json,''), COALESCE(non_destructive,0) COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
COALESCE(profile,'quick')
FROM runs WHERE host_id = ? FROM runs WHERE host_id = ?
ORDER BY id DESC LIMIT 1 ORDER BY id DESC LIMIT 1
`, hostID) `, hostID)
@@ -141,7 +157,7 @@ func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, err
var completedAt sql.NullTime var completedAt sql.NullTime
err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive) &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile)
if errors.Is(err, sql.ErrNoRows) { if errors.Is(err, sql.ErrNoRows) {
return nil, nil return nil, nil
} }
@@ -165,7 +181,8 @@ func (r *Runs) ListForHost(ctx context.Context, hostID int64, limit int) ([]mode
SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
COALESCE(next_boot_target,''), agent_token_hash, started_at, COALESCE(next_boot_target,''), agent_token_hash, started_at,
completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
COALESCE(override_flags_json,''), COALESCE(non_destructive,0) COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
COALESCE(profile,'quick')
FROM runs FROM runs
WHERE host_id = ? WHERE host_id = ?
ORDER BY id DESC ORDER BY id DESC
@@ -181,7 +198,7 @@ func (r *Runs) ListForHost(ctx context.Context, hostID int64, limit int) ([]mode
var completedAt sql.NullTime var completedAt sql.NullTime
if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive); err != nil { &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile); err != nil {
return nil, err return nil, err
} }
if completedAt.Valid { if completedAt.Valid {
@@ -206,7 +223,8 @@ func (r *Runs) Active(ctx context.Context) ([]model.Run, error) {
SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
COALESCE(next_boot_target,''), agent_token_hash, started_at, COALESCE(next_boot_target,''), agent_token_hash, started_at,
completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
COALESCE(override_flags_json,''), COALESCE(non_destructive,0) COALESCE(override_flags_json,''), COALESCE(non_destructive,0),
COALESCE(profile,'quick')
FROM runs FROM runs
WHERE state NOT IN ('Completed','Released','Cancelled') WHERE state NOT IN ('Completed','Released','Cancelled')
ORDER BY id ORDER BY id
@@ -221,7 +239,7 @@ func (r *Runs) Active(ctx context.Context) ([]model.Run, error) {
var completedAt sql.NullTime var completedAt sql.NullTime
if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive); err != nil { &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile); err != nil {
return nil, err return nil, err
} }
if completedAt.Valid { if completedAt.Valid {
@@ -275,7 +293,7 @@ func (r *Runs) FindActiveByMAC(ctx context.Context, mac string) (*model.Run, err
var completedAt sql.NullTime var completedAt sql.NullTime
err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive) &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile)
if errors.Is(err, sql.ErrNoRows) { if errors.Is(err, sql.ErrNoRows) {
return nil, nil return nil, nil
} }
+2
View File
@@ -17,11 +17,13 @@ type Stages struct {
// reaches Inventory; later phases add more executors but the list is fixed. // reaches Inventory; later phases add more executors but the list is fixed.
var DefaultStageOrder = []string{ var DefaultStageOrder = []string{
"Inventory", "Inventory",
"Firmware",
"SpecValidate", "SpecValidate",
"SMART", "SMART",
"CPUStress", "CPUStress",
"Storage", "Storage",
"Network", "Network",
"Burn",
"GPU", "GPU",
"PSU", "PSU",
"Reporting", "Reporting",
+280
View File
@@ -0,0 +1,280 @@
package store
import (
"context"
"database/sql"
"fmt"
"time"
)
// Threshold is the DB view of a per-run threshold row. Mirrors the
// orchestrator.Threshold value-object but keeps Severity/Op as strings
// so callers higher up don't force this package to import orchestrator.
type Threshold struct {
ID int64
RunID int64
Stage string
Kind string
Key string
Op string
Threshold float64
Nominal float64
Unit string
Severity string
Source string // profile|host_override
}
// ThresholdEvaluation is one recorded comparison — the evaluator calls
// this for every sample that matched a threshold, whether it passed
// or breached. The report page aggregates these to show the operator
// why a run failed (or was flagged as warning-only).
type ThresholdEvaluation struct {
ID int64
RunID int64
ThresholdID int64
Stage string
Kind string
Key string
TS time.Time
Observed float64
Passed bool
}
// Thresholds is the CRUD seam. Kept intentionally narrow: seed at run
// creation, list for evaluation on each sensor batch, record eval
// results, aggregate for the report.
type Thresholds struct {
DB *sql.DB
}
// ThresholdSpec is the caller-supplied shape for seeding — a flat
// value-object that carries the threshold rule plus its source so
// the ProfileRegistry-driven seed and per-host overrides converge
// on one insert path. Kept here (not in config) so the store layer
// doesn't have to import config.
type ThresholdSpec struct {
Stage string
Kind string
Key string
Op string
Value float64
Nominal float64
Unit string
Severity string
Source string
}
// SeedForRun converts the caller's specs into Threshold rows for the
// given run and bulk-inserts them. Returns the inserted rows with IDs
// populated so the evaluator can pin evaluations without a re-read.
func (t *Thresholds) SeedForRun(ctx context.Context, runID int64, specs []ThresholdSpec) ([]Threshold, error) {
rows := make([]Threshold, 0, len(specs))
for _, s := range specs {
rows = append(rows, Threshold{
RunID: runID,
Stage: s.Stage,
Kind: s.Kind,
Key: s.Key,
Op: s.Op,
Threshold: s.Value,
Nominal: s.Nominal,
Unit: s.Unit,
Severity: s.Severity,
Source: s.Source,
})
}
return t.CreateBatch(ctx, rows)
}
// Create inserts a single threshold row — used by the seed path when
// the orchestrator materializes per-run rules from the ProfileRegistry.
// Returns the row's ID so the evaluator can pin evaluations to it.
func (t *Thresholds) Create(ctx context.Context, th Threshold) (int64, error) {
res, err := t.DB.ExecContext(ctx, `
INSERT INTO thresholds(run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source)
VALUES(?,?,?,?,?,?,?,?,?,?)
`, th.RunID, th.Stage, th.Kind, th.Key, th.Op, th.Threshold, th.Nominal, th.Unit, th.Severity, th.Source)
if err != nil {
return 0, fmt.Errorf("insert threshold: %w", err)
}
return res.LastInsertId()
}
// CreateBatch is the fast path for run seeding — one transaction per
// run, one row per threshold. Returns the inserted rows with IDs set
// so the caller can drop them into the in-memory evaluator without a
// follow-up read.
func (t *Thresholds) CreateBatch(ctx context.Context, rows []Threshold) ([]Threshold, error) {
if len(rows) == 0 {
return nil, nil
}
tx, err := t.DB.BeginTx(ctx, nil)
if err != nil {
return nil, err
}
defer func() { _ = tx.Rollback() }()
stmt, err := tx.PrepareContext(ctx, `
INSERT INTO thresholds(run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source)
VALUES(?,?,?,?,?,?,?,?,?,?)
`)
if err != nil {
return nil, fmt.Errorf("prepare threshold insert: %w", err)
}
defer func() { _ = stmt.Close() }()
out := make([]Threshold, 0, len(rows))
for _, th := range rows {
res, err := stmt.ExecContext(ctx, th.RunID, th.Stage, th.Kind, th.Key, th.Op,
th.Threshold, th.Nominal, th.Unit, th.Severity, th.Source)
if err != nil {
return nil, fmt.Errorf("insert threshold %s/%s: %w", th.Stage, th.Key, err)
}
id, err := res.LastInsertId()
if err != nil {
return nil, err
}
th.ID = id
out = append(out, th)
}
if err := tx.Commit(); err != nil {
return nil, err
}
return out, nil
}
// ListForRun returns every threshold seeded for a run, in stable ID
// order. Evaluator expects this to be cheap (few tens of rows per run)
// and pulls it on each /sensor batch.
func (t *Thresholds) ListForRun(ctx context.Context, runID int64) ([]Threshold, error) {
rows, err := t.DB.QueryContext(ctx, `
SELECT id, run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source
FROM thresholds WHERE run_id = ? ORDER BY id
`, runID)
if err != nil {
return nil, err
}
defer rows.Close()
var out []Threshold
for rows.Next() {
var th Threshold
if err := rows.Scan(&th.ID, &th.RunID, &th.Stage, &th.Kind, &th.Key,
&th.Op, &th.Threshold, &th.Nominal, &th.Unit, &th.Severity, &th.Source); err != nil {
return nil, err
}
out = append(out, th)
}
return out, rows.Err()
}
// RecordEvaluation persists a single evaluation outcome. Called per
// matching sample so the run's report has a full audit trail ("temp
// hit 95 at 14:22:03" rather than just "temp failed").
func (t *Thresholds) RecordEvaluation(ctx context.Context, ev ThresholdEvaluation) error {
passed := 0
if ev.Passed {
passed = 1
}
if ev.TS.IsZero() {
ev.TS = time.Now().UTC()
}
_, err := t.DB.ExecContext(ctx, `
INSERT INTO threshold_evaluations(run_id, threshold_id, stage_name, kind, key, ts, observed, passed)
VALUES(?,?,?,?,?,?,?,?)
`, ev.RunID, ev.ThresholdID, ev.Stage, ev.Kind, ev.Key, ev.TS, ev.Observed, passed)
if err != nil {
return fmt.Errorf("record evaluation: %w", err)
}
return nil
}
// RecordBatch persists a slice of evaluations in one transaction. The
// agent-handler hot path builds these one per sample and batches them
// under the same Sensor POST so we take one round-trip rather than N.
func (t *Thresholds) RecordBatch(ctx context.Context, evals []ThresholdEvaluation) error {
if len(evals) == 0 {
return nil
}
tx, err := t.DB.BeginTx(ctx, nil)
if err != nil {
return err
}
defer func() { _ = tx.Rollback() }()
stmt, err := tx.PrepareContext(ctx, `
INSERT INTO threshold_evaluations(run_id, threshold_id, stage_name, kind, key, ts, observed, passed)
VALUES(?,?,?,?,?,?,?,?)
`)
if err != nil {
return fmt.Errorf("prepare eval insert: %w", err)
}
defer func() { _ = stmt.Close() }()
for _, ev := range evals {
passed := 0
if ev.Passed {
passed = 1
}
if ev.TS.IsZero() {
ev.TS = time.Now().UTC()
}
if _, err := stmt.ExecContext(ctx, ev.RunID, ev.ThresholdID, ev.Stage, ev.Kind, ev.Key, ev.TS, ev.Observed, passed); err != nil {
return fmt.Errorf("insert eval: %w", err)
}
}
return tx.Commit()
}
// ListEvaluations returns the evaluation history for a run, newest
// last. Bounded at a sane cap so a pathological run with a sample-per-
// second sidecar doesn't blow up the report page.
func (t *Thresholds) ListEvaluations(ctx context.Context, runID int64) ([]ThresholdEvaluation, error) {
rows, err := t.DB.QueryContext(ctx, `
SELECT id, run_id, threshold_id, stage_name, kind, key, ts, observed, passed
FROM threshold_evaluations WHERE run_id = ?
ORDER BY id LIMIT 5000
`, runID)
if err != nil {
return nil, err
}
defer rows.Close()
var out []ThresholdEvaluation
for rows.Next() {
var ev ThresholdEvaluation
var passed int
if err := rows.Scan(&ev.ID, &ev.RunID, &ev.ThresholdID, &ev.Stage, &ev.Kind,
&ev.Key, &ev.TS, &ev.Observed, &passed); err != nil {
return nil, err
}
ev.Passed = passed == 1
out = append(out, ev)
}
return out, rows.Err()
}
// CriticalBreaches returns the evaluations that fire the "fail the
// run" gate — critical-severity thresholds with passed=0. The
// agent-handler calls this at /result close so an aggregate breach
// (p99 latency > bound) still flips the run to FailedHolding even if
// no single sample tripped the fast-fail path.
func (t *Thresholds) CriticalBreaches(ctx context.Context, runID int64) ([]ThresholdEvaluation, error) {
rows, err := t.DB.QueryContext(ctx, `
SELECT e.id, e.run_id, e.threshold_id, e.stage_name, e.kind, e.key, e.ts, e.observed, e.passed
FROM threshold_evaluations e
JOIN thresholds t ON t.id = e.threshold_id
WHERE e.run_id = ? AND e.passed = 0 AND t.severity = 'critical'
ORDER BY e.id
`, runID)
if err != nil {
return nil, err
}
defer rows.Close()
var out []ThresholdEvaluation
for rows.Next() {
var ev ThresholdEvaluation
var passed int
if err := rows.Scan(&ev.ID, &ev.RunID, &ev.ThresholdID, &ev.Stage, &ev.Kind,
&ev.Key, &ev.TS, &ev.Observed, &passed); err != nil {
return nil, err
}
ev.Passed = passed == 1
out = append(out, ev)
}
return out, rows.Err()
}
+26
View File
@@ -636,6 +636,21 @@ body.bare main { max-width: none; }
.run-failed-stage { color: var(--danger); } .run-failed-stage { color: var(--danger); }
.run-failed-stage strong { font-family: var(--mono); } .run-failed-stage strong { font-family: var(--mono); }
.run-diffs { color: var(--danger); } .run-diffs { color: var(--danger); }
.run-profile-chip {
display: inline-block;
font-family: var(--mono);
font-size: 11px;
text-transform: uppercase;
letter-spacing: .04em;
padding: 2px 8px;
border-radius: 999px;
border: 1px solid rgba(255,255,255,.15);
background: rgba(255,255,255,.05);
color: var(--text-dim);
}
.run-profile-quick { color: var(--accent); border-color: rgba(60,130,246,.45); background: rgba(60,130,246,.08); }
.run-profile-deep { color: #e5b94f; border-color: rgba(229,185,79,.45); background: rgba(229,185,79,.08); }
.run-profile-soak { color: #d97a57; border-color: rgba(217,122,87,.45); background: rgba(217,122,87,.08); }
.hold-banner { .hold-banner {
background: rgba(229,100,102,.1); background: rgba(229,100,102,.1);
@@ -890,6 +905,17 @@ body.bare main { max-width: none; }
.host-actions { padding: 0; } .host-actions { padding: 0; }
.host-actions-row { display: flex; gap: 10px; flex-wrap: wrap; align-items: center; } .host-actions-row { display: flex; gap: 10px; flex-wrap: wrap; align-items: center; }
.host-nd-toggle { display: inline-flex; gap: 6px; align-items: center; color: var(--text-dim); font-size: 13px; } .host-nd-toggle { display: inline-flex; gap: 6px; align-items: center; color: var(--text-dim); font-size: 13px; }
.host-profile-picker {
border: 1px solid var(--border);
border-radius: var(--radius);
padding: 6px 10px;
display: inline-flex;
gap: 12px;
align-items: center;
margin: 0 8px 0 0;
}
.host-profile-picker legend { font-size: 11px; color: var(--text-dim); text-transform: uppercase; letter-spacing: .05em; padding: 0 4px; }
.host-profile-picker label { display: inline-flex; gap: 4px; align-items: center; font-family: var(--mono); font-size: 13px; cursor: pointer; }
.in-flight-banner-wrap { display: contents; } .in-flight-banner-wrap { display: contents; }
.in-flight-banner { .in-flight-banner {
+9 -9
View File
@@ -65,7 +65,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
var templ_7745c5c3_Var3 string var templ_7745c5c3_Var3 string
templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String()) templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String())
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 1, Col: 0} return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 1, Col: 0}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -88,7 +88,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
var templ_7745c5c3_Var4 string var templ_7745c5c3_Var4 string
templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name) templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 28, Col: 102} return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 28, Col: 102}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -110,7 +110,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
var templ_7745c5c3_Var6 string var templ_7745c5c3_Var6 string
templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var5).String()) templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var5).String())
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 1, Col: 0} return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 1, Col: 0}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -123,7 +123,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
var templ_7745c5c3_Var7 string var templ_7745c5c3_Var7 string
templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(stageMarker(string(d.Stage.State))) templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(stageMarker(string(d.Stage.State)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 30, Col: 105} return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 30, Col: 105}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -136,7 +136,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
var templ_7745c5c3_Var8 string var templ_7745c5c3_Var8 string
templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name) templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 31, Col: 41} return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 31, Col: 41}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -149,7 +149,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
var templ_7745c5c3_Var9 string var templ_7745c5c3_Var9 string
templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(stageDurationFromStage(d.Stage)) templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(stageDurationFromStage(d.Stage))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 32, Col: 64} return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 32, Col: 64}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -182,7 +182,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
var templ_7745c5c3_Var10 string var templ_7745c5c3_Var10 string
templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name) templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 43, Col: 99} return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 43, Col: 99}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -195,7 +195,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
var templ_7745c5c3_Var11 string var templ_7745c5c3_Var11 string
templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d-%s", d.RunID, d.Stage.Name)) templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d-%s", d.RunID, d.Stage.Name))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 47, Col: 56} return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 47, Col: 56}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -208,7 +208,7 @@ func ActiveStep(d ActiveStepData) templ.Component {
var templ_7745c5c3_Var12 string var templ_7745c5c3_Var12 string
templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d-%s", d.RunID, d.Stage.Name)) templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d-%s", d.RunID, d.Stage.Name))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 48, Col: 62} return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 48, Col: 62}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
+25
View File
@@ -102,6 +102,21 @@ templ HostActions(d HostPageData) {
<div class="host-actions-row"> <div class="host-actions-row">
if hostCanStart(d) { if hostCanStart(d) {
<form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/start", d.Host.ID)) } class="inline host-start-form"> <form method="post" action={ templ.SafeURL(fmt.Sprintf("/hosts/%d/start", d.Host.ID)) } class="inline host-start-form">
<fieldset class="host-profile-picker">
<legend>Profile</legend>
<label title="~10 min — post-repair sanity: all probes + gates, short budgets">
<input type="radio" name="profile" value="quick" checked/>
quick
</label>
<label title="~812 h — overnight soak: long CPU/RAM, full-disk fio verify, 30 min network">
<input type="radio" name="profile" value="deep"/>
deep
</label>
<label title="≥24 h — week-long burn-in; opt-in when you suspect intermittent faults">
<input type="radio" name="profile" value="soak"/>
soak
</label>
</fieldset>
<label class="host-nd-toggle"> <label class="host-nd-toggle">
<input type="checkbox" name="non_destructive" value="1"/> <input type="checkbox" name="non_destructive" value="1"/>
Non-destructive (skip wipe-probe + disk writes) Non-destructive (skip wipe-probe + disk writes)
@@ -258,6 +273,16 @@ func hostCanStartIfOnline(d HostPageData) bool {
return d.ActiveRun == nil return d.ActiveRun == nil
} }
// profileChipValue normalizes a Run.Profile string for display on the
// run page chip. Older runs with an empty column predate Phase 1 — show
// them as "quick" (the prior implicit default).
func profileChipValue(p string) string {
if p == "" {
return "quick"
}
return p
}
// runDuration formats the elapsed time for a run using the same buckets // runDuration formats the elapsed time for a run using the same buckets
// as stageDuration. In-flight runs clock from StartedAt to now so the // as stageDuration. In-flight runs clock from StartedAt to now so the
// run-page header + runs-table row keep ticking on each SSE push. // run-page header + runs-table row keep ticking on each SSE push.
+27 -17
View File
@@ -361,7 +361,7 @@ func HostActions(d HostPageData) templ.Component {
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 21, "\" class=\"inline host-start-form\"><label class=\"host-nd-toggle\"><input type=\"checkbox\" name=\"non_destructive\" value=\"1\"> Non-destructive (skip wipe-probe + disk writes)</label> <button type=\"submit\" class=\"btn-primary\">Start vetting</button></form>") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 21, "\" class=\"inline host-start-form\"><fieldset class=\"host-profile-picker\"><legend>Profile</legend> <label title=\"~10 min — post-repair sanity: all probes + gates, short budgets\"><input type=\"radio\" name=\"profile\" value=\"quick\" checked> quick</label> <label title=\"~812 h — overnight soak: long CPU/RAM, full-disk fio verify, 30 min network\"><input type=\"radio\" name=\"profile\" value=\"deep\"> deep</label> <label title=\"≥24 h — week-long burn-in; opt-in when you suspect intermittent faults\"><input type=\"radio\" name=\"profile\" value=\"soak\"> soak</label></fieldset><label class=\"host-nd-toggle\"><input type=\"checkbox\" name=\"non_destructive\" value=\"1\"> Non-destructive (skip wipe-probe + disk writes)</label> <button type=\"submit\" class=\"btn-primary\">Start vetting</button></form>")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
@@ -383,7 +383,7 @@ func HostActions(d HostPageData) templ.Component {
var templ_7745c5c3_Var19 templ.SafeURL var templ_7745c5c3_Var19 templ.SafeURL
templ_7745c5c3_Var19, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/delete", d.Host.ID))) templ_7745c5c3_Var19, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/delete", d.Host.ID)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 116, Col: 89} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 131, Col: 89}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var19)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var19))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -428,7 +428,7 @@ func InFlightBanner(d HostPageData) templ.Component {
var templ_7745c5c3_Var21 string var templ_7745c5c3_Var21 string
templ_7745c5c3_Var21, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-inflight-%d", d.Host.ID)) templ_7745c5c3_Var21, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-inflight-%d", d.Host.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 128, Col: 51} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 143, Col: 51}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var21)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var21))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -441,7 +441,7 @@ func InFlightBanner(d HostPageData) templ.Component {
var templ_7745c5c3_Var22 string var templ_7745c5c3_Var22 string
templ_7745c5c3_Var22, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-inflight-%d", d.Host.ID)) templ_7745c5c3_Var22, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-inflight-%d", d.Host.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 130, Col: 57} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 145, Col: 57}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var22)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var22))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -459,7 +459,7 @@ func InFlightBanner(d HostPageData) templ.Component {
var templ_7745c5c3_Var23 templ.SafeURL var templ_7745c5c3_Var23 templ.SafeURL
templ_7745c5c3_Var23, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/runs/%d", d.ActiveRun.ID))) templ_7745c5c3_Var23, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/runs/%d", d.ActiveRun.ID)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 134, Col: 92} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 149, Col: 92}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var23)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var23))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -472,7 +472,7 @@ func InFlightBanner(d HostPageData) templ.Component {
var templ_7745c5c3_Var24 string var templ_7745c5c3_Var24 string
templ_7745c5c3_Var24, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", d.ActiveRun.ID)) templ_7745c5c3_Var24, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", d.ActiveRun.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 135, Col: 74} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 150, Col: 74}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var24)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var24))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -485,7 +485,7 @@ func InFlightBanner(d HostPageData) templ.Component {
var templ_7745c5c3_Var25 string var templ_7745c5c3_Var25 string
templ_7745c5c3_Var25, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(d.ActiveRun)) templ_7745c5c3_Var25, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(d.ActiveRun))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 136, Col: 59} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 151, Col: 59}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var25)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var25))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -541,7 +541,7 @@ func HostEmptyState(d HostPageData) templ.Component {
var templ_7745c5c3_Var27 templ.SafeURL var templ_7745c5c3_Var27 templ.SafeURL
templ_7745c5c3_Var27, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", d.Host.ID))) templ_7745c5c3_Var27, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", d.Host.ID)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 152, Col: 88} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 167, Col: 88}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var27)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var27))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -655,7 +655,7 @@ func RunRow(d RunRowData) templ.Component {
var templ_7745c5c3_Var31 string var templ_7745c5c3_Var31 string
templ_7745c5c3_Var31, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("runrow-%d", d.Run.ID)) templ_7745c5c3_Var31, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("runrow-%d", d.Run.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 204, Col: 41} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 219, Col: 41}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var31)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var31))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -681,7 +681,7 @@ func RunRow(d RunRowData) templ.Component {
var templ_7745c5c3_Var33 string var templ_7745c5c3_Var33 string
templ_7745c5c3_Var33, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("runrow-%d", d.Run.ID)) templ_7745c5c3_Var33, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("runrow-%d", d.Run.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 206, Col: 47} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 221, Col: 47}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var33)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var33))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -694,7 +694,7 @@ func RunRow(d RunRowData) templ.Component {
var templ_7745c5c3_Var34 templ.SafeURL var templ_7745c5c3_Var34 templ.SafeURL
templ_7745c5c3_Var34, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/runs/%d", d.Run.ID))) templ_7745c5c3_Var34, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/runs/%d", d.Run.ID)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 210, Col: 61} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 225, Col: 61}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var34)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var34))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -707,7 +707,7 @@ func RunRow(d RunRowData) templ.Component {
var templ_7745c5c3_Var35 string var templ_7745c5c3_Var35 string
templ_7745c5c3_Var35, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("#%d", d.Run.ID)) templ_7745c5c3_Var35, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("#%d", d.Run.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 210, Col: 94} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 225, Col: 94}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var35)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var35))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -742,7 +742,7 @@ func RunRow(d RunRowData) templ.Component {
var templ_7745c5c3_Var38 string var templ_7745c5c3_Var38 string
templ_7745c5c3_Var38, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(&d.Run)) templ_7745c5c3_Var38, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(&d.Run))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 213, Col: 92} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 228, Col: 92}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var38)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var38))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -755,7 +755,7 @@ func RunRow(d RunRowData) templ.Component {
var templ_7745c5c3_Var39 string var templ_7745c5c3_Var39 string
templ_7745c5c3_Var39, templ_7745c5c3_Err = templ.JoinStringErrs(relativeTime(d.Run.StartedAt)) templ_7745c5c3_Var39, templ_7745c5c3_Err = templ.JoinStringErrs(relativeTime(d.Run.StartedAt))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 215, Col: 62} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 230, Col: 62}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var39)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var39))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -768,7 +768,7 @@ func RunRow(d RunRowData) templ.Component {
var templ_7745c5c3_Var40 string var templ_7745c5c3_Var40 string
templ_7745c5c3_Var40, templ_7745c5c3_Err = templ.JoinStringErrs(runDuration(&d.Run)) templ_7745c5c3_Var40, templ_7745c5c3_Err = templ.JoinStringErrs(runDuration(&d.Run))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 216, Col: 53} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 231, Col: 53}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var40)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var40))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -805,7 +805,7 @@ func RunRow(d RunRowData) templ.Component {
var templ_7745c5c3_Var43 string var templ_7745c5c3_Var43 string
templ_7745c5c3_Var43, templ_7745c5c3_Err = templ.JoinStringErrs(name) templ_7745c5c3_Var43, templ_7745c5c3_Err = templ.JoinStringErrs(name)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 221, Col: 94} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 236, Col: 94}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var43)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var43))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -823,7 +823,7 @@ func RunRow(d RunRowData) templ.Component {
var templ_7745c5c3_Var44 templ.SafeURL var templ_7745c5c3_Var44 templ.SafeURL
templ_7745c5c3_Var44, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/runs/%d", d.Run.ID))) templ_7745c5c3_Var44, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/runs/%d", d.Run.ID)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 226, Col: 84} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_page.templ`, Line: 241, Col: 84}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var44)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var44))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -867,6 +867,16 @@ func hostCanStartIfOnline(d HostPageData) bool {
return d.ActiveRun == nil return d.ActiveRun == nil
} }
// profileChipValue normalizes a Run.Profile string for display on the
// run page chip. Older runs with an empty column predate Phase 1 — show
// them as "quick" (the prior implicit default).
func profileChipValue(p string) string {
if p == "" {
return "quick"
}
return p
}
// runDuration formats the elapsed time for a run using the same buckets // runDuration formats the elapsed time for a run using the same buckets
// as stageDuration. In-flight runs clock from StartedAt to now so the // as stageDuration. In-flight runs clock from StartedAt to now so the
// run-page header + runs-table row keep ticking on each SSE push. // run-page header + runs-table row keep ticking on each SSE push.
+12 -12
View File
@@ -55,7 +55,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var3 string var templ_7745c5c3_Var3 string
templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("host-%d", t.Host.ID)) templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("host-%d", t.Host.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 19, Col: 40} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 19, Col: 40}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -68,7 +68,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var4 string var templ_7745c5c3_Var4 string
templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String()) templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String())
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 1, Col: 0} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 1, Col: 0}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -81,7 +81,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var5 string var templ_7745c5c3_Var5 string
templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("tile-%d", t.Host.ID)) templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("tile-%d", t.Host.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 21, Col: 46} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 21, Col: 46}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -94,7 +94,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var6 templ.SafeURL var templ_7745c5c3_Var6 templ.SafeURL
templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d", t.Host.ID))) templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d", t.Host.ID)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 24, Col: 80} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 24, Col: 80}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -107,7 +107,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var7 string var templ_7745c5c3_Var7 string
templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs("Open " + t.Host.Name) templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs("Open " + t.Host.Name)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 24, Col: 117} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 24, Col: 117}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -120,7 +120,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var8 string var templ_7745c5c3_Var8 string
templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(t.Host.Name) templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(t.Host.Name)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 26, Col: 39} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 26, Col: 39}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -142,7 +142,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var10 string var templ_7745c5c3_Var10 string
templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var9).String()) templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var9).String())
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 1, Col: 0} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 1, Col: 0}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -155,7 +155,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var11 string var templ_7745c5c3_Var11 string
templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(lastSeenLabel(t.LastSeenAt)) templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(lastSeenLabel(t.LastSeenAt))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 28, Col: 95} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 28, Col: 95}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -168,7 +168,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var12 string var templ_7745c5c3_Var12 string
templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(t.Latest)) templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(tileStatus(t.Latest))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 29, Col: 51} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 29, Col: 51}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -186,7 +186,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var13 templ.SafeURL var templ_7745c5c3_Var13 templ.SafeURL
templ_7745c5c3_Var13, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", t.Host.ID))) templ_7745c5c3_Var13, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", t.Host.ID)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 34, Col: 89} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 34, Col: 89}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var13)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var13))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -209,7 +209,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var14 templ.SafeURL var templ_7745c5c3_Var14 templ.SafeURL
templ_7745c5c3_Var14, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/cancel", t.Host.ID))) templ_7745c5c3_Var14, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/cancel", t.Host.ID)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 44, Col: 90} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 44, Col: 90}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var14)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var14))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -227,7 +227,7 @@ func HostTile(t TileData) templ.Component {
var templ_7745c5c3_Var15 templ.SafeURL var templ_7745c5c3_Var15 templ.SafeURL
templ_7745c5c3_Var15, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/reports/%d", t.Latest.ID))) templ_7745c5c3_Var15, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/reports/%d", t.Latest.ID)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/host_tile.templ`, Line: 48, Col: 88} return templ.Error{Err: templ_7745c5c3_Err, FileName: `host_tile.templ`, Line: 48, Col: 88}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var15)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var15))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
+2 -2
View File
@@ -36,7 +36,7 @@ func Layout(title string) templ.Component {
var templ_7745c5c3_Var2 string var templ_7745c5c3_Var2 string
templ_7745c5c3_Var2, templ_7745c5c3_Err = templ.JoinStringErrs(title) templ_7745c5c3_Var2, templ_7745c5c3_Err = templ.JoinStringErrs(title)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/layout.templ`, Line: 9, Col: 17} return templ.Error{Err: templ_7745c5c3_Err, FileName: `layout.templ`, Line: 9, Col: 17}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var2)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var2))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -86,7 +86,7 @@ func BareLayout(title string) templ.Component {
var templ_7745c5c3_Var4 string var templ_7745c5c3_Var4 string
templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(title) templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(title)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/layout.templ`, Line: 39, Col: 17} return templ.Error{Err: templ_7745c5c3_Err, FileName: `layout.templ`, Line: 39, Col: 17}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
+4
View File
@@ -40,11 +40,13 @@ func runStateRank(s model.RunState) int {
model.StateWaitingReboot, model.StateWaitingReboot,
model.StateBooting, model.StateBooting,
model.StateInventoryCheck, model.StateInventoryCheck,
model.StateFirmware,
model.StateSpecValidate, model.StateSpecValidate,
model.StateSMART, model.StateSMART,
model.StateCPUStress, model.StateCPUStress,
model.StateStorage, model.StateStorage,
model.StateNetwork, model.StateNetwork,
model.StateBurn,
model.StateGPU, model.StateGPU,
model.StatePSU, model.StatePSU,
model.StateReporting, model.StateReporting,
@@ -205,11 +207,13 @@ func firstStageState(run *model.Run) model.RunState {
func stageStateByName(name string) (model.RunState, bool) { func stageStateByName(name string) (model.RunState, bool) {
m := map[string]model.RunState{ m := map[string]model.RunState{
"Inventory": model.StateInventoryCheck, "Inventory": model.StateInventoryCheck,
"Firmware": model.StateFirmware,
"SpecValidate": model.StateSpecValidate, "SpecValidate": model.StateSpecValidate,
"SMART": model.StateSMART, "SMART": model.StateSMART,
"CPUStress": model.StateCPUStress, "CPUStress": model.StateCPUStress,
"Storage": model.StateStorage, "Storage": model.StateStorage,
"Network": model.StateNetwork, "Network": model.StateNetwork,
"Burn": model.StateBurn,
"GPU": model.StateGPU, "GPU": model.StateGPU,
"PSU": model.StatePSU, "PSU": model.StatePSU,
"Reporting": model.StateReporting, "Reporting": model.StateReporting,
+12 -8
View File
@@ -48,11 +48,13 @@ func runStateRank(s model.RunState) int {
model.StateWaitingReboot, model.StateWaitingReboot,
model.StateBooting, model.StateBooting,
model.StateInventoryCheck, model.StateInventoryCheck,
model.StateFirmware,
model.StateSpecValidate, model.StateSpecValidate,
model.StateSMART, model.StateSMART,
model.StateCPUStress, model.StateCPUStress,
model.StateStorage, model.StateStorage,
model.StateNetwork, model.StateNetwork,
model.StateBurn,
model.StateGPU, model.StateGPU,
model.StatePSU, model.StatePSU,
model.StateReporting, model.StateReporting,
@@ -213,11 +215,13 @@ func firstStageState(run *model.Run) model.RunState {
func stageStateByName(name string) (model.RunState, bool) { func stageStateByName(name string) (model.RunState, bool) {
m := map[string]model.RunState{ m := map[string]model.RunState{
"Inventory": model.StateInventoryCheck, "Inventory": model.StateInventoryCheck,
"Firmware": model.StateFirmware,
"SpecValidate": model.StateSpecValidate, "SpecValidate": model.StateSpecValidate,
"SMART": model.StateSMART, "SMART": model.StateSMART,
"CPUStress": model.StateCPUStress, "CPUStress": model.StateCPUStress,
"Storage": model.StateStorage, "Storage": model.StateStorage,
"Network": model.StateNetwork, "Network": model.StateNetwork,
"Burn": model.StateBurn,
"GPU": model.StateGPU, "GPU": model.StateGPU,
"PSU": model.StatePSU, "PSU": model.StatePSU,
"Reporting": model.StateReporting, "Reporting": model.StateReporting,
@@ -312,7 +316,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
var templ_7745c5c3_Var3 string var templ_7745c5c3_Var3 string
templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String()) templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String())
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 1, Col: 0} return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 1, Col: 0}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -339,7 +343,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
var templ_7745c5c3_Var5 string var templ_7745c5c3_Var5 string
templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var4).String()) templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var4).String())
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 1, Col: 0} return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 1, Col: 0}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -361,7 +365,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
var templ_7745c5c3_Var7 string var templ_7745c5c3_Var7 string
templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var6).String()) templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var6).String())
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 1, Col: 0} return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 1, Col: 0}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -374,7 +378,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
var templ_7745c5c3_Var8 string var templ_7745c5c3_Var8 string
templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(stageMarker(n.State)) templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(stageMarker(n.State))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 275, Col: 77} return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 279, Col: 77}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -387,7 +391,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
var templ_7745c5c3_Var9 string var templ_7745c5c3_Var9 string
templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(n.Name) templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(n.Name)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 276, Col: 36} return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 280, Col: 36}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -400,7 +404,7 @@ func Pipeline(nodes []PipelineNode) templ.Component {
var templ_7745c5c3_Var10 string var templ_7745c5c3_Var10 string
templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(stageDuration(n)) templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(stageDuration(n))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 277, Col: 50} return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 281, Col: 50}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -454,7 +458,7 @@ func PipelineSection(run *model.Run, nodes []PipelineNode) templ.Component {
var templ_7745c5c3_Var12 string var templ_7745c5c3_Var12 string
templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("pipeline-%d", run.ID)) templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("pipeline-%d", run.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 292, Col: 41} return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 296, Col: 41}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -467,7 +471,7 @@ func PipelineSection(run *model.Run, nodes []PipelineNode) templ.Component {
var templ_7745c5c3_Var13 string var templ_7745c5c3_Var13 string
templ_7745c5c3_Var13, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("pipeline-%d", run.ID)) templ_7745c5c3_Var13, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("pipeline-%d", run.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/pipeline.templ`, Line: 294, Col: 47} return templ.Error{Err: templ_7745c5c3_Err, FileName: `pipeline.templ`, Line: 298, Col: 47}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var13)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var13))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
+34 -30
View File
@@ -8,26 +8,28 @@ import (
) )
// node indexes for the default pipeline layout: pre-stages (3) + stage // node indexes for the default pipeline layout: pre-stages (3) + stage
// rows (9) + terminal Completed (1) = 13 nodes. // rows (11) + terminal Completed (1) = 15 nodes.
const ( const (
idxQueued = 0 idxQueued = 0
idxWaitingReboot = 1 idxWaitingReboot = 1
idxBooting = 2 idxBooting = 2
idxInventory = 3 idxInventory = 3
idxSpecValidate = 4 idxFirmware = 4
idxSMART = 5 idxSpecValidate = 5
idxCPUStress = 6 idxSMART = 6
idxStorage = 7 idxCPUStress = 7
idxNetwork = 8 idxStorage = 8
idxGPU = 9 idxNetwork = 9
idxPSU = 10 idxBurn = 10
idxReporting = 11 idxGPU = 11
idxCompleted = 12 idxPSU = 12
idxReporting = 13
idxCompleted = 14
) )
// seedStages returns a fresh all-pending stage slice in the canonical order. // seedStages returns a fresh all-pending stage slice in the canonical order.
func seedStages() []model.Stage { func seedStages() []model.Stage {
names := []string{"Inventory", "SpecValidate", "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU", "Reporting"} names := []string{"Inventory", "Firmware", "SpecValidate", "SMART", "CPUStress", "Storage", "Network", "Burn", "GPU", "PSU", "Reporting"}
out := make([]model.Stage, len(names)) out := make([]model.Stage, len(names))
for i, n := range names { for i, n := range names {
out[i] = model.Stage{Name: n, Ordinal: i, State: model.StagePending} out[i] = model.Stage{Name: n, Ordinal: i, State: model.StagePending}
@@ -37,10 +39,10 @@ func seedStages() []model.Stage {
func TestBuildPipeline_NoRun(t *testing.T) { func TestBuildPipeline_NoRun(t *testing.T) {
nodes := BuildPipeline(nil, nil) nodes := BuildPipeline(nil, nil)
// Ghost pipeline: 3 pre-stages + 9 stage ghosts + 1 terminal = 13 // Ghost pipeline: 3 pre-stages + 10 stage ghosts + 1 terminal = 14
// nodes, all pending. // nodes, all pending.
if len(nodes) != 13 { if len(nodes) != 15 {
t.Fatalf("len = %d, want 13", len(nodes)) t.Fatalf("len = %d, want 15", len(nodes))
} }
for i, n := range nodes { for i, n := range nodes {
if n.State != "pending" { if n.State != "pending" {
@@ -56,8 +58,8 @@ func TestBuildPipeline_NoRun(t *testing.T) {
func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) { func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
run := &model.Run{State: model.StateWaitingReboot} run := &model.Run{State: model.StateWaitingReboot}
nodes := BuildPipeline(run, nil) nodes := BuildPipeline(run, nil)
if len(nodes) != 13 { if len(nodes) != 15 {
t.Fatalf("len = %d, want 13", len(nodes)) t.Fatalf("len = %d, want 15", len(nodes))
} }
if nodes[idxQueued].State != "passed" { if nodes[idxQueued].State != "passed" {
t.Errorf("Queued = %q, want passed", nodes[idxQueued].State) t.Errorf("Queued = %q, want passed", nodes[idxQueued].State)
@@ -65,7 +67,7 @@ func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
if nodes[idxWaitingReboot].State != "running" { if nodes[idxWaitingReboot].State != "running" {
t.Errorf("WaitingReboot = %q, want running", nodes[idxWaitingReboot].State) t.Errorf("WaitingReboot = %q, want running", nodes[idxWaitingReboot].State)
} }
// All 9 stage ghosts must be pending — nothing has started yet. // All 11 stage ghosts must be pending — nothing has started yet.
for i := idxInventory; i <= idxReporting; i++ { for i := idxInventory; i <= idxReporting; i++ {
if nodes[i].State != "pending" { if nodes[i].State != "pending" {
t.Errorf("%s (ghost) = %q, want pending", nodes[i].Name, nodes[i].State) t.Errorf("%s (ghost) = %q, want pending", nodes[i].Name, nodes[i].State)
@@ -81,19 +83,20 @@ func TestBuildPipeline_GhostStagesBeforeClaim(t *testing.T) {
// pending ghosts rather than silently disappearing. // pending ghosts rather than silently disappearing.
func TestBuildPipeline_GhostStagesDuringStage(t *testing.T) { func TestBuildPipeline_GhostStagesDuringStage(t *testing.T) {
run := &model.Run{State: model.StateSMART} run := &model.Run{State: model.StateSMART}
// Only Inventory + SpecValidate seeded; SMART onwards are ghosts. // Only Inventory + Firmware + SpecValidate seeded; SMART onwards are ghosts.
stages := []model.Stage{ stages := []model.Stage{
{Name: "Inventory", Ordinal: 0, State: model.StagePassed}, {Name: "Inventory", Ordinal: 0, State: model.StagePassed},
{Name: "SpecValidate", Ordinal: 1, State: model.StagePassed}, {Name: "Firmware", Ordinal: 1, State: model.StagePassed},
{Name: "SpecValidate", Ordinal: 2, State: model.StagePassed},
} }
nodes := BuildPipeline(run, stages) nodes := BuildPipeline(run, stages)
if len(nodes) != 13 { if len(nodes) != 15 {
t.Fatalf("len = %d, want 13", len(nodes)) t.Fatalf("len = %d, want 15", len(nodes))
} }
if nodes[idxSMART].State != "running" { if nodes[idxSMART].State != "running" {
t.Errorf("SMART (ghost) = %q, want running", nodes[idxSMART].State) t.Errorf("SMART (ghost) = %q, want running", nodes[idxSMART].State)
} }
for _, i := range []int{idxCPUStress, idxStorage, idxNetwork, idxGPU, idxPSU, idxReporting} { for _, i := range []int{idxCPUStress, idxStorage, idxNetwork, idxBurn, idxGPU, idxPSU, idxReporting} {
if nodes[i].State != "pending" { if nodes[i].State != "pending" {
t.Errorf("%s (ghost) = %q, want pending", nodes[i].Name, nodes[i].State) t.Errorf("%s (ghost) = %q, want pending", nodes[i].Name, nodes[i].State)
} }
@@ -103,12 +106,13 @@ func TestBuildPipeline_GhostStagesDuringStage(t *testing.T) {
func TestBuildPipeline_Running(t *testing.T) { func TestBuildPipeline_Running(t *testing.T) {
run := &model.Run{State: model.StateSMART} run := &model.Run{State: model.StateSMART}
stages := seedStages() stages := seedStages()
stages[0].State = model.StagePassed stages[0].State = model.StagePassed // Inventory
stages[1].State = model.StagePassed stages[1].State = model.StagePassed // Firmware
stages[2].State = model.StageRunning stages[2].State = model.StagePassed // SpecValidate
stages[3].State = model.StageRunning // SMART
nodes := BuildPipeline(run, stages) nodes := BuildPipeline(run, stages)
if len(nodes) != 13 { if len(nodes) != 15 {
t.Fatalf("len = %d, want 13", len(nodes)) t.Fatalf("len = %d, want 15", len(nodes))
} }
// Pre-stages are all past for a run that has reached SMART. // Pre-stages are all past for a run that has reached SMART.
for i := idxQueued; i <= idxBooting; i++ { for i := idxQueued; i <= idxBooting; i++ {
@@ -136,10 +140,10 @@ func TestBuildPipeline_Running(t *testing.T) {
func TestBuildPipeline_Failed(t *testing.T) { func TestBuildPipeline_Failed(t *testing.T) {
run := &model.Run{State: model.StateFailedHolding, FailedStage: "Storage"} run := &model.Run{State: model.StateFailedHolding, FailedStage: "Storage"}
stages := seedStages() stages := seedStages()
for i := 0; i <= 3; i++ { for i := 0; i <= 4; i++ {
stages[i].State = model.StagePassed stages[i].State = model.StagePassed
} }
stages[4].State = model.StageFailed // Storage stages[5].State = model.StageFailed // Storage
nodes := BuildPipeline(run, stages) nodes := BuildPipeline(run, stages)
// Pre-stages are past a run that reached Storage. // Pre-stages are past a run that reached Storage.
for i := idxQueued; i <= idxBooting; i++ { for i := idxQueued; i <= idxBooting; i++ {
@@ -150,7 +154,7 @@ func TestBuildPipeline_Failed(t *testing.T) {
if nodes[idxStorage].State != "failed" { if nodes[idxStorage].State != "failed" {
t.Errorf("Storage = %q, want failed", nodes[idxStorage].State) t.Errorf("Storage = %q, want failed", nodes[idxStorage].State)
} }
for _, i := range []int{idxNetwork, idxGPU, idxPSU, idxReporting} { for _, i := range []int{idxNetwork, idxBurn, idxGPU, idxPSU, idxReporting} {
if nodes[i].State != "skipped" { if nodes[i].State != "skipped" {
t.Errorf("%s = %q, want skipped", nodes[i].Name, nodes[i].State) t.Errorf("%s = %q, want skipped", nodes[i].Name, nodes[i].State)
} }
+8 -8
View File
@@ -64,7 +64,7 @@ func Registration(form RegistrationForm) templ.Component {
var templ_7745c5c3_Var3 string var templ_7745c5c3_Var3 string
templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(form.Error) templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(form.Error)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 22, Col: 35} return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 22, Col: 35}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -83,7 +83,7 @@ func Registration(form RegistrationForm) templ.Component {
var templ_7745c5c3_Var4 string var templ_7745c5c3_Var4 string
templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs("curl -fsSL " + form.QuickRegisterURL + "/register/quick.sh | sudo bash") templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs("curl -fsSL " + form.QuickRegisterURL + "/register/quick.sh | sudo bash")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 28, Col: 108} return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 28, Col: 108}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -101,7 +101,7 @@ func Registration(form RegistrationForm) templ.Component {
var templ_7745c5c3_Var5 string var templ_7745c5c3_Var5 string
templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(form.Name) templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(form.Name)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 38, Col: 55} return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 38, Col: 55}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -114,7 +114,7 @@ func Registration(form RegistrationForm) templ.Component {
var templ_7745c5c3_Var6 string var templ_7745c5c3_Var6 string
templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(form.MAC) templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(form.MAC)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 42, Col: 53} return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 42, Col: 53}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -127,7 +127,7 @@ func Registration(form RegistrationForm) templ.Component {
var templ_7745c5c3_Var7 string var templ_7745c5c3_Var7 string
templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(form.WoLBroadcastIP) templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(form.WoLBroadcastIP)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 47, Col: 78} return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 47, Col: 78}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -140,7 +140,7 @@ func Registration(form RegistrationForm) templ.Component {
var templ_7745c5c3_Var8 string var templ_7745c5c3_Var8 string
templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(defaultPort(form.WoLPort)) templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(defaultPort(form.WoLPort))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 51, Col: 78} return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 51, Col: 78}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -153,7 +153,7 @@ func Registration(form RegistrationForm) templ.Component {
var templ_7745c5c3_Var9 string var templ_7745c5c3_Var9 string
templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(form.ExpectedSpecYAML) templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(form.ExpectedSpecYAML)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 56, Col: 127} return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 56, Col: 127}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -166,7 +166,7 @@ func Registration(form RegistrationForm) templ.Component {
var templ_7745c5c3_Var10 string var templ_7745c5c3_Var10 string
templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(form.Notes) templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(form.Notes)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/registration.templ`, Line: 60, Col: 51} return templ.Error{Err: templ_7745c5c3_Err, FileName: `registration.templ`, Line: 60, Col: 51}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
+1
View File
@@ -83,6 +83,7 @@ templ RunHeader(d RunPageData) {
<div class="run-header-left"> <div class="run-header-left">
<h1 class="run-header-name">{ fmt.Sprintf("Run #%d", d.Run.ID) }</h1> <h1 class="run-header-name">{ fmt.Sprintf("Run #%d", d.Run.ID) }</h1>
<span class={ "run-status-badge", "run-status-" + tileMood(&d.Run) }>{ tileStatus(&d.Run) }</span> <span class={ "run-status-badge", "run-status-" + tileMood(&d.Run) }>{ tileStatus(&d.Run) }</span>
<span class={ "run-profile-chip", "run-profile-" + profileChipValue(d.Run.Profile) }>{ profileChipValue(d.Run.Profile) }</span>
<span class="run-duration">{ runDuration(&d.Run) }</span> <span class="run-duration">{ runDuration(&d.Run) }</span>
if d.Run.FailedStage != "" { if d.Run.FailedStage != "" {
<span class="run-failed-stage">failed at <strong>{ d.Run.FailedStage }</strong></span> <span class="run-failed-stage">failed at <strong>{ d.Run.FailedStage }</strong></span>
+242 -207
View File
@@ -286,142 +286,177 @@ func RunHeader(d RunPageData) templ.Component {
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 15, "</span> <span class=\"run-duration\">") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 15, "</span> ")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
var templ_7745c5c3_Var15 string var templ_7745c5c3_Var15 = []any{"run-profile-chip", "run-profile-" + profileChipValue(d.Run.Profile)}
templ_7745c5c3_Var15, templ_7745c5c3_Err = templ.JoinStringErrs(runDuration(&d.Run)) templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var15...)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 86, Col: 51}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var15))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 16, "</span> ") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 16, "<span class=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var16 string
templ_7745c5c3_Var16, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var15).String())
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 1, Col: 0}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var16))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 17, "\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var17 string
templ_7745c5c3_Var17, templ_7745c5c3_Err = templ.JoinStringErrs(profileChipValue(d.Run.Profile))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 86, Col: 121}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var17))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 18, "</span> <span class=\"run-duration\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var18 string
templ_7745c5c3_Var18, templ_7745c5c3_Err = templ.JoinStringErrs(runDuration(&d.Run))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 87, Col: 51}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var18))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 19, "</span> ")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
if d.Run.FailedStage != "" { if d.Run.FailedStage != "" {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 17, "<span class=\"run-failed-stage\">failed at <strong>") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 20, "<span class=\"run-failed-stage\">failed at <strong>")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
var templ_7745c5c3_Var16 string var templ_7745c5c3_Var19 string
templ_7745c5c3_Var16, templ_7745c5c3_Err = templ.JoinStringErrs(d.Run.FailedStage) templ_7745c5c3_Var19, templ_7745c5c3_Err = templ.JoinStringErrs(d.Run.FailedStage)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 88, Col: 72} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 89, Col: 72}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var16))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 18, "</strong></span> ")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
if d.SpecDiffCritical > 0 {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 19, "<span class=\"run-diffs bad\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var17 string
templ_7745c5c3_Var17, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d critical diff", d.SpecDiffCritical))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 91, Col: 85}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var17))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 20, "</span>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 21, "</div><div class=\"run-header-right\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
if canCancel(&d.Run) {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 22, "<form method=\"post\" action=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var18 templ.SafeURL
templ_7745c5c3_Var18, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/cancel", d.Host.ID)))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 96, Col: 90}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var18))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 23, "\" class=\"inline\" onsubmit=\"return confirm('Cancel run? Destructive stages may leave the host in an intermediate state requiring manual cleanup.');\"><button type=\"submit\" class=\"btn-danger\">Cancel run</button></form>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
if canOverrideWipe(&d.Run) {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 24, "<form method=\"post\" action=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var19 templ.SafeURL
templ_7745c5c3_Var19, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/override-wipe", d.Host.ID)))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 101, Col: 97}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var19)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var19))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 25, "\" class=\"inline\"><button type=\"submit\" class=\"btn-danger\">Override wipe-probe</button></form>") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 21, "</strong></span> ")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
} }
if hasReport(&d.Run) { if d.SpecDiffCritical > 0 {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 26, "<a class=\"button-like\" href=\"") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 22, "<span class=\"run-diffs bad\">")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
var templ_7745c5c3_Var20 templ.SafeURL var templ_7745c5c3_Var20 string
templ_7745c5c3_Var20, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/reports/%d", d.Run.ID))) templ_7745c5c3_Var20, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d critical diff", d.SpecDiffCritical))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 106, Col: 85} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 92, Col: 85}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var20)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var20))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 27, "\" target=\"_blank\" rel=\"noopener\">View report</a> ") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 23, "</span>")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
} }
if d.Run.State.IsTerminal() { templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 24, "</div><div class=\"run-header-right\">")
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 28, "<form method=\"post\" action=\"") if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
if canCancel(&d.Run) {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 25, "<form method=\"post\" action=\"")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
var templ_7745c5c3_Var21 templ.SafeURL var templ_7745c5c3_Var21 templ.SafeURL
templ_7745c5c3_Var21, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", d.Host.ID))) templ_7745c5c3_Var21, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/cancel", d.Host.ID)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 109, Col: 89} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 97, Col: 90}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var21)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var21))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 29, "\" class=\"inline\"><button type=\"submit\" class=\"btn-primary\">Start new run</button></form>") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 26, "\" class=\"inline\" onsubmit=\"return confirm('Cancel run? Destructive stages may leave the host in an intermediate state requiring manual cleanup.');\"><button type=\"submit\" class=\"btn-danger\">Cancel run</button></form>")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 30, "</div></header>") if canOverrideWipe(&d.Run) {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 27, "<form method=\"post\" action=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var22 templ.SafeURL
templ_7745c5c3_Var22, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/override-wipe", d.Host.ID)))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 102, Col: 97}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var22))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 28, "\" class=\"inline\"><button type=\"submit\" class=\"btn-danger\">Override wipe-probe</button></form>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
if hasReport(&d.Run) {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 29, "<a class=\"button-like\" href=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var23 templ.SafeURL
templ_7745c5c3_Var23, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/reports/%d", d.Run.ID)))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 107, Col: 85}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var23))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 30, "\" target=\"_blank\" rel=\"noopener\">View report</a> ")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
if d.Run.State.IsTerminal() {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 31, "<form method=\"post\" action=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var24 templ.SafeURL
templ_7745c5c3_Var24, templ_7745c5c3_Err = templ.JoinURLErrs(templ.SafeURL(fmt.Sprintf("/hosts/%d/start", d.Host.ID)))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 110, Col: 89}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var24))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 32, "\" class=\"inline\"><button type=\"submit\" class=\"btn-primary\">Start new run</button></form>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 33, "</div></header>")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
@@ -449,83 +484,83 @@ func HoldBanner(d RunPageData) templ.Component {
}() }()
} }
ctx = templ.InitializeContext(ctx) ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Var22 := templ.GetChildren(ctx) templ_7745c5c3_Var25 := templ.GetChildren(ctx)
if templ_7745c5c3_Var22 == nil { if templ_7745c5c3_Var25 == nil {
templ_7745c5c3_Var22 = templ.NopComponent templ_7745c5c3_Var25 = templ.NopComponent
} }
ctx = templ.ClearChildren(ctx) ctx = templ.ClearChildren(ctx)
if d.Run.State == model.StateFailedHolding && d.Run.HoldIP != "" { if d.Run.State == model.StateFailedHolding && d.Run.HoldIP != "" {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 31, "<section id=\"") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 34, "<section id=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var23 string
templ_7745c5c3_Var23, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 124, Col: 47}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var23))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 32, "\" class=\"hold-banner\" sse-swap=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var24 string
templ_7745c5c3_Var24, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 126, Col: 53}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var24))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 33, "\" hx-swap=\"outerHTML\"><span class=\"hold-banner-label\">Host is holding — SSH available:</span> <code class=\"hold-ssh\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var25 string
templ_7745c5c3_Var25, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(d.HoldKeyPath, d.Run.HoldIP))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 130, Col: 70}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var25))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 34, "</code></section>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
} else {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 35, "<section id=\"")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
var templ_7745c5c3_Var26 string var templ_7745c5c3_Var26 string
templ_7745c5c3_Var26, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID)) templ_7745c5c3_Var26, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 134, Col: 47} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 125, Col: 47}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var26)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var26))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 36, "\" class=\"detail-hold-placeholder\" sse-swap=\"") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 35, "\" class=\"hold-banner\" sse-swap=\"")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
var templ_7745c5c3_Var27 string var templ_7745c5c3_Var27 string
templ_7745c5c3_Var27, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID)) templ_7745c5c3_Var27, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 136, Col: 53} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 127, Col: 53}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var27)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var27))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 37, "\" hx-swap=\"outerHTML\"></section>") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 36, "\" hx-swap=\"outerHTML\"><span class=\"hold-banner-label\">Host is holding — SSH available:</span> <code class=\"hold-ssh\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var28 string
templ_7745c5c3_Var28, templ_7745c5c3_Err = templ.JoinStringErrs(sshInvocation(d.HoldKeyPath, d.Run.HoldIP))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 131, Col: 70}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var28))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 37, "</code></section>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
} else {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, "<section id=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var29 string
templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 135, Col: 47}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var29))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 39, "\" class=\"detail-hold-placeholder\" sse-swap=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var30 string
templ_7745c5c3_Var30, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-hold-%d", d.Run.ID))
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 137, Col: 53}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var30))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 40, "\" hx-swap=\"outerHTML\"></section>")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
@@ -553,138 +588,138 @@ func RunSpecDiffs(d RunPageData) templ.Component {
}() }()
} }
ctx = templ.InitializeContext(ctx) ctx = templ.InitializeContext(ctx)
templ_7745c5c3_Var28 := templ.GetChildren(ctx) templ_7745c5c3_Var31 := templ.GetChildren(ctx)
if templ_7745c5c3_Var28 == nil { if templ_7745c5c3_Var31 == nil {
templ_7745c5c3_Var28 = templ.NopComponent templ_7745c5c3_Var31 = templ.NopComponent
} }
ctx = templ.ClearChildren(ctx) ctx = templ.ClearChildren(ctx)
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, "<section id=\"") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 41, "<section id=\"")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
var templ_7745c5c3_Var29 string var templ_7745c5c3_Var32 string
templ_7745c5c3_Var29, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-specdiffs-%d", d.Run.ID)) templ_7745c5c3_Var32, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-specdiffs-%d", d.Run.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 147, Col: 51} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 148, Col: 51}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var29)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var32))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 39, "\" class=\"detail-section detail-diffs\" sse-swap=\"") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "\" class=\"detail-section detail-diffs\" sse-swap=\"")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
var templ_7745c5c3_Var30 string var templ_7745c5c3_Var33 string
templ_7745c5c3_Var30, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-specdiffs-%d", d.Run.ID)) templ_7745c5c3_Var33, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("detail-specdiffs-%d", d.Run.ID))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 149, Col: 57} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 150, Col: 57}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var30)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var33))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 40, "\" hx-swap=\"outerHTML\">") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "\" hx-swap=\"outerHTML\">")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
if len(d.SpecDiffs) > 0 { if len(d.SpecDiffs) > 0 {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 41, "<details") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, "<details")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
if hasCriticalDiff(d.SpecDiffs) { if hasCriticalDiff(d.SpecDiffs) {
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, " open") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 45, " open")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "><summary><h2>Spec diffs (") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, "><summary><h2>Spec diffs (")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
var templ_7745c5c3_Var31 string var templ_7745c5c3_Var34 string
templ_7745c5c3_Var31, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", len(d.SpecDiffs))) templ_7745c5c3_Var34, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", len(d.SpecDiffs)))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 154, Col: 66} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 155, Col: 66}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var31)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var34))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, ")</h2></summary><ul class=\"diff-list\">") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, ")</h2></summary><ul class=\"diff-list\">")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
for _, diff := range d.SpecDiffs { for _, diff := range d.SpecDiffs {
var templ_7745c5c3_Var32 = []any{"diff-row", "diff-" + diff.Severity} var templ_7745c5c3_Var35 = []any{"diff-row", "diff-" + diff.Severity}
templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var32...) templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var35...)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 45, "<li class=\"") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "<li class=\"")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var33 string
templ_7745c5c3_Var33, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var32).String())
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 1, Col: 0}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var33))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, "\"><div class=\"diff-field\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var34 string
templ_7745c5c3_Var34, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Field)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 158, Col: 43}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var34))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, "</div><div class=\"diff-expected\">expected: <code>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var35 string
templ_7745c5c3_Var35, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Expected)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 159, Col: 65}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var35))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "</code></div><div class=\"diff-actual\">actual: <code>")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
var templ_7745c5c3_Var36 string var templ_7745c5c3_Var36 string
templ_7745c5c3_Var36, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Actual) templ_7745c5c3_Var36, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var35).String())
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 160, Col: 59} return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 1, Col: 0}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var36)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var36))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "</code></div></li>") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "\"><div class=\"diff-field\">")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var37 string
templ_7745c5c3_Var37, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Field)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 159, Col: 43}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var37))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 50, "</div><div class=\"diff-expected\">expected: <code>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var38 string
templ_7745c5c3_Var38, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Expected)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 160, Col: 65}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var38))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 51, "</code></div><div class=\"diff-actual\">actual: <code>")
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
var templ_7745c5c3_Var39 string
templ_7745c5c3_Var39, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Actual)
if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 161, Col: 59}
}
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var39))
if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err
}
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 52, "</code></div></li>")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 50, "</ul></details>") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 53, "</ul></details>")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
} }
templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 51, "</section>") templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 54, "</section>")
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ_7745c5c3_Err return templ_7745c5c3_Err
} }
+7 -7
View File
@@ -99,7 +99,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
var templ_7745c5c3_Var3 string var templ_7745c5c3_Var3 string
templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("substep-%d-%s-%d", ss.RunID, ss.StageName, ss.Ordinal)) templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("substep-%d-%s-%d", ss.RunID, ss.StageName, ss.Ordinal))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 63, Col: 74} return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 63, Col: 74}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -112,7 +112,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
var templ_7745c5c3_Var4 string var templ_7745c5c3_Var4 string
templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String()) templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String())
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 1, Col: 0} return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 1, Col: 0}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -125,7 +125,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
var templ_7745c5c3_Var5 string var templ_7745c5c3_Var5 string
templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("substep-%d-%s-%d", ss.RunID, ss.StageName, ss.Ordinal)) templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("substep-%d-%s-%d", ss.RunID, ss.StageName, ss.Ordinal))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 65, Col: 80} return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 65, Col: 80}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -147,7 +147,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
var templ_7745c5c3_Var7 string var templ_7745c5c3_Var7 string
templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var6).String()) templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var6).String())
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 1, Col: 0} return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 1, Col: 0}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -160,7 +160,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
var templ_7745c5c3_Var8 string var templ_7745c5c3_Var8 string
templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(subStepMarker(ss.State)) templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(subStepMarker(ss.State))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 68, Col: 96} return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 68, Col: 96}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -173,7 +173,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
var templ_7745c5c3_Var9 string var templ_7745c5c3_Var9 string
templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(ss.Name) templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(ss.Name)
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 69, Col: 38} return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 69, Col: 38}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
@@ -186,7 +186,7 @@ func SubStepRow(ss model.SubStep) templ.Component {
var templ_7745c5c3_Var10 string var templ_7745c5c3_Var10 string
templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(subStepDuration(ss)) templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(subStepDuration(ss))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {
return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 70, Col: 54} return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 70, Col: 54}
} }
_, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10)) _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10))
if templ_7745c5c3_Err != nil { if templ_7745c5c3_Err != nil {