diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index abf2ac7..adda700 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -42,4 +42,20 @@ jobs: GOOS=linux GOARCH=amd64 go build ./... - name: Test - run: go test -race -count=1 ./... + run: go test -race -count=1 -coverprofile=coverage.out ./... + + - name: Coverage summary + run: | + go tool cover -func=coverage.out | tee coverage.txt + go tool cover -html=coverage.out -o coverage.html + + - name: Upload coverage artifact + uses: actions/upload-artifact@v4 + if: always() + with: + name: coverage + path: | + coverage.out + coverage.txt + coverage.html + retention-days: 14 diff --git a/agent/client.go b/agent/client.go index 56dc64c..223996e 100644 --- a/agent/client.go +++ b/agent/client.go @@ -124,6 +124,56 @@ type ClaimResponse struct { // at the right stage instead of silently replaying Inventory and // letting the orchestrator advance past the crashed stage. CurrentState string `json:"current_state"` + // StageConfig carries per-profile stage knobs (Phase 2): stage-level + // timeouts and probe-level durations/modes. Empty when the agent + // talks to a pre-Phase-2 orchestrator; the agent applies compile- + // time defaults in that case. + StageConfig ClaimStageConfig `json:"stage_config"` +} + +// ClaimStageConfig mirrors config.StageConfig server-side — duplicated so +// the agent doesn't need to import internal/config. Durations arrive as +// strings ("2m", "2h") and are parsed by the tests package at the point +// of use. An empty field means "use the agent-side default" so a missing +// knob doesn't silently turn CPUStress / Storage into a no-op. +type ClaimStageConfig struct { + Profile string `json:"profile"` + StageTimeouts map[string]string `json:"stage_timeouts,omitempty"` + CPUStress ClaimCPUStressKnobs `json:"cpustress"` + Storage ClaimStorageKnobs `json:"storage"` + Network ClaimNetworkKnobs `json:"network"` + Burn ClaimBurnKnobs `json:"burn"` +} + +type ClaimCPUStressKnobs struct { + CPUPass string `json:"cpu_pass,omitempty"` + MemPass string `json:"mem_pass,omitempty"` + EDACPoll string `json:"edac_poll,omitempty"` +} + +type ClaimStorageKnobs struct { + Mode string `json:"mode,omitempty"` + FioSize string `json:"fio_size,omitempty"` + FioTime string `json:"fio_time,omitempty"` + FioBS string `json:"fio_bs,omitempty"` + FioRW string `json:"fio_rw,omitempty"` + Verify string `json:"verify,omitempty"` +} + +type ClaimNetworkKnobs struct { + Duration string `json:"duration,omitempty"` +} + +// ClaimBurnKnobs mirrors config.BurnKnobs. Duration/CPUWorkers arrive as +// strings so the agent can treat empty as "use compile-time default". +// MemPct is a percentage (0-100); IperfParallel is the parallel stream +// count fed to iperf3 -P. FioOnSpare gates whether fio runs inside Burn. +type ClaimBurnKnobs struct { + Duration string `json:"duration,omitempty"` + CPUWorkers string `json:"cpu_workers,omitempty"` + MemPct int `json:"mem_pct,omitempty"` + FioOnSpare bool `json:"fio_on_spare,omitempty"` + IperfParallel int `json:"iperf_parallel,omitempty"` } type ClaimExpectedDiskSpec struct { diff --git a/agent/probes/edac.go b/agent/probes/edac.go new file mode 100644 index 0000000..45f98c0 --- /dev/null +++ b/agent/probes/edac.go @@ -0,0 +1,70 @@ +package probes + +import ( + "os" + "path/filepath" + "strconv" + "strings" +) + +// EDACSample is one counter reading from /sys/devices/system/edac/mc/. +// Kind is "edac_ce" (correctable ECC errors) or "edac_ue" +// (uncorrectable — always a critical signal). Key identifies the memory +// controller (e.g. "mc0"). Value is the cumulative count since boot; +// the threshold evaluator flags it the moment it exceeds 0. +type EDACSample struct { + Kind string + Key string + Value float64 + Unit string +} + +// EDAC returns one EDACSample per (memory-controller × {ce,ue}) pair +// that /sys exposes. Returns an empty slice when EDAC isn't available +// (virtualized host, missing kernel driver, mdadm-style boards without +// a controller node) — callers treat an empty return as "no data", +// not "passed". Errors are swallowed for the same reason: a hot- +// swapped DIMM that makes /sys blink briefly shouldn't fail the stage +// before the real counter can be read. +// +// This is intentionally small — the sidecar polls periodically, so one +// bad read is recovered on the next tick. The counters are monotonic, +// so emitting the current raw value is correct. +func EDAC() []EDACSample { + root := "/sys/devices/system/edac/mc" + entries, err := os.ReadDir(root) + if err != nil { + return nil + } + var out []EDACSample + for _, e := range entries { + name := e.Name() + if !strings.HasPrefix(name, "mc") { + continue + } + base := filepath.Join(root, name) + if ce, ok := readCount(filepath.Join(base, "ce_count")); ok { + out = append(out, EDACSample{Kind: "edac_ce", Key: name, Value: ce, Unit: "count"}) + } + if ue, ok := readCount(filepath.Join(base, "ue_count")); ok { + out = append(out, EDACSample{Kind: "edac_ue", Key: name, Value: ue, Unit: "count"}) + } + } + return out +} + +// readCount reads a single decimal integer from a sysfs file and +// returns it as a float. Returns (0, false) on any failure so callers +// can skip the sample without a diagnostic. +func readCount(path string) (float64, bool) { + b, err := os.ReadFile(path) + if err != nil { + return 0, false + } + s := strings.TrimSpace(string(b)) + n, err := strconv.ParseInt(s, 10, 64) + if err != nil { + return 0, false + } + return float64(n), true +} diff --git a/agent/probes/firmware.go b/agent/probes/firmware.go new file mode 100644 index 0000000..db4c37e --- /dev/null +++ b/agent/probes/firmware.go @@ -0,0 +1,496 @@ +package probes + +import ( + "bufio" + "context" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + "time" +) + +// FirmwareSnapshot is the on-wire shape the agent POSTs alongside the +// Firmware stage result. Mirrors internal/store.FirmwareSnapshot without +// the import — the /result handler converts to the store type and +// persists. One run produces many snapshots (one per BIOS / BMC / NIC +// port / HBA / microcode / NVMe); identifier distinguishes siblings +// (e.g. "eth0" / "eth1"), version is the canonical string to diff. +type FirmwareSnapshot struct { + Component string `json:"component"` // bios|bmc|nic|hba|microcode|nvme_fw + Identifier string `json:"identifier"` + Version string `json:"version"` + Vendor string `json:"vendor,omitempty"` + Raw map[string]string `json:"raw,omitempty"` +} + +// Firmware runs every sub-probe in sequence. Each one is bounded with +// a short timeout so a hung dmidecode / ipmitool / nvme tool can't +// freeze the stage — the probe is best-effort, missing tools produce +// empty output rather than an error. Returns the aggregated slice +// along with a list of probe-level warnings (surfaced in the stage +// summary so operators see which subsystem couldn't be read). +func Firmware(ctx context.Context) ([]FirmwareSnapshot, []string) { + var out []FirmwareSnapshot + var warnings []string + + if snap, warn := probeBIOS(ctx); snap != nil { + out = append(out, *snap) + } else if warn != "" { + warnings = append(warnings, warn) + } + if snap, warn := probeBMC(ctx); snap != nil { + out = append(out, *snap) + } else if warn != "" { + warnings = append(warnings, warn) + } + out = append(out, probeNICFirmware(ctx)...) + out = append(out, probeNVMeFirmware(ctx)...) + out = append(out, probeHBAFirmware(ctx)...) + if snap := probeMicrocode(); snap != nil { + out = append(out, *snap) + } + + return out, warnings +} + +// runCmd executes a short-lived command with a per-call timeout. The +// timeout is intentionally aggressive (5 s) because firmware probes +// read device registers and occasionally block forever on a wedged +// controller — the stage should report "no HBA firmware readable" +// rather than hang the pipeline. +func runCmd(ctx context.Context, name string, args ...string) (string, error) { + cctx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + cmd := exec.CommandContext(cctx, name, args...) + out, err := cmd.CombinedOutput() + if err != nil { + return string(out), err + } + return string(out), nil +} + +// ----- BIOS -------------------------------------------------------------- + +// probeBIOS invokes dmidecode -t bios and parses the vendor + version +// lines. dmidecode must run as root; we let it fail gracefully when the +// agent is mis-deployed without privileges. +func probeBIOS(ctx context.Context) (*FirmwareSnapshot, string) { + if _, err := exec.LookPath("dmidecode"); err != nil { + return nil, "bios: dmidecode not installed" + } + out, err := runCmd(ctx, "dmidecode", "-t", "bios") + if err != nil { + return nil, fmt.Sprintf("bios: dmidecode failed: %v", trimErr(err, out)) + } + snap := parseDmidecodeBIOS(strings.NewReader(out)) + if snap == nil { + return nil, "bios: dmidecode produced no usable output" + } + return snap, "" +} + +// parseDmidecodeBIOS consumes `dmidecode -t bios` output and pulls +// Vendor / Version / Release Date. Kept as an io.Reader for unit tests. +func parseDmidecodeBIOS(r io.Reader) *FirmwareSnapshot { + kv := parseDmidecodeSection(r, "BIOS Information") + if kv == nil { + return nil + } + snap := &FirmwareSnapshot{ + Component: "bios", + Identifier: "system", + Version: firstNonEmpty(kv["Version"], kv["Firmware Revision"]), + Vendor: kv["Vendor"], + Raw: kv, + } + if snap.Version == "" { + return nil + } + return snap +} + +// parseDmidecodeSection returns the key/value map of the first dmidecode +// handle whose title matches. dmidecode blocks look like: +// Handle 0x0000, ... +// BIOS Information +// Vendor: American Megatrends +// Version: 3.0 +// ... +// With a blank line between blocks. Values like "Characteristics:" +// followed by a bulleted sub-list are collapsed into "…" so we don't +// accidentally swallow the next handle. +func parseDmidecodeSection(r io.Reader, title string) map[string]string { + sc := bufio.NewScanner(r) + sc.Buffer(make([]byte, 0, 64*1024), 1024*1024) + var kv map[string]string + var inside, seenTitle bool + for sc.Scan() { + line := sc.Text() + trim := strings.TrimSpace(line) + if strings.HasPrefix(line, "Handle ") { + if seenTitle && kv != nil { + return kv + } + inside = false + kv = nil + continue + } + if !inside { + if trim == title { + inside = true + seenTitle = true + kv = map[string]string{} + } + continue + } + if trim == "" { + continue + } + if k, v, ok := strings.Cut(trim, ":"); ok { + v = strings.TrimSpace(v) + if v == "" { + continue + } + kv[strings.TrimSpace(k)] = v + } + } + if seenTitle { + return kv + } + return nil +} + +// ----- BMC / IPMI -------------------------------------------------------- + +// probeBMC walks `ipmitool mc info`. Home-lab hosts often lack a BMC — +// missing binary or a non-zero exit returns a warning without failing +// the stage. We capture Firmware Revision + Manufacturer as the version. +func probeBMC(ctx context.Context) (*FirmwareSnapshot, string) { + if _, err := exec.LookPath("ipmitool"); err != nil { + return nil, "bmc: ipmitool not installed" + } + out, err := runCmd(ctx, "ipmitool", "mc", "info") + if err != nil { + return nil, fmt.Sprintf("bmc: ipmitool mc info failed: %v", trimErr(err, out)) + } + snap := parseIpmitoolMCInfo(strings.NewReader(out)) + if snap == nil { + return nil, "bmc: ipmitool output not parseable" + } + return snap, "" +} + +// parseIpmitoolMCInfo pulls "Firmware Revision" + "Manufacturer Name" +// from the textual output. Format is indented key : value lines. +func parseIpmitoolMCInfo(r io.Reader) *FirmwareSnapshot { + sc := bufio.NewScanner(r) + kv := map[string]string{} + for sc.Scan() { + line := strings.TrimSpace(sc.Text()) + if k, v, ok := strings.Cut(line, ":"); ok { + kv[strings.TrimSpace(k)] = strings.TrimSpace(v) + } + } + version := firstNonEmpty(kv["Firmware Revision"], kv["Aux Firmware Rev Info"]) + if version == "" { + return nil + } + return &FirmwareSnapshot{ + Component: "bmc", + Identifier: "bmc0", + Version: version, + Vendor: kv["Manufacturer Name"], + Raw: kv, + } +} + +// ----- NIC firmware ------------------------------------------------------ + +// probeNICFirmware enumerates /sys/class/net/*/device and calls +// `ethtool -i ` on each real NIC (skip lo, bridges, virtuals). +// One snapshot per interface so a mismatched port lights up in the diff +// without silencing sibling ports. +func probeNICFirmware(ctx context.Context) []FirmwareSnapshot { + if _, err := exec.LookPath("ethtool"); err != nil { + return nil + } + ifaces, err := os.ReadDir("/sys/class/net") + if err != nil { + return nil + } + var out []FirmwareSnapshot + for _, entry := range ifaces { + name := entry.Name() + if !isRealNIC(name) { + continue + } + raw, err := runCmd(ctx, "ethtool", "-i", name) + if err != nil { + continue + } + snap := parseEthtoolI(strings.NewReader(raw), name) + if snap != nil { + out = append(out, *snap) + } + } + return out +} + +// parseEthtoolI extracts driver/firmware-version from `ethtool -i` +// output. Lines are "key: value" with a consistent prefix order. +func parseEthtoolI(r io.Reader, iface string) *FirmwareSnapshot { + sc := bufio.NewScanner(r) + kv := map[string]string{} + for sc.Scan() { + line := sc.Text() + if k, v, ok := strings.Cut(line, ":"); ok { + kv[strings.TrimSpace(k)] = strings.TrimSpace(v) + } + } + if kv["firmware-version"] == "" && kv["driver"] == "" { + return nil + } + return &FirmwareSnapshot{ + Component: "nic", + Identifier: iface, + Version: kv["firmware-version"], + Vendor: kv["driver"], + Raw: kv, + } +} + +// isRealNIC filters out loopback, bridges, veth, and the handful of +// virtual kernel devices ethtool will refuse on. +func isRealNIC(name string) bool { + if name == "" || name == "lo" { + return false + } + for _, prefix := range []string{"docker", "br-", "veth", "virbr", "tun", "tap", "bond"} { + if strings.HasPrefix(name, prefix) { + return false + } + } + // Only accept interfaces that have a `device` link — real PCI NICs + // do; pure virtuals (dummy0, wg*) don't. + if _, err := os.Stat(filepath.Join("/sys/class/net", name, "device")); err != nil { + return false + } + return true +} + +// ----- NVMe -------------------------------------------------------------- + +// probeNVMeFirmware reads /sys/class/nvme/nvmeN/firmware_rev for every +// controller. Falls back to `nvme id-ctrl` if the sysfs file is missing +// (older kernels). Identifier is the controller path so a run with two +// drives produces two snapshots. +func probeNVMeFirmware(ctx context.Context) []FirmwareSnapshot { + entries, err := os.ReadDir("/sys/class/nvme") + if err != nil { + return nil + } + var out []FirmwareSnapshot + for _, e := range entries { + ctrl := e.Name() + rev := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "firmware_rev"))) + model := strings.TrimSpace(readFile(filepath.Join("/sys/class/nvme", ctrl, "model"))) + if rev == "" { + // Fallback: nvme id-ctrl -H /dev/. Available on hosts + // where sysfs doesn't export firmware_rev. + if _, err := exec.LookPath("nvme"); err == nil { + raw, _ := runCmd(ctx, "nvme", "id-ctrl", "/dev/"+ctrl) + rev = parseNVMeIDCtrl(strings.NewReader(raw), "fr") + if model == "" { + model = parseNVMeIDCtrl(strings.NewReader(raw), "mn") + } + } + } + if rev == "" { + continue + } + out = append(out, FirmwareSnapshot{ + Component: "nvme_fw", + Identifier: ctrl, + Version: rev, + Vendor: model, + Raw: map[string]string{"model": model, "firmware_rev": rev}, + }) + } + return out +} + +// parseNVMeIDCtrl pulls a single field out of `nvme id-ctrl` output. +// Format: "fr : FW1234" / "mn : Samsung SSD 980 PRO". +// Leading spaces vary, values may contain spaces. +func parseNVMeIDCtrl(r io.Reader, key string) string { + sc := bufio.NewScanner(r) + prefix := key + " " + for sc.Scan() { + line := strings.TrimSpace(sc.Text()) + if !strings.HasPrefix(line, prefix) { + continue + } + _, v, ok := strings.Cut(line, ":") + if !ok { + continue + } + return strings.TrimSpace(v) + } + return "" +} + +// ----- HBA --------------------------------------------------------------- + +var lspciClassHBA = regexp.MustCompile(`(?i)(serial attached scsi|sas controller|raid bus controller)`) + +// probeHBAFirmware looks for SAS/RAID HBAs via `lspci -Dvvnn`. The +// firmware string is typically exposed as "Product Name" + +// "Capabilities" but in practice the LSI/Broadcom driver writes a +// "revision" on the device line. We capture what's printed and rely on +// SpecValidate to diff — this keeps us off tool-specific CLIs (storcli, +// mpt-status) that aren't always installed. +func probeHBAFirmware(ctx context.Context) []FirmwareSnapshot { + if _, err := exec.LookPath("lspci"); err != nil { + return nil + } + out, err := runCmd(ctx, "lspci", "-Dvvnn") + if err != nil { + return nil + } + return parseLspciHBA(strings.NewReader(out)) +} + +// parseLspciHBA walks `lspci -Dvvnn` stanzas and picks SAS/RAID +// controllers. One snapshot per device; identifier is the PCI address. +// Version is the device line's revision (rev NN) or the Kernel modules +// string when no rev is printed. +func parseLspciHBA(r io.Reader) []FirmwareSnapshot { + sc := bufio.NewScanner(r) + sc.Buffer(make([]byte, 0, 64*1024), 1024*1024) + var out []FirmwareSnapshot + var cur *FirmwareSnapshot + revRe := regexp.MustCompile(`\(rev\s+([0-9a-fA-F]+)\)`) + flush := func() { + if cur != nil && cur.Version != "" { + out = append(out, *cur) + } + cur = nil + } + for sc.Scan() { + line := sc.Text() + if !strings.HasPrefix(line, "\t") && strings.Contains(line, " ") { + // New device line. + flush() + if lspciClassHBA.MatchString(line) { + addr, rest, _ := strings.Cut(line, " ") + cur = &FirmwareSnapshot{ + Component: "hba", + Identifier: addr, + Vendor: strings.TrimSpace(rest), + Raw: map[string]string{"device_line": line}, + } + if m := revRe.FindStringSubmatch(line); len(m) == 2 { + cur.Version = "rev " + m[1] + } + } + continue + } + if cur == nil { + continue + } + trim := strings.TrimSpace(line) + if strings.HasPrefix(trim, "Kernel modules:") { + cur.Raw["kernel_modules"] = strings.TrimPrefix(trim, "Kernel modules:") + } + if strings.HasPrefix(trim, "Kernel driver in use:") { + cur.Raw["kernel_driver"] = strings.TrimPrefix(trim, "Kernel driver in use:") + } + } + flush() + return out +} + +// ----- Microcode --------------------------------------------------------- + +// probeMicrocode reads /proc/cpuinfo for the "microcode" line. All +// cores report the same value post-boot, so one snapshot is enough. +func probeMicrocode() *FirmwareSnapshot { + f, err := os.Open("/proc/cpuinfo") + if err != nil { + return nil + } + defer func() { _ = f.Close() }() + snap := parseMicrocode(f) + return snap +} + +func parseMicrocode(r io.Reader) *FirmwareSnapshot { + sc := bufio.NewScanner(r) + version := "" + vendor := "" + for sc.Scan() { + line := sc.Text() + k, v, ok := strings.Cut(line, ":") + if !ok { + continue + } + key := strings.TrimSpace(k) + val := strings.TrimSpace(v) + switch key { + case "microcode": + if version == "" { + version = val + } + case "vendor_id": + if vendor == "" { + vendor = val + } + } + if version != "" && vendor != "" { + break + } + } + if version == "" { + return nil + } + return &FirmwareSnapshot{ + Component: "microcode", + Identifier: "cpu", + Version: version, + Vendor: vendor, + } +} + +// ----- helpers ----------------------------------------------------------- + +func firstNonEmpty(ss ...string) string { + for _, s := range ss { + if strings.TrimSpace(s) != "" { + return s + } + } + return "" +} + +func readFile(p string) string { + b, err := os.ReadFile(p) + if err != nil { + return "" + } + return string(b) +} + +// trimErr joins the underlying error with the first line of combined +// output so the warning message carries enough diagnostic context +// without dumping a screenful of dmidecode/ipmitool noise. +func trimErr(err error, out string) string { + firstLine := strings.SplitN(strings.TrimSpace(out), "\n", 2)[0] + if firstLine == "" { + return err.Error() + } + return fmt.Sprintf("%v (%s)", err, firstLine) +} diff --git a/agent/probes/firmware_test.go b/agent/probes/firmware_test.go new file mode 100644 index 0000000..44ed938 --- /dev/null +++ b/agent/probes/firmware_test.go @@ -0,0 +1,232 @@ +package probes + +import ( + "strings" + "testing" +) + +// Golden dmidecode -t bios output (trimmed, representative). A real +// host will have more lines; parse must tolerate the unknown fields. +const dmidecodeBIOS = `# dmidecode 3.3 +Getting SMBIOS data from sysfs. +SMBIOS 3.2.0 present. + +Handle 0x0000, DMI type 0, 26 bytes +BIOS Information + Vendor: American Megatrends Inc. + Version: 3.2 + Release Date: 07/15/2021 + Address: 0xF0000 + Runtime Size: 64 kB + ROM Size: 32 MB + Characteristics: + PCI is supported + BIOS is upgradeable + +Handle 0x0001, DMI type 1, 27 bytes +System Information + Manufacturer: Supermicro + Product Name: X11SSL-F +` + +func TestParseDmidecodeBIOS(t *testing.T) { + snap := parseDmidecodeBIOS(strings.NewReader(dmidecodeBIOS)) + if snap == nil { + t.Fatal("parseDmidecodeBIOS returned nil") + } + if snap.Component != "bios" { + t.Errorf("component = %q, want bios", snap.Component) + } + if snap.Version != "3.2" { + t.Errorf("version = %q, want 3.2", snap.Version) + } + if snap.Vendor != "American Megatrends Inc." { + t.Errorf("vendor = %q, want American Megatrends Inc.", snap.Vendor) + } + if snap.Raw["Release Date"] != "07/15/2021" { + t.Errorf("release date = %q, want 07/15/2021", snap.Raw["Release Date"]) + } +} + +func TestParseDmidecodeBIOSMissingBlock(t *testing.T) { + // No BIOS Information block → nil result, not a crash. + input := "Handle 0x0001, DMI type 1, 27 bytes\nSystem Information\n\tManufacturer: Acme\n" + if snap := parseDmidecodeBIOS(strings.NewReader(input)); snap != nil { + t.Fatalf("expected nil when BIOS block absent, got %+v", snap) + } +} + +const ipmitoolMCInfo = `Device ID : 32 +Device Revision : 1 +Firmware Revision : 1.74 +IPMI Version : 2.0 +Manufacturer ID : 10876 +Manufacturer Name : Supermicro +Product ID : 2051 (0x0803) +Product Name : Unknown (0x803) +` + +func TestParseIpmitoolMCInfo(t *testing.T) { + snap := parseIpmitoolMCInfo(strings.NewReader(ipmitoolMCInfo)) + if snap == nil { + t.Fatal("parseIpmitoolMCInfo returned nil") + } + if snap.Component != "bmc" { + t.Errorf("component = %q, want bmc", snap.Component) + } + if snap.Version != "1.74" { + t.Errorf("version = %q, want 1.74", snap.Version) + } + if snap.Vendor != "Supermicro" { + t.Errorf("vendor = %q, want Supermicro", snap.Vendor) + } +} + +func TestParseIpmitoolMCInfoEmpty(t *testing.T) { + if snap := parseIpmitoolMCInfo(strings.NewReader("")); snap != nil { + t.Fatalf("expected nil on empty input, got %+v", snap) + } +} + +const ethtoolEth0 = `driver: mlx5_core +version: 5.15.0 +firmware-version: 16.32.1010 (MT_0000000008) +expansion-rom-version: +bus-info: 0000:5e:00.0 +supports-statistics: yes +` + +func TestParseEthtoolI(t *testing.T) { + snap := parseEthtoolI(strings.NewReader(ethtoolEth0), "eth0") + if snap == nil { + t.Fatal("parseEthtoolI returned nil") + } + if snap.Component != "nic" || snap.Identifier != "eth0" { + t.Errorf("component/id = %q/%q, want nic/eth0", snap.Component, snap.Identifier) + } + if snap.Version != "16.32.1010 (MT_0000000008)" { + t.Errorf("version = %q, want 16.32.1010 (MT_0000000008)", snap.Version) + } + if snap.Vendor != "mlx5_core" { + t.Errorf("vendor = %q, want mlx5_core", snap.Vendor) + } +} + +func TestParseEthtoolIEmpty(t *testing.T) { + if snap := parseEthtoolI(strings.NewReader("not a valid output"), "eth0"); snap != nil { + t.Fatalf("expected nil on garbage input, got %+v", snap) + } +} + +const nvmeIDCtrl = `NVME Identify Controller: +vid : 0x144d +ssvid : 0x144d +sn : S5GYNX0R500123X +mn : Samsung SSD 980 PRO 1TB +fr : 5B2QGXA7 +rab : 2 +` + +func TestParseNVMeIDCtrl(t *testing.T) { + if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "fr"); got != "5B2QGXA7" { + t.Errorf("fr = %q, want 5B2QGXA7", got) + } + if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "mn"); got != "Samsung SSD 980 PRO 1TB" { + t.Errorf("mn = %q, want Samsung SSD 980 PRO 1TB", got) + } + if got := parseNVMeIDCtrl(strings.NewReader(nvmeIDCtrl), "missing"); got != "" { + t.Errorf("missing key should be empty, got %q", got) + } +} + +const lspciHBA = `0000:01:00.0 Ethernet controller [0200]: Intel Corporation I350 [8086:1521] (rev 01) + Subsystem: Intel Corporation I350 [8086:0001] + Kernel driver in use: igb + Kernel modules: igb + +0000:03:00.0 Serial Attached SCSI controller [0107]: Broadcom / LSI SAS3008 PCI-Express Fusion-MPT SAS-3 [1000:0097] (rev 02) + Subsystem: Broadcom / LSI SAS9300-8i [1000:30e0] + Kernel driver in use: mpt3sas + Kernel modules: mpt3sas + +0000:04:00.0 RAID bus controller [0104]: LSI MegaRAID SAS-3 3108 [1000:005d] (rev 02) + Subsystem: LSI MegaRAID SAS 9361-8i [1000:9361] + Kernel driver in use: megaraid_sas + Kernel modules: megaraid_sas +` + +func TestParseLspciHBA(t *testing.T) { + got := parseLspciHBA(strings.NewReader(lspciHBA)) + if len(got) != 2 { + t.Fatalf("got %d HBA snapshots, want 2 (SAS + RAID; Ethernet must be skipped)", len(got)) + } + for _, s := range got { + if s.Component != "hba" { + t.Errorf("component = %q, want hba", s.Component) + } + if s.Version != "rev 02" { + t.Errorf("version = %q, want 'rev 02'", s.Version) + } + } + if got[0].Identifier != "0000:03:00.0" { + t.Errorf("first identifier = %q, want 0000:03:00.0", got[0].Identifier) + } + if got[1].Identifier != "0000:04:00.0" { + t.Errorf("second identifier = %q, want 0000:04:00.0", got[1].Identifier) + } +} + +const cpuinfo = `processor : 0 +vendor_id : GenuineIntel +cpu family : 6 +model : 85 +model name : Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz +stepping : 7 +microcode : 0x5003006 +cpu MHz : 2100.000 +` + +func TestParseMicrocode(t *testing.T) { + snap := parseMicrocode(strings.NewReader(cpuinfo)) + if snap == nil { + t.Fatal("parseMicrocode returned nil") + } + if snap.Version != "0x5003006" { + t.Errorf("version = %q, want 0x5003006", snap.Version) + } + if snap.Vendor != "GenuineIntel" { + t.Errorf("vendor = %q, want GenuineIntel", snap.Vendor) + } + if snap.Identifier != "cpu" { + t.Errorf("identifier = %q, want cpu", snap.Identifier) + } +} + +func TestParseMicrocodeMissing(t *testing.T) { + // A /proc/cpuinfo without a microcode line returns nil. + input := "processor\t: 0\nvendor_id\t: GenuineIntel\n" + if snap := parseMicrocode(strings.NewReader(input)); snap != nil { + t.Fatalf("expected nil when microcode line absent, got %+v", snap) + } +} + +func TestIsRealNIC(t *testing.T) { + cases := []struct { + name string + want bool // want=true means a real-looking name (the /sys/class/net//device check is skipped here) + }{ + {"lo", false}, + {"", false}, + {"docker0", false}, + {"br-abc", false}, + {"veth1234", false}, + {"virbr0", false}, + {"bond0", false}, + {"tun0", false}, + } + for _, tc := range cases { + if got := isRealNIC(tc.name); got != tc.want { + t.Errorf("isRealNIC(%q) = %v, want %v", tc.name, got, tc.want) + } + } +} diff --git a/agent/probes/netdev.go b/agent/probes/netdev.go new file mode 100644 index 0000000..f059d9a --- /dev/null +++ b/agent/probes/netdev.go @@ -0,0 +1,85 @@ +package probes + +import ( + "bufio" + "io" + "os" + "strconv" + "strings" +) + +// NetDevSnapshot is the per-interface counter row from /proc/net/dev at +// a single instant. Used by the Network stage to compute deltas across +// an iperf window — a rising rx_errors or tx_dropped during a loaded +// link is a real NIC problem, not general noise. +type NetDevSnapshot struct { + Iface string + RxBytes uint64 + RxErrs uint64 + RxDrop uint64 + TxBytes uint64 + TxErrs uint64 + TxDrop uint64 +} + +// NetDev reads /proc/net/dev and returns one snapshot per non-loopback +// interface. Returns nil on read/parse failure (best-effort: a missing +// /proc is survivable; the caller skips delta reporting that tick). +func NetDev() []NetDevSnapshot { + f, err := os.Open("/proc/net/dev") + if err != nil { + return nil + } + defer func() { _ = f.Close() }() + return parseNetDev(f) +} + +// parseNetDev is split from NetDev so tests can feed a fixture without +// touching the real /proc. The /proc/net/dev format is two header lines +// followed by rows of "iface: rx_bytes rx_packets rx_errs rx_drop ... tx_bytes tx_packets tx_errs tx_drop ..." +// — 16 whitespace-separated counters, of which we pull a curated six. +func parseNetDev(r io.Reader) []NetDevSnapshot { + var out []NetDevSnapshot + sc := bufio.NewScanner(r) + // Skip the two header lines (iface || bytes ... || bytes ...). + for i := 0; i < 2 && sc.Scan(); i++ { + } + for sc.Scan() { + line := strings.TrimSpace(sc.Text()) + if line == "" { + continue + } + colon := strings.IndexByte(line, ':') + if colon < 0 { + continue + } + iface := strings.TrimSpace(line[:colon]) + if iface == "" || iface == "lo" { + continue + } + fields := strings.Fields(line[colon+1:]) + if len(fields) < 16 { + continue + } + // /proc/net/dev columns: + // 0 rx_bytes 1 rx_packets 2 rx_errs 3 rx_drop 4 fifo 5 frame 6 compressed 7 multicast + // 8 tx_bytes 9 tx_packets 10 tx_errs 11 tx_drop 12 fifo 13 colls 14 carrier 15 compressed + snap := NetDevSnapshot{Iface: iface} + snap.RxBytes = parseU64(fields[0]) + snap.RxErrs = parseU64(fields[2]) + snap.RxDrop = parseU64(fields[3]) + snap.TxBytes = parseU64(fields[8]) + snap.TxErrs = parseU64(fields[10]) + snap.TxDrop = parseU64(fields[11]) + out = append(out, snap) + } + return out +} + +func parseU64(s string) uint64 { + n, err := strconv.ParseUint(s, 10, 64) + if err != nil { + return 0 + } + return n +} diff --git a/agent/probes/netdev_test.go b/agent/probes/netdev_test.go new file mode 100644 index 0000000..f443a5d --- /dev/null +++ b/agent/probes/netdev_test.go @@ -0,0 +1,84 @@ +package probes + +import ( + "strings" + "testing" +) + +// TestParseNetDev_RealSample exercises parseNetDev against a synthetic +// /proc/net/dev fixture with the full 16-column layout. Confirms the +// loopback interface is dropped, headers are skipped, and each of the +// six curated counters lands in the right field. +func TestParseNetDev_RealSample(t *testing.T) { + // Columns after "iface:": + // 0 rx_bytes 1 rx_packets 2 rx_errs 3 rx_drop + // 4 fifo 5 frame 6 compressed 7 multicast + // 8 tx_bytes 9 tx_packets 10 tx_errs 11 tx_drop + // 12 fifo 13 colls 14 carrier 15 compressed + fixture := `Inter-| Receive | Transmit + face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed + lo: 1000000 10000 0 0 0 0 0 0 1000000 10000 0 0 0 0 0 0 + eth0: 50000000 100000 7 12 0 0 0 0 40000000 90000 3 5 0 0 0 0 + eth1: 12345 200 0 0 0 0 0 0 54321 180 0 0 0 0 0 0 +` + snaps := parseNetDev(strings.NewReader(fixture)) + if len(snaps) != 2 { + t.Fatalf("got %d snapshots, want 2 (lo should be dropped)", len(snaps)) + } + byIface := map[string]NetDevSnapshot{} + for _, s := range snaps { + byIface[s.Iface] = s + } + eth0, ok := byIface["eth0"] + if !ok { + t.Fatalf("eth0 missing from parsed snapshots") + } + if eth0.RxBytes != 50000000 { + t.Errorf("eth0 RxBytes=%d, want 50000000", eth0.RxBytes) + } + if eth0.RxErrs != 7 { + t.Errorf("eth0 RxErrs=%d, want 7", eth0.RxErrs) + } + if eth0.RxDrop != 12 { + t.Errorf("eth0 RxDrop=%d, want 12", eth0.RxDrop) + } + if eth0.TxBytes != 40000000 { + t.Errorf("eth0 TxBytes=%d, want 40000000", eth0.TxBytes) + } + if eth0.TxErrs != 3 { + t.Errorf("eth0 TxErrs=%d, want 3", eth0.TxErrs) + } + if eth0.TxDrop != 5 { + t.Errorf("eth0 TxDrop=%d, want 5", eth0.TxDrop) + } + if _, ok := byIface["lo"]; ok { + t.Errorf("lo should have been filtered out") + } +} + +// TestParseNetDev_Empty: an empty reader returns no snapshots, not a +// crash. Callers treat nil as "no data" and skip the delta step. +func TestParseNetDev_Empty(t *testing.T) { + snaps := parseNetDev(strings.NewReader("")) + if len(snaps) != 0 { + t.Errorf("got %d snapshots from empty reader, want 0", len(snaps)) + } +} + +// TestParseNetDev_MalformedRow skips rows that don't have the expected +// 16 columns rather than panicking. A truncated line shouldn't hide the +// good rows that follow. +func TestParseNetDev_MalformedRow(t *testing.T) { + fixture := `header line 1 +header line 2 + bad0: 123 456 + eth0: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 +` + snaps := parseNetDev(strings.NewReader(fixture)) + if len(snaps) != 1 { + t.Fatalf("got %d snapshots, want 1 (bad0 should be dropped)", len(snaps)) + } + if snaps[0].Iface != "eth0" { + t.Errorf("got iface=%q, want eth0", snaps[0].Iface) + } +} diff --git a/agent/runner.go b/agent/runner.go index 5141cd2..b567d54 100644 --- a/agent/runner.go +++ b/agent/runner.go @@ -26,6 +26,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "sync" "sync/atomic" "time" @@ -71,7 +72,10 @@ func Run(ctx context.Context, p *bootstate.Params) error { } fwd.info(fmt.Sprintf("claimed run; stages=%v current_state=%s", claim.Stages, claim.CurrentState)) - go thermalSidecar(ctx, c, fwd) + mux := NewSensorMux(ctx, c) + defer mux.Close() + + go thermalSidecar(ctx, mux, fwd) hbCh := make(chan HeartbeatResponse, 4) go heartbeatLoop(ctx, c, fwd, hbCh) @@ -101,7 +105,7 @@ func Run(ctx context.Context, p *bootstate.Params) error { default: } fwd.info("stage: starting " + nextStage) - outcome := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{}) + outcome := runStageCancellable(ctx, nextStage, claim, fwd, c, mux, overrideFlags{}) if outcome.Cancelled { fwd.warn("stage cancelled by operator; posting result and exiting") _, _ = postResult(ctx, c, nextStage, outcome) @@ -119,7 +123,7 @@ func Run(ctx context.Context, p *bootstate.Params) error { return err } // Park and wait for an override directive. - return waitForOverride(ctx, c, fwd, hbCh, claim) + return waitForOverride(ctx, c, fwd, mux, hbCh, claim) } if resp.NextState == "Completed" || resp.NextState == "" { fwd.info("pipeline complete") @@ -144,10 +148,10 @@ func Run(ctx context.Context, p *bootstate.Params) error { // it runs the inventory probe and passes the result as the /result body // (the orchestrator persists it as an artifact). Every other stage // returns a tests.Outcome which postResult marshals generically. -func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome { +func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, mux *SensorMux, ovr overrideFlags) stageOutcome { fwd.SetStage(stage) defer fwd.ClearStage() - deps := newDeps(ctx, c, fwd, ovr, claim) + deps := newDeps(ctx, c, fwd, mux, ovr, claim, stage) switch stage { case "Inventory": fwd.info("Inventory: probing host hardware") @@ -163,6 +167,25 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF }, Inventory: inv, } + case "Firmware": + fwd.info("Firmware: probing firmware versions") + snaps, warns := probes.Firmware(ctx) + for _, w := range warns { + fwd.warn(w) + } + summary := firmwareSummary(snaps) + fwd.info("Firmware: " + summary) + return stageOutcome{ + Outcome: tests.Outcome{ + Passed: true, + Summary: summary, + Extras: map[string]any{ + "warnings": warns, + "snapshots": len(snaps), + }, + }, + Firmware: snaps, + } case "SMART": return stageOutcome{Outcome: tests.SMART(ctx, deps)} case "CPUStress": @@ -170,10 +193,19 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF case "Storage": return stageOutcome{Outcome: tests.Storage(ctx, deps)} case "Network": + duration := deps.NetworkKnobs.Duration + if duration <= 0 { + duration = 10 * time.Second + } return stageOutcome{Outcome: tests.Network(ctx, deps, tests.NetworkConfig{ OrchestratorURL: c.BaseURL, IperfPort: claim.IperfPort, - Duration: 10 * time.Second, + Duration: duration, + })} + case "Burn": + return stageOutcome{Outcome: tests.Burn(ctx, deps, tests.BurnConfig{ + OrchestratorURL: c.BaseURL, + IperfPort: claim.IperfPort, })} case "GPU": return stageOutcome{Outcome: tests.GPU(ctx, deps)} @@ -188,8 +220,9 @@ func runStage(ctx context.Context, stage string, claim *ClaimResponse, fwd *logF type stageOutcome struct { Outcome tests.Outcome - Inventory *spec.Inventory // only for Inventory stage - Cancelled bool // set when the stage was cut short by operator cancel + Inventory *spec.Inventory // only for Inventory stage + Firmware []probes.FirmwareSnapshot // only for Firmware stage + Cancelled bool // set when the stage was cut short by operator cancel } // runStageCancellable wraps runStage in a per-stage context so the @@ -197,14 +230,14 @@ type stageOutcome struct { // is currently running. If the derived context was cancelled while the // stage executed, the outcome is rewritten as a cancellation record so // the orchestrator has something to persist. -func runStageCancellable(parent context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, ovr overrideFlags) stageOutcome { +func runStageCancellable(parent context.Context, stage string, claim *ClaimResponse, fwd *logForwarder, c *Client, mux *SensorMux, ovr overrideFlags) stageOutcome { stageCtx, cancel := context.WithCancel(parent) stageCancel.Store(cancel) defer func() { cancel() stageCancel.Store(context.CancelFunc(nil)) }() - out := runStage(stageCtx, stage, claim, fwd, c, ovr) + out := runStage(stageCtx, stage, claim, fwd, c, mux, ovr) // If the parent is still live but the stage ctx was cancelled, the // operator fired a cancel — mark the outcome so the caller can exit // the pipeline cleanly. Plain ctx-cancel on ctx.Done (e.g. shutdown) @@ -235,7 +268,7 @@ type overrideFlags struct { Wipe bool `json:"wipe"` } -func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlags, claim *ClaimResponse) tests.Deps { +func newDeps(ctx context.Context, c *Client, fwd *logForwarder, mux *SensorMux, ovr overrideFlags, claim *ClaimResponse, stage string) tests.Deps { var expected []tests.ExpectedDisk for _, e := range claim.ExpectedDisks { expected = append(expected, tests.ExpectedDisk{Serial: e.Serial, SizeGB: e.SizeGB}) @@ -247,17 +280,73 @@ func newDeps(ctx context.Context, c *Client, fwd *logForwarder, ovr overrideFlag OverrideWipe: ovr.Wipe, NonDestructive: claim.NonDestructive, ExpectedDisks: expected, - StageTimeout: 2 * time.Minute, - Sensor: func(ctx context.Context, samples []tests.Sample) error { + StageTimeout: stageTimeout(claim, stage), + CPUStressKnobs: tests.CPUStressKnobs{ + CPUPass: parseDur(claim.StageConfig.CPUStress.CPUPass), + MemPass: parseDur(claim.StageConfig.CPUStress.MemPass), + EDACPoll: parseDur(claim.StageConfig.CPUStress.EDACPoll), + }, + StorageKnobs: tests.StorageKnobs{ + Mode: claim.StageConfig.Storage.Mode, + FioSize: claim.StageConfig.Storage.FioSize, + FioTime: parseDur(claim.StageConfig.Storage.FioTime), + FioBS: claim.StageConfig.Storage.FioBS, + FioRW: claim.StageConfig.Storage.FioRW, + Verify: claim.StageConfig.Storage.Verify, + }, + NetworkKnobs: tests.NetworkKnobs{ + Duration: parseDur(claim.StageConfig.Network.Duration), + }, + BurnKnobs: tests.BurnKnobs{ + Duration: parseDur(claim.StageConfig.Burn.Duration), + CPUWorkers: claim.StageConfig.Burn.CPUWorkers, + MemPct: claim.StageConfig.Burn.MemPct, + FioOnSpare: claim.StageConfig.Burn.FioOnSpare, + IperfParallel: claim.StageConfig.Burn.IperfParallel, + }, + Sensor: func(_ context.Context, samples []tests.Sample) error { out := make([]SensorSample, 0, len(samples)) for _, s := range samples { out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit}) } - return c.Sensor(ctx, out) + mux.Send(out) + return nil }, } } +// stageTimeout reads claim.StageConfig.StageTimeouts[stage] and falls +// back to 2 minutes (the pre-Phase-2 default). Malformed entries log and +// fall back — we'd rather run the stage than refuse on a typo. +func stageTimeout(claim *ClaimResponse, stage string) time.Duration { + if claim == nil || claim.StageConfig.StageTimeouts == nil { + return 2 * time.Minute + } + raw, ok := claim.StageConfig.StageTimeouts[stage] + if !ok || raw == "" { + return 2 * time.Minute + } + d, err := time.ParseDuration(raw) + if err != nil || d <= 0 { + return 2 * time.Minute + } + return d +} + +// parseDur is the permissive duration parser for the knob wire shape. +// Empty strings / parse failures yield 0 so callers can treat a zero +// value as "use the compile-time default" without a nil-check dance. +func parseDur(s string) time.Duration { + if s == "" { + return 0 + } + d, err := time.ParseDuration(s) + if err != nil || d < 0 { + return 0 + } + return d +} + // postResult marshals stageOutcome for the /result endpoint. The // Inventory shape is special-cased: it includes the inventory blob so // the orchestrator can persist it and run server-side spec diff. @@ -276,6 +365,9 @@ func postResult(ctx context.Context, c *Client, stage string, s stageOutcome) (* if s.Inventory != nil { body["inventory"] = s.Inventory } + if len(s.Firmware) > 0 { + body["firmware"] = s.Firmware + } if len(s.Outcome.SubSteps) > 0 { wire := make([]SubStepReport, 0, len(s.Outcome.SubSteps)) for _, ss := range s.Outcome.SubSteps { @@ -304,7 +396,7 @@ func stageForState(state string) string { switch state { case "InventoryCheck": return "Inventory" - case "SMART", "CPUStress", "Storage", "Network", "GPU", "PSU": + case "Firmware", "SMART", "CPUStress", "Storage", "Network", "Burn", "GPU", "PSU": return state } // SpecValidate and Reporting are orchestrator-owned; we never see @@ -315,7 +407,7 @@ func stageForState(state string) string { // waitForOverride parks the agent in FailedHolding. It listens for a // heartbeat directive that tells it to retry a stage (e.g. Storage // with wipe-override armed) and re-enters runStage from that point. -func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-chan HeartbeatResponse, claim *ClaimResponse) error { +func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, mux *SensorMux, hb <-chan HeartbeatResponse, claim *ClaimResponse) error { fwd.info("holding: awaiting operator decision (heartbeat directive or ctx cancel)") for { select { @@ -333,7 +425,7 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha if len(cmd.OverrideFlags) > 0 { _ = json.Unmarshal(cmd.OverrideFlags, &ovr) } - outcome := runStageCancellable(ctx, cmd.Stage, claim, fwd, c, ovr) + outcome := runStageCancellable(ctx, cmd.Stage, claim, fwd, c, mux, ovr) if outcome.Cancelled { fwd.warn("stage cancelled by operator; posting result and exiting") _, _ = postResult(ctx, c, cmd.Stage, outcome) @@ -362,7 +454,7 @@ func waitForOverride(ctx context.Context, c *Client, fwd *logForwarder, hb <-cha default: } fwd.info("stage: starting " + nextStage) - out := runStageCancellable(ctx, nextStage, claim, fwd, c, overrideFlags{}) + out := runStageCancellable(ctx, nextStage, claim, fwd, c, mux, overrideFlags{}) if out.Cancelled { fwd.warn("stage cancelled by operator; posting result and exiting") _, _ = postResult(ctx, c, nextStage, out) @@ -417,11 +509,32 @@ func inventorySummary(inv *spec.Inventory) string { len(inv.Disks), len(inv.NICs), len(inv.GPUs)) } +// firmwareSummary renders the one-liner surfaced in the stage tile: +// per-component counts so an operator can see "bios=1 nic=2 nvme_fw=1" +// without opening the report. +func firmwareSummary(snaps []probes.FirmwareSnapshot) string { + counts := map[string]int{} + for _, s := range snaps { + counts[s.Component]++ + } + if len(counts) == 0 { + return "no firmware readable" + } + keys := []string{"bios", "bmc", "nic", "hba", "nvme_fw", "microcode"} + parts := make([]string, 0, len(keys)) + for _, k := range keys { + if n := counts[k]; n > 0 { + parts = append(parts, fmt.Sprintf("%s=%d", k, n)) + } + } + return strings.Join(parts, " ") +} + // thermalSidecar posts a batch of /sys/class/hwmon samples every 5s. // Idempotent: a dead sensor just drops out of the next batch. Errors // are logged but never fatal — we'd rather have a run with partial // thermal data than kill the agent over an I/O hiccup. -func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) { +func thermalSidecar(ctx context.Context, mux *SensorMux, fwd *logForwarder) { t := time.NewTicker(5 * time.Second) defer t.Stop() for { @@ -437,11 +550,7 @@ func thermalSidecar(ctx context.Context, c *Client, fwd *logForwarder) { for _, s := range samples { out = append(out, SensorSample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit}) } - sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) - if err := c.Sensor(sendCtx, out); err != nil { - fwd.warn("thermal sidecar: " + err.Error()) - } - cancel() + mux.Send(out) } } } diff --git a/agent/sensor_mux.go b/agent/sensor_mux.go new file mode 100644 index 0000000..fd4ab7a --- /dev/null +++ b/agent/sensor_mux.go @@ -0,0 +1,139 @@ +package agent + +import ( + "context" + "log" + "sync" + "time" +) + +// SensorMux coalesces sensor samples from every stage + sidecar into a +// single batched HTTP POST stream. Without it, a Burn run that fans out +// four concurrent workloads + thermal + PSU + EDAC sidecars can push ~50 +// samples/sec, each as a separate /sensor request — enough to either +// saturate the orchestrator's request budget or stall a stage on its +// own sensor-forwarding path. +// +// Contract: +// - Send is non-blocking; a full input channel drops a batch on the +// floor and logs a warning. That's preferred over back-pressuring +// a workload goroutine and skewing its timing. +// - Flush happens every flushInterval *or* whenever the pending buffer +// exceeds maxBatch samples. Chunk-at-flush keeps each HTTP request +// bounded regardless of the incoming rate. +// - Close flushes whatever is in the buffer. Callers that need the +// final flush to reach the server should defer Close before other +// deferred shutdown work. +type SensorMux struct { + c *Client + in chan []SensorSample + flushInterval time.Duration + maxBatch int + + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup +} + +// NewSensorMux starts the flush loop. Callers hand the returned mux to +// every code path that previously called Client.Sensor directly (stage +// Deps.Sensor, thermal sidecar, EDAC sidecar). The mux lives for the +// duration of the agent run. +func NewSensorMux(parent context.Context, c *Client) *SensorMux { + ctx, cancel := context.WithCancel(parent) + m := &SensorMux{ + c: c, + in: make(chan []SensorSample, 32), + flushInterval: 2 * time.Second, + maxBatch: 500, + ctx: ctx, + cancel: cancel, + } + m.wg.Add(1) + go m.loop() + return m +} + +// Send enqueues a batch for the next flush tick. Empty batches are +// silently ignored so callers with conditional sample lists don't need +// to guard the call site. +func (m *SensorMux) Send(samples []SensorSample) { + if m == nil || len(samples) == 0 { + return + } + // Copy so caller mutations don't race with the flush loop. + out := make([]SensorSample, len(samples)) + copy(out, samples) + select { + case m.in <- out: + default: + log.Printf("sensor mux: input channel full, dropping %d samples", len(out)) + } +} + +// Close stops the flush loop and flushes the residual buffer. Safe to +// call twice (the second is a no-op because the internal context is +// already cancelled). +func (m *SensorMux) Close() { + if m == nil { + return + } + m.cancel() + m.wg.Wait() +} + +func (m *SensorMux) loop() { + defer m.wg.Done() + buf := make([]SensorSample, 0, m.maxBatch) + t := time.NewTicker(m.flushInterval) + defer t.Stop() + for { + select { + case <-m.ctx.Done(): + m.flushChunks(buf) + buf = nil + // Drain whatever is still sitting in the channel so a + // workload that pushed right before Close doesn't lose + // those final samples. + for { + select { + case batch := <-m.in: + m.flushChunks(batch) + default: + return + } + } + case batch := <-m.in: + buf = append(buf, batch...) + if len(buf) >= m.maxBatch { + m.flushChunks(buf) + buf = buf[:0] + } + case <-t.C: + if len(buf) > 0 { + m.flushChunks(buf) + buf = buf[:0] + } + } + } +} + +// flushChunks splits a potentially-large slice into maxBatch-sized +// HTTP requests so no single POST carries more than the configured cap. +// A 10-second per-chunk timeout keeps a stalled orchestrator from +// freezing the flush loop. +func (m *SensorMux) flushChunks(all []SensorSample) { + for len(all) > 0 { + n := len(all) + if n > m.maxBatch { + n = m.maxBatch + } + chunk := all[:n] + all = all[n:] + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + if err := m.c.Sensor(ctx, chunk); err != nil { + log.Printf("sensor mux: flush of %d samples failed: %v", len(chunk), err) + } + cancel() + } +} diff --git a/agent/sensor_mux_test.go b/agent/sensor_mux_test.go new file mode 100644 index 0000000..35c3394 --- /dev/null +++ b/agent/sensor_mux_test.go @@ -0,0 +1,144 @@ +package agent + +import ( + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "sync/atomic" + "testing" + "time" +) + +// TestSensorMux_CloseFlushesBuffer confirms Close() empties the +// pending buffer through the HTTP client before returning. Without +// this guarantee a Burn run would drop the last 2 s of samples when +// the stage tears down, which is exactly the window that contains the +// peak-load PSU / thermal readings we care about. +func TestSensorMux_CloseFlushesBuffer(t *testing.T) { + var batches int32 + var totalSamples int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !strings.HasSuffix(r.URL.Path, "/sensor") { + t.Errorf("unexpected path %s", r.URL.Path) + } + body, _ := io.ReadAll(r.Body) + var env struct { + Samples []SensorSample `json:"samples"` + } + if err := json.Unmarshal(body, &env); err != nil { + t.Errorf("decode: %v", err) + } + atomic.AddInt32(&batches, 1) + atomic.AddInt32(&totalSamples, int32(len(env.Samples))) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + c := &Client{ + BaseURL: srv.URL, + RunID: 1, + Token: "t", + HTTP: srv.Client(), + } + mux := NewSensorMux(context.Background(), c) + mux.Send([]SensorSample{ + {Kind: "temp", Key: "cpu/0", Value: 72.5, Unit: "C"}, + {Kind: "psu_volt", Key: "+12V", Value: 12.05, Unit: "V"}, + }) + mux.Send([]SensorSample{ + {Kind: "mce", Key: "0", Value: 0, Unit: "count"}, + }) + mux.Close() + + if got := atomic.LoadInt32(&totalSamples); got != 3 { + t.Errorf("expected 3 samples flushed, got %d across %d batch(es)", got, atomic.LoadInt32(&batches)) + } + if atomic.LoadInt32(&batches) == 0 { + t.Errorf("expected at least one batch HTTP post") + } +} + +// TestSensorMux_ChunksOversizedBatch verifies flushChunks splits a +// single oversized input into maxBatch-sized HTTP requests. The plan's +// Burn stage can legitimately push a single input larger than the cap +// (e.g. a workload goroutine dumping a backlog), and a single giant +// POST would defeat the point of the multiplexer. +func TestSensorMux_ChunksOversizedBatch(t *testing.T) { + var batchSizes []int + var mu sync.Mutex + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + var env struct { + Samples []SensorSample `json:"samples"` + } + _ = json.Unmarshal(body, &env) + mu.Lock() + batchSizes = append(batchSizes, len(env.Samples)) + mu.Unlock() + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + c := &Client{BaseURL: srv.URL, RunID: 1, Token: "t", HTTP: srv.Client()} + mux := NewSensorMux(context.Background(), c) + + // One input with 1200 samples → expect chunks of 500 + 500 + 200 + // given the default maxBatch of 500. + big := make([]SensorSample, 1200) + for i := range big { + big[i] = SensorSample{Kind: "burn/throughput_mbps", Key: "eth0", Value: float64(i), Unit: "Mbps"} + } + mux.Send(big) + mux.Close() + + mu.Lock() + defer mu.Unlock() + total := 0 + for _, n := range batchSizes { + total += n + if n > 500 { + t.Errorf("batch size %d exceeds maxBatch=500", n) + } + } + if total != 1200 { + t.Errorf("sum of batch sizes = %d, want 1200 (sizes=%v)", total, batchSizes) + } + if len(batchSizes) < 3 { + t.Errorf("expected at least 3 chunks for a 1200-sample input, got %d (%v)", len(batchSizes), batchSizes) + } +} + +// TestSensorMux_EmptyAndNilSafe covers the defensive guards around +// Send(nil) / Send([]) / a nil *SensorMux. Callers with conditional +// sample lists (storage probe that skipped a disk, GPU stage with no +// devices) should be able to call Send unconditionally without adding +// their own nil check. +func TestSensorMux_EmptyAndNilSafe(t *testing.T) { + var batches int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + atomic.AddInt32(&batches, 1) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + // Nil receiver must be a no-op. + var nilMux *SensorMux + nilMux.Send([]SensorSample{{Kind: "x", Key: "y"}}) + nilMux.Close() + + c := &Client{BaseURL: srv.URL, RunID: 1, Token: "t", HTTP: srv.Client()} + mux := NewSensorMux(context.Background(), c) + mux.Send(nil) + mux.Send([]SensorSample{}) + mux.Close() + + // Give any spurious goroutine a chance to surprise us. + time.Sleep(50 * time.Millisecond) + if atomic.LoadInt32(&batches) != 0 { + t.Errorf("empty/nil Send must not produce HTTP batches, got %d", atomic.LoadInt32(&batches)) + } +} diff --git a/agent/tests/burn.go b/agent/tests/burn.go new file mode 100644 index 0000000..625bdef --- /dev/null +++ b/agent/tests/burn.go @@ -0,0 +1,486 @@ +package tests + +import ( + "context" + "encoding/json" + "fmt" + "os/exec" + "runtime" + "strconv" + "strings" + "sync" + "time" + + "vetting/agent/probes" +) + +// BurnConfig is what the agent passes to Burn: the orchestrator's iperf3 +// server address and port. Durations + concurrency knobs come from +// Deps.BurnKnobs so they scale with profile. +type BurnConfig struct { + OrchestratorURL string + IperfPort int // 0 = 5201 +} + +// Burn is the concurrent soak stage. Unlike CPUStress (serial +// CPU→memory) or Storage (serial per disk) it fans out every workload +// at once: stress-ng hammers CPU + memory, fio drives the allow-listed +// disks, iperf3 pushes sustained NIC traffic, and two sidecars poll +// EDAC + PSU rails for the duration of the window. +// +// This is where PSU rails actually matter: 12V sag under simultaneous +// CPU + disk + NIC load is exactly the failure a thermal/power +// regression produces, and it's invisible to any stage that loads one +// subsystem at a time. The PSU stage that follows Burn in the pipeline +// re-samples rails post-window to confirm they settle back to nominal. +// +// Burn stays inside the stage framework — it doesn't spawn a parallel +// stage runner. The goroutine fan-out is local; the stage converges +// before returning an Outcome so every invariant the orchestrator +// relies on (serial stage order, single in-flight stage per run) still +// holds. +func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome { + duration := d.BurnKnobs.Duration + if duration <= 0 { + duration = 2 * time.Minute + } + cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers) + memPct := clampMemPct(d.BurnKnobs.MemPct) + iperfParallel := d.BurnKnobs.IperfParallel + if iperfParallel <= 0 { + iperfParallel = 2 + } + d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v", + duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare)) + + // Sidecars run for the lifetime of the window and are cancelled on + // return so the main stage converges cleanly. EDAC catches DIMM + // bit-flips that appear only under concurrent load; PSU catches + // rail sag that only appears when CPU + disk + NIC pull current + // simultaneously. + sideCtx, sideCancel := context.WithCancel(ctx) + defer sideCancel() + var sideWG sync.WaitGroup + sideWG.Add(2) + go runEDACSidecar(sideCtx, &sideWG, d) + go runPSUSidecar(sideCtx, &sideWG, d) + + runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second) + defer cancel() + + results := make(chan burnSubResult, 4) + var wg sync.WaitGroup + + wg.Add(1) + go func() { + defer wg.Done() + results <- runBurnCPU(runCtx, d, duration, cpuWorkers) + }() + + wg.Add(1) + go func() { + defer wg.Done() + results <- runBurnMemory(runCtx, d, duration, memPct) + }() + + // fio runs only when explicitly enabled *and* there are allow-listed + // disks *and* the run wasn't marked non-destructive. Any of those + // missing records a Skipped sub-step so the operator sees why. + if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive { + wg.Add(1) + go func() { + defer wg.Done() + results <- runBurnFio(runCtx, d, duration) + }() + } else { + reason := burnFioSkipReason(d) + results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason} + } + + // iperf requires an orchestrator host. Lab hosts run with the + // bundled iperf3 server; without a base URL we can't derive a + // target so we skip rather than fail the stage. + if cfg.OrchestratorURL != "" { + wg.Add(1) + go func() { + defer wg.Done() + results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel) + }() + } else { + results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"} + } + + wg.Wait() + sideCancel() + sideWG.Wait() + close(results) + + subs, samples, failures := collectBurnResults(results) + if d.Sensor != nil && len(samples) > 0 { + _ = d.Sensor(ctx, samples) + } + + extras := map[string]any{ + "duration": duration.String(), + "cpu_workers": cpuWorkers, + "mem_pct": memPct, + "iperf_parallel": iperfParallel, + "fio_on_spare": d.BurnKnobs.FioOnSpare, + } + if len(failures) > 0 { + msg := "Burn workloads failed: " + strings.Join(failures, ", ") + d.Error(msg) + return Outcome{ + Passed: false, + Message: msg, + Summary: fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)), + Extras: extras, + SubSteps: subs, + } + } + d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs))) + return Outcome{ + Passed: true, + Summary: fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)), + Extras: extras, + SubSteps: subs, + } +} + +// burnSubResult is the per-workload return type used by the fan-out +// goroutines. Sample slice is merged into the stage's final /sensor +// batch; SubStep becomes a row on the /result sub-steps list. +type burnSubResult struct { + Name string + Passed bool + Skipped bool + Reason string // why a workload was skipped + Err string // why a workload failed + Samples []Sample + SubStep SubStepReport +} + +func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) { + var subs []SubStepReport + var samples []Sample + var failures []string + for r := range ch { + // Non-skipped goroutines populate SubStep directly. Skipped slots + // get a synthesized row here so the /result shape stays stable. + if r.Skipped { + stamp := time.Now().UTC() + subs = append(subs, SubStepReport{ + Name: r.Name, + Skipped: true, + StartedAt: stamp, + CompletedAt: stamp, + SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}), + }) + continue + } + subs = append(subs, r.SubStep) + samples = append(samples, r.Samples...) + if !r.Passed { + reason := r.Err + if reason == "" { + reason = "unknown" + } + failures = append(failures, r.Name+": "+reason) + } + } + return subs, samples, failures +} + +func burnFioSkipReason(d Deps) string { + if !d.BurnKnobs.FioOnSpare { + return "fio_on_spare knob disabled" + } + if d.NonDestructive { + return "non-destructive run" + } + if len(d.ExpectedDisks) == 0 { + return "no allowlisted disks" + } + return "disabled" +} + +// runBurnCPU hammers all CPU cores with stress-ng for the window. Same +// shape as CPUStress pass 1 but with shorter label so the sub-step row +// doesn't collide with the earlier stage's "CPU pass". +func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult { + if _, err := exec.LookPath("stress-ng"); err != nil { + return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"} + } + args := []string{ + "--cpu", strconv.Itoa(workers), + "--cpu-method", "all", + "--timeout", durationSeconds(duration), + "--metrics-brief", + "--verify", + } + d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " "))) + pass := runStressPass(ctx, d, "Burn CPU", duration, args) + return burnSubResult{ + Name: "Burn CPU", + Passed: pass.Passed, + Err: pass.Err, + SubStep: subStepFromPass("Burn CPU", pass), + } +} + +// runBurnMemory drives a single --vm worker sized at memPct of +// MemAvailable, capped so the kernel + agent + other workloads still +// have headroom. Clamping happens here rather than in resolveBurnKnobs +// so the cap is computed against real live memory each run. +func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult { + if _, err := exec.LookPath("stress-ng"); err != nil { + return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"} + } + avail, err := memAvailableBytes() + if err != nil { + return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()} + } + // Budget = avail * memPct / 100, then subtract the standard headroom. + // If the result is below the memory-pass floor we record a skipped + // row instead — the window is too tight to be meaningful on this box. + budget := int64(float64(avail) * float64(memPct) / 100.0) + cap := budget - memHeadroomBytes + if cap < memFloorBytes { + return burnSubResult{ + Name: "Burn memory", + Skipped: true, + Reason: fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)), + } + } + args := []string{ + "--vm", "1", + "--vm-bytes", strconv.FormatInt(cap, 10), + "--vm-keep", + "--timeout", durationSeconds(duration), + "--metrics-brief", + "--verify", + } + d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct)) + pass := runStressPass(ctx, d, "Burn memory", duration, args) + return burnSubResult{ + Name: "Burn memory", + Passed: pass.Passed, + Err: pass.Err, + SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass), + } +} + +// runBurnFio runs fio_sample against the first allow-listed disk for +// the window. Reuses runFioVerify + parseFioJSON so the samples line +// up with what Storage emits. Using fio_sample (bounded by --size) +// keeps Burn's write volume predictable regardless of profile. +func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult { + if _, err := exec.LookPath("fio"); err != nil { + return burnSubResult{Name: "Burn fio", Err: "fio missing"} + } + targets := resolveTargets(d.ExpectedDisks) + if len(targets) == 0 { + return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"} + } + t := targets[0] + opts := fioOpts{ + Mode: "fio_sample", + Size: "512MiB", + Runtime: duration, + BS: "4k", + RW: "randrw", + Verify: "md5", + } + start := time.Now() + d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration)) + fr := runFioVerify(ctx, t.Device, opts) + end := time.Now() + + sub := SubStepReport{ + Name: "Burn fio " + t.Device, + Passed: fr.Error == "", + StartedAt: start, + CompletedAt: end, + SummaryJSON: mustJSON(fr), + } + out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error} + if fr.Error == "" { + out.Samples = append(out.Samples, + Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"}, + Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"}, + ) + if fr.ReadP99Us > 0 { + out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"}) + } + if fr.WriteP99Us > 0 { + out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"}) + } + } + return out +} + +// runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON +// so the same (mbps, retrans, bytesSent) extraction the Network stage +// uses applies here too. Samples emitted as Burn-scoped keys so the +// dashboard can tell at-a-glance which window they came from. +func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult { + if _, err := exec.LookPath("iperf3"); err != nil { + return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"} + } + host, err := deriveHost(orchestratorURL) + if err != nil || host == "" { + return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"} + } + if port == 0 { + port = 5201 + } + if parallel < 1 { + parallel = 1 + } + args := []string{ + "-c", host, + "-p", strconv.Itoa(port), + "-t", strconv.Itoa(int(duration.Seconds())), + "-P", strconv.Itoa(parallel), + "-J", + } + runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second) + defer cancel() + start := time.Now() + out, err := exec.CommandContext(runCtx, "iperf3", args...).Output() + end := time.Now() + if err != nil { + return burnSubResult{ + Name: "Burn iperf", + Err: "iperf3 client error: " + err.Error(), + SubStep: SubStepReport{ + Name: "Burn iperf", + StartedAt: start, + CompletedAt: end, + SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}), + }, + } + } + mbps, retrans, bytesSent, _, perr := parseIperfJSON(out) + if perr != nil { + return burnSubResult{ + Name: "Burn iperf", + Err: "parse iperf3 json: " + perr.Error(), + SubStep: SubStepReport{ + Name: "Burn iperf", + StartedAt: start, + CompletedAt: end, + SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}), + }, + } + } + + samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}} + if bytesSent > 0 { + packets := float64(bytesSent) / 1460.0 + if packets > 0 { + samples = append(samples, Sample{ + Kind: "nic_retrans", Key: "burn/rate", + Value: float64(retrans) / packets, Unit: "rate", + }) + } + } + passed := mbps > 0 + errMsg := "" + if !passed { + errMsg = "zero throughput from iperf3" + } + return burnSubResult{ + Name: "Burn iperf", + Passed: passed, + Err: errMsg, + Samples: samples, + SubStep: SubStepReport{ + Name: fmt.Sprintf("Burn iperf (P=%d)", parallel), + Passed: passed, + StartedAt: start, + CompletedAt: end, + SummaryJSON: mustJSON(map[string]any{ + "throughput_mbps": mbps, + "retransmits": retrans, + "bytes_sent": bytesSent, + "parallel": parallel, + }), + }, + } +} + +// runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration +// of the Burn window, piping each read into the stage's sensor channel +// as a psu_volt sample. The threshold evaluator then applies the same +// within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V +// under load will fire the critical threshold mid-Burn and the run +// will flip into FailedHolding without waiting for the post-Burn PSU +// stage to catch it. +func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) { + defer wg.Done() + if d.Sensor == nil { + return + } + t := time.NewTicker(5 * time.Second) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + rails := scanPSURails() + if len(rails) == 0 { + continue + } + batch := make([]Sample, 0, len(rails)) + for _, r := range rails { + batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"}) + } + sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + if err := d.Sensor(sendCtx, batch); err != nil { + d.Warn("Burn: PSU sample post: " + err.Error()) + } + cancel() + } + } +} + +func resolveCPUWorkers(raw string) int { + if raw == "" || strings.EqualFold(raw, "all") { + return runtime.NumCPU() + } + if n, err := strconv.Atoi(raw); err == nil && n > 0 { + return n + } + return runtime.NumCPU() +} + +// clampMemPct keeps the knob in a sane band. 0 means "use default 50%"; +// above 90 would crowd the kernel + agent + fio + iperf3 workers off the +// page cache. Anything outside [10, 90] is clamped. +func clampMemPct(pct int) int { + if pct <= 0 { + return 50 + } + if pct < 10 { + return 10 + } + if pct > 90 { + return 90 + } + return pct +} + +func mustJSON(v any) json.RawMessage { + b, err := json.Marshal(v) + if err != nil { + return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`)) + } + return b +} + +// Ensure the probes package import stays anchored — the Burn sidecars +// use probes.EDAC + the PSU rail scanner defined in psu.go which +// otherwise wouldn't pull probes in on its own. +var _ = probes.EDAC diff --git a/agent/tests/burn_test.go b/agent/tests/burn_test.go new file mode 100644 index 0000000..ebe8c38 --- /dev/null +++ b/agent/tests/burn_test.go @@ -0,0 +1,58 @@ +package tests + +import ( + "runtime" + "testing" +) + +// TestResolveCPUWorkers covers the three parse branches: empty/"all" +// falls back to NumCPU, a valid integer is used verbatim, and garbage +// also falls back to NumCPU rather than returning zero. Zero workers +// would make stress-ng a no-op and silently defeat Burn's CPU load. +func TestResolveCPUWorkers(t *testing.T) { + np := runtime.NumCPU() + cases := []struct { + name string + in string + want int + }{ + {"empty defaults to NumCPU", "", np}, + {"all defaults to NumCPU", "all", np}, + {"ALL is case-insensitive", "ALL", np}, + {"explicit integer", "3", 3}, + {"negative falls back", "-1", np}, + {"zero falls back", "0", np}, + {"garbage falls back", "lots", np}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := resolveCPUWorkers(tc.in); got != tc.want { + t.Errorf("resolveCPUWorkers(%q) = %d, want %d", tc.in, got, tc.want) + } + }) + } +} + +// TestClampMemPct ensures the mem_pct knob never drives the memory +// burner into OOM territory (upper clamp) or into uselessness (lower +// clamp). Zero is treated as "use default 50" so a missing knob in an +// older orchestrator's claim response doesn't collapse the workload. +func TestClampMemPct(t *testing.T) { + cases := []struct { + in, want int + }{ + {0, 50}, // default + {-10, 50}, // negative treated as default + {5, 10}, // below lower band → clamp up + {10, 10}, + {50, 50}, + {90, 90}, + {95, 90}, // above upper band → clamp down + {1000, 90}, + } + for _, tc := range cases { + if got := clampMemPct(tc.in); got != tc.want { + t.Errorf("clampMemPct(%d) = %d, want %d", tc.in, got, tc.want) + } + } +} diff --git a/agent/tests/cpustress.go b/agent/tests/cpustress.go index 857d007..dabda54 100644 --- a/agent/tests/cpustress.go +++ b/agent/tests/cpustress.go @@ -11,7 +11,10 @@ import ( "runtime" "strconv" "strings" + "sync" "time" + + "vetting/agent/probes" ) // CPUStress runs stress-ng as two serial passes. The previous shape @@ -55,11 +58,28 @@ func CPUStress(ctx context.Context, d Deps) Outcome { extras := map[string]any{"cores": cores} var subs []SubStepReport + // EDAC sidecar runs for the lifetime of the stage; cancelled on + // return. It polls /sys/devices/system/edac/mc/*/{ce,ue}_count and + // posts the current counters so the server-side threshold evaluator + // can gate edac_ue > 0 → fail the run. Zero-valued poll falls back + // to 10s — the same cadence rasdaemon uses by default. + sideCtx, sideCancel := context.WithCancel(ctx) + defer sideCancel() + var sideWG sync.WaitGroup + sideWG.Add(1) + go runEDACSidecar(sideCtx, &sideWG, d) + + // Per-profile durations come from Deps; zero values (missing knobs + // or legacy orchestrator) fall back to the package default so the + // stage always has a defined budget. + cpuDur := nonzeroDur(d.CPUStressKnobs.CPUPass, cpuPassDuration) + memDur := nonzeroDur(d.CPUStressKnobs.MemPass, memPassDuration) + // Pass 1: CPU - cpu := runStressPass(ctx, d, "CPU", cpuPassDuration, []string{ + cpu := runStressPass(ctx, d, "CPU", cpuDur, []string{ "--cpu", strconv.Itoa(cores), "--cpu-method", "all", - "--timeout", durationSeconds(cpuPassDuration), + "--timeout", durationSeconds(cpuDur), "--metrics-brief", "--verify", }) @@ -104,11 +124,11 @@ func CPUStress(ctx context.Context, d Deps) Outcome { SubSteps: subs, } } - mem := runStressPass(ctx, d, "memory", memPassDuration, []string{ + mem := runStressPass(ctx, d, "memory", memDur, []string{ "--vm", "1", "--vm-bytes", strconv.FormatInt(cap, 10), "--vm-keep", - "--timeout", durationSeconds(memPassDuration), + "--timeout", durationSeconds(memDur), "--metrics-brief", "--verify", }) @@ -133,6 +153,64 @@ func CPUStress(ctx context.Context, d Deps) Outcome { } } +// runEDACSidecar polls /sys EDAC counters on d.CPUStressKnobs.EDACPoll +// cadence (or 10s fallback) for the lifetime of the stage ctx, emitting +// one sample per (memory-controller × {ce,ue}) pair on each tick. A +// single failing read is tolerated: the next tick picks up the counter. +// +// This is where the critical edac_ue threshold becomes a hard-fail: as +// soon as a UE counter advances past 0, the server-side evaluator trips +// and flips the run into FailedHolding. The sidecar emits whether or +// not stress-ng is still running; that keeps the signal live during +// inter-pass gaps. +// +// MCE counts are intentionally not sampled here — they require +// rasdaemon or mcelog and vary by live-image packaging. The threshold +// rule for mce stays seeded (so the DB shape is stable) but only fires +// once a matching kind lands, which is a follow-up. +func runEDACSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) { + defer wg.Done() + if d.Sensor == nil { + return + } + poll := d.CPUStressKnobs.EDACPoll + if poll <= 0 { + poll = 10 * time.Second + } + t := time.NewTicker(poll) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + edac := probes.EDAC() + if len(edac) == 0 { + continue + } + batch := make([]Sample, 0, len(edac)) + for _, s := range edac { + batch = append(batch, Sample{Kind: s.Kind, Key: s.Key, Value: s.Value, Unit: s.Unit}) + } + sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + if err := d.Sensor(sendCtx, batch); err != nil { + d.Warn("CPUStress: edac sample post: " + err.Error()) + } + cancel() + } + } +} + +// nonzeroDur picks override over fallback, but only when override is +// strictly positive. Lets callers pass a zero-value duration to mean +// "no override; use fallback" without a separate ok return. +func nonzeroDur(override, fallback time.Duration) time.Duration { + if override > 0 { + return override + } + return fallback +} + // subStepFromPass projects a stressPass into a SubStepReport — shared by // both passes and by the mid-stage early-return paths so the UI always // sees exactly one row per pass, even on failure. diff --git a/agent/tests/fakes/dmidecode/main.go b/agent/tests/fakes/dmidecode/main.go new file mode 100644 index 0000000..c5545bb --- /dev/null +++ b/agent/tests/fakes/dmidecode/main.go @@ -0,0 +1,24 @@ +// fake_dmidecode simulates `dmidecode -t bios` for unit tests of the +// firmware probe's BIOS parser. Prints deterministic output modeled on +// a real Supermicro host; exits 0 regardless of flags. +package main + +import "fmt" + +func main() { + fmt.Println(`# dmidecode 3.3 +Getting SMBIOS data from sysfs. +SMBIOS 3.2.0 present. + +Handle 0x0000, DMI type 0, 26 bytes +BIOS Information + Vendor: American Megatrends Inc. + Version: 3.2 + Release Date: 07/15/2021 + Address: 0xF0000 + Runtime Size: 64 kB + ROM Size: 32 MB + Characteristics: + PCI is supported + BIOS is upgradeable`) +} diff --git a/agent/tests/fakes/doc.go b/agent/tests/fakes/doc.go new file mode 100644 index 0000000..01541d2 --- /dev/null +++ b/agent/tests/fakes/doc.go @@ -0,0 +1,22 @@ +// Package fakes is the umbrella for deterministic stand-ins for +// external probe binaries that Vetting's stage code normally shells +// out to (stress-ng, fio, iperf3, dmidecode, ethtool, nvidia-smi, +// mcelog, nvme). Each real binary gets its own subpackage under +// fakes// with `package main` and a main() that prints golden +// output — build with `go build -o / ./agent/tests/fakes/` +// and point a test's tests.Deps.LookPath at /. +// +// The seam in tests is tests.Deps.LookPath: when non-nil the stage +// code uses it instead of os/exec.LookPath. Outside tests, nil +// LookPath means "use the real binary on $PATH" — stages continue to +// work on production hosts without the fakes package around. +// +// How to add a new fake: +// 1. Create agent/tests/fakes//main.go. +// 2. Write `package main` with a main() that prints exactly the +// bytes the real tool would produce for the input you care to +// simulate. Determinism > completeness — tests want a known +// sample, not a realistic one. +// 3. Reference the fake from the unit test with `go test` compiling +// it via t.TempDir() + `go build -o` before the test body runs. +package fakes diff --git a/agent/tests/fakes/stress_ng/main.go b/agent/tests/fakes/stress_ng/main.go new file mode 100644 index 0000000..b7f5178 --- /dev/null +++ b/agent/tests/fakes/stress_ng/main.go @@ -0,0 +1,18 @@ +// fake_stress_ng simulates stress-ng for unit tests. Accepts (and +// ignores) any flag, sleeps briefly so callers that measure wall-clock +// see a non-zero elapsed, and prints the "passed" lines CPUStress +// expects. Exits 0. +package main + +import ( + "fmt" + "os" + "time" +) + +func main() { + fmt.Fprintln(os.Stderr, "fake_stress_ng invoked:", os.Args[1:]) + time.Sleep(50 * time.Millisecond) + fmt.Println("stress-ng: info: [1] dispatching hogs: 1 cpu") + fmt.Println("stress-ng: info: [1] successful run completed in 0.05s") +} diff --git a/agent/tests/network.go b/agent/tests/network.go index 089dc89..e673150 100644 --- a/agent/tests/network.go +++ b/agent/tests/network.go @@ -9,19 +9,27 @@ import ( "strconv" "strings" "time" + + "vetting/agent/probes" ) // NetworkConfig is what the agent passes to Network: the orchestrator's -// iperf3 server address and port. We derive host from OrchestratorURL. +// iperf3 server address, port, and the per-profile duration. type NetworkConfig struct { OrchestratorURL string IperfPort int // 0 = 5201 Duration time.Duration } -// Network runs iperf3 against the orchestrator's bundled server. Records -// bandwidth as a measurement; fails if iperf3 is missing, the server -// isn't reachable, or throughput is zero. +// Network runs iperf3 against the orchestrator's bundled server for +// the profile-configured duration. Records throughput as a measurement; +// records per-interface rx/tx error-rate deltas as nic_retrans samples +// so the server-side threshold gate (`nic_retrans rate < 0.001`) fires +// on a flaky PHY or a wire that drops half its packets under load. +// +// Failure cases: iperf3 missing, server unreachable, zero throughput. +// Zero throughput is treated as a hard failure — an iperf that finished +// cleanly but pushed zero bytes is indistinguishable from a bad run. func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome { if _, err := exec.LookPath("iperf3"); err != nil { // Live image ships iperf3; absence means packaging regression. @@ -51,6 +59,11 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome { duration = 10 * time.Second } + // Snapshot /proc/net/dev before the test so we can attribute any + // error-count growth to *this stage's* traffic. The same snapshot + // taken after iperf returns is the end of the window. + netStart := indexNetDev(probes.NetDev()) + args := []string{ "-c", host, "-p", strconv.Itoa(port), @@ -72,7 +85,7 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome { Extras: map[string]any{"stderr_tail": tailLines(string(out), 20)}, } } - mbps, parsed, err := parseIperfJSON(out) + mbps, retrans, bytesSent, parsed, err := parseIperfJSON(out) if err != nil { d.Error("Network: parse iperf3 output: " + err.Error()) return Outcome{ @@ -82,12 +95,58 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome { Extras: map[string]any{"raw": string(out)}, } } + + netEnd := indexNetDev(probes.NetDev()) + netDelta := diffNetDev(netStart, netEnd) + + samples := []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}} + + // iperf-derived retrans rate: retrans_count / packet_count_estimate. + // TCP typical MTU 1500; payload ~1460. We divide bytes by 1460 to + // approximate packets. This keeps the rate bounded in [0, 1]. + if bytesSent > 0 { + packets := float64(bytesSent) / 1460.0 + if packets > 0 { + samples = append(samples, Sample{ + Kind: "nic_retrans", + Key: "iperf/rate", + Value: float64(retrans) / packets, + Unit: "rate", + }) + } + } + + // Per-interface error-rate deltas. A flaky cable typically surfaces + // as tx_errs or tx_drop on the originating interface, not inside + // iperf's own tally. + for iface, delta := range netDelta { + if delta.TxBytes > 0 { + packets := float64(delta.TxBytes) / 1460.0 + if packets > 0 { + rate := float64(delta.TxErrs+delta.TxDrop) / packets + samples = append(samples, Sample{ + Kind: "nic_retrans", Key: iface + "/rate", Value: rate, Unit: "rate", + }) + } + } + // Diagnostic raw counts so the report can show which interface + // bled. These don't fire a threshold today but are useful for + // post-mortem. + samples = append(samples, + Sample{Kind: "nic_errs", Key: iface + "/rx", Value: float64(delta.RxErrs + delta.RxDrop), Unit: "count"}, + Sample{Kind: "nic_errs", Key: iface + "/tx", Value: float64(delta.TxErrs + delta.TxDrop), Unit: "count"}, + ) + } + if d.Sensor != nil { - _ = d.Sensor(ctx, []Sample{{Kind: "iperf", Key: "throughput_mbps", Value: mbps, Unit: "Mbps"}}) + _ = d.Sensor(ctx, samples) } extras := map[string]any{ "throughput_mbps": mbps, + "retransmits": retrans, + "bytes_sent": bytesSent, + "net_delta": netDelta, "iperf_end": parsed, } if mbps <= 0 { @@ -98,14 +157,55 @@ func Network(ctx context.Context, d Deps, cfg NetworkConfig) Outcome { Extras: extras, } } - d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps", mbps)) + d.Info(fmt.Sprintf("Network: iperf3 PASSED: %.1f Mbps (retransmits=%d)", mbps, retrans)) return Outcome{ Passed: true, - Summary: fmt.Sprintf("%.1f Mbps to %s", mbps, host), + Summary: fmt.Sprintf("%.1f Mbps to %s (retransmits=%d)", mbps, host, retrans), Extras: extras, } } +// indexNetDev flattens a NetDev slice into a map keyed by interface +// name so diffNetDev can pair start/end by name without O(n²) scans. +func indexNetDev(snaps []probes.NetDevSnapshot) map[string]probes.NetDevSnapshot { + out := map[string]probes.NetDevSnapshot{} + for _, s := range snaps { + out[s.Iface] = s + } + return out +} + +// diffNetDev computes end − start for each interface present in both +// snapshots. An interface that dropped away mid-run is dropped from +// the result (can't compute a delta). Underflow (end < start, rare +// after a counter reset) is clamped to 0. +func diffNetDev(start, end map[string]probes.NetDevSnapshot) map[string]probes.NetDevSnapshot { + out := map[string]probes.NetDevSnapshot{} + for iface, e := range end { + s, ok := start[iface] + if !ok { + continue + } + out[iface] = probes.NetDevSnapshot{ + Iface: iface, + RxBytes: subU64(e.RxBytes, s.RxBytes), + RxErrs: subU64(e.RxErrs, s.RxErrs), + RxDrop: subU64(e.RxDrop, s.RxDrop), + TxBytes: subU64(e.TxBytes, s.TxBytes), + TxErrs: subU64(e.TxErrs, s.TxErrs), + TxDrop: subU64(e.TxDrop, s.TxDrop), + } + } + return out +} + +func subU64(a, b uint64) uint64 { + if a < b { + return 0 + } + return a - b +} + // deriveHost pulls the hostname out of an https://host:port base URL. func deriveHost(raw string) (string, error) { if raw == "" { @@ -119,18 +219,22 @@ func deriveHost(raw string) (string, error) { return strings.TrimSpace(h), nil } -// parseIperfJSON pulls end.sum_sent.bits_per_second out of iperf3 -J. -// Returns (Mbps, full-json-map, err). -func parseIperfJSON(b []byte) (float64, map[string]any, error) { +// parseIperfJSON pulls end.sum_sent.bits_per_second and retransmits out +// of iperf3 -J. Returns (Mbps, retransmits, bytes_sent, full-end-map, err). +func parseIperfJSON(b []byte) (float64, int64, int64, map[string]any, error) { var top map[string]any if err := json.Unmarshal(b, &top); err != nil { - return 0, nil, err + return 0, 0, 0, nil, err } end, ok := top["end"].(map[string]any) if !ok { - return 0, top, fmt.Errorf("missing end") + return 0, 0, 0, nil, fmt.Errorf("missing end") } - // iperf3 reports either sum_sent (when -R not set) or sum_received. + // Pull the first sum that carries bits_per_second; retransmits + + // bytes live there too for TCP. + var mbps float64 + var retrans int64 + var bytesSent int64 for _, key := range []string{"sum_sent", "sum_received", "sum"} { sum, ok := end[key].(map[string]any) if !ok { @@ -140,7 +244,17 @@ func parseIperfJSON(b []byte) (float64, map[string]any, error) { if !ok { continue } - return bps / 1_000_000, end, nil + mbps = bps / 1_000_000 + if r, ok := sum["retransmits"].(float64); ok { + retrans = int64(r) + } + if bs, ok := sum["bytes"].(float64); ok { + bytesSent = int64(bs) + } + break } - return 0, end, fmt.Errorf("no bits_per_second in end.sum_*") + if mbps == 0 { + return 0, 0, 0, end, fmt.Errorf("no bits_per_second in end.sum_*") + } + return mbps, retrans, bytesSent, end, nil } diff --git a/agent/tests/network_test.go b/agent/tests/network_test.go new file mode 100644 index 0000000..7ee5e63 --- /dev/null +++ b/agent/tests/network_test.go @@ -0,0 +1,192 @@ +package tests + +import ( + "encoding/json" + "testing" + + "vetting/agent/probes" +) + +// TestParseIperfJSON_SumSent confirms we pull throughput, retransmits, +// and bytes_sent from end.sum_sent. Real iperf3 -J output nests these +// three under end.sum_sent for TCP streams. +func TestParseIperfJSON_SumSent(t *testing.T) { + raw := `{ + "end": { + "sum_sent": { + "bits_per_second": 950000000, + "retransmits": 42, + "bytes": 1187500000 + } + } + }` + mbps, retrans, bytesSent, _, err := parseIperfJSON([]byte(raw)) + if err != nil { + t.Fatalf("parseIperfJSON: %v", err) + } + if mbps != 950 { + t.Errorf("mbps = %v, want 950", mbps) + } + if retrans != 42 { + t.Errorf("retransmits = %d, want 42", retrans) + } + if bytesSent != 1187500000 { + t.Errorf("bytesSent = %d, want 1187500000", bytesSent) + } +} + +// TestParseIperfJSON_MissingEnd fails cleanly when iperf returned +// something without an end block (partial/aborted run). +func TestParseIperfJSON_MissingEnd(t *testing.T) { + raw := `{"start": {}}` + if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil { + t.Errorf("expected error on iperf output missing end block") + } +} + +// TestParseIperfJSON_ZeroBps returns an error so the stage can fail +// fast. A successful-exit iperf that pushed zero bits is indistinguishable +// from a broken run and must not pass. +func TestParseIperfJSON_ZeroBps(t *testing.T) { + raw := `{"end": {"sum_sent": {"bits_per_second": 0}}}` + if _, _, _, _, err := parseIperfJSON([]byte(raw)); err == nil { + t.Errorf("expected error when bits_per_second is 0") + } +} + +// TestParseIperfJSON_FallsBackToSumReceived: UDP tests and some edge +// cases don't populate sum_sent. The parser walks sum_sent → sum_received +// → sum and picks the first that has a throughput number. +func TestParseIperfJSON_FallsBackToSumReceived(t *testing.T) { + raw := `{ + "end": { + "sum_received": {"bits_per_second": 500000000} + } + }` + mbps, _, _, _, err := parseIperfJSON([]byte(raw)) + if err != nil { + t.Fatalf("parseIperfJSON: %v", err) + } + if mbps != 500 { + t.Errorf("mbps = %v, want 500", mbps) + } +} + +// TestDiffNetDev_HappyPath confirms end − start on a shared interface +// produces the delta we expect. eth0 pushed 10k bytes and accumulated +// 3 tx errors during the window. +func TestDiffNetDev_HappyPath(t *testing.T) { + start := map[string]probes.NetDevSnapshot{ + "eth0": {Iface: "eth0", RxBytes: 1000, RxErrs: 0, TxBytes: 5000, TxErrs: 1}, + } + end := map[string]probes.NetDevSnapshot{ + "eth0": {Iface: "eth0", RxBytes: 2000, RxErrs: 0, TxBytes: 15000, TxErrs: 4}, + } + delta := diffNetDev(start, end) + got, ok := delta["eth0"] + if !ok { + t.Fatalf("eth0 missing from diff output") + } + if got.RxBytes != 1000 { + t.Errorf("RxBytes delta=%d, want 1000", got.RxBytes) + } + if got.TxBytes != 10000 { + t.Errorf("TxBytes delta=%d, want 10000", got.TxBytes) + } + if got.TxErrs != 3 { + t.Errorf("TxErrs delta=%d, want 3", got.TxErrs) + } +} + +// TestDiffNetDev_InterfaceVanished: an interface present at start but +// gone at end drops from the diff rather than carrying a negative or +// stale number. +func TestDiffNetDev_InterfaceVanished(t *testing.T) { + start := map[string]probes.NetDevSnapshot{ + "eth0": {Iface: "eth0", TxBytes: 1000}, + "eth1": {Iface: "eth1", TxBytes: 500}, + } + end := map[string]probes.NetDevSnapshot{ + "eth0": {Iface: "eth0", TxBytes: 2000}, + } + delta := diffNetDev(start, end) + if _, ok := delta["eth1"]; ok { + t.Errorf("eth1 should have been dropped (gone at end)") + } + if delta["eth0"].TxBytes != 1000 { + t.Errorf("eth0 TxBytes delta=%d, want 1000", delta["eth0"].TxBytes) + } +} + +// TestDiffNetDev_CounterReset: if a counter resets between snapshots +// (kernel restart, wrap-around on a 32-bit counter) we clamp to 0 +// rather than underflow a uint64. +func TestDiffNetDev_CounterReset(t *testing.T) { + start := map[string]probes.NetDevSnapshot{ + "eth0": {Iface: "eth0", TxBytes: 9999, TxErrs: 5}, + } + end := map[string]probes.NetDevSnapshot{ + "eth0": {Iface: "eth0", TxBytes: 100, TxErrs: 0}, + } + delta := diffNetDev(start, end) + if delta["eth0"].TxBytes != 0 { + t.Errorf("reset TxBytes delta=%d, want 0 (clamped)", delta["eth0"].TxBytes) + } + if delta["eth0"].TxErrs != 0 { + t.Errorf("reset TxErrs delta=%d, want 0 (clamped)", delta["eth0"].TxErrs) + } +} + +// TestDeriveHost: orchestrator URL → host extraction is how the agent +// picks the iperf3 server target. Handles both https://host and +// https://host:port shapes. +func TestDeriveHost(t *testing.T) { + cases := []struct { + raw string + want string + }{ + {"https://orch.local", "orch.local"}, + {"https://orch.local:8443", "orch.local"}, + {"http://10.0.0.5:8080", "10.0.0.5"}, + } + for _, c := range cases { + got, err := deriveHost(c.raw) + if err != nil { + t.Errorf("deriveHost(%q) error: %v", c.raw, err) + continue + } + if got != c.want { + t.Errorf("deriveHost(%q) = %q, want %q", c.raw, got, c.want) + } + } +} + +func TestDeriveHost_Empty(t *testing.T) { + if _, err := deriveHost(""); err == nil { + t.Errorf("deriveHost(\"\") should error") + } +} + +// TestParseIperfJSON_ParsesEndMap confirms the full end map is returned +// so extras can show every field iperf produced, not just the three we +// extract by hand. +func TestParseIperfJSON_ParsesEndMap(t *testing.T) { + raw := `{ + "end": { + "sum_sent": {"bits_per_second": 1000000, "retransmits": 0, "bytes": 125000}, + "cpu_utilization_percent": {"host_total": 12.3} + } + }` + _, _, _, endMap, err := parseIperfJSON([]byte(raw)) + if err != nil { + t.Fatalf("parseIperfJSON: %v", err) + } + if endMap == nil { + t.Fatalf("endMap is nil") + } + // Sanity: both keys round-trip via json. + b, _ := json.Marshal(endMap) + if len(b) == 0 { + t.Errorf("endMap marshaled to empty") + } +} diff --git a/agent/tests/psu.go b/agent/tests/psu.go index 8e8991e..7bedecb 100644 --- a/agent/tests/psu.go +++ b/agent/tests/psu.go @@ -7,12 +7,20 @@ import ( "path/filepath" "strconv" "strings" + "time" ) // PSU walks /sys/class/hwmon for in*_input (mV) and in*_label to find -// PSU rails. In home-lab hosts the kernel surfaces a handful of named -// rails (12V, 5V, 3V3). No rails → auto-skip. Any rail outside a ±10% -// window of its nominal value → fail. +// PSU rails, then samples each rail every psuSampleInterval for a +// window sized by the stage timeout. During Burn a separate sidecar +// (see burn.go) runs the same probe concurrently with workload — the +// PSU stage itself catches slow post-load sag that only surfaces once +// the 12V rail starts recovering from a brownout under concurrent CPU +// + fio + iperf load. +// +// Any rail outside ±10% of its nominal value at any tick fires the +// critical threshold (server-side) and fails the stage. A host with no +// PSU rails wired to hwmon auto-skips. func PSU(ctx context.Context, d Deps) Outcome { rails := scanPSURails() if len(rails) == 0 { @@ -24,39 +32,150 @@ func PSU(ctx context.Context, d Deps) Outcome { } } - var samples []Sample - problems := []string{} - for _, rail := range rails { - samples = append(samples, Sample{Kind: "psu_volt", Key: rail.Label, Value: rail.Volts, Unit: "V"}) - if ok, why := voltageInRange(rail); !ok { - problems = append(problems, fmt.Sprintf("%s=%.2fV (%s)", rail.Label, rail.Volts, why)) + window := resolvePSUWindow(d.StageTimeout) + deadline := time.Now().Add(window) + interval := psuSampleInterval + if window < interval*2 { + // Tiny window (tests, pathological stage_timeout) — at least two + // ticks so aggregate stats are meaningful. + interval = window / 2 + if interval < time.Second { + interval = time.Second } } - if d.Sensor != nil { - _ = d.Sensor(ctx, samples) + + // Per-label tracking: min/max across the window, count of out-of-range + // hits, last-observed value (shown in the summary). + type railStats struct { + label string + minV float64 + maxV float64 + lastV float64 + ticks int + breaches int + reason string + } + stats := map[string]*railStats{} + + tick := time.NewTicker(interval) + defer tick.Stop() + // Start with an immediate sample so a sub-45s window still produces + // at least one reading. + sampleOnce := func() { + cur := scanPSURails() + if len(cur) == 0 { + return + } + batch := make([]Sample, 0, len(cur)) + for _, r := range cur { + s, ok := stats[r.Label] + if !ok { + s = &railStats{label: r.Label, minV: r.Volts, maxV: r.Volts} + stats[r.Label] = s + } + s.ticks++ + s.lastV = r.Volts + if r.Volts < s.minV { + s.minV = r.Volts + } + if r.Volts > s.maxV { + s.maxV = r.Volts + } + if ok, why := voltageInRange(r); !ok { + s.breaches++ + if s.reason == "" { + s.reason = why + } + } + batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"}) + } + if d.Sensor != nil && len(batch) > 0 { + sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + _ = d.Sensor(sendCtx, batch) + cancel() + } + } + sampleOnce() +sampling: + for time.Now().Before(deadline) { + select { + case <-ctx.Done(): + break sampling + case <-tick.C: + sampleOnce() + } + } + + // Build the outcome. Extras carry per-rail rollup so the report can + // show "12V min=11.1 max=12.05 (3/120 ticks out of range)". + type railRollup struct { + Label string `json:"label"` + MinV float64 `json:"min_v"` + MaxV float64 `json:"max_v"` + LastV float64 `json:"last_v"` + Ticks int `json:"ticks"` + Breaches int `json:"breaches"` + Reason string `json:"reason,omitempty"` + } + rollups := make([]railRollup, 0, len(stats)) + problems := []string{} + for _, s := range stats { + rollups = append(rollups, railRollup{ + Label: s.label, MinV: s.minV, MaxV: s.maxV, LastV: s.lastV, + Ticks: s.ticks, Breaches: s.breaches, Reason: s.reason, + }) + if s.breaches > 0 { + problems = append(problems, fmt.Sprintf("%s min=%.2fV max=%.2fV (%s)", s.label, s.minV, s.maxV, s.reason)) + } } extras := map[string]any{ - "rails": rails, - "problems": problems, + "rails": rollups, + "problems": problems, + "window": window.String(), + "interval": interval.String(), } if len(problems) > 0 { - d.Error("PSU: out-of-range rails: " + strings.Join(problems, ", ")) + d.Error("PSU: out-of-range rails: " + strings.Join(problems, "; ")) return Outcome{ Passed: false, - Message: "PSU rails out of range: " + strings.Join(problems, ", "), - Summary: fmt.Sprintf("%d rails, %d failing", len(rails), len(problems)), + Message: "PSU rails out of range: " + strings.Join(problems, "; "), + Summary: fmt.Sprintf("%d rails, %d failing", len(rollups), len(problems)), Extras: extras, } } - d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal", len(rails))) + d.Info(fmt.Sprintf("PSU: %d rails within ±10%% nominal across %s window", len(rollups), window)) return Outcome{ Passed: true, - Summary: fmt.Sprintf("%d rails nominal", len(rails)), + Summary: fmt.Sprintf("%d rails nominal (%s)", len(rollups), window), Extras: extras, } } +// psuSampleInterval is the default tick for post-Burn rail sampling. +// Five seconds is slow enough to stay under the HTTP budget and fast +// enough to catch rail recovery transients. +const psuSampleInterval = 5 * time.Second + +// resolvePSUWindow maps the stage timeout to the sampling window. +// With no timeout (tests / pre-Phase-2 orchestrator), stay snapshot- +// like at 30 s. Otherwise take stage_timeout - 5 s to leave headroom +// for sensor flush + result post, capped at 10 min so a 24 h soak +// doesn't spend all day in PSU. +func resolvePSUWindow(stageTimeout time.Duration) time.Duration { + if stageTimeout <= 0 { + return 30 * time.Second + } + w := stageTimeout - 5*time.Second + if w < 30*time.Second { + w = 30 * time.Second + } + if w > 10*time.Minute { + w = 10 * time.Minute + } + return w +} + type psuRail struct { Label string `json:"label"` Volts float64 `json:"volts"` diff --git a/agent/tests/psu_test.go b/agent/tests/psu_test.go new file mode 100644 index 0000000..3bc9e03 --- /dev/null +++ b/agent/tests/psu_test.go @@ -0,0 +1,112 @@ +package tests + +import ( + "testing" + "time" +) + +// TestIsPSULabel keeps the allowlist narrow enough that CPU VRM rails +// don't get misclassified as PSU-out-of-range failures but wide enough +// that common SuperMicro/Intel hwmon labels land in the Yes bucket. +func TestIsPSULabel(t *testing.T) { + cases := []struct { + label string + want bool + }{ + {"+12V", true}, + {"12V", true}, + {"+5V", true}, + {"5V", true}, + {"+3.3V", true}, + {"3V3", true}, + {"VCCIN", true}, + {"vccin", true}, + {"Vcore", false}, + {"CPU VCORE", false}, + {"AVCC", false}, + {"", false}, + } + for _, tc := range cases { + if got := isPSULabel(tc.label); got != tc.want { + t.Errorf("isPSULabel(%q) = %v, want %v", tc.label, got, tc.want) + } + } +} + +// TestNominalFor maps rail labels back to expected nominal voltages. +// Unknown labels must return 0 so voltageInRange short-circuits — an +// accidental nominal would invent out-of-range failures. +func TestNominalFor(t *testing.T) { + cases := []struct { + label string + want float64 + }{ + {"+12V", 12.0}, + {"12V", 12.0}, + {"+5V", 5.0}, + {"+3.3V", 3.3}, + {"3V3", 3.3}, + {"VCCIN", 0}, + {"unknown", 0}, + } + for _, tc := range cases { + if got := nominalFor(tc.label); got != tc.want { + t.Errorf("nominalFor(%q) = %v, want %v", tc.label, got, tc.want) + } + } +} + +// TestVoltageInRange verifies the ±10% band: 12V passes in [10.8, +// 13.2], fails anywhere outside. Unknown labels always pass (since +// nominalFor returned 0 above). +func TestVoltageInRange(t *testing.T) { + cases := []struct { + rail psuRail + ok bool + }{ + {psuRail{Label: "+12V", Volts: 12.0}, true}, + {psuRail{Label: "+12V", Volts: 10.8}, true}, // exactly at the band + {psuRail{Label: "+12V", Volts: 13.2}, true}, // exactly at the band + {psuRail{Label: "+12V", Volts: 10.7}, false}, // just below + {psuRail{Label: "+12V", Volts: 13.3}, false}, // just above + {psuRail{Label: "+12V", Volts: 10.5}, false}, // real sag + {psuRail{Label: "+5V", Volts: 4.6}, true}, // 8% low on 5V still in band + {psuRail{Label: "+5V", Volts: 4.4}, false}, // 12% low on 5V — out of band + {psuRail{Label: "+5V", Volts: 5.0}, true}, + {psuRail{Label: "VCCIN", Volts: 1.8}, true}, // unknown nominal → pass + } + for _, tc := range cases { + got, _ := voltageInRange(tc.rail) + if got != tc.ok { + t.Errorf("voltageInRange(%+v) = %v, want %v", tc.rail, got, tc.ok) + } + } +} + +// TestResolvePSUWindow maps stage timeouts to the sampling window. +// Quick's 1m stage_timeout → 55s window; deep's 10m → capped at 10m; +// missing/zero → 30s (test / legacy orchestrator path); sub-35s → at +// least 30s so aggregates are non-trivial. +func TestResolvePSUWindow(t *testing.T) { + cases := []struct { + name string + in time.Duration + want time.Duration + }{ + {"zero → snapshot fallback", 0, 30 * time.Second}, + {"negative → snapshot fallback", -1 * time.Second, 30 * time.Second}, + {"tiny timeout clamps up to 30s floor", 10 * time.Second, 30 * time.Second}, + {"35s - 5s = 30s", 35 * time.Second, 30 * time.Second}, + {"1m quick → 55s", time.Minute, 55 * time.Second}, + {"10m deep → 9m55s", 10 * time.Minute, 9*time.Minute + 55*time.Second}, + {"15m soak → capped at 10m", 15 * time.Minute, 10 * time.Minute}, + {"1h → capped at 10m", time.Hour, 10 * time.Minute}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := resolvePSUWindow(tc.in); got != tc.want { + t.Errorf("resolvePSUWindow(%s) = %s, want %s", tc.in, got, tc.want) + } + }) + } +} diff --git a/agent/tests/stage.go b/agent/tests/stage.go index 4acffdd..5f4ac09 100644 --- a/agent/tests/stage.go +++ b/agent/tests/stage.go @@ -59,6 +59,11 @@ func (o Outcome) MarshalSummary() (json.RawMessage, error) { // Deps bundles what stages need without pulling in the whole agent. // Logger methods print to stdout + forward to the orchestrator; Sensor // drops numeric samples; OverrideFlags carries operator-set bypasses. +// +// CPUStressKnobs / StorageKnobs / NetworkKnobs are Phase-2 profile +// knobs. Zero-valued fields mean "fall back to the compile-time +// default" — that keeps the stages runnable even when the runner can't +// materialize a profile (tests, legacy orchestrator, etc). type Deps struct { Info func(string) Warn func(string) @@ -68,6 +73,58 @@ type Deps struct { NonDestructive bool // skip wipe-probe + writes in Storage ExpectedDisks []ExpectedDisk // serials + sizes from host.expected_spec StageTimeout time.Duration + CPUStressKnobs CPUStressKnobs + StorageKnobs StorageKnobs + NetworkKnobs NetworkKnobs + BurnKnobs BurnKnobs + // LookPath is the unit-test seam for swapping a real external + // binary (stress-ng, fio, iperf3, dmidecode, …) for a fake. When + // nil the stage falls back to os/exec.LookPath — production and + // existing tests keep working unchanged. Tests under + // agent/tests/fakes/ populate this to redirect lookups to a built + // fake binary in a tempdir. + LookPath func(name string) (string, error) +} + +// CPUStressKnobs parameterizes the CPUStress stage. Zero durations fall +// back to the package's compile-time defaults (cpuPassDuration etc). +type CPUStressKnobs struct { + CPUPass time.Duration + MemPass time.Duration + EDACPoll time.Duration +} + +// StorageKnobs parameterizes the Storage stage. Mode picks between +// "fio_sample" (bounded tempfile inside the device, quick profile) and +// "full_disk" (whole-device write verify, deep/soak). Empty strings +// fall back to the stage's safe defaults. +type StorageKnobs struct { + Mode string + FioSize string + FioTime time.Duration + FioBS string + FioRW string + Verify string +} + +// NetworkKnobs parameterizes the Network stage. +type NetworkKnobs struct { + Duration time.Duration +} + +// BurnKnobs parameterizes the Burn super-stage. Duration is the total +// Burn window; sub-workloads run concurrently inside that window. +// CPUWorkers is "all" (runtime.NumCPU) or a numeric string. MemPct is a +// percentage of MemAvailable to allocate for the memory burner (clamped +// 0-90 by the stage). IperfParallel feeds iperf3 -P to generate sustained +// NIC load. FioOnSpare gates the storage sub-workload: true = fio runs +// against the allow-listed disks for the same window; false = skip fio. +type BurnKnobs struct { + Duration time.Duration + CPUWorkers string + MemPct int + FioOnSpare bool + IperfParallel int } // Sample mirrors the server's SensorSample but lives in the tests diff --git a/agent/tests/storage.go b/agent/tests/storage.go index 0c5e78e..6f29889 100644 --- a/agent/tests/storage.go +++ b/agent/tests/storage.go @@ -5,24 +5,36 @@ import ( "encoding/json" "fmt" "os/exec" + "strconv" "strings" "time" ) -// Storage is the destructive stage: badblocks (write-mode sample) + fio -// random IO, persisting IOPS + latency as measurements. Pre-gates: +// Storage is the destructive stage. Phase 2 replaced the old +// badblocks + 128 MiB fio combo with a single fio run per disk that +// writes, verifies md5 of what it wrote, and reports p99 latency. +// Modes: +// +// - fio_sample (quick): bounded 1 GiB write per disk, ~3 min runtime. +// - full_disk (deep/soak): writes the whole device, time-bounded by +// the fio_time knob (2 h deep, 6 h soak). +// +// Pre-gates kept from Phase 1: // // 1. Device allowlist: only act on /dev/ where the kernel-reported -// serial matches one of Deps.ExpectedDisks. This is the operator's -// contract for what can be written to. USB sticks and unexpected +// serial matches one of Deps.ExpectedDisks. USB sticks and unexpected // drives are excluded. // 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem -// signatures, partition tables, or LVM metadata → fail with +// signature, partition table, or LVM metadata → fail with // UnexpectedData unless Deps.OverrideWipe is set. // -// Only after those pass does the stage run `badblocks -b 4096 -c 64 -w` -// and `fio` in write mode. This matches the plan's "destructive disk -// tests are always-on, gated by layered safety." +// After fio, the stage captures a SMART diff (start snapshot taken +// before any writes; end snapshot after all writes finish) and posts +// deltas on attributes like Reallocated_Sector_Ct and Current_Pending_Sector. +// The threshold evaluator isn't seeded to gate smart_delta out of the +// box — those samples are diagnostic for the report. Fio's p99 latency +// posts as fio_p99_us so the per-stage Storage warning threshold can +// fire on a latency cliff. func Storage(ctx context.Context, d Deps) Outcome { if len(d.ExpectedDisks) == 0 { d.Info("Storage: no expected disks in spec — skipping stage") @@ -44,10 +56,10 @@ func Storage(ctx context.Context, d Deps) Outcome { } } - // Non-destructive runs skip wipe-probe (nothing to refuse), badblocks - // -w, and write-mode fio. Every expected disk is still asserted - // present + readable by listing /sys/block and reading SMART-accessible - // identity; the per-disk map flags the shortcut so the report is clear. + // Non-destructive runs skip wipe-probe (nothing to refuse), fio + // writes, and SMART delta (nothing changed so no delta to report). + // Every expected disk is still asserted present so a vanished drive + // still fails the stage. if d.NonDestructive { perDisk := map[string]any{} for _, t := range targets { @@ -79,9 +91,9 @@ func Storage(ctx context.Context, d Deps) Outcome { Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)", Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)), Extras: map[string]any{ - "wipe_probe": probes, - "override_hint": "click 'Override wipe & retry' in the held tile", - "dirty_devices": dirty, + "wipe_probe": probes, + "override_hint": "click 'Override wipe & retry' in the held tile", + "dirty_devices": dirty, }, } } @@ -89,64 +101,80 @@ func Storage(ctx context.Context, d Deps) Outcome { d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", ")) } - // Per target: short badblocks write sample + fio random-read/write. + // Capture start-of-stage SMART attributes before we write anything + // so the delta is attributable to *this* stage's writes and not the + // host's prior history. Per-disk failures are tolerated (e.g. the + // device doesn't expose SMART); we just can't emit a delta for it. + startSMART := captureSMARTAttrs(ctx, targets) + + fioOpts := resolveFioOpts(d.StorageKnobs) + d.Info(fmt.Sprintf("Storage: fio mode=%s size=%s runtime=%s bs=%s rw=%s verify=%s", + fioOpts.Mode, fioOpts.Size, fioOpts.Runtime, fioOpts.BS, fioOpts.RW, fioOpts.Verify)) + var samples []Sample var subs []SubStepReport perDisk := map[string]any{} + failed := "" for _, t := range targets { - d.Info("Storage: running badblocks write sample on " + t.Device) - bbStart := time.Now() - bb := runBadblocks(ctx, t.Device) - bbEnd := time.Now() - bbSummary, _ := json.Marshal(bb) - subs = append(subs, SubStepReport{ - Name: fmt.Sprintf("badblocks %s", t.Device), - Passed: bb.OK, - StartedAt: bbStart, - CompletedAt: bbEnd, - SummaryJSON: bbSummary, - }) - - d.Info(fmt.Sprintf("Storage: running fio random rw on %s", t.Device)) + d.Info(fmt.Sprintf("Storage: running fio %s on %s", fioOpts.Mode, t.Device)) fioStart := time.Now() - fr := runFio(ctx, t.Device) + fr := runFioVerify(ctx, t.Device, fioOpts) fioEnd := time.Now() fioSummary, _ := json.Marshal(fr) subs = append(subs, SubStepReport{ - Name: fmt.Sprintf("fio %s", t.Device), + Name: fmt.Sprintf("fio %s %s", fioOpts.Mode, t.Device), Passed: fr.Error == "", StartedAt: fioStart, CompletedAt: fioEnd, SummaryJSON: fioSummary, }) + perDisk[t.Device] = map[string]any{"fio": fr} - perDisk[t.Device] = map[string]any{ - "badblocks": bb, - "fio": fr, - } - samples = append(samples, - Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"}, - Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"}, - ) - if !bb.OK { - return Outcome{ - Passed: false, - Message: "badblocks found errors on " + t.Device, - Summary: "badblocks failed on " + t.Device, - Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes}, - SubSteps: subs, + if fr.Error == "" { + samples = append(samples, + Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"}, + Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"}, + ) + if fr.ReadP99Us > 0 { + samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"}) } + if fr.WriteP99Us > 0 { + samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"}) + } + } else if failed == "" { + failed = t.Device } } - if d.Sensor != nil { + + // End-of-stage SMART snapshot + diff. We capture whether or not fio + // succeeded — a mid-run failure still produces attributable deltas, + // which is often more interesting than the stage outcome itself. + endSMART := captureSMARTAttrs(ctx, targets) + deltas := diffSMARTAttrs(startSMART, endSMART) + for dev, attrs := range deltas { + for attr, delta := range attrs { + samples = append(samples, Sample{Kind: "smart_delta", Key: dev + "/" + attr, Value: delta, Unit: "count"}) + } + } + if d.Sensor != nil && len(samples) > 0 { _ = d.Sensor(ctx, samples) } - d.Info(fmt.Sprintf("Storage: %d disk(s) passed badblocks + fio", len(targets))) + if failed != "" { + return Outcome{ + Passed: false, + Message: "fio verify failed on " + failed, + Summary: "fio failed on " + failed, + Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts}, + SubSteps: subs, + } + } + + d.Info(fmt.Sprintf("Storage: %d disk(s) passed fio --verify", len(targets))) return Outcome{ Passed: true, - Summary: fmt.Sprintf("%d disks passed", len(targets)), - Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes}, + Summary: fmt.Sprintf("%d disks passed (%s)", len(targets), fioOpts.Mode), + Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts}, SubSteps: subs, } } @@ -229,8 +257,8 @@ type wipeProbeResult struct { // probeWipe runs blkid + wipefs -n. Any non-empty output from either is // a "has data" signal. This is deliberately conservative: we'd rather -// halt on a bare ext4 signature than hand badblocks a disk with real -// bytes on it. +// halt on a bare ext4 signature than hand fio a disk with real bytes on +// it. func probeWipe(ctx context.Context, device string) wipeProbeResult { out := wipeProbeResult{Device: device} @@ -257,84 +285,269 @@ func probeWipe(ctx context.Context, device string) wipeProbeResult { return out } -// ---------- badblocks ---------- +// ---------- fio ---------- -type badblocksResult struct { - OK bool `json:"ok"` - Elapsed string `json:"elapsed"` - Error string `json:"error,omitempty"` - OutputTail string `json:"output_tail,omitempty"` +// fioOpts resolves the probe knobs into the concrete flag values fio +// needs. Defaults match the quick profile's fio_sample shape so callers +// with zero knobs still run something bounded. +type fioOpts struct { + Mode string `json:"mode"` // "fio_sample" | "full_disk" + Size string `json:"size"` // "1GiB"; only used for fio_sample + Runtime time.Duration `json:"runtime"` // bounding time + BS string `json:"bs"` // "4k" + RW string `json:"rw"` // "randrw" + Verify string `json:"verify"` // "md5" | "" } -func runBadblocks(ctx context.Context, device string) badblocksResult { - // -c 64 blocks per check, -w destructive write, -b 4096 block size, - // -t pattern. We only sample 256MiB (65536 × 4k) so the stage stays - // bounded. A real burn-in would run the whole disk; that belongs in - // a separate "deep" stage. - args := []string{"-b", "4096", "-c", "64", "-w", "-t", "random", device, "65536"} - start := time.Now() - runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) +// resolveFioOpts normalizes the knobs into a runnable config. Zero- +// valued fields fall back to the quick defaults so a stage that's +// missing its knobs still has coherent behavior (safer than refusing). +func resolveFioOpts(k StorageKnobs) fioOpts { + o := fioOpts{ + Mode: firstNonEmpty(k.Mode, "fio_sample"), + Size: firstNonEmpty(k.FioSize, "1GiB"), + Runtime: k.FioTime, + BS: firstNonEmpty(k.FioBS, "4k"), + RW: firstNonEmpty(k.FioRW, "randrw"), + Verify: firstNonEmpty(k.Verify, "md5"), + } + if o.Runtime <= 0 { + o.Runtime = 3 * time.Minute + } + return o +} + +func firstNonEmpty(vs ...string) string { + for _, v := range vs { + if v != "" { + return v + } + } + return "" +} + +type fioResult struct { + Mode string `json:"mode"` + ReadIOPS float64 `json:"read_iops"` + WriteIOPS float64 `json:"write_iops"` + ReadBWKBps float64 `json:"read_bw_kbps"` + WriteBWKBps float64 `json:"write_bw_kbps"` + ReadP99Us float64 `json:"read_p99_us,omitempty"` + WriteP99Us float64 `json:"write_p99_us,omitempty"` + Error string `json:"error,omitempty"` + OutputTail string `json:"output_tail,omitempty"` +} + +// runFioVerify invokes fio with md5-verify semantics. fio_sample mode +// caps the IO at opts.Size; full_disk drives the whole device bounded +// by runtime. Both use direct IO to bypass the page cache — we want +// real disk latency, not Linux' cheerful buffer. +func runFioVerify(ctx context.Context, device string, opts fioOpts) fioResult { + // 30s grace over runtime so fio has time to flush + close cleanly. + runCtx, cancel := context.WithTimeout(ctx, opts.Runtime+30*time.Second) defer cancel() - cmd := exec.CommandContext(runCtx, "badblocks", args...) - out, err := cmd.CombinedOutput() - r := badblocksResult{Elapsed: time.Since(start).Round(time.Second).String(), OutputTail: tailLines(string(out), 10)} + + args := []string{ + "--name=verify-" + strings.TrimPrefix(device, "/dev/"), + "--filename=" + device, + "--rw=" + opts.RW, + "--bs=" + opts.BS, + "--numjobs=1", + "--direct=1", + "--group_reporting", + "--output-format=json", + "--runtime=" + strconv.Itoa(int(opts.Runtime.Seconds())), + } + if opts.Verify != "" { + args = append(args, + "--verify="+opts.Verify, + "--verify_pattern=random", + "--do_verify=1", + ) + } + switch opts.Mode { + case "full_disk": + // Time-bounded across the full device — fio uses the device's + // full size when --size is omitted on a block device. + args = append(args, "--time_based=1") + default: + // fio_sample: bounded write. Setting --size= limits the IO + // volume regardless of runtime. + args = append(args, "--size="+opts.Size, "--time_based=0") + } + + cmd := exec.CommandContext(runCtx, "fio", args...) + out, err := cmd.Output() + r := fioResult{Mode: opts.Mode, OutputTail: tailLines(string(out), 20)} if err != nil { r.Error = err.Error() return r } - // badblocks prints each bad block to stdout. Empty output = clean. - if strings.TrimSpace(string(out)) == "" { - r.OK = true - } else { - r.Error = "bad blocks found" + parsed, perr := parseFioJSON(out) + if perr != nil { + r.Error = "parse fio json: " + perr.Error() + return r } + r.ReadIOPS = parsed.ReadIOPS + r.WriteIOPS = parsed.WriteIOPS + r.ReadBWKBps = parsed.ReadBWKBps + r.WriteBWKBps = parsed.WriteBWKBps + r.ReadP99Us = parsed.ReadP99Us + r.WriteP99Us = parsed.WriteP99Us return r } -// ---------- fio ---------- - -type fioResult struct { - ReadIOPS float64 `json:"read_iops"` - WriteIOPS float64 `json:"write_iops"` - ReadBWKBps float64 `json:"read_bw_kbps"` - WriteBWKBps float64 `json:"write_bw_kbps"` - Error string `json:"error,omitempty"` -} - -// runFio kicks off a tiny random-rw job: 2 jobs × 64MB × 4k blocks. -// This is a health bar, not a benchmark — we want to know the disk -// services IO, not how fast it is at p99. -func runFio(ctx context.Context, device string) fioResult { - runCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) - defer cancel() - args := []string{ - "--name=health", "--filename=" + device, "--rw=randrw", - "--bs=4k", "--size=64M", "--numjobs=2", "--time_based=0", - "--group_reporting", "--output-format=json", "--direct=1", - } - cmd := exec.CommandContext(runCtx, "fio", args...) - out, err := cmd.Output() - if err != nil { - return fioResult{Error: err.Error()} - } +// parseFioJSON extracts the bits we care about from fio's --output-format=json. +// Latency percentiles live at .jobs[0].read.clat_ns.percentile["99.000000"]; +// we convert nanoseconds to microseconds for the fio_p99_us sample. +func parseFioJSON(out []byte) (fioResult, error) { var top struct { Jobs []struct { - Read struct { + Read struct { IOPS float64 `json:"iops"` BW float64 `json:"bw"` + CLat struct { + Percentile map[string]float64 `json:"percentile"` + } `json:"clat_ns"` } `json:"read"` Write struct { IOPS float64 `json:"iops"` BW float64 `json:"bw"` + CLat struct { + Percentile map[string]float64 `json:"percentile"` + } `json:"clat_ns"` } `json:"write"` } `json:"jobs"` } - if err := json.Unmarshal(out, &top); err != nil || len(top.Jobs) == 0 { - return fioResult{Error: "parse fio json: " + fmt.Sprint(err)} + if err := json.Unmarshal(out, &top); err != nil { + return fioResult{}, err + } + if len(top.Jobs) == 0 { + return fioResult{}, fmt.Errorf("no jobs in fio output") } j := top.Jobs[0] - return fioResult{ + r := fioResult{ ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS, ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW, } + if p := j.Read.CLat.Percentile["99.000000"]; p > 0 { + r.ReadP99Us = p / 1000.0 + } + if p := j.Write.CLat.Percentile["99.000000"]; p > 0 { + r.WriteP99Us = p / 1000.0 + } + return r, nil +} + +// ---------- SMART delta ---------- + +// smartAttrMap: device → attribute → raw counter value. ATA drives +// populate named attributes (Reallocated_Sector_Ct etc); NVMe drives +// populate a flatter nvme-specific map. We track a curated whitelist +// of wear indicators — anything else is diagnostic and drops to the raw +// report output. +type smartAttrMap map[string]map[string]float64 + +// captureSMARTAttrs runs smartctl -aj on each target and pulls the +// whitelisted attributes. Per-device failures (virtio, permission +// issues) degrade silently — the delta step just shows no data for +// that device. +func captureSMARTAttrs(ctx context.Context, targets []diskTarget) smartAttrMap { + out := smartAttrMap{} + for _, t := range targets { + parsed, err := runSmartctl(ctx, t.Device) + if err != nil { + continue + } + attrs := extractSMARTAttrs(parsed) + if len(attrs) > 0 { + out[t.Device] = attrs + } + } + return out +} + +// smartAttributeWhitelist is the set of attributes we diff across a +// stage. They're the ones that reflect *this stage's* IO damage, not +// cumulative drive history. Adding attributes is cheap — missing ones +// just drop to zero. +var smartAttributeWhitelist = map[string]bool{ + // ATA SMART attribute names (smartctl normalizes to these) + "Reallocated_Sector_Ct": true, + "Current_Pending_Sector": true, + "Offline_Uncorrectable": true, + "UDMA_CRC_Error_Count": true, + "Reported_Uncorrect": true, + "Raw_Read_Error_Rate": true, + // NVMe log fields (flat keys at top of nvme_smart_health_information_log) + "media_errors": true, + "num_err_log_entries": true, + "percentage_used": true, +} + +// extractSMARTAttrs walks smartctl's JSON for whitelisted attribute +// values. Handles both the ATA shape (ata_smart_attributes.table[]) and +// the NVMe shape (nvme_smart_health_information_log). Returns a map +// keyed by the canonical attribute name. +func extractSMARTAttrs(raw map[string]any) map[string]float64 { + out := map[string]float64{} + // ATA attributes are in ata_smart_attributes.table[] — each element + // has {"name": "Reallocated_Sector_Ct", "raw": {"value": N}}. + if ata, ok := raw["ata_smart_attributes"].(map[string]any); ok { + if tbl, ok := ata["table"].([]any); ok { + for _, row := range tbl { + rm, ok := row.(map[string]any) + if !ok { + continue + } + name, _ := rm["name"].(string) + if !smartAttributeWhitelist[name] { + continue + } + if r, ok := rm["raw"].(map[string]any); ok { + if v, ok := r["value"].(float64); ok { + out[name] = v + } + } + } + } + } + // NVMe attributes live flat under nvme_smart_health_information_log. + if nvme, ok := raw["nvme_smart_health_information_log"].(map[string]any); ok { + for k, v := range nvme { + if !smartAttributeWhitelist[k] { + continue + } + if n, ok := v.(float64); ok { + out[k] = n + } + } + } + return out +} + +// diffSMARTAttrs subtracts start from end per (device, attribute). +// Only attributes present in both ends produce a delta; missing +// attributes drop out (can't attribute a zero-to-present delta safely). +// Negative deltas are kept so a drive that resets a counter is visible. +func diffSMARTAttrs(start, end smartAttrMap) map[string]map[string]float64 { + out := map[string]map[string]float64{} + for dev, endAttrs := range end { + startAttrs, ok := start[dev] + if !ok { + continue + } + devOut := map[string]float64{} + for attr, endV := range endAttrs { + startV, ok := startAttrs[attr] + if !ok { + continue + } + devOut[attr] = endV - startV + } + if len(devOut) > 0 { + out[dev] = devOut + } + } + return out } diff --git a/agent/tests/storage_test.go b/agent/tests/storage_test.go new file mode 100644 index 0000000..1e52d64 --- /dev/null +++ b/agent/tests/storage_test.go @@ -0,0 +1,218 @@ +package tests + +import ( + "encoding/json" + "testing" + "time" +) + +// TestParseFioJSON_ATAReadWrite confirms we pull IOPS, BW, and p99 +// latency from both read and write sides. P99 is read from clat_ns and +// converted ns → us (the unit we emit to the threshold evaluator). +func TestParseFioJSON_ATAReadWrite(t *testing.T) { + raw := `{ + "jobs": [{ + "read": {"iops": 1234.5, "bw": 5000, "clat_ns": {"percentile": {"99.000000": 250000}}}, + "write": {"iops": 432.1, "bw": 2000, "clat_ns": {"percentile": {"99.000000": 500000}}} + }] + }` + r, err := parseFioJSON([]byte(raw)) + if err != nil { + t.Fatalf("parseFioJSON: %v", err) + } + if r.ReadIOPS != 1234.5 { + t.Errorf("ReadIOPS = %v, want 1234.5", r.ReadIOPS) + } + if r.WriteIOPS != 432.1 { + t.Errorf("WriteIOPS = %v, want 432.1", r.WriteIOPS) + } + if r.ReadBWKBps != 5000 { + t.Errorf("ReadBWKBps = %v, want 5000", r.ReadBWKBps) + } + // 250000 ns → 250 us + if r.ReadP99Us != 250 { + t.Errorf("ReadP99Us = %v, want 250", r.ReadP99Us) + } + // 500000 ns → 500 us + if r.WriteP99Us != 500 { + t.Errorf("WriteP99Us = %v, want 500", r.WriteP99Us) + } +} + +// TestParseFioJSON_ReadOnlyJob: if only one side has p99 populated the +// other stays zero (not emitted as a sample). Mirrors a randread job. +func TestParseFioJSON_ReadOnlyJob(t *testing.T) { + raw := `{ + "jobs": [{ + "read": {"iops": 1000, "bw": 4000, "clat_ns": {"percentile": {"99.000000": 100000}}}, + "write": {"iops": 0, "bw": 0} + }] + }` + r, err := parseFioJSON([]byte(raw)) + if err != nil { + t.Fatalf("parseFioJSON: %v", err) + } + if r.WriteP99Us != 0 { + t.Errorf("WriteP99Us = %v on read-only job, want 0", r.WriteP99Us) + } + if r.ReadP99Us != 100 { + t.Errorf("ReadP99Us = %v, want 100", r.ReadP99Us) + } +} + +// TestParseFioJSON_NoJobs fails rather than reporting zeroes silently. +// An empty jobs array means fio didn't run anything. +func TestParseFioJSON_NoJobs(t *testing.T) { + raw := `{"jobs": []}` + if _, err := parseFioJSON([]byte(raw)); err == nil { + t.Errorf("expected error on empty jobs array") + } +} + +// TestExtractSMARTAttrs_ATA picks attributes out of ata_smart_attributes.table +// when present. Attributes outside the whitelist drop out silently. +func TestExtractSMARTAttrs_ATA(t *testing.T) { + raw := map[string]any{} + smartJSON := `{ + "ata_smart_attributes": { + "table": [ + {"name": "Reallocated_Sector_Ct", "raw": {"value": 7}}, + {"name": "Current_Pending_Sector", "raw": {"value": 3}}, + {"name": "Spin_Retry_Count", "raw": {"value": 99}} + ] + } + }` + if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil { + t.Fatalf("unmarshal fixture: %v", err) + } + out := extractSMARTAttrs(raw) + if out["Reallocated_Sector_Ct"] != 7 { + t.Errorf("Reallocated_Sector_Ct = %v, want 7", out["Reallocated_Sector_Ct"]) + } + if out["Current_Pending_Sector"] != 3 { + t.Errorf("Current_Pending_Sector = %v, want 3", out["Current_Pending_Sector"]) + } + if _, ok := out["Spin_Retry_Count"]; ok { + t.Errorf("Spin_Retry_Count should not appear (not in whitelist)") + } +} + +// TestExtractSMARTAttrs_NVMe picks media_errors and friends from the +// nvme health log shape, which is a flat map at the top of the JSON. +func TestExtractSMARTAttrs_NVMe(t *testing.T) { + raw := map[string]any{} + smartJSON := `{ + "nvme_smart_health_information_log": { + "media_errors": 2, + "num_err_log_entries": 15, + "percentage_used": 7, + "temperature": 42 + } + }` + if err := json.Unmarshal([]byte(smartJSON), &raw); err != nil { + t.Fatalf("unmarshal fixture: %v", err) + } + out := extractSMARTAttrs(raw) + if out["media_errors"] != 2 { + t.Errorf("media_errors = %v, want 2", out["media_errors"]) + } + if out["num_err_log_entries"] != 15 { + t.Errorf("num_err_log_entries = %v, want 15", out["num_err_log_entries"]) + } + if out["percentage_used"] != 7 { + t.Errorf("percentage_used = %v, want 7", out["percentage_used"]) + } + if _, ok := out["temperature"]; ok { + t.Errorf("temperature should not appear (not in whitelist)") + } +} + +// TestDiffSMARTAttrs: end − start per (device, attr). Only attrs in +// both snapshots yield a delta; any disappearing attribute just drops +// out instead of showing a misleading negative. +func TestDiffSMARTAttrs(t *testing.T) { + start := smartAttrMap{ + "/dev/sda": {"Reallocated_Sector_Ct": 5, "Current_Pending_Sector": 0}, + } + end := smartAttrMap{ + "/dev/sda": {"Reallocated_Sector_Ct": 8, "Current_Pending_Sector": 2, "UDMA_CRC_Error_Count": 1}, + } + out := diffSMARTAttrs(start, end) + if out["/dev/sda"]["Reallocated_Sector_Ct"] != 3 { + t.Errorf("Reallocated_Sector_Ct delta = %v, want 3", out["/dev/sda"]["Reallocated_Sector_Ct"]) + } + if out["/dev/sda"]["Current_Pending_Sector"] != 2 { + t.Errorf("Current_Pending_Sector delta = %v, want 2", out["/dev/sda"]["Current_Pending_Sector"]) + } + if _, ok := out["/dev/sda"]["UDMA_CRC_Error_Count"]; ok { + t.Errorf("UDMA_CRC_Error_Count should not appear (missing at start)") + } +} + +// TestDiffSMARTAttrs_DeviceNewAtEnd: a device only present in the end +// snapshot (drive hot-plugged mid-run, or SMART read succeeded only at +// end) is dropped from the diff — no start baseline to subtract from. +func TestDiffSMARTAttrs_DeviceNewAtEnd(t *testing.T) { + start := smartAttrMap{} + end := smartAttrMap{ + "/dev/sda": {"Reallocated_Sector_Ct": 10}, + } + out := diffSMARTAttrs(start, end) + if _, ok := out["/dev/sda"]; ok { + t.Errorf("/dev/sda should drop from diff when absent at start") + } +} + +// TestResolveFioOpts_Defaults: zero-valued knobs resolve to the quick +// profile's fio_sample shape. Any stage that's missing per-profile +// knobs (legacy claim response, test harness) still has coherent +// bounded defaults — we won't accidentally fall into unbounded writes. +func TestResolveFioOpts_Defaults(t *testing.T) { + o := resolveFioOpts(StorageKnobs{}) + if o.Mode != "fio_sample" { + t.Errorf("Mode = %q, want fio_sample", o.Mode) + } + if o.Size != "1GiB" { + t.Errorf("Size = %q, want 1GiB", o.Size) + } + if o.Runtime != 3*time.Minute { + t.Errorf("Runtime = %v, want 3m", o.Runtime) + } + if o.BS != "4k" { + t.Errorf("BS = %q, want 4k", o.BS) + } + if o.RW != "randrw" { + t.Errorf("RW = %q, want randrw", o.RW) + } + if o.Verify != "md5" { + t.Errorf("Verify = %q, want md5", o.Verify) + } +} + +// TestResolveFioOpts_FullDiskOverride confirms the deep/soak shape +// round-trips. FioTime as 2h overrides the 3-minute default. +func TestResolveFioOpts_FullDiskOverride(t *testing.T) { + k := StorageKnobs{ + Mode: "full_disk", + FioTime: 2 * time.Hour, + FioBS: "64k", + FioRW: "write", + } + o := resolveFioOpts(k) + if o.Mode != "full_disk" { + t.Errorf("Mode = %q, want full_disk", o.Mode) + } + if o.Runtime != 2*time.Hour { + t.Errorf("Runtime = %v, want 2h", o.Runtime) + } + if o.BS != "64k" { + t.Errorf("BS = %q, want 64k", o.BS) + } + if o.RW != "write" { + t.Errorf("RW = %q, want write", o.RW) + } + // Verify should fall back to md5 default since knob was empty. + if o.Verify != "md5" { + t.Errorf("Verify = %q, want md5 (default)", o.Verify) + } +} diff --git a/cmd/vetting/main.go b/cmd/vetting/main.go index 7a0df9b..e361235 100644 --- a/cmd/vetting/main.go +++ b/cmd/vetting/main.go @@ -60,6 +60,8 @@ func main() { artifactStore := &store.Artifacts{DB: conn} specDiffStore := &store.SpecDiffs{DB: conn} measurementStore := &store.Measurements{DB: conn} + thresholdStore := &store.Thresholds{DB: conn} + firmwareStore := &store.Firmware{DB: conn} hub := events.NewHub() @@ -99,17 +101,19 @@ func main() { } ui := &api.UI{ - Hosts: hostStore, - Runs: runStore, - Stages: stageStore, - SubSteps: subStepStore, - SpecDiffs: specDiffStore, - Artifacts: artifactStore, - EventHub: hub, - Logs: logHub, - Runner: runner, - Tiles: tiles, - PublicURL: cfg.Server.PublicURL, + Hosts: hostStore, + Runs: runStore, + Stages: stageStore, + SubSteps: subStepStore, + SpecDiffs: specDiffStore, + Artifacts: artifactStore, + Thresholds: thresholdStore, + Profiles: cfg.Profiles, + EventHub: hub, + Logs: logHub, + Runner: runner, + Tiles: tiles, + PublicURL: cfg.Server.PublicURL, } // Inject the host-page + run-page fragment renderers. Each reuses @@ -157,6 +161,9 @@ func main() { Artifacts: artifactStore, SpecDiffs: specDiffStore, Measurements: measurementStore, + Thresholds: thresholdStore, + Firmware: firmwareStore, + Profiles: cfg.Profiles, Runner: runner, EventHub: hub, Logs: logHub, diff --git a/deploy/vetting.example.yaml b/deploy/vetting.example.yaml index 373efd2..b9db53d 100644 --- a/deploy/vetting.example.yaml +++ b/deploy/vetting.example.yaml @@ -85,3 +85,54 @@ agent: notifiers: [] routes: [] + +# Vetting pipeline shared defaults. Every profile (quick/deep/soak) +# walks the same stage list; only per-stage durations differ. +# Thresholds here apply to every profile — a 92°C CPU fails a +# 2-minute quick run and a 12-hour soak run alike. +vetting: + stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting] + thresholds: + - { stage: "*", kind: temp, key: "cpu/*", op: lt, value: 92, unit: C, severity: critical } + - { stage: PSU, kind: psu_volt, key: "+12V", op: within_pct, value: 5, nominal: 12.0, severity: critical } + - { stage: PSU, kind: psu_volt, key: "+5V", op: within_pct, value: 5, nominal: 5.0, severity: critical } + - { stage: PSU, kind: psu_volt, key: "+3.3V", op: within_pct, value: 5, nominal: 3.3, severity: critical } + - { stage: Storage, kind: fio_p99_us, key: "*", op: lt, value: 50000, severity: warning } + - { stage: Network, kind: iperf, key: throughput_mbps, op: gte, value: 900, severity: critical } + - { stage: Network, kind: nic_retrans, key: "*/rate", op: lt, value: 0.001, severity: warning } + - { stage: CPUStress, kind: edac_ue, key: "*", op: lte, value: 0, severity: critical } + - { stage: CPUStress, kind: mce, key: "*", op: lte, value: 0, severity: critical } + +# Per-profile durations + probe knobs. Only the *durations* scale across +# profiles — every profile exercises every probe and gate. Quick is a +# ~10-minute same-day sanity check; deep is the 8–12 h overnight soak; +# soak is the opt-in 36–40 h extreme run. +profiles: + quick: + stage_timeouts: + CPUStress: 5m + Storage: 5m + Network: 2m + defaults: + cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s } + storage: { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 } + network: { duration: 60s } + deep: + stage_timeouts: + CPUStress: 2h + Storage: 4h + Network: 35m + defaults: + cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s } + storage: { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 } + network: { duration: 30m } + soak: + inherit: deep + stage_timeouts: + CPUStress: 14h + Storage: 8h + Network: 2h30m + defaults: + cpustress: { cpu_pass: 12h } + storage: { mode: full_disk, fio_time: 6h } + network: { duration: 2h } diff --git a/deploy/vetting.production.yaml b/deploy/vetting.production.yaml index 2191661..7f16f0d 100644 --- a/deploy/vetting.production.yaml +++ b/deploy/vetting.production.yaml @@ -75,3 +75,41 @@ agent: notifiers: [] routes: [] + +# Vetting pipeline shared defaults. Every profile (quick/deep/soak) +# walks the same stage list; only per-stage durations differ. +# Thresholds apply to every profile — critical breaches fail a run +# regardless of which profile the operator picked. +vetting: + stages: [Inventory, SpecValidate, SMART, CPUStress, Storage, Network, GPU, PSU, Reporting] + thresholds: + - { stage: "*", kind: temp, key: "cpu/*", op: lt, value: 92, unit: C, severity: critical } + - { stage: PSU, kind: psu_volt, key: "+12V", op: within_pct, value: 5, nominal: 12.0, severity: critical } + - { stage: PSU, kind: psu_volt, key: "+5V", op: within_pct, value: 5, nominal: 5.0, severity: critical } + - { stage: PSU, kind: psu_volt, key: "+3.3V", op: within_pct, value: 5, nominal: 3.3, severity: critical } + - { stage: Storage, kind: fio_p99_us, key: "*", op: lt, value: 50000, severity: warning } + - { stage: Network, kind: iperf, key: throughput_mbps, op: gte, value: 900, severity: critical } + - { stage: Network, kind: nic_retrans, key: "*/rate", op: lt, value: 0.001, severity: warning } + - { stage: CPUStress, kind: edac_ue, key: "*", op: lte, value: 0, severity: critical } + - { stage: CPUStress, kind: mce, key: "*", op: lte, value: 0, severity: critical } + +profiles: + quick: + stage_timeouts: { CPUStress: 5m, Storage: 5m, Network: 2m } + defaults: + cpustress: { cpu_pass: 2m, mem_pass: 2m, edac_poll: 10s } + storage: { mode: fio_sample, fio_size: 1GiB, fio_time: 3m, fio_bs: 4k, fio_rw: randrw, verify: md5 } + network: { duration: 60s } + deep: + stage_timeouts: { CPUStress: 2h, Storage: 4h, Network: 35m } + defaults: + cpustress: { cpu_pass: 60m, mem_pass: 60m, edac_poll: 10s } + storage: { mode: full_disk, fio_time: 2h, fio_bs: 4k, fio_rw: randrw, verify: md5 } + network: { duration: 30m } + soak: + inherit: deep + stage_timeouts: { CPUStress: 14h, Storage: 8h, Network: 2h30m } + defaults: + cpustress: { cpu_pass: 12h } + storage: { mode: full_disk, fio_time: 6h } + network: { duration: 2h } diff --git a/internal/api/agent_handlers.go b/internal/api/agent_handlers.go index 04215f6..dd164e3 100644 --- a/internal/api/agent_handlers.go +++ b/internal/api/agent_handlers.go @@ -19,6 +19,7 @@ import ( "github.com/go-chi/chi/v5" + "vetting/internal/config" "vetting/internal/events" "vetting/internal/hold" "vetting/internal/logs" @@ -41,6 +42,9 @@ type Agent struct { Artifacts *store.Artifacts SpecDiffs *store.SpecDiffs Measurements *store.Measurements + Thresholds *store.Thresholds // Phase 1: seeded per run; consulted on each /sensor batch + Firmware *store.Firmware // Phase 4: firmware snapshots (unused before then) + Profiles *config.ProfileRegistry // Phase 2: /claim resolves the run's profile → stage knobs Runner *orchestrator.Runner EventHub *events.Hub Logs *logs.Hub @@ -216,6 +220,21 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) { if iperfPort == 0 { iperfPort = 5201 } + + // Resolve the run's profile → agent-visible stage knobs. The agent + // reads these to size CPUStress / Storage / Network work. An empty + // profile (legacy runs seeded before Phase 1) falls back to "quick". + profileName := run.Profile + if profileName == "" { + profileName = config.ProfileQuick + } + var stageCfg config.StageConfig + if a.Profiles != nil { + stageCfg = a.Profiles.ResolveStageConfig(profileName) + } else { + stageCfg = config.StageConfig{Profile: profileName} + } + writeJSON(w, http.StatusOK, map[string]any{ "ok": true, "run_id": runID, @@ -224,6 +243,7 @@ func (a *Agent) Claim(w http.ResponseWriter, r *http.Request) { "iperf_port": iperfPort, "non_destructive": run.NonDestructive, "current_state": string(currentState), + "stage_config": stageCfg, }) } @@ -398,10 +418,24 @@ type StageResult struct { Passed bool `json:"passed"` Summary json.RawMessage `json:"summary,omitempty"` Inventory *spec.Inventory `json:"inventory,omitempty"` + Firmware []FirmwareLine `json:"firmware,omitempty"` Message string `json:"message,omitempty"` SubSteps []SubStepResultLine `json:"sub_steps,omitempty"` } +// FirmwareLine is a single firmware snapshot POSTed alongside the +// Firmware stage's /result body. Mirrors agent/probes.FirmwareSnapshot. +// The server converts each line to a store.FirmwareSnapshot and persists +// it under the run — SpecValidate reads these back to diff against the +// host's expected_firmware. +type FirmwareLine struct { + Component string `json:"component"` + Identifier string `json:"identifier"` + Version string `json:"version"` + Vendor string `json:"vendor,omitempty"` + Raw map[string]string `json:"raw,omitempty"` +} + // SubStepResultLine is one entry in StageResult.SubSteps. Ordinal is // assigned from slice index server-side; the agent doesn't set it. type SubStepResultLine struct { @@ -476,6 +510,20 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) { return } + // Aggregate threshold gate: flip Passed=false server-side when any + // critical breach landed for this stage. The agent's verdict is + // advisory — a stage-executor can miss a runaway sample that the + // sidecar caught. We check this *before* writing the stage state + // so the DB reflects the server-side decision. + thresholdDetail := "" + if body.Passed { + if breached, detail := a.stageHadCriticalBreach(r.Context(), runID, body.Stage); breached { + body.Passed = false + thresholdDetail = detail + a.appendLog(runID, "error", fmt.Sprintf("%s reported passed but %s — flipping to failed", body.Stage, detail)) + } + } + stageState := model.StagePassed if !body.Passed { stageState = model.StageFailed @@ -488,6 +536,9 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) { http.Error(w, "complete stage: "+err.Error(), http.StatusInternalServerError) return } + if thresholdDetail != "" && body.Message == "" { + body.Message = thresholdDetail + } // Agent-authored sub-steps: persist in slice order (ordinal = index) // and fan out a per-row SSE event each so the detail pane shows them @@ -502,6 +553,14 @@ func (a *Agent) Result(w http.ResponseWriter, r *http.Request) { } } + // Firmware-specific: persist each snapshot into firmware_snapshots. + // SpecValidate reads them back to diff against expected_firmware. + if body.Stage == "Firmware" && len(body.Firmware) > 0 { + if err := a.persistFirmware(r.Context(), runID, body.Firmware); err != nil { + log.Printf("persist firmware run %d: %v", runID, err) + } + } + if !body.Passed { if err := a.Runs.SetFailedStage(r.Context(), runID, body.Stage); err != nil { log.Printf("set failed stage: %v", err) @@ -615,6 +674,34 @@ func parseResultTime(s string) *time.Time { return nil } +// persistFirmware writes the reported snapshots. A nil/unset a.Firmware +// store is a no-op so tests that don't wire it up stay green; a mid-run +// persist error is logged but doesn't fail the stage (Firmware is +// advisory — SpecValidate is the gate). +func (a *Agent) persistFirmware(ctx context.Context, runID int64, lines []FirmwareLine) error { + if a.Firmware == nil || len(lines) == 0 { + return nil + } + rows := make([]store.FirmwareSnapshot, 0, len(lines)) + for _, l := range lines { + raw := "{}" + if len(l.Raw) > 0 { + if b, err := json.Marshal(l.Raw); err == nil { + raw = string(b) + } + } + rows = append(rows, store.FirmwareSnapshot{ + RunID: runID, + Component: l.Component, + Identifier: l.Identifier, + Version: l.Version, + Vendor: l.Vendor, + RawJSON: raw, + }) + } + return a.Firmware.CreateBatch(ctx, rows) +} + func (a *Agent) persistInventory(r *http.Request, run *model.Run, inv *spec.Inventory) error { dir := filepath.Join(a.ArtifactsDir, fmt.Sprintf("run-%d", run.ID)) if err := os.MkdirAll(dir, 0o755); err != nil { @@ -667,6 +754,22 @@ func (a *Agent) resolveSpecValidate(r *http.Request, runID int64) { return } diffs := spec.Diff(expected, inv) + if a.Firmware != nil && len(expected.Firmware) > 0 { + snaps, err := a.Firmware.ListForRun(r.Context(), runID) + if err != nil { + log.Printf("specvalidate: list firmware: %v", err) + } else { + observed := make([]spec.FirmwareObserved, 0, len(snaps)) + for _, s := range snaps { + observed = append(observed, spec.FirmwareObserved{ + Component: s.Component, + Identifier: s.Identifier, + Version: s.Version, + }) + } + diffs = append(diffs, spec.DiffFirmware(expected.Firmware, observed)...) + } + } if err := a.SpecDiffs.ReplaceForRun(r.Context(), runID, diffs); err != nil { log.Printf("specvalidate: write diffs: %v", err) } @@ -884,13 +987,17 @@ type SensorSample struct { } // Sensor persists a batch of numeric samples. The thermal sidecar hits -// this on a tick; stage executors (iperf, fio) also drop here. +// this on a tick; stage executors (iperf, fio) also drop here. Each +// sample is evaluated against the run's seeded thresholds — critical +// breaches fail the run immediately (thermal runaway, EDAC UE, voltage +// sag); warning breaches are recorded for the report only. func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) { runID, ok := runIDFromURL(w, r) if !ok { return } - if _, ok := a.authenticate(w, r, runID); !ok { + run, ok := a.authenticate(w, r, runID) + if !ok { return } if a.Measurements == nil { @@ -903,8 +1010,12 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) { return } rows := make([]model.Measurement, 0, len(body.Samples)) + sampleStages := make([]string, 0, len(body.Samples)) for _, s := range body.Samples { ts, _ := time.Parse(time.RFC3339Nano, s.TS) + if ts.IsZero() { + ts = time.Now().UTC() + } rows = append(rows, model.Measurement{ RunID: runID, TS: ts, @@ -913,12 +1024,139 @@ func (a *Agent) Sensor(w http.ResponseWriter, r *http.Request) { Value: s.Value, Unit: s.Unit, }) + // Stage the sample belongs to drives threshold selector + // matching. We use the run's current state — the agent does + // not tag samples with a stage. + sampleStages = append(sampleStages, orchestrator.StageNameForState(run.State)) } if err := a.Measurements.CreateBatch(r.Context(), rows); err != nil { http.Error(w, "write samples: "+err.Error(), http.StatusInternalServerError) return } - writeJSON(w, http.StatusOK, map[string]any{"ok": true, "written": len(rows)}) + critical := a.evaluateSensorBatch(r.Context(), runID, rows, sampleStages) + writeJSON(w, http.StatusOK, map[string]any{ + "ok": true, + "written": len(rows), + "breach": critical != "", + "breach_kind": critical, + }) + if critical != "" { + a.failRunOnCriticalBreach(r, run, critical) + } +} + +// evaluateSensorBatch runs each sample through the run's thresholds, +// persists evaluations, and returns a short human-readable label for +// the first critical breach it sees (empty when all samples pass or +// only hit warning-severity rules). +func (a *Agent) evaluateSensorBatch(ctx context.Context, runID int64, rows []model.Measurement, sampleStages []string) string { + if a.Thresholds == nil || len(rows) == 0 { + return "" + } + rules, err := a.Thresholds.ListForRun(ctx, runID) + if err != nil { + log.Printf("sensor: list thresholds run %d: %v", runID, err) + return "" + } + if len(rules) == 0 { + return "" + } + evalRules := make([]orchestrator.Threshold, 0, len(rules)) + for _, r := range rules { + evalRules = append(evalRules, orchestrator.Threshold{ + ID: r.ID, + Stage: r.Stage, + Kind: r.Kind, + Key: r.Key, + Op: orchestrator.ThresholdOp(r.Op), + Value: r.Threshold, + Nominal: r.Nominal, + Severity: orchestrator.ThresholdSeverity(r.Severity), + }) + } + evals := make([]store.ThresholdEvaluation, 0, len(rows)) + critical := "" + for i, m := range rows { + sample := orchestrator.Sample{ + Stage: sampleStages[i], + Kind: m.Kind, + Key: m.Key, + Value: m.Value, + } + for _, res := range orchestrator.Evaluate(sample, evalRules) { + evals = append(evals, store.ThresholdEvaluation{ + RunID: runID, + ThresholdID: res.Threshold.ID, + Stage: sample.Stage, + Kind: sample.Kind, + Key: sample.Key, + TS: m.TS, + Observed: res.Observed, + Passed: res.Passed, + }) + if critical == "" && res.CriticalBreach() { + critical = fmt.Sprintf("%s %s=%g breached %s %g", + res.Threshold.Kind, sample.Key, res.Observed, res.Threshold.Op, res.Threshold.Value) + } + } + } + if err := a.Thresholds.RecordBatch(ctx, evals); err != nil { + log.Printf("sensor: record evals run %d: %v", runID, err) + } + return critical +} + +// stageHadCriticalBreach returns true if any critical-severity +// threshold evaluation for this run matched samples attributed to the +// given stage (stage selector "*" or exact). Called at /result close +// so even an agent that reports Passed=true gets overridden when the +// aggregate view says the stage tripped a gate. +func (a *Agent) stageHadCriticalBreach(ctx context.Context, runID int64, stage string) (bool, string) { + if a.Thresholds == nil { + return false, "" + } + breaches, err := a.Thresholds.CriticalBreaches(ctx, runID) + if err != nil { + log.Printf("result: list breaches run %d: %v", runID, err) + return false, "" + } + for _, b := range breaches { + if b.Stage == stage || b.Stage == "" || b.Stage == "*" { + return true, fmt.Sprintf("critical threshold breach: %s %s=%g", b.Kind, b.Key, b.Observed) + } + } + return false, "" +} + +// failRunOnCriticalBreach flips the run to FailedHolding in response +// to a live threshold breach (thermal runaway, EDAC UE, rail sag). +// The agent's pending /result for the current stage may still arrive — +// the silent-skip guard handles that by refusing to double-transition. +func (a *Agent) failRunOnCriticalBreach(r *http.Request, run *model.Run, detail string) { + stage := orchestrator.StageNameForState(run.State) + if stage == "" { + stage = "threshold" + } + if err := a.Runs.SetFailedStage(r.Context(), run.ID, stage+" (threshold)"); err != nil { + log.Printf("sensor: set failed stage run %d: %v", run.ID, err) + } + if _, err := a.Runner.Transition(r.Context(), run.ID, orchestrator.TriggerStageFailed); err != nil { + // If we're already in FailedHolding the transition errors — + // that's fine, the first breach wins. + log.Printf("sensor: fail-transition run %d: %v", run.ID, err) + return + } + hostName := a.hostNameFor(r.Context(), run.HostID) + a.dispatchEvent(notify.Event{ + Kind: notify.KindStageFailed, + Severity: notify.SeverityCritical, + RunID: run.ID, + HostName: hostName, + Title: fmt.Sprintf("[vetting] %s FAILED: %s (threshold)", hostName, stage), + Body: fmt.Sprintf("Run %d on %s tripped a critical threshold during %s: %s", run.ID, hostName, stage, detail), + URL: a.runLinkURL(run.ID), + }) + a.appendLog(run.ID, "error", fmt.Sprintf("threshold breach during %s: %s — run parked in FailedHolding", stage, detail)) } // resolveReporting runs when the pipeline advances into StateReporting. @@ -956,12 +1194,20 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) { log.Printf("reporting: list measurements: %v", err) } } + var firmware []store.FirmwareSnapshot + if a.Firmware != nil { + firmware, err = a.Firmware.ListForRun(ctx, runID) + if err != nil { + log.Printf("reporting: list firmware: %v", err) + } + } bundle := map[string]any{ "run": run, "host": host, "stages": stages, "spec_diffs": diffs, "measurements": measurements, + "firmware": firmware, "generated_at": time.Now().UTC().Format(time.RFC3339), } buf, err := json.MarshalIndent(bundle, "", " ") @@ -993,6 +1239,15 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) { // Also render the operator-facing HTML summary alongside the JSON. // Failures here are non-fatal — the JSON is the source of truth. if host != nil { + fwRows := make([]report.FirmwareSnapshot, 0, len(firmware)) + for _, f := range firmware { + fwRows = append(fwRows, report.FirmwareSnapshot{ + Component: f.Component, + Identifier: f.Identifier, + Version: f.Version, + Vendor: f.Vendor, + }) + } htmlData := report.Data{ GeneratedAt: time.Now().UTC(), Run: *run, @@ -1000,6 +1255,7 @@ func (a *Agent) resolveReporting(r *http.Request, runID int64) { Stages: stages, SpecDiffs: diffs, Aggregates: report.AggregateMeasurements(measurements), + Firmware: fwRows, } if htmlBuf, err := report.RenderHTML(htmlData); err != nil { log.Printf("reporting: render html: %v", err) diff --git a/internal/api/run_page_test.go b/internal/api/run_page_test.go index 586a509..7cc61dc 100644 --- a/internal/api/run_page_test.go +++ b/internal/api/run_page_test.go @@ -108,7 +108,7 @@ func TestRunPage_DefaultStep_Running(t *testing.T) { }) runID, _ := runs.Create(ctx, id, "rr", false) _ = ui.Stages.Seed(ctx, runID) - for _, name := range []string{"Inventory", "SpecValidate"} { + for _, name := range []string{"Inventory", "Firmware", "SpecValidate"} { _ = ui.Stages.StartByName(ctx, runID, name) _ = ui.Stages.CompleteByName(ctx, runID, name, model.StagePassed, "") } @@ -135,7 +135,7 @@ func TestRunPage_DefaultStep_Failed(t *testing.T) { }) runID, _ := runs.Create(ctx, id, "rf", false) _ = ui.Stages.Seed(ctx, runID) - for _, name := range []string{"Inventory", "SpecValidate", "SMART"} { + for _, name := range []string{"Inventory", "Firmware", "SpecValidate", "SMART"} { _ = ui.Stages.StartByName(ctx, runID, name) _ = ui.Stages.CompleteByName(ctx, runID, name, model.StagePassed, "") } diff --git a/internal/api/sensor_thresholds_test.go b/internal/api/sensor_thresholds_test.go new file mode 100644 index 0000000..d49973b --- /dev/null +++ b/internal/api/sensor_thresholds_test.go @@ -0,0 +1,169 @@ +package api_test + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "path/filepath" + "strconv" + "testing" + + "vetting/internal/api" + "vetting/internal/db" + "vetting/internal/events" + "vetting/internal/model" + "vetting/internal/orchestrator" + "vetting/internal/store" +) + +// setupAgentWithThresholds builds an Agent wired up to the thresholds +// store + a Runner so the /sensor handler can drive the state machine. +// Seeds one critical thermal threshold and parks the run in CPUStress +// so the handler will stamp a stage-relevant failed_stage. +func setupAgentWithThresholds(t *testing.T) (*api.Agent, int64, string) { + t.Helper() + path := filepath.Join(t.TempDir(), "vetting.db") + conn, err := db.Open(path) + if err != nil { + t.Fatalf("open db: %v", err) + } + t.Cleanup(func() { _ = conn.Close() }) + + hosts := &store.Hosts{DB: conn} + runs := &store.Runs{DB: conn} + stages := &store.Stages{DB: conn} + meas := &store.Measurements{DB: conn} + thresholds := &store.Thresholds{DB: conn} + hub := events.NewHub() + runner := &orchestrator.Runner{Runs: runs, Hosts: hosts, Stages: stages, EventHub: hub} + + hostID, err := hosts.Create(context.Background(), model.Host{ + Name: "thresh-host", + MAC: "aa:bb:cc:dd:ee:aa", + WoLBroadcastIP: "10.0.0.255", + WoLPort: 9, + ExpectedSpecYAML: "memory:\n total_gib: 16\n", + }) + if err != nil { + t.Fatalf("create host: %v", err) + } + plain, hash, err := orchestrator.IssueRunToken() + if err != nil { + t.Fatalf("issue token: %v", err) + } + runID, err := runs.Create(context.Background(), hostID, hash, false) + if err != nil { + t.Fatalf("create run: %v", err) + } + if err := stages.Seed(context.Background(), runID); err != nil { + t.Fatalf("seed stages: %v", err) + } + // Park the run where a real thermal sidecar would be posting samples. + if err := runs.SetState(context.Background(), runID, model.StateCPUStress); err != nil { + t.Fatalf("set state: %v", err) + } + // Seed one critical thermal threshold. + if _, err := thresholds.SeedForRun(context.Background(), runID, []store.ThresholdSpec{ + {Stage: "*", Kind: "temp", Key: "cpu/*", Op: "lt", Value: 92, Unit: "C", Severity: "critical", Source: "profile"}, + }); err != nil { + t.Fatalf("seed thresholds: %v", err) + } + return &api.Agent{ + Hosts: hosts, + Runs: runs, + Stages: stages, + Measurements: meas, + Thresholds: thresholds, + Runner: runner, + }, runID, plain +} + +// TestSensor_ThermalRunawayFailsRun: a sample that breaches a critical +// threshold lands in threshold_evaluations (passed=0) and flips the +// run into FailedHolding with failed_stage naming the current stage. +// This is the Phase-1 behavior gate — without the evaluator, the sample +// would just sit in measurements and the run would happily march on. +func TestSensor_ThermalRunawayFailsRun(t *testing.T) { + a, runID, token := setupAgentWithThresholds(t) + batch := api.SensorBatch{Samples: []api.SensorSample{ + {Kind: "temp", Key: "cpu/0", Value: 95.3, Unit: "C"}, + }} + buf, _ := json.Marshal(batch) + req := routedRequest(runID, http.MethodPost, + "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf) + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + + rr := httptest.NewRecorder() + a.Sensor(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String()) + } + var resp struct { + OK bool `json:"ok"` + Breach bool `json:"breach"` + Kind string `json:"breach_kind"` + } + if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil { + t.Fatalf("decode: %v", err) + } + if !resp.Breach { + t.Fatalf("expected breach=true, got %+v", resp) + } + run, err := a.Runs.Get(context.Background(), runID) + if err != nil { + t.Fatalf("get run: %v", err) + } + if run.State != model.StateFailedHolding { + t.Fatalf("state = %s, want FailedHolding", run.State) + } + if run.FailedStage == "" { + t.Fatalf("failed_stage empty; want stage-named breach") + } + evals, err := a.Thresholds.ListEvaluations(context.Background(), runID) + if err != nil { + t.Fatalf("list evaluations: %v", err) + } + if len(evals) != 1 { + t.Fatalf("want 1 evaluation recorded, got %d", len(evals)) + } + if evals[0].Passed { + t.Fatalf("evaluation recorded as passed for 95.3C sample against <92C rule") + } +} + +// TestSensor_WithinThresholdPasses: a sample comfortably inside the +// threshold writes an evaluation row with passed=1 and leaves the run +// state untouched. +func TestSensor_WithinThresholdPasses(t *testing.T) { + a, runID, token := setupAgentWithThresholds(t) + batch := api.SensorBatch{Samples: []api.SensorSample{ + {Kind: "temp", Key: "cpu/0", Value: 55.0, Unit: "C"}, + }} + buf, _ := json.Marshal(batch) + req := routedRequest(runID, http.MethodPost, + "/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/sensor", buf) + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + + rr := httptest.NewRecorder() + a.Sensor(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, body = %q", rr.Code, rr.Body.String()) + } + run, err := a.Runs.Get(context.Background(), runID) + if err != nil { + t.Fatalf("get run: %v", err) + } + if run.State != model.StateCPUStress { + t.Fatalf("state = %s, want CPUStress unchanged", run.State) + } + evals, err := a.Thresholds.ListEvaluations(context.Background(), runID) + if err != nil { + t.Fatalf("list evaluations: %v", err) + } + if len(evals) != 1 || !evals[0].Passed { + t.Fatalf("want 1 passed evaluation, got %+v", evals) + } +} diff --git a/internal/api/smoke_test.go b/internal/api/smoke_test.go index cba8ea4..46f6dec 100644 --- a/internal/api/smoke_test.go +++ b/internal/api/smoke_test.go @@ -75,6 +75,12 @@ func newCaptureRegistry(c *captureNotifier) *notify.Registry { // (agent, runID, plainTokenForBearer). Caller is responsible for // transitioning the run out of Queued. func fullAgent(t *testing.T) (*api.Agent, int64, string) { + return fullAgentWithSpec(t, "") +} + +// fullAgentWithSpec is the same as fullAgent but seeds the host with +// an ExpectedSpecYAML so SpecValidate can pick up diffs in the test. +func fullAgentWithSpec(t *testing.T, expectedSpecYAML string) (*api.Agent, int64, string) { t.Helper() tmp := t.TempDir() conn, err := db.Open(filepath.Join(tmp, "vetting.db")) @@ -89,6 +95,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) { artifactStore := &store.Artifacts{DB: conn} specDiffStore := &store.SpecDiffs{DB: conn} measurementStore := &store.Measurements{DB: conn} + firmwareStore := &store.Firmware{DB: conn} hub := events.NewHub() logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub) @@ -109,7 +116,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) { MAC: "aa:bb:cc:dd:ee:10", WoLBroadcastIP: "10.0.0.255", WoLPort: 9, - ExpectedSpecYAML: "", // empty spec → no diffs + ExpectedSpecYAML: expectedSpecYAML, }) if err != nil { t.Fatalf("create host: %v", err) @@ -132,6 +139,7 @@ func fullAgent(t *testing.T) (*api.Agent, int64, string) { Artifacts: artifactStore, SpecDiffs: specDiffStore, Measurements: measurementStore, + Firmware: firmwareStore, Runner: runner, EventHub: hub, Logs: logHub, @@ -195,20 +203,24 @@ func TestFullPipelineToCompleted(t *testing.T) { Memory: spec.MemorySpec{TotalGiB: 16}, } next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}) - // After Inventory → SpecValidate resolves inline → SMART - if next != "SMART" { - t.Fatalf("after Inventory, next_state = %q, want SMART", next) + // After Inventory → Firmware + if next != "Firmware" { + t.Fatalf("after Inventory, next_state = %q, want Firmware", next) } - // The remaining stages advance one-for-one in order. + // The remaining stages advance one-for-one in order. After Firmware + // the inline SpecValidate resolver advances through SpecValidate to + // SMART without a dedicated /result POST for SpecValidate. walkPlan := []struct { stage string expected string }{ + {"Firmware", "SMART"}, {"SMART", "CPUStress"}, {"CPUStress", "Storage"}, {"Storage", "Network"}, - {"Network", "GPU"}, + {"Network", "Burn"}, + {"Burn", "GPU"}, {"GPU", "PSU"}, {"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed } @@ -287,8 +299,11 @@ func TestFaultInjectionSMART(t *testing.T) { } inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}} - if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "SMART" { - t.Fatalf("after Inventory, next = %q want SMART", next) + if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" { + t.Fatalf("after Inventory, next = %q want Firmware", next) + } + if next := walkStage(t, a, runID, token, "Firmware", true, nil); next != "SMART" { + t.Fatalf("after Firmware, next = %q want SMART (inline SpecValidate)", next) } // Fake SMART failure → expect FailedHolding. @@ -316,3 +331,76 @@ func TestFaultInjectionSMART(t *testing.T) { t.Errorf("StageFailed severity = %q, want critical", ev.Severity) } } + +// TestFirmwarePersistAndSpecMismatch exercises the Phase 4 firmware +// integration: the agent POSTs Firmware snapshots; server persists; the +// following SpecValidate diff picks up a firmware mismatch and parks +// the run in FailedHolding with FailedStage=SpecValidate. +func TestFirmwarePersistAndSpecMismatch(t *testing.T) { + // Host demands BIOS 3.3; agent will POST 3.2 → one critical firmware diff. + yaml := "firmware:\n - component: bios\n version: \"3.3\"\n" + a, runID, token := fullAgentWithSpec(t, yaml) + a.Notify = newCaptureRegistry(&captureNotifier{name: "capture"}) + + if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil { + t.Fatalf("set state: %v", err) + } + + inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}} + if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" { + t.Fatalf("after Inventory, next = %q want Firmware", next) + } + + // Firmware stage: agent reports actual BIOS 3.2 → one row persisted. + fw := []map[string]any{ + {"component": "bios", "identifier": "system", "version": "3.2", "vendor": "AMI"}, + } + next := walkStage(t, a, runID, token, "Firmware", true, map[string]any{"firmware": fw}) + // Inline SpecValidate should detect the firmware mismatch and send + // the run to FailedHolding without the agent posting SpecValidate. + if next != "FailedHolding" { + t.Fatalf("after Firmware mismatch, next = %q want FailedHolding", next) + } + + run, err := a.Runs.Get(context.Background(), runID) + if err != nil { + t.Fatalf("get run: %v", err) + } + if run.State != model.StateFailedHolding { + t.Fatalf("run.State = %q, want FailedHolding", run.State) + } + if run.FailedStage != "SpecValidate" { + t.Fatalf("run.FailedStage = %q, want SpecValidate", run.FailedStage) + } + + // Persistence: row landed in firmware_snapshots. + snaps, err := a.Firmware.ListForRun(context.Background(), runID) + if err != nil { + t.Fatalf("ListForRun firmware: %v", err) + } + if len(snaps) != 1 { + t.Fatalf("firmware rows = %d, want 1: %+v", len(snaps), snaps) + } + if snaps[0].Component != "bios" || snaps[0].Version != "3.2" { + t.Errorf("persisted snapshot = %+v", snaps[0]) + } + + // Diff row: SpecDiffs has a firmware-specific entry (rather than + // only CPU/memory/disk rows) and is critical. + diffs, err := a.SpecDiffs.ListForRun(context.Background(), runID) + if err != nil { + t.Fatalf("ListForRun specdiffs: %v", err) + } + found := false + for _, d := range diffs { + if strings.HasPrefix(d.Field, "firmware[") { + found = true + if d.Severity != "critical" { + t.Errorf("firmware diff severity = %q, want critical", d.Severity) + } + } + } + if !found { + t.Fatalf("no firmware[...] entry in spec diffs: %+v", diffs) + } +} diff --git a/internal/api/ui_handlers.go b/internal/api/ui_handlers.go index c3f2a9f..745b6db 100644 --- a/internal/api/ui_handlers.go +++ b/internal/api/ui_handlers.go @@ -16,6 +16,7 @@ import ( "github.com/go-chi/chi/v5" "gopkg.in/yaml.v3" + "vetting/internal/config" "vetting/internal/events" "vetting/internal/logs" "vetting/internal/model" @@ -26,17 +27,19 @@ import ( ) type UI struct { - Hosts *store.Hosts - Runs *store.Runs - Stages *store.Stages - SubSteps *store.SubSteps - SpecDiffs *store.SpecDiffs - Artifacts *store.Artifacts - EventHub *events.Hub - Logs *logs.Hub - Runner *orchestrator.Runner - Tiles *TileEnricher - PublicURL string // user-visible base URL baked into the quick-register one-liner + Hosts *store.Hosts + Runs *store.Runs + Stages *store.Stages + SubSteps *store.SubSteps + SpecDiffs *store.SpecDiffs + Artifacts *store.Artifacts + Thresholds *store.Thresholds // Phase 1: seeded at StartRun from Profiles + Profiles *config.ProfileRegistry + EventHub *events.Hub + Logs *logs.Hub + Runner *orchestrator.Runner + Tiles *TileEnricher + PublicURL string // user-visible base URL baked into the quick-register one-liner // PXE, when non-nil, gets Reload()ed after host create/delete so // dnsmasq's dhcp-host= allowlist reflects the current registry. // Without this, a newly-registered host PXE-boots and gets @@ -316,23 +319,71 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) { } nonDestructive := r.PostFormValue("non_destructive") == "1" + profile := strings.TrimSpace(r.PostFormValue("profile")) + if profile == "" { + profile = config.ProfileQuick + } + if !config.IsValidProfile(profile) { + http.Error(w, "unknown profile: "+profile, http.StatusBadRequest) + return + } _, hash, err := orchestrator.IssueRunToken() if err != nil { http.Error(w, "token: "+err.Error(), http.StatusInternalServerError) return } - runID, err := u.Runs.Create(r.Context(), hostID, hash, nonDestructive) + runID, err := u.Runs.CreateWithProfile(r.Context(), hostID, hash, nonDestructive, profile) if err != nil { http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError) return } - log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID) + if err := u.seedThresholds(r.Context(), runID, host, profile); err != nil { + // A threshold-seed failure shouldn't orphan a run row — log + // and continue. Samples will just accumulate without a gate + // until the operator retries, same as before Phase 1. + log.Printf("ui: seed thresholds run %d: %v", runID, err) + } + log.Printf("ui: created run %d for host %d profile=%s (state=Queued)", runID, hostID, profile) // Send the operator straight to the new run — the button they clicked // was "Start vetting", the thing they want next is to watch it. http.Redirect(w, r, fmt.Sprintf("/runs/%d", runID), http.StatusSeeOther) } +// seedThresholds materializes the per-run threshold table from the +// ProfileRegistry. The shared vetting.thresholds block applies to +// every profile; future per-profile overrides will layer on top here, +// and per-host overrides (Phase 1 extra) land via ExpectedSpecYAML in +// a later iteration. Safe to skip silently when Thresholds or the +// registry isn't wired — tests do not always build one. +func (u *UI) seedThresholds(ctx context.Context, runID int64, host *model.Host, profile string) error { + if u.Thresholds == nil || u.Profiles == nil { + return nil + } + _ = host // reserved for per-host override layer + _ = profile // reserved for per-profile override layer + defaults := u.Profiles.Vetting.Thresholds + if len(defaults) == 0 { + return nil + } + specs := make([]store.ThresholdSpec, 0, len(defaults)) + for _, d := range defaults { + specs = append(specs, store.ThresholdSpec{ + Stage: d.Stage, + Kind: d.Kind, + Key: d.Key, + Op: d.Op, + Value: d.Value, + Nominal: d.Nominal, + Unit: d.Unit, + Severity: d.Severity, + Source: "profile", + }) + } + _, err := u.Thresholds.SeedForRun(ctx, runID, specs) + return err +} + func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) { _ = templates.Registration(templates.RegistrationForm{ QuickRegisterURL: u.baseURL(r), diff --git a/internal/config/config.go b/internal/config/config.go index a064f20..1c0460b 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -20,6 +20,13 @@ type Config struct { Agent Agent `yaml:"agent"` Notifiers []Notifier `yaml:"notifiers"` Routes []Route `yaml:"routes"` + + // Profiles holds the Phase-1 quick/deep/soak registry (stage order, + // threshold defaults, per-profile stage timeouts + probe knobs). + // Populated from the `vetting:` and `profiles:` top-level blocks + // during Load. Nil is never returned — Load installs a default + // registry when those blocks are absent. + Profiles *ProfileRegistry `yaml:"-"` } type Server struct { @@ -111,6 +118,20 @@ func Load(path string) (*Config, error) { if err := yaml.Unmarshal(b, &c); err != nil { return nil, fmt.Errorf("parse config: %w", err) } + // The `vetting:` + `profiles:` blocks live alongside the existing + // fields but we decode them into the raw shape because YAML + // durations arrive as strings. Reusing the same byte buffer is + // safe: yaml.Unmarshal is happy to ignore keys the target doesn't + // know about. + var rawProfiles rawProfilesBlock + if err := yaml.Unmarshal(b, &rawProfiles); err != nil { + return nil, fmt.Errorf("parse profiles: %w", err) + } + reg, err := buildProfileRegistry(rawProfiles) + if err != nil { + return nil, fmt.Errorf("profiles: %w", err) + } + c.Profiles = reg if c.Server.Bind == "" { c.Server.Bind = "127.0.0.1:8080" } diff --git a/internal/config/profiles.go b/internal/config/profiles.go new file mode 100644 index 0000000..f68a76c --- /dev/null +++ b/internal/config/profiles.go @@ -0,0 +1,441 @@ +package config + +import ( + "fmt" + "strings" + "time" +) + +// ProfileName is the set of legal values for a Run's profile column. +// Exposed as constants so callers (UI handler, tests, agent) don't +// sprinkle literal strings. +const ( + ProfileQuick = "quick" + ProfileDeep = "deep" + ProfileSoak = "soak" +) + +// AllProfiles is the canonical ordering shown in the picker. Leftmost +// is the default; rightmost is the longest-running. +var AllProfiles = []string{ProfileQuick, ProfileDeep, ProfileSoak} + +// IsValidProfile returns true when name is one of the known profile +// identifiers. Used at the UI boundary to reject malformed POSTs and in +// store code as a fallback guard. +func IsValidProfile(name string) bool { + for _, p := range AllProfiles { + if p == name { + return true + } + } + return false +} + +// Vetting holds the stage order + threshold defaults that are shared +// across all profiles. Only the per-stage durations/concurrency differ +// between quick/deep/soak; gates like "CPU > 92C fails the run" apply +// to a 2-minute quick run and a 12-hour soak alike. +type Vetting struct { + Stages []string `yaml:"stages"` + Thresholds []ThresholdDefaults `yaml:"thresholds"` +} + +// ThresholdDefaults is the YAML shape of a threshold declaration. One +// stanza can declare a per-stage rule ("stage: Network") or a global +// rule ("stage: *") — the threshold evaluator applies both to samples +// with matching (stage, kind, key). +type ThresholdDefaults struct { + Stage string `yaml:"stage"` + Kind string `yaml:"kind"` + Key string `yaml:"key"` + Op string `yaml:"op"` // lt|lte|gt|gte|within_pct + Value float64 `yaml:"value"` + Nominal float64 `yaml:"nominal"` // only used by within_pct (e.g. 12.0 for +12V rail) + Unit string `yaml:"unit"` + Severity string `yaml:"severity"` // critical|warning +} + +// ProfileRegistry is the in-memory view of the `profiles:` block in +// vetting.yaml. The orchestrator queries it at run creation time to +// seed thresholds and (in Phase 3+) to scale per-stage durations. +type ProfileRegistry struct { + // Shared stage ordering + threshold defaults. Every profile walks + // the same list; only durations/concurrency differ. + Vetting Vetting + + // Profiles is keyed by name ("quick"/"deep"/"soak"). Inherit is + // already resolved at load time — a caller sees a flattened view. + Profiles map[string]Profile +} + +// Profile is a loaded profile. StageTimeouts is keyed by stage name. +// Defaults carries the free-form knobs each probe reads. +type Profile struct { + Name string + Inherit string + StageTimeouts map[string]time.Duration + Defaults map[string]map[string]any +} + +// StageConfig is the flat view of a profile's knobs, shipped on the +// claim response so the agent can size CPUStress/Storage/Network/Burn +// work without parsing YAML. Empty values mean "fall back to the +// agent's compile-time default" — an older orchestrator that doesn't +// set these fields keeps working unchanged. +type StageConfig struct { + Profile string `json:"profile"` + StageTimeouts map[string]string `json:"stage_timeouts,omitempty"` + CPUStress CPUStressKnobs `json:"cpustress"` + Storage StorageKnobs `json:"storage"` + Network NetworkKnobs `json:"network"` + Burn BurnKnobs `json:"burn"` +} + +// CPUStressKnobs parallels the `cpustress:` block under `profiles..defaults`. +// Durations are YAML duration strings ("2m", "60m", "12h"). +type CPUStressKnobs struct { + CPUPass string `json:"cpu_pass,omitempty"` + MemPass string `json:"mem_pass,omitempty"` + EDACPoll string `json:"edac_poll,omitempty"` +} + +// StorageKnobs parallels `storage:` defaults. Mode is "fio_sample" (quick) +// or "full_disk" (deep/soak). Verify names the integrity mode ("md5" or ""). +type StorageKnobs struct { + Mode string `json:"mode,omitempty"` + FioSize string `json:"fio_size,omitempty"` + FioTime string `json:"fio_time,omitempty"` + FioBS string `json:"fio_bs,omitempty"` + FioRW string `json:"fio_rw,omitempty"` + Verify string `json:"verify,omitempty"` +} + +// NetworkKnobs parallels `network:` defaults. Duration is a YAML string. +type NetworkKnobs struct { + Duration string `json:"duration,omitempty"` +} + +// BurnKnobs parallels `burn:` defaults. Duration is the total Burn window. +// CPUWorkers is "all" (agent picks runtime.NumCPU) or a numeric string. +// MemPct is a percentage of MemAvailable to stress. FioOnSpare gates +// whether fio runs inside Burn (set false if operator lacks a spare +// partition). IperfParallel is the parallel stream count fed to iperf3 -P. +type BurnKnobs struct { + Duration string `json:"duration,omitempty"` + CPUWorkers string `json:"cpu_workers,omitempty"` + MemPct int `json:"mem_pct,omitempty"` + FioOnSpare bool `json:"fio_on_spare,omitempty"` + IperfParallel int `json:"iperf_parallel,omitempty"` +} + +// ResolveStageConfig flattens the named profile into the wire shape the +// claim handler ships. Missing keys render as empty strings so the agent +// falls back to its own defaults. +func (pr *ProfileRegistry) ResolveStageConfig(name string) StageConfig { + if pr == nil { + return StageConfig{Profile: name} + } + p, err := pr.Lookup(name) + if err != nil { + return StageConfig{Profile: name} + } + out := StageConfig{Profile: p.Name} + if len(p.StageTimeouts) > 0 { + out.StageTimeouts = make(map[string]string, len(p.StageTimeouts)) + for k, v := range p.StageTimeouts { + out.StageTimeouts[k] = v.String() + } + } + cpu := p.Defaults["cpustress"] + out.CPUStress.CPUPass = yamlString(cpu, "cpu_pass") + out.CPUStress.MemPass = yamlString(cpu, "mem_pass") + out.CPUStress.EDACPoll = yamlString(cpu, "edac_poll") + st := p.Defaults["storage"] + out.Storage.Mode = yamlString(st, "mode") + out.Storage.FioSize = yamlString(st, "fio_size") + out.Storage.FioTime = yamlString(st, "fio_time") + out.Storage.FioBS = yamlString(st, "fio_bs") + out.Storage.FioRW = yamlString(st, "fio_rw") + out.Storage.Verify = yamlString(st, "verify") + net := p.Defaults["network"] + out.Network.Duration = yamlString(net, "duration") + burn := p.Defaults["burn"] + out.Burn.Duration = yamlString(burn, "duration") + out.Burn.CPUWorkers = yamlString(burn, "cpu_workers") + out.Burn.MemPct = yamlInt(burn, "mem_pct") + out.Burn.FioOnSpare = yamlBool(burn, "fio_on_spare") + out.Burn.IperfParallel = yamlInt(burn, "iperf_parallel") + return out +} + +// yamlInt coerces a map[string]any entry to int. Accepts native int, +// float64 (JSON numbers round-trip as float), or numeric string. Missing +// / malformed values return 0 so the agent falls back to its default. +func yamlInt(m map[string]any, key string) int { + v, ok := m[key] + if !ok || v == nil { + return 0 + } + switch x := v.(type) { + case int: + return x + case int64: + return int(x) + case float64: + return int(x) + case string: + // Best-effort string → int. Empty and non-numeric fall through + // to zero. + var n int + if _, err := fmt.Sscanf(x, "%d", &n); err == nil { + return n + } + } + return 0 +} + +// yamlBool accepts native bool or "true"/"false" strings. Anything else +// (missing key, numeric, typo) returns false — a safer default than +// "true" for a destructive knob like fio_on_spare. +func yamlBool(m map[string]any, key string) bool { + v, ok := m[key] + if !ok || v == nil { + return false + } + switch x := v.(type) { + case bool: + return x + case string: + return strings.EqualFold(x, "true") + } + return false +} + +// yamlString coerces a map[string]any entry to its string form. YAML +// durations like "2m" parse as strings; numeric literals like 5 parse as +// int. We format non-string scalars with fmt.Sprint so the agent can +// still interpret them. +func yamlString(m map[string]any, key string) string { + v, ok := m[key] + if !ok || v == nil { + return "" + } + if s, ok := v.(string); ok { + return s + } + return fmt.Sprint(v) +} + +// Lookup returns the profile with the given name. Falls back to the +// default profile (quick) if the name is empty. Returns an error when +// the name is non-empty but unknown so the caller can surface it. +func (pr *ProfileRegistry) Lookup(name string) (Profile, error) { + if name == "" { + name = ProfileQuick + } + p, ok := pr.Profiles[name] + if !ok { + return Profile{}, fmt.Errorf("unknown profile %q", name) + } + return p, nil +} + +// Names returns the registry's profile names in the canonical +// picker order (quick/deep/soak). Profiles present in the config but +// unknown to AllProfiles are appended after, alphabetically. +func (pr *ProfileRegistry) Names() []string { + out := make([]string, 0, len(pr.Profiles)) + seen := map[string]bool{} + for _, n := range AllProfiles { + if _, ok := pr.Profiles[n]; ok { + out = append(out, n) + seen[n] = true + } + } + for n := range pr.Profiles { + if !seen[n] { + out = append(out, n) + } + } + return out +} + +// Stages returns the shared stage order, or a safe default when the +// config didn't declare one — keeps tests that don't build a full +// ProfileRegistry from tripping over a nil slice. +func (pr *ProfileRegistry) Stages() []string { + if len(pr.Vetting.Stages) == 0 { + return DefaultStages() + } + out := make([]string, len(pr.Vetting.Stages)) + copy(out, pr.Vetting.Stages) + return out +} + +// DefaultStages is the canonical stage list the orchestrator walks +// when no config is loaded. Mirrored in the vetting.yaml shipped with +// the repo so edits to the slice and the file stay in sync. +func DefaultStages() []string { + return []string{ + "Inventory", + "Firmware", + "SpecValidate", + "SMART", + "CPUStress", + "Storage", + "Network", + "Burn", + "GPU", + "PSU", + "Reporting", + } +} + +// rawProfile is the YAML shape before inherit resolution. Durations +// arrive as strings (e.g. "2h") so we can parse them with +// time.ParseDuration instead of rolling our own. +type rawProfile struct { + Inherit string `yaml:"inherit"` + StageTimeouts map[string]string `yaml:"stage_timeouts"` + Defaults map[string]map[string]any `yaml:"defaults"` +} + +type rawProfilesBlock struct { + Vetting Vetting `yaml:"vetting"` + Profiles map[string]rawProfile `yaml:"profiles"` +} + +// buildProfileRegistry flattens a rawProfilesBlock into a ProfileRegistry. +// Resolves `inherit:` by recursive merge (child keys win), parses +// stage_timeouts strings into time.Durations, and returns an error if +// the inherit chain loops or references an unknown profile. +func buildProfileRegistry(raw rawProfilesBlock) (*ProfileRegistry, error) { + if len(raw.Profiles) == 0 { + raw.Profiles = defaultRawProfiles() + } + out := &ProfileRegistry{ + Vetting: raw.Vetting, + Profiles: make(map[string]Profile, len(raw.Profiles)), + } + if len(out.Vetting.Stages) == 0 { + out.Vetting.Stages = DefaultStages() + } + for name := range raw.Profiles { + resolved, err := resolveProfile(raw.Profiles, name, nil) + if err != nil { + return nil, err + } + out.Profiles[name] = resolved + } + return out, nil +} + +// resolveProfile recursively walks inherit chains, depth-first. The +// visited slice is a cycle guard — we add the current name before +// recursing and bail if we ever see it again. +func resolveProfile(all map[string]rawProfile, name string, visited []string) (Profile, error) { + for _, v := range visited { + if v == name { + return Profile{}, fmt.Errorf("profile inherit cycle: %s -> %s", strings.Join(visited, " -> "), name) + } + } + raw, ok := all[name] + if !ok { + return Profile{}, fmt.Errorf("unknown profile %q", name) + } + base := Profile{ + Name: name, + Inherit: raw.Inherit, + StageTimeouts: map[string]time.Duration{}, + Defaults: map[string]map[string]any{}, + } + if raw.Inherit != "" { + parent, err := resolveProfile(all, raw.Inherit, append(visited, name)) + if err != nil { + return Profile{}, err + } + for k, v := range parent.StageTimeouts { + base.StageTimeouts[k] = v + } + for k, v := range parent.Defaults { + copyMap := make(map[string]any, len(v)) + for kk, vv := range v { + copyMap[kk] = vv + } + base.Defaults[k] = copyMap + } + } + for stage, s := range raw.StageTimeouts { + d, err := time.ParseDuration(s) + if err != nil { + return Profile{}, fmt.Errorf("profile %s stage_timeouts[%s]: %w", name, stage, err) + } + base.StageTimeouts[stage] = d + } + for group, kv := range raw.Defaults { + dest, ok := base.Defaults[group] + if !ok { + dest = map[string]any{} + base.Defaults[group] = dest + } + for k, v := range kv { + dest[k] = v + } + } + return base, nil +} + +// defaultRawProfiles returns sane per-profile durations + probe knobs +// used when vetting.yaml omits the `profiles:` block entirely. Matches +// the plan's per-stage budget table so the agent still gets coherent +// CPUStress/Storage/Network knobs without any operator-visible config. +func defaultRawProfiles() map[string]rawProfile { + return map[string]rawProfile{ + ProfileQuick: { + StageTimeouts: map[string]string{ + "CPUStress": "5m", + "Storage": "5m", + "Network": "2m", + "Burn": "3m", + "PSU": "1m", + }, + Defaults: map[string]map[string]any{ + "cpustress": {"cpu_pass": "2m", "mem_pass": "2m", "edac_poll": "10s"}, + "storage": {"mode": "fio_sample", "fio_size": "1GiB", "fio_time": "3m", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"}, + "network": {"duration": "60s"}, + "burn": {"duration": "2m", "cpu_workers": "all", "mem_pct": 50, "fio_on_spare": true, "iperf_parallel": 2}, + }, + }, + ProfileDeep: { + StageTimeouts: map[string]string{ + "CPUStress": "2h", + "Storage": "4h", + "Network": "35m", + "Burn": "3h", + "PSU": "10m", + }, + Defaults: map[string]map[string]any{ + "cpustress": {"cpu_pass": "60m", "mem_pass": "60m", "edac_poll": "10s"}, + "storage": {"mode": "full_disk", "fio_time": "2h", "fio_bs": "4k", "fio_rw": "randrw", "verify": "md5"}, + "network": {"duration": "30m"}, + "burn": {"duration": "2h", "cpu_workers": "all", "mem_pct": 70, "fio_on_spare": true, "iperf_parallel": 4}, + }, + }, + ProfileSoak: { + Inherit: ProfileDeep, + StageTimeouts: map[string]string{ + "CPUStress": "14h", + "Storage": "8h", + "Network": "2h30m", + "Burn": "20h", + "PSU": "15m", + }, + Defaults: map[string]map[string]any{ + "cpustress": {"cpu_pass": "12h"}, + "storage": {"mode": "full_disk", "fio_time": "6h"}, + "network": {"duration": "2h"}, + "burn": {"duration": "18h", "iperf_parallel": 8}, + }, + }, + } +} diff --git a/internal/db/migrations/0005_profiles_thresholds_firmware.sql b/internal/db/migrations/0005_profiles_thresholds_firmware.sql new file mode 100644 index 0000000..61bd253 --- /dev/null +++ b/internal/db/migrations/0005_profiles_thresholds_firmware.sql @@ -0,0 +1,57 @@ +-- Phase-1 groundwork for profile-aware, threshold-gated vetting. +-- +-- Adds: +-- * runs.profile — which profile the run is executing +-- (quick|deep|soak; defaults to quick for +-- backfill of older rows + tests). +-- * thresholds — seeded per run at creation from the +-- ProfileRegistry + per-host overrides; +-- immutable for that run so a late config +-- edit can't retroactively pass/fail it. +-- * threshold_evaluations — one row per observed sample vs threshold; +-- drives the report + pipeline badges. +-- * firmware_snapshots — per-run BIOS/BMC/NIC/HBA/microcode/NVMe +-- version captures used by SpecValidate +-- diffing in Phase 4. + +ALTER TABLE runs ADD COLUMN profile TEXT NOT NULL DEFAULT 'quick'; + +CREATE TABLE IF NOT EXISTS thresholds ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + stage_name TEXT NOT NULL, -- "*" matches any stage + kind TEXT NOT NULL, -- temp|psu_volt|iperf|fio_p99_us|nic_retrans|edac_ce|edac_ue|mce|... + key TEXT NOT NULL, -- "*" or glob-ish match (prefix* / *suffix / exact) + op TEXT NOT NULL, -- lt|lte|gt|gte|within_pct + threshold REAL NOT NULL, + nominal REAL NOT NULL DEFAULT 0, -- used by within_pct; 0 elsewhere + unit TEXT NOT NULL DEFAULT '', + severity TEXT NOT NULL, -- critical|warning + source TEXT NOT NULL -- profile|host_override +); +CREATE INDEX IF NOT EXISTS idx_thresholds_run ON thresholds(run_id); +CREATE INDEX IF NOT EXISTS idx_thresholds_kind ON thresholds(run_id, stage_name, kind); + +CREATE TABLE IF NOT EXISTS threshold_evaluations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + threshold_id INTEGER NOT NULL REFERENCES thresholds(id) ON DELETE CASCADE, + stage_name TEXT NOT NULL, + kind TEXT NOT NULL, + key TEXT NOT NULL, + ts TIMESTAMP NOT NULL, + observed REAL NOT NULL, + passed INTEGER NOT NULL -- 1 = sample within threshold, 0 = breach +); +CREATE INDEX IF NOT EXISTS idx_threshold_evals_run ON threshold_evaluations(run_id, passed); + +CREATE TABLE IF NOT EXISTS firmware_snapshots ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + component TEXT NOT NULL, -- bios|bmc|nic|hba|microcode|nvme_fw + identifier TEXT NOT NULL, -- slot/serial/device path that distinguishes this component + version TEXT NOT NULL, + vendor TEXT NOT NULL DEFAULT '', + raw_json TEXT NOT NULL DEFAULT '{}' +); +CREATE INDEX IF NOT EXISTS idx_firmware_run ON firmware_snapshots(run_id, component); diff --git a/internal/model/model.go b/internal/model/model.go index a896d01..85543ba 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -26,11 +26,13 @@ const ( StateWaitingReboot RunState = "WaitingReboot" StateBooting RunState = "Booting" StateInventoryCheck RunState = "InventoryCheck" + StateFirmware RunState = "Firmware" StateSpecValidate RunState = "SpecValidate" StateSMART RunState = "SMART" StateCPUStress RunState = "CPUStress" StateStorage RunState = "Storage" StateNetwork RunState = "Network" + StateBurn RunState = "Burn" StateGPU RunState = "GPU" StatePSU RunState = "PSU" StateReporting RunState = "Reporting" @@ -63,6 +65,7 @@ type Run struct { HoldIP string OverrideFlagsJSON string NonDestructive bool + Profile string // quick|deep|soak; empty is treated as "quick" } type StageState string diff --git a/internal/orchestrator/dispatcher.go b/internal/orchestrator/dispatcher.go index 85c637f..cc255bb 100644 --- a/internal/orchestrator/dispatcher.go +++ b/internal/orchestrator/dispatcher.go @@ -119,9 +119,9 @@ func (d *Dispatcher) pickNext(ctx context.Context) { queued = &runs[i] } case model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting, - model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART, + model.StateInventoryCheck, model.StateFirmware, model.StateSpecValidate, model.StateSMART, model.StateCPUStress, model.StateStorage, model.StateNetwork, - model.StateGPU, model.StatePSU, model.StateReporting: + model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting: inFlight++ } } diff --git a/internal/orchestrator/statemachine.go b/internal/orchestrator/statemachine.go index 5e3e57b..c94c96d 100644 --- a/internal/orchestrator/statemachine.go +++ b/internal/orchestrator/statemachine.go @@ -30,11 +30,13 @@ const ( // "InventoryCheck". Later stages share a name with their state. var stageStates = map[string]model.RunState{ "Inventory": model.StateInventoryCheck, + "Firmware": model.StateFirmware, "SpecValidate": model.StateSpecValidate, "SMART": model.StateSMART, "CPUStress": model.StateCPUStress, "Storage": model.StateStorage, "Network": model.StateNetwork, + "Burn": model.StateBurn, "GPU": model.StateGPU, "PSU": model.StatePSU, "Reporting": model.StateReporting, @@ -44,11 +46,13 @@ var stageStates = map[string]model.RunState{ // first stage to Completed. Kept in sync with store.DefaultStageOrder. var stageOrder = []model.RunState{ model.StateInventoryCheck, + model.StateFirmware, model.StateSpecValidate, model.StateSMART, model.StateCPUStress, model.StateStorage, model.StateNetwork, + model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting, @@ -143,9 +147,9 @@ func nextStageState(current model.RunState) (model.RunState, error) { func allActiveStates() []model.RunState { return []model.RunState{ model.StateQueued, model.StateWaitingWoL, model.StateWaitingReboot, model.StateBooting, - model.StateInventoryCheck, model.StateSpecValidate, model.StateSMART, + model.StateInventoryCheck, model.StateFirmware, model.StateSpecValidate, model.StateSMART, model.StateCPUStress, model.StateStorage, model.StateNetwork, - model.StateGPU, model.StatePSU, model.StateReporting, + model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting, } } diff --git a/internal/orchestrator/statemachine_test.go b/internal/orchestrator/statemachine_test.go index 50ecf0b..32231a9 100644 --- a/internal/orchestrator/statemachine_test.go +++ b/internal/orchestrator/statemachine_test.go @@ -80,11 +80,13 @@ func TestTriggerAgentClaimedFromWaitingReboot(t *testing.T) { func TestTriggerStageMismatch(t *testing.T) { stageStates := []model.RunState{ model.StateInventoryCheck, + model.StateFirmware, model.StateSpecValidate, model.StateSMART, model.StateCPUStress, model.StateStorage, model.StateNetwork, + model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting, @@ -114,11 +116,13 @@ func TestTriggerStageMismatch(t *testing.T) { func TestStageNameForState(t *testing.T) { pairs := map[string]model.RunState{ "Inventory": model.StateInventoryCheck, + "Firmware": model.StateFirmware, "SpecValidate": model.StateSpecValidate, "SMART": model.StateSMART, "CPUStress": model.StateCPUStress, "Storage": model.StateStorage, "Network": model.StateNetwork, + "Burn": model.StateBurn, "GPU": model.StateGPU, "PSU": model.StatePSU, "Reporting": model.StateReporting, @@ -143,11 +147,13 @@ func TestNextStageWalk(t *testing.T) { // one in the canonical order, and from Reporting onto Completed. chain := []model.RunState{ model.StateInventoryCheck, + model.StateFirmware, model.StateSpecValidate, model.StateSMART, model.StateCPUStress, model.StateStorage, model.StateNetwork, + model.StateBurn, model.StateGPU, model.StatePSU, model.StateReporting, diff --git a/internal/orchestrator/thresholds.go b/internal/orchestrator/thresholds.go new file mode 100644 index 0000000..0b3be00 --- /dev/null +++ b/internal/orchestrator/thresholds.go @@ -0,0 +1,182 @@ +package orchestrator + +import ( + "fmt" + "strings" +) + +// ThresholdOp is one of the comparison operators a threshold supports. +// within_pct is the only one that cares about a "nominal" value for +// the key — used for PSU rails ("+12V within 5% of 12.0"). +type ThresholdOp string + +const ( + OpLT ThresholdOp = "lt" + OpLTE ThresholdOp = "lte" + OpGT ThresholdOp = "gt" + OpGTE ThresholdOp = "gte" + OpWithinPct ThresholdOp = "within_pct" +) + +// ThresholdSeverity routes a breach to either "fail the run" or "just +// surface a warning in the report". The evaluator returns it alongside +// the Pass flag so the caller can decide whether to transition the run. +type ThresholdSeverity string + +const ( + SeverityCritical ThresholdSeverity = "critical" + SeverityWarning ThresholdSeverity = "warning" +) + +// Threshold is the evaluator's view of a stored threshold row. It's a +// flat, already-parsed value-object — the evaluator doesn't look at +// the DB and the store doesn't look at the evaluator. +type Threshold struct { + ID int64 + Stage string // "*" matches any stage + Kind string + Key string // glob-ish: "*" / "prefix*" / "*suffix" / exact + Op ThresholdOp + Value float64 + Nominal float64 // for within_pct (nominal voltage/frequency) + Severity ThresholdSeverity +} + +// Sample is a single observation the evaluator tests against matching +// thresholds. Stage may be empty when the agent doesn't know which +// stage posted it (e.g. the thermal sidecar running across stages) — +// empty-stage samples only match thresholds with Stage == "*". +type Sample struct { + Stage string + Kind string + Key string + Value float64 +} + +// EvalResult is the per-sample outcome of a threshold evaluation: +// which threshold was consulted, whether the sample passed, and the +// severity so the caller can fast-fail on critical breaches. +type EvalResult struct { + Threshold Threshold + Passed bool + Observed float64 +} + +// Breached returns true when the sample violated the threshold. +func (r EvalResult) Breached() bool { return !r.Passed } + +// CriticalBreach returns true only for critical-severity breaches — +// the "fail the run right now" case. +func (r EvalResult) CriticalBreach() bool { + return r.Breached() && r.Threshold.Severity == SeverityCritical +} + +// Evaluate runs a single sample through every threshold that applies +// to it. A sample may match more than one threshold (a generic "*" +// rule + a stage-specific override); each match produces its own +// EvalResult in the returned slice so both get persisted. +func Evaluate(sample Sample, thresholds []Threshold) []EvalResult { + out := make([]EvalResult, 0, 1) + for _, t := range thresholds { + if !thresholdMatchesSample(t, sample) { + continue + } + passed, err := evaluateOp(t.Op, sample.Value, t.Value, t.Nominal) + if err != nil { + // Unknown operator — skip. The caller could validate on + // insert; here we prefer to drop the threshold than to + // return an error that forces every Sensor write to 500. + continue + } + out = append(out, EvalResult{ + Threshold: t, + Passed: passed, + Observed: sample.Value, + }) + } + return out +} + +// thresholdMatchesSample applies the stage + kind + key filter. Kind +// is always literal — there's no "any kind" threshold and if there +// ever is we'll add a `kind: *` escape hatch. Stage and key both +// support glob-ish matching. +func thresholdMatchesSample(t Threshold, s Sample) bool { + if t.Kind != s.Kind { + return false + } + if !stageMatches(t.Stage, s.Stage) { + return false + } + if !keyMatches(t.Key, s.Key) { + return false + } + return true +} + +// stageMatches returns true if the threshold's stage selector applies +// to the sample's stage. "*" matches everything; empty threshold +// selector is treated as "*" so a threshold declared without a stage +// key isn't accidentally inert. A sample without a stage only matches +// the "*" selector — we don't guess. +func stageMatches(selector, sampleStage string) bool { + if selector == "" || selector == "*" { + return true + } + return selector == sampleStage +} + +// keyMatches handles "*", "prefix*", "*suffix", and exact match. We +// avoid pulling in filepath.Match so Windows `\`-vs-`/` rules don't +// leak into the sample namespace (key "eth0/rx_errors" is not a path). +func keyMatches(pattern, key string) bool { + if pattern == "" || pattern == "*" { + return true + } + hasPrefix := strings.HasPrefix(pattern, "*") + hasSuffix := strings.HasSuffix(pattern, "*") + switch { + case hasPrefix && hasSuffix: + inner := strings.TrimPrefix(strings.TrimSuffix(pattern, "*"), "*") + return strings.Contains(key, inner) + case hasSuffix: + return strings.HasPrefix(key, strings.TrimSuffix(pattern, "*")) + case hasPrefix: + return strings.HasSuffix(key, strings.TrimPrefix(pattern, "*")) + default: + return pattern == key + } +} + +// evaluateOp does the numeric comparison. within_pct is the oddball: +// it tests |observed - nominal| <= (pct / 100) * nominal. Returns an +// error for unknown operators so the caller can log + drop. +func evaluateOp(op ThresholdOp, observed, threshold, nominal float64) (bool, error) { + switch op { + case OpLT: + return observed < threshold, nil + case OpLTE: + return observed <= threshold, nil + case OpGT: + return observed > threshold, nil + case OpGTE: + return observed >= threshold, nil + case OpWithinPct: + if nominal == 0 { + // within_pct against a 0 nominal is meaningless. Treat as + // pass so a misconfigured rule doesn't spuriously fail. + return true, nil + } + allowed := (threshold / 100.0) * nominal + if allowed < 0 { + allowed = -allowed + } + diff := observed - nominal + if diff < 0 { + diff = -diff + } + return diff <= allowed, nil + default: + return false, fmt.Errorf("unknown op %q", op) + } +} diff --git a/internal/orchestrator/thresholds_test.go b/internal/orchestrator/thresholds_test.go new file mode 100644 index 0000000..47117ec --- /dev/null +++ b/internal/orchestrator/thresholds_test.go @@ -0,0 +1,152 @@ +package orchestrator + +import "testing" + +// TestEvaluate_Ops covers every operator against the boundary case +// (equal to threshold) plus one clearly-inside and one clearly-outside +// value. Table-driven because the logic is regular. +func TestEvaluate_Ops(t *testing.T) { + cases := []struct { + name string + op ThresholdOp + value float64 + nominal float64 + observed float64 + want bool + }{ + {"lt strict below", OpLT, 10, 0, 5, true}, + {"lt equal fails", OpLT, 10, 0, 10, false}, + {"lt above fails", OpLT, 10, 0, 15, false}, + + {"lte below", OpLTE, 10, 0, 5, true}, + {"lte equal passes", OpLTE, 10, 0, 10, true}, + {"lte above fails", OpLTE, 10, 0, 11, false}, + + {"gt below fails", OpGT, 900, 0, 800, false}, + {"gt equal fails", OpGT, 900, 0, 900, false}, + {"gt above passes", OpGT, 900, 0, 950, true}, + + {"gte equal passes", OpGTE, 900, 0, 900, true}, + {"gte below fails", OpGTE, 900, 0, 800, false}, + + {"within_pct exact", OpWithinPct, 5, 12.0, 12.0, true}, + {"within_pct inside", OpWithinPct, 5, 12.0, 11.7, true}, + {"within_pct outside low", OpWithinPct, 5, 12.0, 11.0, false}, + {"within_pct outside high", OpWithinPct, 5, 12.0, 12.7, false}, + {"within_pct zero nominal passes", OpWithinPct, 5, 0, 99, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + rules := []Threshold{{ + Stage: "*", Kind: "k", Key: "k", Op: tc.op, + Value: tc.value, Nominal: tc.nominal, Severity: SeverityCritical, + }} + res := Evaluate(Sample{Stage: "Any", Kind: "k", Key: "k", Value: tc.observed}, rules) + if len(res) != 1 { + t.Fatalf("expected 1 match, got %d", len(res)) + } + if res[0].Passed != tc.want { + t.Fatalf("op=%s observed=%v want passed=%v got %v", tc.op, tc.observed, tc.want, res[0].Passed) + } + }) + } +} + +// TestEvaluate_StageMatching: a Network-scoped rule ignores samples +// stamped with other stages. Global "*" catches everything. +func TestEvaluate_StageMatching(t *testing.T) { + rules := []Threshold{ + {Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical}, + {Stage: "Burn", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 88, Severity: SeverityCritical}, + } + // Sample from CPUStress — only the global rule applies. + res := Evaluate(Sample{Stage: "CPUStress", Kind: "temp", Key: "cpu/0", Value: 89}, rules) + if len(res) != 1 { + t.Fatalf("cpustress sample: expected 1 match, got %d", len(res)) + } + if res[0].Threshold.Value != 92 { + t.Fatalf("cpustress sample matched wrong rule: %+v", res[0].Threshold) + } + + // Sample from Burn — both rules match. The stricter one breaches. + res = Evaluate(Sample{Stage: "Burn", Kind: "temp", Key: "cpu/0", Value: 89}, rules) + if len(res) != 2 { + t.Fatalf("burn sample: expected 2 matches, got %d", len(res)) + } + var globalPassed, burnPassed bool + for _, r := range res { + switch r.Threshold.Value { + case 92: + globalPassed = r.Passed + case 88: + burnPassed = r.Passed + } + } + if !globalPassed { + t.Fatalf("global 92C rule should pass at 89C") + } + if burnPassed { + t.Fatalf("burn 88C rule should breach at 89C") + } +} + +// TestEvaluate_KeyWildcards covers "*" / "prefix*" / "*suffix". +func TestEvaluate_KeyWildcards(t *testing.T) { + cases := []struct { + pattern string + key string + match bool + }{ + {"*", "anything", true}, + {"", "anything", true}, + {"cpu/*", "cpu/0", true}, + {"cpu/*", "gpu/0", false}, + {"*/rate", "eth0/rate", true}, + {"*/rate", "eth0/count", false}, + {"exact", "exact", true}, + {"exact", "exactly", false}, + } + for _, tc := range cases { + t.Run(tc.pattern+"_vs_"+tc.key, func(t *testing.T) { + got := keyMatches(tc.pattern, tc.key) + if got != tc.match { + t.Fatalf("keyMatches(%q, %q) = %v, want %v", tc.pattern, tc.key, got, tc.match) + } + }) + } +} + +// TestEvaluate_SeverityDispatch: only critical breaches flip +// CriticalBreach; warning-severity breaches stay advisory. +func TestEvaluate_SeverityDispatch(t *testing.T) { + rules := []Threshold{ + {Stage: "*", Kind: "temp", Key: "cpu", Op: OpLT, Value: 92, Severity: SeverityCritical}, + {Stage: "*", Kind: "fio", Key: "p99", Op: OpLT, Value: 50000, Severity: SeverityWarning}, + } + res := Evaluate(Sample{Stage: "CPU", Kind: "temp", Key: "cpu", Value: 95}, rules) + if len(res) != 1 || !res[0].CriticalBreach() { + t.Fatalf("critical breach not detected: %+v", res) + } + res = Evaluate(Sample{Stage: "Storage", Kind: "fio", Key: "p99", Value: 80000}, rules) + if len(res) != 1 { + t.Fatalf("expected 1 match, got %d", len(res)) + } + if res[0].CriticalBreach() { + t.Fatalf("warning-severity breach should not be critical") + } + if !res[0].Breached() { + t.Fatalf("warning-severity rule should still show breach=true") + } +} + +// TestEvaluate_NoMatchingThreshold: a sample that doesn't hit any rule +// produces an empty result slice — callers treat that as "advisory". +func TestEvaluate_NoMatchingThreshold(t *testing.T) { + rules := []Threshold{ + {Stage: "*", Kind: "temp", Key: "cpu/*", Op: OpLT, Value: 92, Severity: SeverityCritical}, + } + res := Evaluate(Sample{Stage: "Network", Kind: "iperf", Key: "throughput", Value: 950}, rules) + if len(res) != 0 { + t.Fatalf("unmatched sample should yield 0 results, got %d", len(res)) + } +} diff --git a/internal/report/report.go b/internal/report/report.go index 2370ec2..37f709e 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -28,7 +28,17 @@ type Data struct { Host model.Host Stages []model.Stage SpecDiffs []model.SpecDiff - Aggregates []Aggregate // flattened measurement summary; see Aggregate + Aggregates []Aggregate // flattened measurement summary; see Aggregate + Firmware []FirmwareSnapshot // captured firmware versions, empty if none +} + +// FirmwareSnapshot is the report-facing view of one firmware row. +// Package-local so the HTML template stays decoupled from store types. +type FirmwareSnapshot struct { + Component string + Identifier string + Version string + Vendor string } // Aggregate is a per (kind, key) summary of a run's measurements. Min/ @@ -196,6 +206,27 @@ const htmlTemplate = ` +
+

Firmware ({{len .Firmware}})

+{{if .Firmware}} + + + + {{range .Firmware}} + + + + + + + {{end}} + +
ComponentIdentifierVersionVendor
{{.Component}}{{.Identifier}}{{.Version}}{{.Vendor}}
+{{else}} +

No firmware snapshots captured.

+{{end}} +
+

Spec diffs ({{len .SpecDiffs}})

{{if .SpecDiffs}} diff --git a/internal/spec/spec.go b/internal/spec/spec.go index c433665..108de80 100644 --- a/internal/spec/spec.go +++ b/internal/spec/spec.go @@ -21,11 +21,36 @@ import ( ) type Spec struct { - CPU *CPUSpec `yaml:"cpu,omitempty"` - Memory *MemorySpec `yaml:"memory,omitempty"` - Disks []DiskSpec `yaml:"disks,omitempty"` - NICs []NICSpec `yaml:"nics,omitempty"` - GPUs []GPUSpec `yaml:"gpus,omitempty"` + CPU *CPUSpec `yaml:"cpu,omitempty"` + Memory *MemorySpec `yaml:"memory,omitempty"` + Disks []DiskSpec `yaml:"disks,omitempty"` + NICs []NICSpec `yaml:"nics,omitempty"` + GPUs []GPUSpec `yaml:"gpus,omitempty"` + Firmware []FirmwareSpec `yaml:"firmware,omitempty"` +} + +// FirmwareSpec is one row in the expected-spec YAML's `firmware:` block. +// Component is one of bios|bmc|nic|hba|microcode|nvme_fw (matches the +// on-wire value from agent/probes.FirmwareSnapshot.Component). Identifier +// is optional — when empty the rule applies to every observed snapshot +// of that component (use for single-instance things like BIOS/microcode); +// when set it pins the check to a specific NIC port / NVMe controller / +// PCI address. Version is the literal string expected; comparison is +// exact after trimming whitespace. +type FirmwareSpec struct { + Component string `yaml:"component"` + Identifier string `yaml:"identifier,omitempty"` + Version string `yaml:"version"` +} + +// FirmwareObserved is what the agent reported, in a spec-package-local +// shape so callers don't need to thread store types through the diff. +// The server converts store.FirmwareSnapshot → FirmwareObserved before +// calling DiffFirmware. +type FirmwareObserved struct { + Component string + Identifier string + Version string } type CPUSpec struct { @@ -175,6 +200,73 @@ func diffNICs(expected, actual []NICSpec) []model.SpecDiff { return out } +// DiffFirmware returns a SpecDiff per firmware expectation that doesn't +// find a matching observed snapshot. Matching rules: +// - An expected rule with Identifier set matches by (component, id); +// a missing observed snapshot yields a "present=false" diff. +// - An expected rule with Identifier empty applies to every observed +// snapshot of that component — useful for "all NICs must run fw +// 8.30" without listing each port. Zero observed snapshots of the +// component yields a single "present=false" diff, not N. +// - Version mismatch emits an exact-string expected→actual diff. +// Case is preserved (firmware versions are case-sensitive in practice). +func DiffFirmware(expected []FirmwareSpec, actual []FirmwareObserved) []model.SpecDiff { + if len(expected) == 0 { + return nil + } + byCompIdent := map[string]FirmwareObserved{} + byComp := map[string][]FirmwareObserved{} + for _, o := range actual { + byCompIdent[fwKey(o.Component, o.Identifier)] = o + byComp[o.Component] = append(byComp[o.Component], o) + } + var out []model.SpecDiff + for _, exp := range expected { + comp := strings.TrimSpace(exp.Component) + if comp == "" || strings.TrimSpace(exp.Version) == "" { + continue + } + label := "firmware[" + comp + if exp.Identifier != "" { + label += "/" + exp.Identifier + } + label += "]" + if exp.Identifier != "" { + got, ok := byCompIdent[fwKey(comp, exp.Identifier)] + if !ok { + out = append(out, diff(label+".present", "true", "false")) + continue + } + if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) { + out = append(out, diff(label+".version", exp.Version, got.Version)) + } + continue + } + // No identifier: fan out across every observed snapshot of this + // component. Missing is one diff; a mismatching port/controller + // emits one diff per mismatch. + observed := byComp[comp] + if len(observed) == 0 { + out = append(out, diff(label+".present", "true", "false")) + continue + } + for _, got := range observed { + if !strings.EqualFold(strings.TrimSpace(got.Version), strings.TrimSpace(exp.Version)) { + slot := got.Identifier + if slot == "" { + slot = "*" + } + out = append(out, diff("firmware["+comp+"/"+slot+"].version", exp.Version, got.Version)) + } + } + } + return out +} + +func fwKey(component, identifier string) string { + return strings.ToLower(component) + "|" + strings.ToLower(identifier) +} + func diffGPUs(expected, actual []GPUSpec) []model.SpecDiff { if len(expected) == 0 { return nil diff --git a/internal/spec/spec_test.go b/internal/spec/spec_test.go index 761c83a..c97fb47 100644 --- a/internal/spec/spec_test.go +++ b/internal/spec/spec_test.go @@ -119,3 +119,96 @@ func TestDiffSeverityAlwaysCritical(t *testing.T) { } } } + +func TestDiffFirmwareIdentifierMatch(t *testing.T) { + exp := []FirmwareSpec{{Component: "bios", Version: "3.2"}} + obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}} + if d := DiffFirmware(exp, obs); len(d) != 0 { + t.Fatalf("matching bios version should produce no diff, got %+v", d) + } +} + +func TestDiffFirmwareVersionMismatch(t *testing.T) { + exp := []FirmwareSpec{{Component: "bios", Version: "3.3"}} + obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}} + d := DiffFirmware(exp, obs) + if len(d) != 1 { + t.Fatalf("want 1 diff, got %d: %+v", len(d), d) + } + if d[0].Expected != "3.3" || d[0].Actual != "3.2" { + t.Fatalf("diff expected/actual = %q/%q, want 3.3/3.2", d[0].Expected, d[0].Actual) + } + if d[0].Severity != "critical" { + t.Errorf("severity = %q, want critical", d[0].Severity) + } +} + +func TestDiffFirmwareMissingComponentPresent(t *testing.T) { + // Expected rule with no identifier + zero observed snapshots → + // single "present=false" diff, not N. + exp := []FirmwareSpec{{Component: "bmc", Version: "1.74"}} + d := DiffFirmware(exp, nil) + if len(d) != 1 { + t.Fatalf("want 1 diff for missing BMC, got %d: %+v", len(d), d) + } + if d[0].Field != "firmware[bmc].present" || d[0].Expected != "true" || d[0].Actual != "false" { + t.Fatalf("missing-BMC diff = %+v", d[0]) + } +} + +func TestDiffFirmwareWildcardFanOut(t *testing.T) { + // Expected rule with empty identifier fans across every observed + // snapshot of the component — one port matches, one doesn't → one diff. + exp := []FirmwareSpec{{Component: "nic", Version: "16.32.1010"}} + obs := []FirmwareObserved{ + {Component: "nic", Identifier: "eth0", Version: "16.32.1010"}, + {Component: "nic", Identifier: "eth1", Version: "14.28.0000"}, + } + d := DiffFirmware(exp, obs) + if len(d) != 1 { + t.Fatalf("want 1 diff (mismatched eth1 only), got %d: %+v", len(d), d) + } + if d[0].Field != "firmware[nic/eth1].version" { + t.Errorf("field = %q, want firmware[nic/eth1].version", d[0].Field) + } +} + +func TestDiffFirmwareIdentifierPin(t *testing.T) { + // Identifier set: pins the rule to a specific port. Other ports + // with mismatched firmware are not evaluated by this rule. + exp := []FirmwareSpec{{Component: "nic", Identifier: "eth0", Version: "1.0"}} + obs := []FirmwareObserved{ + {Component: "nic", Identifier: "eth0", Version: "1.0"}, + {Component: "nic", Identifier: "eth1", Version: "9.9"}, + } + if d := DiffFirmware(exp, obs); len(d) != 0 { + t.Fatalf("pinned rule should ignore other ports, got %+v", d) + } +} + +func TestDiffFirmwareIdentifierPinMissing(t *testing.T) { + // Pinned rule with no matching observed snapshot → present=false diff. + exp := []FirmwareSpec{{Component: "nic", Identifier: "eth0", Version: "1.0"}} + if d := DiffFirmware(exp, nil); len(d) != 1 || d[0].Field != "firmware[nic/eth0].present" { + t.Fatalf("want present=false for pinned rule, got %+v", d) + } +} + +func TestDiffFirmwareEmptyRuleSkipped(t *testing.T) { + // Empty component or empty version silently skip rather than panic. + exp := []FirmwareSpec{{Component: "", Version: "x"}, {Component: "bios", Version: ""}} + obs := []FirmwareObserved{{Component: "bios", Identifier: "system", Version: "3.2"}} + if d := DiffFirmware(exp, obs); len(d) != 0 { + t.Fatalf("empty rules should skip, got %+v", d) + } +} + +func TestDiffFirmwareCaseInsensitive(t *testing.T) { + // Version match is case-insensitive after trim; avoids spurious diff + // from ethtool's "FW1234" vs expected YAML's "fw1234". + exp := []FirmwareSpec{{Component: "nvme_fw", Identifier: "nvme0", Version: "fw1234"}} + obs := []FirmwareObserved{{Component: "nvme_fw", Identifier: "nvme0", Version: "FW1234"}} + if d := DiffFirmware(exp, obs); len(d) != 0 { + t.Fatalf("case-insensitive match expected, got %+v", d) + } +} diff --git a/internal/store/firmware.go b/internal/store/firmware.go new file mode 100644 index 0000000..bd431f6 --- /dev/null +++ b/internal/store/firmware.go @@ -0,0 +1,97 @@ +package store + +import ( + "context" + "database/sql" + "fmt" +) + +// FirmwareSnapshot is one row in firmware_snapshots. A run captures +// many (one per BIOS/BMC/NIC/HBA/microcode/NVMe) so SpecValidate can +// diff them against the host's expected spec in Phase 4. +type FirmwareSnapshot struct { + ID int64 + RunID int64 + Component string // bios|bmc|nic|hba|microcode|nvme_fw + Identifier string // slot/serial/device path + Version string + Vendor string + RawJSON string +} + +// Firmware is the CRUD seam. The agent's Phase-4 probe POSTs captured +// rows; the orchestrator stores them. SpecValidate reads them back. +type Firmware struct { + DB *sql.DB +} + +// Create inserts a single firmware snapshot. One call per (run, component, +// identifier) — the agent probe owns dedup/formatting. +func (f *Firmware) Create(ctx context.Context, s FirmwareSnapshot) (int64, error) { + raw := s.RawJSON + if raw == "" { + raw = "{}" + } + res, err := f.DB.ExecContext(ctx, ` + INSERT INTO firmware_snapshots(run_id, component, identifier, version, vendor, raw_json) + VALUES(?,?,?,?,?,?) + `, s.RunID, s.Component, s.Identifier, s.Version, s.Vendor, raw) + if err != nil { + return 0, fmt.Errorf("insert firmware: %w", err) + } + return res.LastInsertId() +} + +// CreateBatch persists a slice of snapshots under one transaction. +// Agent probe enumerates all components in one pass, so batching wins. +func (f *Firmware) CreateBatch(ctx context.Context, rows []FirmwareSnapshot) error { + if len(rows) == 0 { + return nil + } + tx, err := f.DB.BeginTx(ctx, nil) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + stmt, err := tx.PrepareContext(ctx, ` + INSERT INTO firmware_snapshots(run_id, component, identifier, version, vendor, raw_json) + VALUES(?,?,?,?,?,?) + `) + if err != nil { + return fmt.Errorf("prepare firmware insert: %w", err) + } + defer func() { _ = stmt.Close() }() + for _, s := range rows { + raw := s.RawJSON + if raw == "" { + raw = "{}" + } + if _, err := stmt.ExecContext(ctx, s.RunID, s.Component, s.Identifier, s.Version, s.Vendor, raw); err != nil { + return fmt.Errorf("insert firmware %s/%s: %w", s.Component, s.Identifier, err) + } + } + return tx.Commit() +} + +// ListForRun returns every firmware snapshot for a run in stable order. +// Report page + SpecValidate both read this. +func (f *Firmware) ListForRun(ctx context.Context, runID int64) ([]FirmwareSnapshot, error) { + rows, err := f.DB.QueryContext(ctx, ` + SELECT id, run_id, component, identifier, version, vendor, raw_json + FROM firmware_snapshots WHERE run_id = ? ORDER BY id + `, runID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []FirmwareSnapshot + for rows.Next() { + var s FirmwareSnapshot + if err := rows.Scan(&s.ID, &s.RunID, &s.Component, &s.Identifier, + &s.Version, &s.Vendor, &s.RawJSON); err != nil { + return nil, err + } + out = append(out, s) + } + return out, rows.Err() +} diff --git a/internal/store/runs.go b/internal/store/runs.go index 59ab104..1d142e7 100644 --- a/internal/store/runs.go +++ b/internal/store/runs.go @@ -14,16 +14,30 @@ type Runs struct { DB *sql.DB } +// Create inserts a new run using the default "quick" profile. Older +// call sites (and most tests) target this form — the profile column's +// DEFAULT 'quick' on runs takes care of the backfill. func (r *Runs) Create(ctx context.Context, hostID int64, tokenHash string, nonDestructive bool) (int64, error) { + return r.CreateWithProfile(ctx, hostID, tokenHash, nonDestructive, "quick") +} + +// CreateWithProfile inserts a new run with an explicit profile +// ("quick"|"deep"|"soak"). The UI handler is the authoritative caller; +// empty profile falls back to "quick" so a misconfigured form doesn't +// leave a row with a blank profile column. +func (r *Runs) CreateWithProfile(ctx context.Context, hostID int64, tokenHash string, nonDestructive bool, profile string) (int64, error) { + if profile == "" { + profile = "quick" + } now := time.Now().UTC() nd := 0 if nonDestructive { nd = 1 } res, err := r.DB.ExecContext(ctx, ` - INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at, non_destructive) - VALUES(?,?,?,?,?,?) - `, hostID, string(model.StateQueued), tokenHash, "linux", now, nd) + INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at, non_destructive, profile) + VALUES(?,?,?,?,?,?,?) + `, hostID, string(model.StateQueued), tokenHash, "linux", now, nd, profile) if err != nil { return 0, fmt.Errorf("insert run: %w", err) } @@ -107,14 +121,15 @@ func (r *Runs) Get(ctx context.Context, id int64) (*model.Run, error) { SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), COALESCE(next_boot_target,''), agent_token_hash, started_at, completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), - COALESCE(override_flags_json,''), COALESCE(non_destructive,0) + COALESCE(override_flags_json,''), COALESCE(non_destructive,0), + COALESCE(profile,'quick') FROM runs WHERE id = ? `, id) var run model.Run var completedAt sql.NullTime err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, - &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive) + &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile) if errors.Is(err, sql.ErrNoRows) { return nil, ErrNotFound } @@ -133,7 +148,8 @@ func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, err SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), COALESCE(next_boot_target,''), agent_token_hash, started_at, completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), - COALESCE(override_flags_json,''), COALESCE(non_destructive,0) + COALESCE(override_flags_json,''), COALESCE(non_destructive,0), + COALESCE(profile,'quick') FROM runs WHERE host_id = ? ORDER BY id DESC LIMIT 1 `, hostID) @@ -141,7 +157,7 @@ func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, err var completedAt sql.NullTime err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, - &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive) + &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile) if errors.Is(err, sql.ErrNoRows) { return nil, nil } @@ -165,7 +181,8 @@ func (r *Runs) ListForHost(ctx context.Context, hostID int64, limit int) ([]mode SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), COALESCE(next_boot_target,''), agent_token_hash, started_at, completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), - COALESCE(override_flags_json,''), COALESCE(non_destructive,0) + COALESCE(override_flags_json,''), COALESCE(non_destructive,0), + COALESCE(profile,'quick') FROM runs WHERE host_id = ? ORDER BY id DESC @@ -181,7 +198,7 @@ func (r *Runs) ListForHost(ctx context.Context, hostID int64, limit int) ([]mode var completedAt sql.NullTime if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, - &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive); err != nil { + &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile); err != nil { return nil, err } if completedAt.Valid { @@ -206,7 +223,8 @@ func (r *Runs) Active(ctx context.Context) ([]model.Run, error) { SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''), COALESCE(next_boot_target,''), agent_token_hash, started_at, completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''), - COALESCE(override_flags_json,''), COALESCE(non_destructive,0) + COALESCE(override_flags_json,''), COALESCE(non_destructive,0), + COALESCE(profile,'quick') FROM runs WHERE state NOT IN ('Completed','Released','Cancelled') ORDER BY id @@ -221,7 +239,7 @@ func (r *Runs) Active(ctx context.Context) ([]model.Run, error) { var completedAt sql.NullTime if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, - &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive); err != nil { + &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile); err != nil { return nil, err } if completedAt.Valid { @@ -275,7 +293,7 @@ func (r *Runs) FindActiveByMAC(ctx context.Context, mac string) (*model.Run, err var completedAt sql.NullTime err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage, &run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt, - &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive) + &completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON, &run.NonDestructive, &run.Profile) if errors.Is(err, sql.ErrNoRows) { return nil, nil } diff --git a/internal/store/stages.go b/internal/store/stages.go index 63189e6..537b620 100644 --- a/internal/store/stages.go +++ b/internal/store/stages.go @@ -17,11 +17,13 @@ type Stages struct { // reaches Inventory; later phases add more executors but the list is fixed. var DefaultStageOrder = []string{ "Inventory", + "Firmware", "SpecValidate", "SMART", "CPUStress", "Storage", "Network", + "Burn", "GPU", "PSU", "Reporting", diff --git a/internal/store/thresholds.go b/internal/store/thresholds.go new file mode 100644 index 0000000..22c75d9 --- /dev/null +++ b/internal/store/thresholds.go @@ -0,0 +1,280 @@ +package store + +import ( + "context" + "database/sql" + "fmt" + "time" +) + +// Threshold is the DB view of a per-run threshold row. Mirrors the +// orchestrator.Threshold value-object but keeps Severity/Op as strings +// so callers higher up don't force this package to import orchestrator. +type Threshold struct { + ID int64 + RunID int64 + Stage string + Kind string + Key string + Op string + Threshold float64 + Nominal float64 + Unit string + Severity string + Source string // profile|host_override +} + +// ThresholdEvaluation is one recorded comparison — the evaluator calls +// this for every sample that matched a threshold, whether it passed +// or breached. The report page aggregates these to show the operator +// why a run failed (or was flagged as warning-only). +type ThresholdEvaluation struct { + ID int64 + RunID int64 + ThresholdID int64 + Stage string + Kind string + Key string + TS time.Time + Observed float64 + Passed bool +} + +// Thresholds is the CRUD seam. Kept intentionally narrow: seed at run +// creation, list for evaluation on each sensor batch, record eval +// results, aggregate for the report. +type Thresholds struct { + DB *sql.DB +} + +// ThresholdSpec is the caller-supplied shape for seeding — a flat +// value-object that carries the threshold rule plus its source so +// the ProfileRegistry-driven seed and per-host overrides converge +// on one insert path. Kept here (not in config) so the store layer +// doesn't have to import config. +type ThresholdSpec struct { + Stage string + Kind string + Key string + Op string + Value float64 + Nominal float64 + Unit string + Severity string + Source string +} + +// SeedForRun converts the caller's specs into Threshold rows for the +// given run and bulk-inserts them. Returns the inserted rows with IDs +// populated so the evaluator can pin evaluations without a re-read. +func (t *Thresholds) SeedForRun(ctx context.Context, runID int64, specs []ThresholdSpec) ([]Threshold, error) { + rows := make([]Threshold, 0, len(specs)) + for _, s := range specs { + rows = append(rows, Threshold{ + RunID: runID, + Stage: s.Stage, + Kind: s.Kind, + Key: s.Key, + Op: s.Op, + Threshold: s.Value, + Nominal: s.Nominal, + Unit: s.Unit, + Severity: s.Severity, + Source: s.Source, + }) + } + return t.CreateBatch(ctx, rows) +} + +// Create inserts a single threshold row — used by the seed path when +// the orchestrator materializes per-run rules from the ProfileRegistry. +// Returns the row's ID so the evaluator can pin evaluations to it. +func (t *Thresholds) Create(ctx context.Context, th Threshold) (int64, error) { + res, err := t.DB.ExecContext(ctx, ` + INSERT INTO thresholds(run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source) + VALUES(?,?,?,?,?,?,?,?,?,?) + `, th.RunID, th.Stage, th.Kind, th.Key, th.Op, th.Threshold, th.Nominal, th.Unit, th.Severity, th.Source) + if err != nil { + return 0, fmt.Errorf("insert threshold: %w", err) + } + return res.LastInsertId() +} + +// CreateBatch is the fast path for run seeding — one transaction per +// run, one row per threshold. Returns the inserted rows with IDs set +// so the caller can drop them into the in-memory evaluator without a +// follow-up read. +func (t *Thresholds) CreateBatch(ctx context.Context, rows []Threshold) ([]Threshold, error) { + if len(rows) == 0 { + return nil, nil + } + tx, err := t.DB.BeginTx(ctx, nil) + if err != nil { + return nil, err + } + defer func() { _ = tx.Rollback() }() + stmt, err := tx.PrepareContext(ctx, ` + INSERT INTO thresholds(run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source) + VALUES(?,?,?,?,?,?,?,?,?,?) + `) + if err != nil { + return nil, fmt.Errorf("prepare threshold insert: %w", err) + } + defer func() { _ = stmt.Close() }() + out := make([]Threshold, 0, len(rows)) + for _, th := range rows { + res, err := stmt.ExecContext(ctx, th.RunID, th.Stage, th.Kind, th.Key, th.Op, + th.Threshold, th.Nominal, th.Unit, th.Severity, th.Source) + if err != nil { + return nil, fmt.Errorf("insert threshold %s/%s: %w", th.Stage, th.Key, err) + } + id, err := res.LastInsertId() + if err != nil { + return nil, err + } + th.ID = id + out = append(out, th) + } + if err := tx.Commit(); err != nil { + return nil, err + } + return out, nil +} + +// ListForRun returns every threshold seeded for a run, in stable ID +// order. Evaluator expects this to be cheap (few tens of rows per run) +// and pulls it on each /sensor batch. +func (t *Thresholds) ListForRun(ctx context.Context, runID int64) ([]Threshold, error) { + rows, err := t.DB.QueryContext(ctx, ` + SELECT id, run_id, stage_name, kind, key, op, threshold, nominal, unit, severity, source + FROM thresholds WHERE run_id = ? ORDER BY id + `, runID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []Threshold + for rows.Next() { + var th Threshold + if err := rows.Scan(&th.ID, &th.RunID, &th.Stage, &th.Kind, &th.Key, + &th.Op, &th.Threshold, &th.Nominal, &th.Unit, &th.Severity, &th.Source); err != nil { + return nil, err + } + out = append(out, th) + } + return out, rows.Err() +} + +// RecordEvaluation persists a single evaluation outcome. Called per +// matching sample so the run's report has a full audit trail ("temp +// hit 95 at 14:22:03" rather than just "temp failed"). +func (t *Thresholds) RecordEvaluation(ctx context.Context, ev ThresholdEvaluation) error { + passed := 0 + if ev.Passed { + passed = 1 + } + if ev.TS.IsZero() { + ev.TS = time.Now().UTC() + } + _, err := t.DB.ExecContext(ctx, ` + INSERT INTO threshold_evaluations(run_id, threshold_id, stage_name, kind, key, ts, observed, passed) + VALUES(?,?,?,?,?,?,?,?) + `, ev.RunID, ev.ThresholdID, ev.Stage, ev.Kind, ev.Key, ev.TS, ev.Observed, passed) + if err != nil { + return fmt.Errorf("record evaluation: %w", err) + } + return nil +} + +// RecordBatch persists a slice of evaluations in one transaction. The +// agent-handler hot path builds these one per sample and batches them +// under the same Sensor POST so we take one round-trip rather than N. +func (t *Thresholds) RecordBatch(ctx context.Context, evals []ThresholdEvaluation) error { + if len(evals) == 0 { + return nil + } + tx, err := t.DB.BeginTx(ctx, nil) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + stmt, err := tx.PrepareContext(ctx, ` + INSERT INTO threshold_evaluations(run_id, threshold_id, stage_name, kind, key, ts, observed, passed) + VALUES(?,?,?,?,?,?,?,?) + `) + if err != nil { + return fmt.Errorf("prepare eval insert: %w", err) + } + defer func() { _ = stmt.Close() }() + for _, ev := range evals { + passed := 0 + if ev.Passed { + passed = 1 + } + if ev.TS.IsZero() { + ev.TS = time.Now().UTC() + } + if _, err := stmt.ExecContext(ctx, ev.RunID, ev.ThresholdID, ev.Stage, ev.Kind, ev.Key, ev.TS, ev.Observed, passed); err != nil { + return fmt.Errorf("insert eval: %w", err) + } + } + return tx.Commit() +} + +// ListEvaluations returns the evaluation history for a run, newest +// last. Bounded at a sane cap so a pathological run with a sample-per- +// second sidecar doesn't blow up the report page. +func (t *Thresholds) ListEvaluations(ctx context.Context, runID int64) ([]ThresholdEvaluation, error) { + rows, err := t.DB.QueryContext(ctx, ` + SELECT id, run_id, threshold_id, stage_name, kind, key, ts, observed, passed + FROM threshold_evaluations WHERE run_id = ? + ORDER BY id LIMIT 5000 + `, runID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []ThresholdEvaluation + for rows.Next() { + var ev ThresholdEvaluation + var passed int + if err := rows.Scan(&ev.ID, &ev.RunID, &ev.ThresholdID, &ev.Stage, &ev.Kind, + &ev.Key, &ev.TS, &ev.Observed, &passed); err != nil { + return nil, err + } + ev.Passed = passed == 1 + out = append(out, ev) + } + return out, rows.Err() +} + +// CriticalBreaches returns the evaluations that fire the "fail the +// run" gate — critical-severity thresholds with passed=0. The +// agent-handler calls this at /result close so an aggregate breach +// (p99 latency > bound) still flips the run to FailedHolding even if +// no single sample tripped the fast-fail path. +func (t *Thresholds) CriticalBreaches(ctx context.Context, runID int64) ([]ThresholdEvaluation, error) { + rows, err := t.DB.QueryContext(ctx, ` + SELECT e.id, e.run_id, e.threshold_id, e.stage_name, e.kind, e.key, e.ts, e.observed, e.passed + FROM threshold_evaluations e + JOIN thresholds t ON t.id = e.threshold_id + WHERE e.run_id = ? AND e.passed = 0 AND t.severity = 'critical' + ORDER BY e.id + `, runID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []ThresholdEvaluation + for rows.Next() { + var ev ThresholdEvaluation + var passed int + if err := rows.Scan(&ev.ID, &ev.RunID, &ev.ThresholdID, &ev.Stage, &ev.Kind, + &ev.Key, &ev.TS, &ev.Observed, &passed); err != nil { + return nil, err + } + ev.Passed = passed == 1 + out = append(out, ev) + } + return out, rows.Err() +} diff --git a/internal/web/static/app.css b/internal/web/static/app.css index 5d94577..7a19397 100644 --- a/internal/web/static/app.css +++ b/internal/web/static/app.css @@ -636,6 +636,21 @@ body.bare main { max-width: none; } .run-failed-stage { color: var(--danger); } .run-failed-stage strong { font-family: var(--mono); } .run-diffs { color: var(--danger); } +.run-profile-chip { + display: inline-block; + font-family: var(--mono); + font-size: 11px; + text-transform: uppercase; + letter-spacing: .04em; + padding: 2px 8px; + border-radius: 999px; + border: 1px solid rgba(255,255,255,.15); + background: rgba(255,255,255,.05); + color: var(--text-dim); +} +.run-profile-quick { color: var(--accent); border-color: rgba(60,130,246,.45); background: rgba(60,130,246,.08); } +.run-profile-deep { color: #e5b94f; border-color: rgba(229,185,79,.45); background: rgba(229,185,79,.08); } +.run-profile-soak { color: #d97a57; border-color: rgba(217,122,87,.45); background: rgba(217,122,87,.08); } .hold-banner { background: rgba(229,100,102,.1); @@ -890,6 +905,17 @@ body.bare main { max-width: none; } .host-actions { padding: 0; } .host-actions-row { display: flex; gap: 10px; flex-wrap: wrap; align-items: center; } .host-nd-toggle { display: inline-flex; gap: 6px; align-items: center; color: var(--text-dim); font-size: 13px; } +.host-profile-picker { + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 6px 10px; + display: inline-flex; + gap: 12px; + align-items: center; + margin: 0 8px 0 0; +} +.host-profile-picker legend { font-size: 11px; color: var(--text-dim); text-transform: uppercase; letter-spacing: .05em; padding: 0 4px; } +.host-profile-picker label { display: inline-flex; gap: 4px; align-items: center; font-family: var(--mono); font-size: 13px; cursor: pointer; } .in-flight-banner-wrap { display: contents; } .in-flight-banner { diff --git a/internal/web/templates/active_step_templ.go b/internal/web/templates/active_step_templ.go index 4c7c13b..1e0dea7 100644 --- a/internal/web/templates/active_step_templ.go +++ b/internal/web/templates/active_step_templ.go @@ -65,7 +65,7 @@ func ActiveStep(d ActiveStepData) templ.Component { var templ_7745c5c3_Var3 string templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String()) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 1, Col: 0} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 1, Col: 0} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3)) if templ_7745c5c3_Err != nil { @@ -88,7 +88,7 @@ func ActiveStep(d ActiveStepData) templ.Component { var templ_7745c5c3_Var4 string templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 28, Col: 102} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 28, Col: 102} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4)) if templ_7745c5c3_Err != nil { @@ -110,7 +110,7 @@ func ActiveStep(d ActiveStepData) templ.Component { var templ_7745c5c3_Var6 string templ_7745c5c3_Var6, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var5).String()) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 1, Col: 0} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 1, Col: 0} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var6)) if templ_7745c5c3_Err != nil { @@ -123,7 +123,7 @@ func ActiveStep(d ActiveStepData) templ.Component { var templ_7745c5c3_Var7 string templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(stageMarker(string(d.Stage.State))) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 30, Col: 105} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 30, Col: 105} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7)) if templ_7745c5c3_Err != nil { @@ -136,7 +136,7 @@ func ActiveStep(d ActiveStepData) templ.Component { var templ_7745c5c3_Var8 string templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 31, Col: 41} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 31, Col: 41} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8)) if templ_7745c5c3_Err != nil { @@ -149,7 +149,7 @@ func ActiveStep(d ActiveStepData) templ.Component { var templ_7745c5c3_Var9 string templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(stageDurationFromStage(d.Stage)) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 32, Col: 64} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 32, Col: 64} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9)) if templ_7745c5c3_Err != nil { @@ -182,7 +182,7 @@ func ActiveStep(d ActiveStepData) templ.Component { var templ_7745c5c3_Var10 string templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(d.Stage.Name) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 43, Col: 99} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 43, Col: 99} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10)) if templ_7745c5c3_Err != nil { @@ -195,7 +195,7 @@ func ActiveStep(d ActiveStepData) templ.Component { var templ_7745c5c3_Var11 string templ_7745c5c3_Var11, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d-%s", d.RunID, d.Stage.Name)) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 47, Col: 56} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 47, Col: 56} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var11)) if templ_7745c5c3_Err != nil { @@ -208,7 +208,7 @@ func ActiveStep(d ActiveStepData) templ.Component { var templ_7745c5c3_Var12 string templ_7745c5c3_Var12, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("log-%d-%s", d.RunID, d.Stage.Name)) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/active_step.templ`, Line: 48, Col: 62} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `active_step.templ`, Line: 48, Col: 62} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var12)) if templ_7745c5c3_Err != nil { diff --git a/internal/web/templates/host_page.templ b/internal/web/templates/host_page.templ index 5e29e50..d0f711b 100644 --- a/internal/web/templates/host_page.templ +++ b/internal/web/templates/host_page.templ @@ -102,6 +102,21 @@ templ HostActions(d HostPageData) {
if hostCanStart(d) {
+
+ Profile + + + +
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + } else { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, "
") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -553,138 +588,138 @@ func RunSpecDiffs(d RunPageData) templ.Component { }() } ctx = templ.InitializeContext(ctx) - templ_7745c5c3_Var28 := templ.GetChildren(ctx) - if templ_7745c5c3_Var28 == nil { - templ_7745c5c3_Var28 = templ.NopComponent + templ_7745c5c3_Var31 := templ.GetChildren(ctx) + if templ_7745c5c3_Var31 == nil { + templ_7745c5c3_Var31 = templ.NopComponent } ctx = templ.ClearChildren(ctx) - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 38, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "\" hx-swap=\"outerHTML\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } if len(d.SpecDiffs) > 0 { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 41, "

Spec diffs (") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, ">

Spec diffs (") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - var templ_7745c5c3_Var31 string - templ_7745c5c3_Var31, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", len(d.SpecDiffs))) + var templ_7745c5c3_Var34 string + templ_7745c5c3_Var34, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("%d", len(d.SpecDiffs))) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 154, Col: 66} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 155, Col: 66} } - _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var31)) + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var34)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, ")

    ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, ")

    ") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } for _, diff := range d.SpecDiffs { - var templ_7745c5c3_Var32 = []any{"diff-row", "diff-" + diff.Severity} - templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var32...) + var templ_7745c5c3_Var35 = []any{"diff-row", "diff-" + diff.Severity} + templ_7745c5c3_Err = templ.RenderCSSItems(ctx, templ_7745c5c3_Buffer, templ_7745c5c3_Var35...) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 45, "
  • ") - if templ_7745c5c3_Err != nil { - return templ_7745c5c3_Err - } - var templ_7745c5c3_Var34 string - templ_7745c5c3_Var34, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Field) - if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 158, Col: 43} - } - _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var34)) - if templ_7745c5c3_Err != nil { - return templ_7745c5c3_Err - } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, "
    expected: ") - if templ_7745c5c3_Err != nil { - return templ_7745c5c3_Err - } - var templ_7745c5c3_Var35 string - templ_7745c5c3_Var35, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Expected) - if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 159, Col: 65} - } - _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var35)) - if templ_7745c5c3_Err != nil { - return templ_7745c5c3_Err - } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "
    actual: ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "
  • ") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "\">
    ") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var37 string + templ_7745c5c3_Var37, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Field) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 159, Col: 43} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var37)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 50, "
    expected: ") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var38 string + templ_7745c5c3_Var38, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Expected) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 160, Col: 65} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var38)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 51, "
    actual: ") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + var templ_7745c5c3_Var39 string + templ_7745c5c3_Var39, templ_7745c5c3_Err = templ.JoinStringErrs(diff.Actual) + if templ_7745c5c3_Err != nil { + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/run_detail.templ`, Line: 161, Col: 59} + } + _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var39)) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 52, "
    ") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 50, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 53, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 51, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 54, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } diff --git a/internal/web/templates/substep_row_templ.go b/internal/web/templates/substep_row_templ.go index 02cae74..d0c9c1a 100644 --- a/internal/web/templates/substep_row_templ.go +++ b/internal/web/templates/substep_row_templ.go @@ -99,7 +99,7 @@ func SubStepRow(ss model.SubStep) templ.Component { var templ_7745c5c3_Var3 string templ_7745c5c3_Var3, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("substep-%d-%s-%d", ss.RunID, ss.StageName, ss.Ordinal)) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 63, Col: 74} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 63, Col: 74} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var3)) if templ_7745c5c3_Err != nil { @@ -112,7 +112,7 @@ func SubStepRow(ss model.SubStep) templ.Component { var templ_7745c5c3_Var4 string templ_7745c5c3_Var4, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var2).String()) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 1, Col: 0} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 1, Col: 0} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var4)) if templ_7745c5c3_Err != nil { @@ -125,7 +125,7 @@ func SubStepRow(ss model.SubStep) templ.Component { var templ_7745c5c3_Var5 string templ_7745c5c3_Var5, templ_7745c5c3_Err = templ.JoinStringErrs(fmt.Sprintf("substep-%d-%s-%d", ss.RunID, ss.StageName, ss.Ordinal)) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 65, Col: 80} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 65, Col: 80} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var5)) if templ_7745c5c3_Err != nil { @@ -147,7 +147,7 @@ func SubStepRow(ss model.SubStep) templ.Component { var templ_7745c5c3_Var7 string templ_7745c5c3_Var7, templ_7745c5c3_Err = templ.JoinStringErrs(templ.CSSClasses(templ_7745c5c3_Var6).String()) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 1, Col: 0} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 1, Col: 0} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var7)) if templ_7745c5c3_Err != nil { @@ -160,7 +160,7 @@ func SubStepRow(ss model.SubStep) templ.Component { var templ_7745c5c3_Var8 string templ_7745c5c3_Var8, templ_7745c5c3_Err = templ.JoinStringErrs(subStepMarker(ss.State)) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 68, Col: 96} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 68, Col: 96} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var8)) if templ_7745c5c3_Err != nil { @@ -173,7 +173,7 @@ func SubStepRow(ss model.SubStep) templ.Component { var templ_7745c5c3_Var9 string templ_7745c5c3_Var9, templ_7745c5c3_Err = templ.JoinStringErrs(ss.Name) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 69, Col: 38} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 69, Col: 38} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var9)) if templ_7745c5c3_Err != nil { @@ -186,7 +186,7 @@ func SubStepRow(ss model.SubStep) templ.Component { var templ_7745c5c3_Var10 string templ_7745c5c3_Var10, templ_7745c5c3_Err = templ.JoinStringErrs(subStepDuration(ss)) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/web/templates/substep_row.templ`, Line: 70, Col: 54} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `substep_row.templ`, Line: 70, Col: 54} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var10)) if templ_7745c5c3_Err != nil {