deep profile + threshold gating + firmware stage + Burn super-stage
CI / Lint + build + test (push) Failing after 1m57s
Release / release (push) Has been cancelled

Ships all five phases of the deep-profile overhaul together. Runs now
carry a profile (quick/deep/soak); every profile walks the same
11-stage order — Inventory → Firmware → SpecValidate → SMART →
CPUStress → Storage → Network → Burn → GPU → PSU → Reporting —
with only per-stage durations and concurrency scaled.

Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile
column + CreateWithProfile; threshold table + evaluator seeded per-run
from the shared vetting.thresholds block; breach flips result at
/sensor + /result.

Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify +
EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta),
Network (sustained iperf + /proc/net/dev deltas) with per-profile
knobs from Deps.

Phase 3: Burn super-stage with goroutine fan-out for CPU + memory +
fio + iperf, PSU rails sampled across the Burn window, SensorMux
(2 s flush, 500-sample cap) to absorb backpressure.

Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode
(BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl),
lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into
SpecValidate with pin-by-identifier and fan-out-across-component
matching; mismatches park the run in FailedHolding.

Phase 5: profile radio on the host start form, profile chip on the
run header, Firmware section in the HTML report, coverage artifact
uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath
seam + stress_ng and dmidecode example fakes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
+64 -13
View File
@@ -16,6 +16,7 @@ import (
"github.com/go-chi/chi/v5"
"gopkg.in/yaml.v3"
"vetting/internal/config"
"vetting/internal/events"
"vetting/internal/logs"
"vetting/internal/model"
@@ -26,17 +27,19 @@ import (
)
type UI struct {
Hosts *store.Hosts
Runs *store.Runs
Stages *store.Stages
SubSteps *store.SubSteps
SpecDiffs *store.SpecDiffs
Artifacts *store.Artifacts
EventHub *events.Hub
Logs *logs.Hub
Runner *orchestrator.Runner
Tiles *TileEnricher
PublicURL string // user-visible base URL baked into the quick-register one-liner
Hosts *store.Hosts
Runs *store.Runs
Stages *store.Stages
SubSteps *store.SubSteps
SpecDiffs *store.SpecDiffs
Artifacts *store.Artifacts
Thresholds *store.Thresholds // Phase 1: seeded at StartRun from Profiles
Profiles *config.ProfileRegistry
EventHub *events.Hub
Logs *logs.Hub
Runner *orchestrator.Runner
Tiles *TileEnricher
PublicURL string // user-visible base URL baked into the quick-register one-liner
// PXE, when non-nil, gets Reload()ed after host create/delete so
// dnsmasq's dhcp-host= allowlist reflects the current registry.
// Without this, a newly-registered host PXE-boots and gets
@@ -316,23 +319,71 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
}
nonDestructive := r.PostFormValue("non_destructive") == "1"
profile := strings.TrimSpace(r.PostFormValue("profile"))
if profile == "" {
profile = config.ProfileQuick
}
if !config.IsValidProfile(profile) {
http.Error(w, "unknown profile: "+profile, http.StatusBadRequest)
return
}
_, hash, err := orchestrator.IssueRunToken()
if err != nil {
http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
return
}
runID, err := u.Runs.Create(r.Context(), hostID, hash, nonDestructive)
runID, err := u.Runs.CreateWithProfile(r.Context(), hostID, hash, nonDestructive, profile)
if err != nil {
http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
return
}
log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID)
if err := u.seedThresholds(r.Context(), runID, host, profile); err != nil {
// A threshold-seed failure shouldn't orphan a run row — log
// and continue. Samples will just accumulate without a gate
// until the operator retries, same as before Phase 1.
log.Printf("ui: seed thresholds run %d: %v", runID, err)
}
log.Printf("ui: created run %d for host %d profile=%s (state=Queued)", runID, hostID, profile)
// Send the operator straight to the new run — the button they clicked
// was "Start vetting", the thing they want next is to watch it.
http.Redirect(w, r, fmt.Sprintf("/runs/%d", runID), http.StatusSeeOther)
}
// seedThresholds materializes the per-run threshold table from the
// ProfileRegistry. The shared vetting.thresholds block applies to
// every profile; future per-profile overrides will layer on top here,
// and per-host overrides (Phase 1 extra) land via ExpectedSpecYAML in
// a later iteration. Safe to skip silently when Thresholds or the
// registry isn't wired — tests do not always build one.
func (u *UI) seedThresholds(ctx context.Context, runID int64, host *model.Host, profile string) error {
if u.Thresholds == nil || u.Profiles == nil {
return nil
}
_ = host // reserved for per-host override layer
_ = profile // reserved for per-profile override layer
defaults := u.Profiles.Vetting.Thresholds
if len(defaults) == 0 {
return nil
}
specs := make([]store.ThresholdSpec, 0, len(defaults))
for _, d := range defaults {
specs = append(specs, store.ThresholdSpec{
Stage: d.Stage,
Kind: d.Kind,
Key: d.Key,
Op: d.Op,
Value: d.Value,
Nominal: d.Nominal,
Unit: d.Unit,
Severity: d.Severity,
Source: "profile",
})
}
_, err := u.Thresholds.SeedForRun(ctx, runID, specs)
return err
}
func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
_ = templates.Registration(templates.RegistrationForm{
QuickRegisterURL: u.baseURL(r),