deep profile + threshold gating + firmware stage + Burn super-stage

Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-18 22:50:57 -04:00
parent fbb21cbafd
commit 23c689aa5b
60 changed files with 5911 additions and 527 deletions
@@ -16,6 +16,7 @@ import (
 	"github.com/go-chi/chi/v5"
 	"gopkg.in/yaml.v3"

+	"vetting/internal/config"
 	"vetting/internal/events"
 	"vetting/internal/logs"
 	"vetting/internal/model"
@@ -26,17 +27,19 @@ import (
 )

 type UI struct {
-	Hosts     *store.Hosts
-	Runs      *store.Runs
-	Stages    *store.Stages
-	SubSteps  *store.SubSteps
-	SpecDiffs *store.SpecDiffs
-	Artifacts *store.Artifacts
-	EventHub  *events.Hub
-	Logs      *logs.Hub
-	Runner    *orchestrator.Runner
-	Tiles     *TileEnricher
-	PublicURL string // user-visible base URL baked into the quick-register one-liner
+	Hosts      *store.Hosts
+	Runs       *store.Runs
+	Stages     *store.Stages
+	SubSteps   *store.SubSteps
+	SpecDiffs  *store.SpecDiffs
+	Artifacts  *store.Artifacts
+	Thresholds *store.Thresholds // Phase 1: seeded at StartRun from Profiles
+	Profiles   *config.ProfileRegistry
+	EventHub   *events.Hub
+	Logs       *logs.Hub
+	Runner     *orchestrator.Runner
+	Tiles      *TileEnricher
+	PublicURL  string // user-visible base URL baked into the quick-register one-liner
 	// PXE, when non-nil, gets Reload()ed after host create/delete so
 	// dnsmasq's dhcp-host= allowlist reflects the current registry.
 	// Without this, a newly-registered host PXE-boots and gets
@@ -316,23 +319,71 @@ func (u *UI) StartRun(w http.ResponseWriter, r *http.Request) {
 	}

 	nonDestructive := r.PostFormValue("non_destructive") == "1"
+	profile := strings.TrimSpace(r.PostFormValue("profile"))
+	if profile == "" {
+		profile = config.ProfileQuick
+	}
+	if !config.IsValidProfile(profile) {
+		http.Error(w, "unknown profile: "+profile, http.StatusBadRequest)
+		return
+	}

 	_, hash, err := orchestrator.IssueRunToken()
 	if err != nil {
 		http.Error(w, "token: "+err.Error(), http.StatusInternalServerError)
 		return
 	}
-	runID, err := u.Runs.Create(r.Context(), hostID, hash, nonDestructive)
+	runID, err := u.Runs.CreateWithProfile(r.Context(), hostID, hash, nonDestructive, profile)
 	if err != nil {
 		http.Error(w, "create run: "+err.Error(), http.StatusInternalServerError)
 		return
 	}
-	log.Printf("ui: created run %d for host %d (state=Queued)", runID, hostID)
+	if err := u.seedThresholds(r.Context(), runID, host, profile); err != nil {
+		// A threshold-seed failure shouldn't orphan a run row — log
+		// and continue. Samples will just accumulate without a gate
+		// until the operator retries, same as before Phase 1.
+		log.Printf("ui: seed thresholds run %d: %v", runID, err)
+	}
+	log.Printf("ui: created run %d for host %d profile=%s (state=Queued)", runID, hostID, profile)
 	// Send the operator straight to the new run — the button they clicked
 	// was "Start vetting", the thing they want next is to watch it.
 	http.Redirect(w, r, fmt.Sprintf("/runs/%d", runID), http.StatusSeeOther)
 }

+// seedThresholds materializes the per-run threshold table from the
+// ProfileRegistry. The shared vetting.thresholds block applies to
+// every profile; future per-profile overrides will layer on top here,
+// and per-host overrides (Phase 1 extra) land via ExpectedSpecYAML in
+// a later iteration. Safe to skip silently when Thresholds or the
+// registry isn't wired — tests do not always build one.
+func (u *UI) seedThresholds(ctx context.Context, runID int64, host *model.Host, profile string) error {
+	if u.Thresholds == nil || u.Profiles == nil {
+		return nil
+	}
+	_ = host    // reserved for per-host override layer
+	_ = profile // reserved for per-profile override layer
+	defaults := u.Profiles.Vetting.Thresholds
+	if len(defaults) == 0 {
+		return nil
+	}
+	specs := make([]store.ThresholdSpec, 0, len(defaults))
+	for _, d := range defaults {
+		specs = append(specs, store.ThresholdSpec{
+			Stage:    d.Stage,
+			Kind:     d.Kind,
+			Key:      d.Key,
+			Op:       d.Op,
+			Value:    d.Value,
+			Nominal:  d.Nominal,
+			Unit:     d.Unit,
+			Severity: d.Severity,
+			Source:   "profile",
+		})
+	}
+	_, err := u.Thresholds.SeedForRun(ctx, runID, specs)
+	return err
+}
+
 func (u *UI) NewHostForm(w http.ResponseWriter, r *http.Request) {
 	_ = templates.Registration(templates.RegistrationForm{
 		QuickRegisterURL: u.baseURL(r),