23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
280 lines
8.1 KiB
Go
280 lines
8.1 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"errors"
|
|
"flag"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"path/filepath"
|
|
"syscall"
|
|
"time"
|
|
|
|
"vetting/internal/api"
|
|
"vetting/internal/config"
|
|
"vetting/internal/db"
|
|
"vetting/internal/events"
|
|
"vetting/internal/httpserver"
|
|
"vetting/internal/janitor"
|
|
"vetting/internal/logs"
|
|
"vetting/internal/model"
|
|
"vetting/internal/notify"
|
|
"vetting/internal/orchestrator"
|
|
"vetting/internal/pxe"
|
|
"vetting/internal/store"
|
|
"vetting/internal/web/templates"
|
|
)
|
|
|
|
func main() {
|
|
configPath := flag.String("config", "deploy/vetting.example.yaml", "path to vetting.yaml")
|
|
flag.Parse()
|
|
|
|
cfg, err := config.Load(*configPath)
|
|
if err != nil {
|
|
log.Fatalf("load config: %v", err)
|
|
}
|
|
|
|
for _, dir := range []string{
|
|
filepath.Dir(cfg.Database.Path),
|
|
cfg.Artifacts.Dir,
|
|
cfg.Logs.Dir,
|
|
} {
|
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
|
log.Fatalf("mkdir %s: %v", dir, err)
|
|
}
|
|
}
|
|
|
|
conn, err := db.Open(cfg.Database.Path)
|
|
if err != nil {
|
|
log.Fatalf("open db: %v", err)
|
|
}
|
|
defer func() { _ = conn.Close() }()
|
|
|
|
hostStore := &store.Hosts{DB: conn}
|
|
runStore := &store.Runs{DB: conn}
|
|
stageStore := &store.Stages{DB: conn}
|
|
subStepStore := &store.SubSteps{DB: conn}
|
|
artifactStore := &store.Artifacts{DB: conn}
|
|
specDiffStore := &store.SpecDiffs{DB: conn}
|
|
measurementStore := &store.Measurements{DB: conn}
|
|
thresholdStore := &store.Thresholds{DB: conn}
|
|
firmwareStore := &store.Firmware{DB: conn}
|
|
|
|
hub := events.NewHub()
|
|
|
|
logHub, err := logs.NewHub(cfg.Logs.Dir, hub)
|
|
if err != nil {
|
|
log.Fatalf("logs hub: %v", err)
|
|
}
|
|
defer logHub.Close()
|
|
|
|
runner := &orchestrator.Runner{
|
|
Runs: runStore,
|
|
Hosts: hostStore,
|
|
Stages: stageStore,
|
|
EventHub: hub,
|
|
}
|
|
|
|
tiles := &api.TileEnricher{
|
|
Runs: runStore,
|
|
Artifacts: artifactStore,
|
|
SpecDiffs: specDiffStore,
|
|
}
|
|
|
|
// Inject a templ renderer so the Runner can publish tile-refresh
|
|
// fragments via SSE without pulling web/templates into the
|
|
// orchestrator package. The closure enriches the tile with spec-
|
|
// diff count and hold-key path so every tile render shows the
|
|
// same data, whether it came from /events or an initial page load.
|
|
orchestrator.TileRenderer = func(ctx context.Context, host model.Host, latest *model.Run) string {
|
|
return templates.RenderTileString(tiles.Build(ctx, host, latest))
|
|
}
|
|
orchestrator.PipelineRenderer = templates.RenderPipelineString
|
|
orchestrator.SubStepRenderer = templates.RenderSubStepRowString
|
|
|
|
notifyReg, err := notify.BuildRegistry(cfg.Notifiers, cfg.Routes)
|
|
if err != nil {
|
|
log.Fatalf("notify: %v", err)
|
|
}
|
|
|
|
ui := &api.UI{
|
|
Hosts: hostStore,
|
|
Runs: runStore,
|
|
Stages: stageStore,
|
|
SubSteps: subStepStore,
|
|
SpecDiffs: specDiffStore,
|
|
Artifacts: artifactStore,
|
|
Thresholds: thresholdStore,
|
|
Profiles: cfg.Profiles,
|
|
EventHub: hub,
|
|
Logs: logHub,
|
|
Runner: runner,
|
|
Tiles: tiles,
|
|
PublicURL: cfg.Server.PublicURL,
|
|
}
|
|
|
|
// Inject the host-page + run-page fragment renderers. Each reuses
|
|
// the matching LoadHostPageData / LoadRunPageData so SSE-pushed HTML
|
|
// matches an initial page load byte-for-byte, then hands each region
|
|
// to its Render*String helper.
|
|
orchestrator.HostPageRenderer = func(ctx context.Context, hostID int64) (orchestrator.HostPageFragments, bool) {
|
|
d, err := ui.LoadHostPageData(ctx, hostID)
|
|
if err != nil {
|
|
return orchestrator.HostPageFragments{}, false
|
|
}
|
|
rows := make(map[int64]string, len(d.Runs))
|
|
for _, r := range d.Runs {
|
|
rows[r.ID] = templates.RenderRunRowString(templates.RunRowData{
|
|
Run: r,
|
|
Stages: d.RunStages[r.ID],
|
|
Live: d.ActiveRun != nil && d.ActiveRun.ID == r.ID,
|
|
})
|
|
}
|
|
return orchestrator.HostPageFragments{
|
|
Summary: templates.RenderHostSummaryString(d),
|
|
Actions: templates.RenderHostActionsString(d),
|
|
InFlightBanner: templates.RenderInFlightBannerString(d),
|
|
RunRows: rows,
|
|
}, true
|
|
}
|
|
|
|
orchestrator.RunPageRenderer = func(ctx context.Context, runID int64) (orchestrator.RunPageFragments, bool) {
|
|
d, err := ui.LoadRunPageData(ctx, runID)
|
|
if err != nil {
|
|
return orchestrator.RunPageFragments{}, false
|
|
}
|
|
return orchestrator.RunPageFragments{
|
|
Header: templates.RenderRunHeaderString(d),
|
|
Hold: templates.RenderHoldBannerString(d),
|
|
SpecDiffs: templates.RenderRunSpecDiffsString(d),
|
|
}, true
|
|
}
|
|
|
|
agentAPI := &api.Agent{
|
|
Hosts: hostStore,
|
|
Runs: runStore,
|
|
Stages: stageStore,
|
|
SubSteps: subStepStore,
|
|
Artifacts: artifactStore,
|
|
SpecDiffs: specDiffStore,
|
|
Measurements: measurementStore,
|
|
Thresholds: thresholdStore,
|
|
Firmware: firmwareStore,
|
|
Profiles: cfg.Profiles,
|
|
Runner: runner,
|
|
EventHub: hub,
|
|
Logs: logHub,
|
|
Notify: notifyReg,
|
|
ArtifactsDir: cfg.Artifacts.Dir,
|
|
OrchestratorURL: cfg.PXE.OrchestratorURL,
|
|
PublicURL: cfg.Server.PublicURL,
|
|
IperfPort: cfg.Network.IperfPort,
|
|
}
|
|
agentAPI.LiveKernelURL, agentAPI.LiveInitrdURL = pxe.BuildLiveURLs(cfg.PXE.OrchestratorURL)
|
|
|
|
dispatcher := orchestrator.NewDispatcher(cfg.Dispatcher.MaxConcurrentRuns, runStore, hostStore, runner, logHub)
|
|
iperfSup := orchestrator.NewIperfSupervisor(cfg.Network.IperfPort)
|
|
|
|
janitorSvc := janitor.New(janitor.Config{
|
|
ArtifactRetention: time.Duration(cfg.Artifacts.RetentionDays) * 24 * time.Hour,
|
|
LogRetention: time.Duration(cfg.Logs.RetentionDays) * 24 * time.Hour,
|
|
Interval: time.Duration(cfg.Janitor.IntervalMinutes) * time.Minute,
|
|
}, &janitor.StoreAdapter{Runs: runStore, Artifacts: artifactStore, Logs: logHub})
|
|
|
|
// Anchor tftp_root and the pxe runtime dir under artifacts.dir's
|
|
// parent (typically /var/lib/vetting), not logs.dir's parent. The
|
|
// production systemd unit's ReadWritePaths=/var/lib/vetting /var/log/vetting
|
|
// sandbox forbids writing outside those trees, so deriving from
|
|
// /var/log/vetting would land us at /var/log/{tftp,pxe} — unwritable.
|
|
stateRoot := filepath.Dir(cfg.Artifacts.Dir)
|
|
tftpRoot := cfg.PXE.TFTPRoot
|
|
if tftpRoot == "" {
|
|
tftpRoot = filepath.Join(stateRoot, "tftp")
|
|
}
|
|
var supervisor *pxe.Supervisor
|
|
if cfg.PXE.Enabled {
|
|
supervisor = pxe.NewSupervisor(pxe.SupervisorConfig{
|
|
Enabled: true,
|
|
Interface: cfg.PXE.Interface,
|
|
Subnet: cfg.PXE.Subnet,
|
|
OrchestratorURL: cfg.PXE.OrchestratorURL,
|
|
RuntimeDir: filepath.Join(stateRoot, "pxe"),
|
|
TFTPRoot: tftpRoot,
|
|
LiveDir: cfg.PXE.LiveDir,
|
|
})
|
|
ui.PXE = supervisor
|
|
}
|
|
|
|
router := httpserver.NewRouter(httpserver.Deps{
|
|
UI: ui,
|
|
Agent: agentAPI,
|
|
LiveDir: cfg.PXE.LiveDir,
|
|
AgentAssetDir: cfg.Agent.AssetDir,
|
|
})
|
|
|
|
srv := &http.Server{
|
|
Addr: cfg.Server.Bind,
|
|
Handler: router,
|
|
ReadHeaderTimeout: 10 * time.Second,
|
|
}
|
|
if cfg.Server.TLS.Enabled {
|
|
srv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12}
|
|
}
|
|
|
|
shutdown := make(chan os.Signal, 1)
|
|
signal.Notify(shutdown, os.Interrupt, syscall.SIGTERM)
|
|
|
|
rootCtx, cancelRoot := context.WithCancel(context.Background())
|
|
defer cancelRoot()
|
|
|
|
dispatcher.Start(rootCtx)
|
|
janitorSvc.Start(rootCtx)
|
|
|
|
if err := iperfSup.Start(rootCtx); err != nil {
|
|
log.Fatalf("start iperf3: %v", err)
|
|
}
|
|
|
|
if supervisor != nil {
|
|
hosts, err := hostStore.List(rootCtx)
|
|
if err != nil {
|
|
log.Fatalf("list hosts for dnsmasq: %v", err)
|
|
}
|
|
if err := supervisor.Start(rootCtx, hosts); err != nil {
|
|
log.Fatalf("start dnsmasq: %v", err)
|
|
}
|
|
}
|
|
|
|
go func() {
|
|
log.Printf("vetting listening on %s (tls=%v, db=%s)", cfg.Server.Bind, cfg.Server.TLS.Enabled, cfg.Database.Path)
|
|
var err error
|
|
if cfg.Server.TLS.Enabled {
|
|
err = srv.ListenAndServeTLS(cfg.Server.TLS.CertFile, cfg.Server.TLS.KeyFile)
|
|
} else {
|
|
err = srv.ListenAndServe()
|
|
}
|
|
if err != nil && !errors.Is(err, http.ErrServerClosed) {
|
|
log.Fatalf("server: %v", err)
|
|
}
|
|
}()
|
|
|
|
<-shutdown
|
|
log.Printf("shutting down")
|
|
|
|
dispatcher.Stop()
|
|
janitorSvc.Stop()
|
|
_ = iperfSup.Shutdown(3 * time.Second)
|
|
if supervisor != nil {
|
|
_ = supervisor.Shutdown(5 * time.Second)
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
if err := srv.Shutdown(ctx); err != nil {
|
|
log.Printf("server shutdown: %v", err)
|
|
}
|
|
_ = hub.Shutdown(ctx)
|
|
}
|