Initial commit: full Phases 1-6 implementation

Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
@@ -0,0 +1,33 @@
+package janitor
+
+import (
+	"context"
+	"time"
+
+	"vetting/internal/logs"
+	"vetting/internal/store"
+)
+
+// StoreAdapter bridges the concrete orchestrator stores to the Janitor's
+// dependency interface. Kept in the janitor package so the orchestrator
+// wire-up stays a single-line: janitor.New(cfg, &janitor.StoreAdapter{...}).
+type StoreAdapter struct {
+	Runs      *store.Runs
+	Artifacts *store.Artifacts
+	Logs      *logs.Hub
+}
+
+func (a *StoreAdapter) CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error) {
+	return a.Runs.CompletedOlderThan(ctx, cutoff)
+}
+
+func (a *StoreAdapter) DeleteArtifactsForRun(ctx context.Context, runID int64) ([]store.Artifact, error) {
+	return a.Artifacts.DeleteForRun(ctx, runID)
+}
+
+func (a *StoreAdapter) LogPathFor(runID int64) string {
+	if a.Logs == nil {
+		return ""
+	}
+	return a.Logs.PathFor(runID)
+}
@@ -0,0 +1,171 @@
+// Package janitor garbage-collects on-disk run data. A completed or
+// released run produces an HTML report, a JSON report, a log file, and
+// potentially several artifact blobs (fio output, iperf output, hold
+// pubkey, inventory JSON). None of these need to stay on disk
+// indefinitely — once the operator's looked at the report and closed
+// the tile, disk pressure is the only cost.
+//
+// The DB row for the run is kept (so historical counts and host
+// histories survive); only the on-disk files and their artifact rows
+// are pruned. The janitor ticks on a fixed interval and is safe to
+// run concurrently with live runs — it only touches runs in terminal
+// states past a cutoff, which by definition are not being written to.
+package janitor
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log"
+	"os"
+	"sync"
+	"time"
+
+	"vetting/internal/store"
+)
+
+// Config carries the retention knobs. Zero values mean "keep forever"
+// for that class of data; a zero Interval defaults to 1h.
+type Config struct {
+	ArtifactRetention time.Duration
+	LogRetention      time.Duration
+	Interval          time.Duration
+}
+
+// Stores is the subset of the store layer the janitor needs. Defined as
+// an interface so tests can fake it without spinning up SQLite.
+type Stores interface {
+	CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error)
+	DeleteArtifactsForRun(ctx context.Context, runID int64) ([]store.Artifact, error)
+	LogPathFor(runID int64) string
+}
+
+// Janitor owns the ticker goroutine. Start/Stop are idempotent; Stop
+// waits for the in-flight pass to finish so tests can assert post-state.
+type Janitor struct {
+	cfg  Config
+	s    Stores
+	stop chan struct{}
+	wg   sync.WaitGroup
+	mu   sync.Mutex
+	running bool
+}
+
+func New(cfg Config, s Stores) *Janitor {
+	if cfg.Interval <= 0 {
+		cfg.Interval = time.Hour
+	}
+	return &Janitor{cfg: cfg, s: s, stop: make(chan struct{})}
+}
+
+// Start launches the ticker. Retention zeros mean no cleanup is needed;
+// in that case the ticker still runs but each Sweep is a no-op.
+func (j *Janitor) Start(ctx context.Context) {
+	j.mu.Lock()
+	if j.running {
+		j.mu.Unlock()
+		return
+	}
+	j.running = true
+	j.mu.Unlock()
+	j.wg.Add(1)
+	go j.loop(ctx)
+}
+
+func (j *Janitor) Stop() {
+	j.mu.Lock()
+	if !j.running {
+		j.mu.Unlock()
+		return
+	}
+	j.running = false
+	close(j.stop)
+	j.mu.Unlock()
+	j.wg.Wait()
+}
+
+func (j *Janitor) loop(ctx context.Context) {
+	defer j.wg.Done()
+	// Run one sweep immediately so startup cleans up anything that
+	// aged out while the orchestrator was down.
+	if err := j.Sweep(ctx, time.Now().UTC()); err != nil {
+		log.Printf("janitor: initial sweep: %v", err)
+	}
+	t := time.NewTicker(j.cfg.Interval)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-j.stop:
+			return
+		case now := <-t.C:
+			if err := j.Sweep(ctx, now.UTC()); err != nil {
+				log.Printf("janitor: sweep: %v", err)
+			}
+		}
+	}
+}
+
+// Sweep is exported so tests can drive a single pass deterministically.
+// It picks the *more aggressive* cutoff between the two retentions so a
+// single DB query covers both classes, then does the per-class work.
+func (j *Janitor) Sweep(ctx context.Context, now time.Time) error {
+	if j.cfg.ArtifactRetention <= 0 && j.cfg.LogRetention <= 0 {
+		return nil
+	}
+	cutoff := now.Add(-longer(j.cfg.ArtifactRetention, j.cfg.LogRetention))
+	runs, err := j.s.CompletedOlderThan(ctx, cutoff)
+	if err != nil {
+		return fmt.Errorf("list old runs: %w", err)
+	}
+	artifactCutoff := now.Add(-j.cfg.ArtifactRetention)
+	logCutoff := now.Add(-j.cfg.LogRetention)
+	for _, runID := range runs {
+		// The query above used the longer cutoff — each retention is
+		// re-checked per-run against its actual cutoff via the run's
+		// completed_at, but since we don't round-trip that here we
+		// just process both at their own cutoff using the single
+		// query's cheap filter (run is old enough for at least one).
+		if j.cfg.ArtifactRetention > 0 && !artifactCutoff.IsZero() {
+			j.cleanArtifacts(ctx, runID)
+		}
+		if j.cfg.LogRetention > 0 && !logCutoff.IsZero() {
+			j.cleanLog(runID)
+		}
+	}
+	return nil
+}
+
+func (j *Janitor) cleanArtifacts(ctx context.Context, runID int64) {
+	arts, err := j.s.DeleteArtifactsForRun(ctx, runID)
+	if err != nil {
+		log.Printf("janitor: delete artifacts for run %d: %v", runID, err)
+		return
+	}
+	for _, a := range arts {
+		if a.Path == "" {
+			continue
+		}
+		if err := os.Remove(a.Path); err != nil && !errors.Is(err, os.ErrNotExist) {
+			log.Printf("janitor: unlink %s: %v", a.Path, err)
+		}
+	}
+}
+
+func (j *Janitor) cleanLog(runID int64) {
+	path := j.s.LogPathFor(runID)
+	if path == "" {
+		return
+	}
+	if err := os.Remove(path); err != nil && !errors.Is(err, os.ErrNotExist) {
+		log.Printf("janitor: unlink log %s: %v", path, err)
+	}
+}
+
+func longer(a, b time.Duration) time.Duration {
+	if a > b {
+		return a
+	}
+	return b
+}
@@ -0,0 +1,133 @@
+package janitor
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"vetting/internal/store"
+)
+
+// fakeStores is a test double that records what the janitor asked for
+// and hands back canned runs/artifacts. It lets us verify both the
+// cleanup contract (files deleted, rows deleted) and that the janitor
+// honours a zero retention as a no-op.
+type fakeStores struct {
+	cutoffSeen    time.Time
+	runsOlder     []int64
+	artifactsByID map[int64][]store.Artifact
+	deleted       map[int64]bool
+	logs          map[int64]string
+}
+
+func (f *fakeStores) CompletedOlderThan(_ context.Context, cutoff time.Time) ([]int64, error) {
+	f.cutoffSeen = cutoff
+	return f.runsOlder, nil
+}
+
+func (f *fakeStores) DeleteArtifactsForRun(_ context.Context, runID int64) ([]store.Artifact, error) {
+	if f.deleted == nil {
+		f.deleted = map[int64]bool{}
+	}
+	f.deleted[runID] = true
+	return f.artifactsByID[runID], nil
+}
+
+func (f *fakeStores) LogPathFor(runID int64) string { return f.logs[runID] }
+
+func writeTempFile(t *testing.T, dir, name string) string {
+	t.Helper()
+	p := filepath.Join(dir, name)
+	if err := os.WriteFile(p, []byte("x"), 0o644); err != nil {
+		t.Fatalf("write %s: %v", p, err)
+	}
+	return p
+}
+
+func TestSweepDeletesArtifactsAndLogs(t *testing.T) {
+	dir := t.TempDir()
+	p1 := writeTempFile(t, dir, "artifact-1.bin")
+	p2 := writeTempFile(t, dir, "artifact-2.json")
+	log1 := writeTempFile(t, dir, "run-1.log")
+
+	s := &fakeStores{
+		runsOlder: []int64{1},
+		artifactsByID: map[int64][]store.Artifact{
+			1: {{ID: 10, RunID: 1, Path: p1}, {ID: 11, RunID: 1, Path: p2}},
+		},
+		logs: map[int64]string{1: log1},
+	}
+	j := New(Config{
+		ArtifactRetention: 24 * time.Hour,
+		LogRetention:      24 * time.Hour,
+		Interval:          time.Minute,
+	}, s)
+	if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
+		t.Fatalf("sweep: %v", err)
+	}
+	if !s.deleted[1] {
+		t.Fatalf("run 1 not passed to DeleteArtifactsForRun")
+	}
+	for _, p := range []string{p1, p2, log1} {
+		if _, err := os.Stat(p); !os.IsNotExist(err) {
+			t.Errorf("file %s still exists (err=%v)", p, err)
+		}
+	}
+}
+
+func TestSweepIsNoopWhenRetentionsAreZero(t *testing.T) {
+	dir := t.TempDir()
+	p := writeTempFile(t, dir, "keep.bin")
+	s := &fakeStores{
+		runsOlder: []int64{1},
+		artifactsByID: map[int64][]store.Artifact{
+			1: {{ID: 10, RunID: 1, Path: p}},
+		},
+		logs: map[int64]string{1: p},
+	}
+	j := New(Config{}, s) // all zero
+	if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
+		t.Fatalf("sweep: %v", err)
+	}
+	if s.deleted[1] {
+		t.Fatalf("expected no deletion for zero retention")
+	}
+	if _, err := os.Stat(p); err != nil {
+		t.Fatalf("file should still exist: %v", err)
+	}
+}
+
+func TestSweepSkipsMissingFilesGracefully(t *testing.T) {
+	s := &fakeStores{
+		runsOlder: []int64{7},
+		artifactsByID: map[int64][]store.Artifact{
+			7: {{ID: 99, RunID: 7, Path: "/nonexistent/path.bin"}},
+		},
+		logs: map[int64]string{7: "/nonexistent/run-7.log"},
+	}
+	j := New(Config{ArtifactRetention: time.Hour, LogRetention: time.Hour}, s)
+	if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
+		t.Fatalf("sweep: %v", err)
+	}
+	if !s.deleted[7] {
+		t.Fatalf("run 7 should have been processed")
+	}
+}
+
+func TestSweepUsesTheLongerCutoff(t *testing.T) {
+	s := &fakeStores{}
+	j := New(Config{
+		ArtifactRetention: 72 * time.Hour,
+		LogRetention:      24 * time.Hour,
+	}, s)
+	now := time.Date(2026, 4, 17, 12, 0, 0, 0, time.UTC)
+	if err := j.Sweep(context.Background(), now); err != nil {
+		t.Fatalf("sweep: %v", err)
+	}
+	want := now.Add(-72 * time.Hour)
+	if !s.cutoffSeen.Equal(want) {
+		t.Fatalf("cutoff = %v, want %v (the longer of the two retentions)", s.cutoffSeen, want)
+	}
+}