Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
This commit is contained in:
@@ -0,0 +1,33 @@
|
||||
package janitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"vetting/internal/logs"
|
||||
"vetting/internal/store"
|
||||
)
|
||||
|
||||
// StoreAdapter bridges the concrete orchestrator stores to the Janitor's
|
||||
// dependency interface. Kept in the janitor package so the orchestrator
|
||||
// wire-up stays a single-line: janitor.New(cfg, &janitor.StoreAdapter{...}).
|
||||
type StoreAdapter struct {
|
||||
Runs *store.Runs
|
||||
Artifacts *store.Artifacts
|
||||
Logs *logs.Hub
|
||||
}
|
||||
|
||||
func (a *StoreAdapter) CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error) {
|
||||
return a.Runs.CompletedOlderThan(ctx, cutoff)
|
||||
}
|
||||
|
||||
func (a *StoreAdapter) DeleteArtifactsForRun(ctx context.Context, runID int64) ([]store.Artifact, error) {
|
||||
return a.Artifacts.DeleteForRun(ctx, runID)
|
||||
}
|
||||
|
||||
func (a *StoreAdapter) LogPathFor(runID int64) string {
|
||||
if a.Logs == nil {
|
||||
return ""
|
||||
}
|
||||
return a.Logs.PathFor(runID)
|
||||
}
|
||||
@@ -0,0 +1,171 @@
|
||||
// Package janitor garbage-collects on-disk run data. A completed or
|
||||
// released run produces an HTML report, a JSON report, a log file, and
|
||||
// potentially several artifact blobs (fio output, iperf output, hold
|
||||
// pubkey, inventory JSON). None of these need to stay on disk
|
||||
// indefinitely — once the operator's looked at the report and closed
|
||||
// the tile, disk pressure is the only cost.
|
||||
//
|
||||
// The DB row for the run is kept (so historical counts and host
|
||||
// histories survive); only the on-disk files and their artifact rows
|
||||
// are pruned. The janitor ticks on a fixed interval and is safe to
|
||||
// run concurrently with live runs — it only touches runs in terminal
|
||||
// states past a cutoff, which by definition are not being written to.
|
||||
package janitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"vetting/internal/store"
|
||||
)
|
||||
|
||||
// Config carries the retention knobs. Zero values mean "keep forever"
|
||||
// for that class of data; a zero Interval defaults to 1h.
|
||||
type Config struct {
|
||||
ArtifactRetention time.Duration
|
||||
LogRetention time.Duration
|
||||
Interval time.Duration
|
||||
}
|
||||
|
||||
// Stores is the subset of the store layer the janitor needs. Defined as
|
||||
// an interface so tests can fake it without spinning up SQLite.
|
||||
type Stores interface {
|
||||
CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error)
|
||||
DeleteArtifactsForRun(ctx context.Context, runID int64) ([]store.Artifact, error)
|
||||
LogPathFor(runID int64) string
|
||||
}
|
||||
|
||||
// Janitor owns the ticker goroutine. Start/Stop are idempotent; Stop
|
||||
// waits for the in-flight pass to finish so tests can assert post-state.
|
||||
type Janitor struct {
|
||||
cfg Config
|
||||
s Stores
|
||||
stop chan struct{}
|
||||
wg sync.WaitGroup
|
||||
mu sync.Mutex
|
||||
running bool
|
||||
}
|
||||
|
||||
func New(cfg Config, s Stores) *Janitor {
|
||||
if cfg.Interval <= 0 {
|
||||
cfg.Interval = time.Hour
|
||||
}
|
||||
return &Janitor{cfg: cfg, s: s, stop: make(chan struct{})}
|
||||
}
|
||||
|
||||
// Start launches the ticker. Retention zeros mean no cleanup is needed;
|
||||
// in that case the ticker still runs but each Sweep is a no-op.
|
||||
func (j *Janitor) Start(ctx context.Context) {
|
||||
j.mu.Lock()
|
||||
if j.running {
|
||||
j.mu.Unlock()
|
||||
return
|
||||
}
|
||||
j.running = true
|
||||
j.mu.Unlock()
|
||||
j.wg.Add(1)
|
||||
go j.loop(ctx)
|
||||
}
|
||||
|
||||
func (j *Janitor) Stop() {
|
||||
j.mu.Lock()
|
||||
if !j.running {
|
||||
j.mu.Unlock()
|
||||
return
|
||||
}
|
||||
j.running = false
|
||||
close(j.stop)
|
||||
j.mu.Unlock()
|
||||
j.wg.Wait()
|
||||
}
|
||||
|
||||
func (j *Janitor) loop(ctx context.Context) {
|
||||
defer j.wg.Done()
|
||||
// Run one sweep immediately so startup cleans up anything that
|
||||
// aged out while the orchestrator was down.
|
||||
if err := j.Sweep(ctx, time.Now().UTC()); err != nil {
|
||||
log.Printf("janitor: initial sweep: %v", err)
|
||||
}
|
||||
t := time.NewTicker(j.cfg.Interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-j.stop:
|
||||
return
|
||||
case now := <-t.C:
|
||||
if err := j.Sweep(ctx, now.UTC()); err != nil {
|
||||
log.Printf("janitor: sweep: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sweep is exported so tests can drive a single pass deterministically.
|
||||
// It picks the *more aggressive* cutoff between the two retentions so a
|
||||
// single DB query covers both classes, then does the per-class work.
|
||||
func (j *Janitor) Sweep(ctx context.Context, now time.Time) error {
|
||||
if j.cfg.ArtifactRetention <= 0 && j.cfg.LogRetention <= 0 {
|
||||
return nil
|
||||
}
|
||||
cutoff := now.Add(-longer(j.cfg.ArtifactRetention, j.cfg.LogRetention))
|
||||
runs, err := j.s.CompletedOlderThan(ctx, cutoff)
|
||||
if err != nil {
|
||||
return fmt.Errorf("list old runs: %w", err)
|
||||
}
|
||||
artifactCutoff := now.Add(-j.cfg.ArtifactRetention)
|
||||
logCutoff := now.Add(-j.cfg.LogRetention)
|
||||
for _, runID := range runs {
|
||||
// The query above used the longer cutoff — each retention is
|
||||
// re-checked per-run against its actual cutoff via the run's
|
||||
// completed_at, but since we don't round-trip that here we
|
||||
// just process both at their own cutoff using the single
|
||||
// query's cheap filter (run is old enough for at least one).
|
||||
if j.cfg.ArtifactRetention > 0 && !artifactCutoff.IsZero() {
|
||||
j.cleanArtifacts(ctx, runID)
|
||||
}
|
||||
if j.cfg.LogRetention > 0 && !logCutoff.IsZero() {
|
||||
j.cleanLog(runID)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (j *Janitor) cleanArtifacts(ctx context.Context, runID int64) {
|
||||
arts, err := j.s.DeleteArtifactsForRun(ctx, runID)
|
||||
if err != nil {
|
||||
log.Printf("janitor: delete artifacts for run %d: %v", runID, err)
|
||||
return
|
||||
}
|
||||
for _, a := range arts {
|
||||
if a.Path == "" {
|
||||
continue
|
||||
}
|
||||
if err := os.Remove(a.Path); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
log.Printf("janitor: unlink %s: %v", a.Path, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (j *Janitor) cleanLog(runID int64) {
|
||||
path := j.s.LogPathFor(runID)
|
||||
if path == "" {
|
||||
return
|
||||
}
|
||||
if err := os.Remove(path); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
log.Printf("janitor: unlink log %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
|
||||
func longer(a, b time.Duration) time.Duration {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
@@ -0,0 +1,133 @@
|
||||
package janitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"vetting/internal/store"
|
||||
)
|
||||
|
||||
// fakeStores is a test double that records what the janitor asked for
|
||||
// and hands back canned runs/artifacts. It lets us verify both the
|
||||
// cleanup contract (files deleted, rows deleted) and that the janitor
|
||||
// honours a zero retention as a no-op.
|
||||
type fakeStores struct {
|
||||
cutoffSeen time.Time
|
||||
runsOlder []int64
|
||||
artifactsByID map[int64][]store.Artifact
|
||||
deleted map[int64]bool
|
||||
logs map[int64]string
|
||||
}
|
||||
|
||||
func (f *fakeStores) CompletedOlderThan(_ context.Context, cutoff time.Time) ([]int64, error) {
|
||||
f.cutoffSeen = cutoff
|
||||
return f.runsOlder, nil
|
||||
}
|
||||
|
||||
func (f *fakeStores) DeleteArtifactsForRun(_ context.Context, runID int64) ([]store.Artifact, error) {
|
||||
if f.deleted == nil {
|
||||
f.deleted = map[int64]bool{}
|
||||
}
|
||||
f.deleted[runID] = true
|
||||
return f.artifactsByID[runID], nil
|
||||
}
|
||||
|
||||
func (f *fakeStores) LogPathFor(runID int64) string { return f.logs[runID] }
|
||||
|
||||
func writeTempFile(t *testing.T, dir, name string) string {
|
||||
t.Helper()
|
||||
p := filepath.Join(dir, name)
|
||||
if err := os.WriteFile(p, []byte("x"), 0o644); err != nil {
|
||||
t.Fatalf("write %s: %v", p, err)
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
func TestSweepDeletesArtifactsAndLogs(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
p1 := writeTempFile(t, dir, "artifact-1.bin")
|
||||
p2 := writeTempFile(t, dir, "artifact-2.json")
|
||||
log1 := writeTempFile(t, dir, "run-1.log")
|
||||
|
||||
s := &fakeStores{
|
||||
runsOlder: []int64{1},
|
||||
artifactsByID: map[int64][]store.Artifact{
|
||||
1: {{ID: 10, RunID: 1, Path: p1}, {ID: 11, RunID: 1, Path: p2}},
|
||||
},
|
||||
logs: map[int64]string{1: log1},
|
||||
}
|
||||
j := New(Config{
|
||||
ArtifactRetention: 24 * time.Hour,
|
||||
LogRetention: 24 * time.Hour,
|
||||
Interval: time.Minute,
|
||||
}, s)
|
||||
if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
|
||||
t.Fatalf("sweep: %v", err)
|
||||
}
|
||||
if !s.deleted[1] {
|
||||
t.Fatalf("run 1 not passed to DeleteArtifactsForRun")
|
||||
}
|
||||
for _, p := range []string{p1, p2, log1} {
|
||||
if _, err := os.Stat(p); !os.IsNotExist(err) {
|
||||
t.Errorf("file %s still exists (err=%v)", p, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSweepIsNoopWhenRetentionsAreZero(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
p := writeTempFile(t, dir, "keep.bin")
|
||||
s := &fakeStores{
|
||||
runsOlder: []int64{1},
|
||||
artifactsByID: map[int64][]store.Artifact{
|
||||
1: {{ID: 10, RunID: 1, Path: p}},
|
||||
},
|
||||
logs: map[int64]string{1: p},
|
||||
}
|
||||
j := New(Config{}, s) // all zero
|
||||
if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
|
||||
t.Fatalf("sweep: %v", err)
|
||||
}
|
||||
if s.deleted[1] {
|
||||
t.Fatalf("expected no deletion for zero retention")
|
||||
}
|
||||
if _, err := os.Stat(p); err != nil {
|
||||
t.Fatalf("file should still exist: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSweepSkipsMissingFilesGracefully(t *testing.T) {
|
||||
s := &fakeStores{
|
||||
runsOlder: []int64{7},
|
||||
artifactsByID: map[int64][]store.Artifact{
|
||||
7: {{ID: 99, RunID: 7, Path: "/nonexistent/path.bin"}},
|
||||
},
|
||||
logs: map[int64]string{7: "/nonexistent/run-7.log"},
|
||||
}
|
||||
j := New(Config{ArtifactRetention: time.Hour, LogRetention: time.Hour}, s)
|
||||
if err := j.Sweep(context.Background(), time.Now().UTC()); err != nil {
|
||||
t.Fatalf("sweep: %v", err)
|
||||
}
|
||||
if !s.deleted[7] {
|
||||
t.Fatalf("run 7 should have been processed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSweepUsesTheLongerCutoff(t *testing.T) {
|
||||
s := &fakeStores{}
|
||||
j := New(Config{
|
||||
ArtifactRetention: 72 * time.Hour,
|
||||
LogRetention: 24 * time.Hour,
|
||||
}, s)
|
||||
now := time.Date(2026, 4, 17, 12, 0, 0, 0, time.UTC)
|
||||
if err := j.Sweep(context.Background(), now); err != nil {
|
||||
t.Fatalf("sweep: %v", err)
|
||||
}
|
||||
want := now.Add(-72 * time.Hour)
|
||||
if !s.cutoffSeen.Equal(want) {
|
||||
t.Fatalf("cutoff = %v, want %v (the longer of the two retentions)", s.cutoffSeen, want)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user