Initial commit: full Phases 1-6 implementation

Post-repair hardware validation pipeline for Proxmox cluster hosts. Go orchestrator + in-image agent + mkosi live image + bundled dnsmasq PXE + SQLite + HTMX/SSE UI + notify registry + janitor + full docs.
2026-04-17 21:32:10 -04:00
commit 9bb4b09a04
98 changed files with 11960 additions and 0 deletions
@@ -0,0 +1,126 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+
+	"vetting/internal/model"
+)
+
+type Artifact struct {
+	ID        int64
+	RunID     int64
+	StageID   *int64
+	Kind      string // inventory|spec_diff|hold_key|report|log|fio|iperf|smart
+	Path      string
+	SHA256    string
+	SizeBytes int64
+}
+
+type Artifacts struct {
+	DB *sql.DB
+}
+
+func (a *Artifacts) Create(ctx context.Context, art Artifact) (int64, error) {
+	res, err := a.DB.ExecContext(ctx, `
+		INSERT INTO artifacts(run_id, stage_id, kind, path, sha256, size_bytes)
+		VALUES(?,?,?,?,?,?)
+	`, art.RunID, nullInt64(art.StageID), art.Kind, art.Path, art.SHA256, art.SizeBytes)
+	if err != nil {
+		return 0, fmt.Errorf("insert artifact: %w", err)
+	}
+	return res.LastInsertId()
+}
+
+// DeleteForRun removes every artifact row for a run. Returns the rows
+// that were deleted so the caller can unlink the on-disk files. Used by
+// the janitor; ordinary flow treats artifacts as append-only.
+func (a *Artifacts) DeleteForRun(ctx context.Context, runID int64) ([]Artifact, error) {
+	arts, err := a.ListForRun(ctx, runID)
+	if err != nil {
+		return nil, err
+	}
+	if _, err := a.DB.ExecContext(ctx, `DELETE FROM artifacts WHERE run_id = ?`, runID); err != nil {
+		return nil, fmt.Errorf("delete artifacts for run %d: %w", runID, err)
+	}
+	return arts, nil
+}
+
+func (a *Artifacts) ListForRun(ctx context.Context, runID int64) ([]Artifact, error) {
+	rows, err := a.DB.QueryContext(ctx, `
+		SELECT id, run_id, stage_id, kind, path, sha256, size_bytes
+		FROM artifacts WHERE run_id = ? ORDER BY id
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []Artifact
+	for rows.Next() {
+		var ar Artifact
+		var stageID sql.NullInt64
+		if err := rows.Scan(&ar.ID, &ar.RunID, &stageID, &ar.Kind, &ar.Path, &ar.SHA256, &ar.SizeBytes); err != nil {
+			return nil, err
+		}
+		if stageID.Valid {
+			v := stageID.Int64
+			ar.StageID = &v
+		}
+		out = append(out, ar)
+	}
+	return out, rows.Err()
+}
+
+type SpecDiffs struct {
+	DB *sql.DB
+}
+
+func (s *SpecDiffs) ReplaceForRun(ctx context.Context, runID int64, diffs []model.SpecDiff) error {
+	tx, err := s.DB.BeginTx(ctx, nil)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = tx.Rollback() }()
+	if _, err := tx.ExecContext(ctx, `DELETE FROM spec_diffs WHERE run_id = ?`, runID); err != nil {
+		return err
+	}
+	for _, d := range diffs {
+		if _, err := tx.ExecContext(ctx, `
+			INSERT INTO spec_diffs(run_id, field, expected, actual, severity, ignored)
+			VALUES(?,?,?,?,?,?)
+		`, runID, d.Field, d.Expected, d.Actual, d.Severity, 0); err != nil {
+			return err
+		}
+	}
+	return tx.Commit()
+}
+
+func (s *SpecDiffs) ListForRun(ctx context.Context, runID int64) ([]model.SpecDiff, error) {
+	rows, err := s.DB.QueryContext(ctx, `
+		SELECT id, run_id, field, COALESCE(expected,''), COALESCE(actual,''), severity, ignored
+		FROM spec_diffs WHERE run_id = ? ORDER BY id
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []model.SpecDiff
+	for rows.Next() {
+		var d model.SpecDiff
+		var ignored int
+		if err := rows.Scan(&d.ID, &d.RunID, &d.Field, &d.Expected, &d.Actual, &d.Severity, &ignored); err != nil {
+			return nil, err
+		}
+		d.Ignored = ignored != 0
+		out = append(out, d)
+	}
+	return out, rows.Err()
+}
+
+func nullInt64(p *int64) any {
+	if p == nil {
+		return nil
+	}
+	return *p
+}
@@ -0,0 +1,98 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"fmt"
+	"strings"
+
+	"vetting/internal/model"
+)
+
+type Hosts struct {
+	DB *sql.DB
+}
+
+var ErrNotFound = errors.New("not found")
+
+func (h *Hosts) Create(ctx context.Context, in model.Host) (int64, error) {
+	in.MAC = normalizeMAC(in.MAC)
+	res, err := h.DB.ExecContext(ctx, `
+		INSERT INTO hosts(name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml, pdu_config_json, ipmi_config_json, notes)
+		VALUES(?,?,?,?,?,?,?,?)
+	`, in.Name, in.MAC, in.WoLBroadcastIP, in.WoLPort, in.ExpectedSpecYAML, nullIfEmpty(in.PDUConfigJSON), nullIfEmpty(in.IPMIConfigJSON), in.Notes)
+	if err != nil {
+		return 0, fmt.Errorf("insert host: %w", err)
+	}
+	return res.LastInsertId()
+}
+
+func (h *Hosts) List(ctx context.Context) ([]model.Host, error) {
+	rows, err := h.DB.QueryContext(ctx, `
+		SELECT id, name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml,
+		       COALESCE(pdu_config_json,''), COALESCE(ipmi_config_json,''),
+		       notes, created_at, updated_at
+		FROM hosts
+		ORDER BY name COLLATE NOCASE
+	`)
+	if err != nil {
+		return nil, fmt.Errorf("list hosts: %w", err)
+	}
+	defer rows.Close()
+
+	var out []model.Host
+	for rows.Next() {
+		var host model.Host
+		if err := rows.Scan(&host.ID, &host.Name, &host.MAC, &host.WoLBroadcastIP, &host.WoLPort,
+			&host.ExpectedSpecYAML, &host.PDUConfigJSON, &host.IPMIConfigJSON,
+			&host.Notes, &host.CreatedAt, &host.UpdatedAt); err != nil {
+			return nil, fmt.Errorf("scan host: %w", err)
+		}
+		out = append(out, host)
+	}
+	return out, rows.Err()
+}
+
+func (h *Hosts) Get(ctx context.Context, id int64) (*model.Host, error) {
+	row := h.DB.QueryRowContext(ctx, `
+		SELECT id, name, mac, wol_broadcast_ip, wol_port, expected_spec_yaml,
+		       COALESCE(pdu_config_json,''), COALESCE(ipmi_config_json,''),
+		       notes, created_at, updated_at
+		FROM hosts WHERE id = ?
+	`, id)
+	var host model.Host
+	err := row.Scan(&host.ID, &host.Name, &host.MAC, &host.WoLBroadcastIP, &host.WoLPort,
+		&host.ExpectedSpecYAML, &host.PDUConfigJSON, &host.IPMIConfigJSON,
+		&host.Notes, &host.CreatedAt, &host.UpdatedAt)
+	if errors.Is(err, sql.ErrNoRows) {
+		return nil, ErrNotFound
+	}
+	if err != nil {
+		return nil, fmt.Errorf("get host: %w", err)
+	}
+	return &host, nil
+}
+
+func (h *Hosts) Delete(ctx context.Context, id int64) error {
+	res, err := h.DB.ExecContext(ctx, `DELETE FROM hosts WHERE id = ?`, id)
+	if err != nil {
+		return fmt.Errorf("delete host: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return ErrNotFound
+	}
+	return nil
+}
+
+func normalizeMAC(m string) string {
+	return strings.ToLower(strings.TrimSpace(m))
+}
+
+func nullIfEmpty(s string) any {
+	if s == "" {
+		return nil
+	}
+	return s
+}
@@ -0,0 +1,85 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"time"
+
+	"vetting/internal/model"
+)
+
+// Measurements persists timestamped numeric samples: temps, fan speeds,
+// PSU voltages, fio IOPS, iperf throughput, SMART attributes. The schema
+// stores (kind, key, value, unit) so Phase 5 reports can group freely
+// without new tables per source.
+type Measurements struct {
+	DB *sql.DB
+}
+
+func (m *Measurements) Create(ctx context.Context, in model.Measurement) (int64, error) {
+	if in.TS.IsZero() {
+		in.TS = time.Now().UTC()
+	}
+	res, err := m.DB.ExecContext(ctx, `
+		INSERT INTO measurements(run_id, stage_id, ts, kind, key, value, unit)
+		VALUES(?,?,?,?,?,?,?)
+	`, in.RunID, nullInt64(in.StageID), in.TS, in.Kind, in.Key, in.Value, in.Unit)
+	if err != nil {
+		return 0, fmt.Errorf("insert measurement: %w", err)
+	}
+	return res.LastInsertId()
+}
+
+// CreateBatch inserts a batch in one transaction. The sensor endpoint
+// hands us ~5–20 samples per tick; a single commit keeps SQLite happy.
+func (m *Measurements) CreateBatch(ctx context.Context, rows []model.Measurement) error {
+	if len(rows) == 0 {
+		return nil
+	}
+	tx, err := m.DB.BeginTx(ctx, nil)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = tx.Rollback() }()
+	now := time.Now().UTC()
+	for _, r := range rows {
+		if r.TS.IsZero() {
+			r.TS = now
+		}
+		if _, err := tx.ExecContext(ctx, `
+			INSERT INTO measurements(run_id, stage_id, ts, kind, key, value, unit)
+			VALUES(?,?,?,?,?,?,?)
+		`, r.RunID, nullInt64(r.StageID), r.TS, r.Kind, r.Key, r.Value, r.Unit); err != nil {
+			return fmt.Errorf("insert measurement: %w", err)
+		}
+	}
+	return tx.Commit()
+}
+
+// ListForRun returns all measurements for a run. Callers filter by kind
+// in memory; the row count is small per run (≈thousands).
+func (m *Measurements) ListForRun(ctx context.Context, runID int64) ([]model.Measurement, error) {
+	rows, err := m.DB.QueryContext(ctx, `
+		SELECT id, run_id, stage_id, ts, kind, key, value, COALESCE(unit,'')
+		FROM measurements WHERE run_id = ? ORDER BY ts, id
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []model.Measurement
+	for rows.Next() {
+		var meas model.Measurement
+		var stageID sql.NullInt64
+		if err := rows.Scan(&meas.ID, &meas.RunID, &stageID, &meas.TS, &meas.Kind, &meas.Key, &meas.Value, &meas.Unit); err != nil {
+			return nil, err
+		}
+		if stageID.Valid {
+			v := stageID.Int64
+			meas.StageID = &v
+		}
+		out = append(out, meas)
+	}
+	return out, rows.Err()
+}
@@ -0,0 +1,226 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"fmt"
+	"time"
+
+	"vetting/internal/model"
+)
+
+type Runs struct {
+	DB *sql.DB
+}
+
+func (r *Runs) Create(ctx context.Context, hostID int64, tokenHash string) (int64, error) {
+	now := time.Now().UTC()
+	res, err := r.DB.ExecContext(ctx, `
+		INSERT INTO runs(host_id, state, agent_token_hash, next_boot_target, started_at)
+		VALUES(?,?,?,?,?)
+	`, hostID, string(model.StateQueued), tokenHash, "linux", now)
+	if err != nil {
+		return 0, fmt.Errorf("insert run: %w", err)
+	}
+	return res.LastInsertId()
+}
+
+func (r *Runs) SetState(ctx context.Context, runID int64, state model.RunState) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET state = ? WHERE id = ?`, string(state), runID)
+	return err
+}
+
+// RotateTokenHash replaces the stored token hash. Called on each iPXE
+// fetch so only the most-recently-booted agent can claim the run.
+func (r *Runs) RotateTokenHash(ctx context.Context, runID int64, hash string) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET agent_token_hash = ? WHERE id = ?`, hash, runID)
+	return err
+}
+
+// SetHoldIP records the agent's LAN IP so the UI can show the ssh
+// command. Called when the agent POSTs /hold.
+func (r *Runs) SetHoldIP(ctx context.Context, runID int64, ip string) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET hold_ip = ? WHERE id = ?`, ip, runID)
+	return err
+}
+
+// SetFailedStage records which stage tripped the run; used by the tile
+// and by reports. Does not change state.
+func (r *Runs) SetFailedStage(ctx context.Context, runID int64, stage string) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET failed_stage = ? WHERE id = ?`, stage, runID)
+	return err
+}
+
+// ClearFailedStage wipes the failed_stage marker. Called when the
+// operator overrides a stage and the run re-enters the pipeline.
+func (r *Runs) ClearFailedStage(ctx context.Context, runID int64) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET failed_stage = NULL WHERE id = ?`, runID)
+	return err
+}
+
+// SetOverrideFlags persists the operator's override decisions (JSON blob
+// like `{"wipe":true}`). Passed back to the agent on the next heartbeat
+// so it can resume the held stage with the gate bypassed.
+func (r *Runs) SetOverrideFlags(ctx context.Context, runID int64, flagsJSON string) error {
+	_, err := r.DB.ExecContext(ctx, `UPDATE runs SET override_flags_json = ? WHERE id = ?`, flagsJSON, runID)
+	return err
+}
+
+func (r *Runs) MarkFailed(ctx context.Context, runID int64, failedStage, holdIP string) error {
+	now := time.Now().UTC()
+	_, err := r.DB.ExecContext(ctx, `
+		UPDATE runs SET state = ?, result = 'fail', failed_stage = ?, hold_ip = ?, completed_at = ?
+		WHERE id = ?
+	`, string(model.StateFailedHolding), failedStage, holdIP, now, runID)
+	return err
+}
+
+func (r *Runs) MarkCompleted(ctx context.Context, runID int64, reportPath string) error {
+	now := time.Now().UTC()
+	_, err := r.DB.ExecContext(ctx, `
+		UPDATE runs SET state = ?, result = 'pass', report_path = ?, completed_at = ?
+		WHERE id = ?
+	`, string(model.StateCompleted), reportPath, now, runID)
+	return err
+}
+
+func (r *Runs) Get(ctx context.Context, id int64) (*model.Run, error) {
+	row := r.DB.QueryRowContext(ctx, `
+		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
+		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
+		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
+		       COALESCE(override_flags_json,'')
+		FROM runs WHERE id = ?
+	`, id)
+	var run model.Run
+	var completedAt sql.NullTime
+	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
+		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON)
+	if errors.Is(err, sql.ErrNoRows) {
+		return nil, ErrNotFound
+	}
+	if err != nil {
+		return nil, fmt.Errorf("get run: %w", err)
+	}
+	if completedAt.Valid {
+		run.CompletedAt = &completedAt.Time
+	}
+	return &run, nil
+}
+
+// LatestForHost returns the most recent run for a host, or nil if none.
+func (r *Runs) LatestForHost(ctx context.Context, hostID int64) (*model.Run, error) {
+	row := r.DB.QueryRowContext(ctx, `
+		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
+		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
+		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
+		       COALESCE(override_flags_json,'')
+		FROM runs WHERE host_id = ?
+		ORDER BY id DESC LIMIT 1
+	`, hostID)
+	var run model.Run
+	var completedAt sql.NullTime
+	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
+		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON)
+	if errors.Is(err, sql.ErrNoRows) {
+		return nil, nil
+	}
+	if err != nil {
+		return nil, fmt.Errorf("latest run: %w", err)
+	}
+	if completedAt.Valid {
+		run.CompletedAt = &completedAt.Time
+	}
+	return &run, nil
+}
+
+// Active returns all runs in non-terminal states.
+func (r *Runs) Active(ctx context.Context) ([]model.Run, error) {
+	rows, err := r.DB.QueryContext(ctx, `
+		SELECT id, host_id, state, COALESCE(result,''), COALESCE(failed_stage,''),
+		       COALESCE(next_boot_target,''), agent_token_hash, started_at,
+		       completed_at, COALESCE(report_path,''), COALESCE(hold_ip,''),
+		       COALESCE(override_flags_json,'')
+		FROM runs
+		WHERE state NOT IN ('Completed','Released')
+		ORDER BY id
+	`)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []model.Run
+	for rows.Next() {
+		var run model.Run
+		var completedAt sql.NullTime
+		if err := rows.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
+			&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
+			&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON); err != nil {
+			return nil, err
+		}
+		if completedAt.Valid {
+			run.CompletedAt = &completedAt.Time
+		}
+		out = append(out, run)
+	}
+	return out, rows.Err()
+}
+
+// CompletedOlderThan returns run IDs for terminal (Completed/Released/
+// FailedHolding) runs whose completed_at is older than cutoff. Runs with
+// a NULL completed_at fall back to started_at so a stuck run doesn't get
+// garbage-collected out from under its own logs. Used by the janitor.
+func (r *Runs) CompletedOlderThan(ctx context.Context, cutoff time.Time) ([]int64, error) {
+	rows, err := r.DB.QueryContext(ctx, `
+		SELECT id FROM runs
+		WHERE state IN ('Completed','Released','FailedHolding')
+		  AND COALESCE(completed_at, started_at) < ?
+		ORDER BY id
+	`, cutoff)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []int64
+	for rows.Next() {
+		var id int64
+		if err := rows.Scan(&id); err != nil {
+			return nil, err
+		}
+		out = append(out, id)
+	}
+	return out, rows.Err()
+}
+
+// FindByMAC returns the current active run for the host with the given MAC,
+// or nil if the MAC is unknown or has no active run.
+func (r *Runs) FindActiveByMAC(ctx context.Context, mac string) (*model.Run, error) {
+	row := r.DB.QueryRowContext(ctx, `
+		SELECT r.id, r.host_id, r.state, COALESCE(r.result,''), COALESCE(r.failed_stage,''),
+		       COALESCE(r.next_boot_target,''), r.agent_token_hash, r.started_at,
+		       r.completed_at, COALESCE(r.report_path,''), COALESCE(r.hold_ip,''),
+		       COALESCE(r.override_flags_json,'')
+		FROM runs r
+		JOIN hosts h ON h.id = r.host_id
+		WHERE h.mac = ? AND r.state NOT IN ('Completed','Released')
+		ORDER BY r.id DESC LIMIT 1
+	`, mac)
+	var run model.Run
+	var completedAt sql.NullTime
+	err := row.Scan(&run.ID, &run.HostID, &run.State, &run.Result, &run.FailedStage,
+		&run.NextBootTarget, &run.AgentTokenHash, &run.StartedAt,
+		&completedAt, &run.ReportPath, &run.HoldIP, &run.OverrideFlagsJSON)
+	if errors.Is(err, sql.ErrNoRows) {
+		return nil, nil
+	}
+	if err != nil {
+		return nil, err
+	}
+	if completedAt.Valid {
+		run.CompletedAt = &completedAt.Time
+	}
+	return &run, nil
+}
@@ -0,0 +1,91 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"time"
+
+	"vetting/internal/model"
+)
+
+type Stages struct {
+	DB *sql.DB
+}
+
+// DefaultStageOrder is the canonical sequence for every run. Phase 2 only
+// reaches Inventory; later phases add more executors but the list is fixed.
+var DefaultStageOrder = []string{
+	"Inventory",
+	"SpecValidate",
+	"SMART",
+	"CPUStress",
+	"Storage",
+	"Network",
+	"GPU",
+	"PSU",
+	"Reporting",
+}
+
+// Seed creates one pending row per stage for the given run.
+func (s *Stages) Seed(ctx context.Context, runID int64) error {
+	tx, err := s.DB.BeginTx(ctx, nil)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = tx.Rollback() }()
+	for i, name := range DefaultStageOrder {
+		if _, err := tx.ExecContext(ctx,
+			`INSERT INTO stages(run_id, name, ordinal, state) VALUES(?,?,?,?)`,
+			runID, name, i, string(model.StagePending)); err != nil {
+			return fmt.Errorf("seed stage %s: %w", name, err)
+		}
+	}
+	return tx.Commit()
+}
+
+func (s *Stages) ListForRun(ctx context.Context, runID int64) ([]model.Stage, error) {
+	rows, err := s.DB.QueryContext(ctx, `
+		SELECT id, run_id, name, ordinal, state, started_at, completed_at, COALESCE(summary_json,'')
+		FROM stages WHERE run_id = ? ORDER BY ordinal
+	`, runID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var out []model.Stage
+	for rows.Next() {
+		var st model.Stage
+		var started, completed sql.NullTime
+		if err := rows.Scan(&st.ID, &st.RunID, &st.Name, &st.Ordinal, &st.State,
+			&started, &completed, &st.SummaryJSON); err != nil {
+			return nil, err
+		}
+		if started.Valid {
+			st.StartedAt = &started.Time
+		}
+		if completed.Valid {
+			st.CompletedAt = &completed.Time
+		}
+		out = append(out, st)
+	}
+	return out, rows.Err()
+}
+
+func (s *Stages) StartByName(ctx context.Context, runID int64, name string) error {
+	now := time.Now().UTC()
+	_, err := s.DB.ExecContext(ctx, `
+		UPDATE stages SET state = ?, started_at = ?
+		WHERE run_id = ? AND name = ?
+	`, string(model.StageRunning), now, runID, name)
+	return err
+}
+
+func (s *Stages) CompleteByName(ctx context.Context, runID int64, name string, state model.StageState, summaryJSON string) error {
+	now := time.Now().UTC()
+	_, err := s.DB.ExecContext(ctx, `
+		UPDATE stages SET state = ?, completed_at = ?, summary_json = ?
+		WHERE run_id = ? AND name = ?
+	`, string(state), now, nullIfEmpty(summaryJSON), runID, name)
+	return err
+}
@@ -0,0 +1,229 @@
+package store_test
+
+import (
+	"context"
+	"path/filepath"
+	"testing"
+
+	"vetting/internal/db"
+	"vetting/internal/model"
+	"vetting/internal/store"
+)
+
+func newDB(t *testing.T) *store.Runs {
+	t.Helper()
+	path := filepath.Join(t.TempDir(), "vetting.db")
+	conn, err := db.Open(path)
+	if err != nil {
+		t.Fatalf("open db: %v", err)
+	}
+	t.Cleanup(func() { _ = conn.Close() })
+	return &store.Runs{DB: conn}
+}
+
+// seedRun inserts a host + a run and returns (hostID, runID). Every
+// subsequent store test builds on this so run_id foreign keys resolve.
+func seedRun(t *testing.T, runs *store.Runs) (int64, int64) {
+	t.Helper()
+	hosts := &store.Hosts{DB: runs.DB}
+	hostID, err := hosts.Create(context.Background(), model.Host{
+		Name:             "t-host",
+		MAC:              "aa:bb:cc:dd:ee:ff",
+		WoLBroadcastIP:   "10.0.0.255",
+		WoLPort:          9,
+		ExpectedSpecYAML: "memory:\n  total_gib: 16\n",
+	})
+	if err != nil {
+		t.Fatalf("create host: %v", err)
+	}
+	runID, err := runs.Create(context.Background(), hostID, "deadbeef")
+	if err != nil {
+		t.Fatalf("create run: %v", err)
+	}
+	return hostID, runID
+}
+
+func TestArtifactsRoundtrip(t *testing.T) {
+	runs := newDB(t)
+	_, runID := seedRun(t, runs)
+	arts := &store.Artifacts{DB: runs.DB}
+
+	id, err := arts.Create(context.Background(), store.Artifact{
+		RunID:     runID,
+		Kind:      "inventory",
+		Path:      "/var/artifacts/run-1/inventory.json",
+		SHA256:    "abc123",
+		SizeBytes: 42,
+	})
+	if err != nil {
+		t.Fatalf("Create: %v", err)
+	}
+	if id == 0 {
+		t.Fatalf("expected non-zero id")
+	}
+
+	// Hold key on the same run — ListForRun should return both in
+	// insertion order and TileEnricher picks the hold_key row.
+	if _, err := arts.Create(context.Background(), store.Artifact{
+		RunID: runID, Kind: "hold_key", Path: "/var/artifacts/run-1/hold.key", SHA256: "def456", SizeBytes: 400,
+	}); err != nil {
+		t.Fatalf("Create hold_key: %v", err)
+	}
+
+	list, err := arts.ListForRun(context.Background(), runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	if len(list) != 2 {
+		t.Fatalf("ListForRun returned %d, want 2", len(list))
+	}
+	if list[0].Kind != "inventory" || list[1].Kind != "hold_key" {
+		t.Fatalf("unexpected order: %+v", list)
+	}
+	if list[1].Path != "/var/artifacts/run-1/hold.key" {
+		t.Fatalf("hold_key path lost: %q", list[1].Path)
+	}
+}
+
+func TestSpecDiffsReplaceForRun(t *testing.T) {
+	runs := newDB(t)
+	_, runID := seedRun(t, runs)
+	sd := &store.SpecDiffs{DB: runs.DB}
+	ctx := context.Background()
+
+	// First write: three diffs.
+	err := sd.ReplaceForRun(ctx, runID, []model.SpecDiff{
+		{RunID: runID, Field: "cpu.model", Expected: "Xeon", Actual: "EPYC", Severity: "critical"},
+		{RunID: runID, Field: "memory.total_gib", Expected: "16", Actual: "8", Severity: "critical"},
+		{RunID: runID, Field: "note", Expected: "", Actual: "dusty", Severity: "info"},
+	})
+	if err != nil {
+		t.Fatalf("ReplaceForRun: %v", err)
+	}
+
+	list, err := sd.ListForRun(ctx, runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	if len(list) != 3 {
+		t.Fatalf("got %d rows, want 3", len(list))
+	}
+
+	// Second write replaces, doesn't append — otherwise a re-run would
+	// double-count spec diffs and the tile badge would grow without bound.
+	err = sd.ReplaceForRun(ctx, runID, []model.SpecDiff{
+		{RunID: runID, Field: "cpu.model", Expected: "Xeon", Actual: "Xeon Gold", Severity: "info"},
+	})
+	if err != nil {
+		t.Fatalf("second ReplaceForRun: %v", err)
+	}
+	list, err = sd.ListForRun(ctx, runID)
+	if err != nil {
+		t.Fatalf("ListForRun after replace: %v", err)
+	}
+	if len(list) != 1 {
+		t.Fatalf("expected 1 row after replace, got %d", len(list))
+	}
+	if list[0].Severity != "info" {
+		t.Fatalf("expected severity info, got %q", list[0].Severity)
+	}
+}
+
+func TestMeasurementsBatchAndList(t *testing.T) {
+	runs := newDB(t)
+	_, runID := seedRun(t, runs)
+	meas := &store.Measurements{DB: runs.DB}
+	ctx := context.Background()
+
+	err := meas.CreateBatch(ctx, []model.Measurement{
+		{RunID: runID, Kind: "thermal", Key: "cpu", Value: 52.5, Unit: "C"},
+		{RunID: runID, Kind: "iperf", Key: "throughput_mbps", Value: 940.1, Unit: "Mbps"},
+		{RunID: runID, Kind: "psu", Key: "in0", Value: 12.04, Unit: "V"},
+	})
+	if err != nil {
+		t.Fatalf("CreateBatch: %v", err)
+	}
+
+	// Zero-length batch must be a no-op, not an error.
+	if err := meas.CreateBatch(ctx, nil); err != nil {
+		t.Fatalf("empty CreateBatch: %v", err)
+	}
+
+	rows, err := meas.ListForRun(ctx, runID)
+	if err != nil {
+		t.Fatalf("ListForRun: %v", err)
+	}
+	if len(rows) != 3 {
+		t.Fatalf("got %d rows, want 3", len(rows))
+	}
+	foundIperf := false
+	for _, r := range rows {
+		if r.Kind == "iperf" && r.Key == "throughput_mbps" && r.Value > 900 {
+			foundIperf = true
+		}
+	}
+	if !foundIperf {
+		t.Fatalf("iperf row missing or wrong value: %+v", rows)
+	}
+}
+
+func TestRunsOverrideFlagsAndClearFailedStage(t *testing.T) {
+	runs := newDB(t)
+	_, runID := seedRun(t, runs)
+	ctx := context.Background()
+
+	if err := runs.SetFailedStage(ctx, runID, "Storage"); err != nil {
+		t.Fatalf("SetFailedStage: %v", err)
+	}
+	if err := runs.SetOverrideFlags(ctx, runID, `{"wipe":true}`); err != nil {
+		t.Fatalf("SetOverrideFlags: %v", err)
+	}
+	run, err := runs.Get(ctx, runID)
+	if err != nil {
+		t.Fatalf("Get: %v", err)
+	}
+	if run.OverrideFlagsJSON != `{"wipe":true}` {
+		t.Fatalf("OverrideFlagsJSON = %q, want {\"wipe\":true}", run.OverrideFlagsJSON)
+	}
+	if run.FailedStage != "Storage" {
+		t.Fatalf("FailedStage = %q, want Storage", run.FailedStage)
+	}
+	if err := runs.ClearFailedStage(ctx, runID); err != nil {
+		t.Fatalf("ClearFailedStage: %v", err)
+	}
+	run, err = runs.Get(ctx, runID)
+	if err != nil {
+		t.Fatalf("Get after clear: %v", err)
+	}
+	if run.FailedStage != "" {
+		t.Fatalf("FailedStage not cleared: %q", run.FailedStage)
+	}
+	// override_flags_json should persist across ClearFailedStage so the
+	// agent can still read it on its next heartbeat.
+	if run.OverrideFlagsJSON != `{"wipe":true}` {
+		t.Fatalf("OverrideFlagsJSON lost after ClearFailedStage: %q", run.OverrideFlagsJSON)
+	}
+}
+
+func TestRunsHoldAndFailedStage(t *testing.T) {
+	runs := newDB(t)
+	_, runID := seedRun(t, runs)
+	ctx := context.Background()
+
+	if err := runs.SetHoldIP(ctx, runID, "10.0.0.42"); err != nil {
+		t.Fatalf("SetHoldIP: %v", err)
+	}
+	if err := runs.SetFailedStage(ctx, runID, "SpecValidate"); err != nil {
+		t.Fatalf("SetFailedStage: %v", err)
+	}
+	run, err := runs.Get(ctx, runID)
+	if err != nil {
+		t.Fatalf("Get: %v", err)
+	}
+	if run.HoldIP != "10.0.0.42" {
+		t.Fatalf("HoldIP = %q, want 10.0.0.42", run.HoldIP)
+	}
+	if run.FailedStage != "SpecValidate" {
+		t.Fatalf("FailedStage = %q, want SpecValidate", run.FailedStage)
+	}
+}