Vetting/internal/api/smoke_test.go

package api_test

import (
	"bytes"
	"context"
	"encoding/json"
	"net/http"
	"net/http/httptest"
	"os"
	"path/filepath"
	"strconv"
	"strings"
	"sync"
	"testing"
	"time"

	"github.com/go-chi/chi/v5"

	"vetting/internal/api"
	"vetting/internal/db"
	"vetting/internal/events"
	"vetting/internal/logs"
	"vetting/internal/model"
	"vetting/internal/notify"
	"vetting/internal/orchestrator"
	"vetting/internal/spec"
	"vetting/internal/store"
)

// captureNotifier is a testing-only Notifier that records every Event
// sent to it, under a mutex so concurrent Dispatch goroutines are safe.
type captureNotifier struct {
	mu   sync.Mutex
	name string
	evs  []notify.Event
}

func (c *captureNotifier) Name() string { return c.name }

func (c *captureNotifier) Send(_ context.Context, ev notify.Event) error {
	c.mu.Lock()
	c.evs = append(c.evs, ev)
	c.mu.Unlock()
	return nil
}

func (c *captureNotifier) awaitKind(t *testing.T, k notify.Kind) notify.Event {
	t.Helper()
	deadline := time.Now().Add(2 * time.Second)
	for {
		c.mu.Lock()
		for _, ev := range c.evs {
			if ev.Kind == k {
				got := ev
				c.mu.Unlock()
				return got
			}
		}
		c.mu.Unlock()
		if time.Now().After(deadline) {
			t.Fatalf("no %q event received within timeout", k)
		}
		time.Sleep(5 * time.Millisecond)
	}
}

func newCaptureRegistry(c *captureNotifier) *notify.Registry {
	reg := notify.NewRegistry(time.Second)
	reg.Register(c)
	reg.AddRoute(notify.Route{Notifier: c.name}) // wildcard
	return reg
}

// Builds a fully-wired Agent against a fresh sqlite DB and returns
// (agent, runID, plainTokenForBearer). Caller is responsible for
// transitioning the run out of Queued.
func fullAgent(t *testing.T) (*api.Agent, int64, string) {
	return fullAgentWithSpec(t, "")
}

// fullAgentWithSpec is the same as fullAgent but seeds the host with
// an ExpectedSpecYAML so SpecValidate can pick up diffs in the test.
func fullAgentWithSpec(t *testing.T, expectedSpecYAML string) (*api.Agent, int64, string) {
	t.Helper()
	tmp := t.TempDir()
	conn, err := db.Open(filepath.Join(tmp, "vetting.db"))
	if err != nil {
		t.Fatalf("open db: %v", err)
	}
	t.Cleanup(func() { _ = conn.Close() })

	hostStore := &store.Hosts{DB: conn}
	runStore := &store.Runs{DB: conn}
	stageStore := &store.Stages{DB: conn}
	artifactStore := &store.Artifacts{DB: conn}
	specDiffStore := &store.SpecDiffs{DB: conn}
	measurementStore := &store.Measurements{DB: conn}
	firmwareStore := &store.Firmware{DB: conn}

	hub := events.NewHub()
	logHub, err := logs.NewHub(filepath.Join(tmp, "logs"), hub)
	if err != nil {
		t.Fatalf("logs hub: %v", err)
	}
	t.Cleanup(func() { logHub.Close() })

	runner := &orchestrator.Runner{
		Runs:     runStore,
		Hosts:    hostStore,
		Stages:   stageStore,
		EventHub: hub,
	}

	hostID, err := hostStore.Create(context.Background(), model.Host{
		Name:             "smoke-host",
		MAC:              "aa:bb:cc:dd:ee:10",
		WoLBroadcastIP:   "10.0.0.255",
		WoLPort:          9,
		ExpectedSpecYAML: expectedSpecYAML,
	})
	if err != nil {
		t.Fatalf("create host: %v", err)
	}
	plain, hash, err := orchestrator.IssueRunToken()
	if err != nil {
		t.Fatalf("issue token: %v", err)
	}
	runID, err := runStore.Create(context.Background(), hostID, hash, false)
	if err != nil {
		t.Fatalf("create run: %v", err)
	}
	if err := stageStore.Seed(context.Background(), runID); err != nil {
		t.Fatalf("seed stages: %v", err)
	}
	return &api.Agent{
		Hosts:        hostStore,
		Runs:         runStore,
		Stages:       stageStore,
		Artifacts:    artifactStore,
		SpecDiffs:    specDiffStore,
		Measurements: measurementStore,
		Firmware:     firmwareStore,
		Runner:       runner,
		EventHub:     hub,
		Logs:         logHub,
		ArtifactsDir: filepath.Join(tmp, "artifacts"),
		PublicURL:    "https://vetting.example",
	}, runID, plain
}

// walkStage simulates the agent reporting a single stage's outcome.
// Returns the next_state the orchestrator decided to advance to.
func walkStage(t *testing.T, a *api.Agent, runID int64, token, stage string, passed bool, extras map[string]any) string {
	t.Helper()
	body := map[string]any{"stage": stage, "passed": passed}
	if extras != nil {
		for k, v := range extras {
			body[k] = v
		}
	}
	buf, _ := json.Marshal(body)
	req := httptest.NewRequest(http.MethodPost,
		"/api/v1/runs/"+strconv.FormatInt(runID, 10)+"/result",
		bytes.NewReader(buf))
	rctx := chi.NewRouteContext()
	rctx.URLParams.Add("id", strconv.FormatInt(runID, 10))
	req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))
	req.Header.Set("Authorization", "Bearer "+token)
	req.Header.Set("Content-Type", "application/json")
	rr := httptest.NewRecorder()
	a.Result(rr, req)
	if rr.Code != http.StatusOK {
		t.Fatalf("stage %s: status %d body=%q", stage, rr.Code, rr.Body.String())
	}
	var resp struct {
		OK        bool   `json:"ok"`
		NextState string `json:"next_state"`
	}
	if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil {
		t.Fatalf("stage %s: decode resp: %v", stage, err)
	}
	return resp.NextState
}

// TestFullPipelineToCompleted walks an agent through all stages of a
// successful run and asserts the run ends in Completed. Inventory is
// minimal; the empty expected-spec means SpecValidate produces zero
// critical diffs and the orchestrator auto-advances past it.
func TestFullPipelineToCompleted(t *testing.T) {
	a, runID, token := fullAgent(t)
	capture := &captureNotifier{name: "capture"}
	a.Notify = newCaptureRegistry(capture)
	// Claim would normally transition Booting → InventoryCheck; set it
	// directly here since we're not exercising the claim path.
	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
		t.Fatalf("set state: %v", err)
	}

	// Stage 1: Inventory — provide a concrete inventory so SpecValidate
	// has something to compare against.
	inv := spec.Inventory{
		CPU:    spec.CPUSpec{Model: "Xeon Gold", LogicalCores: 8},
		Memory: spec.MemorySpec{TotalGiB: 16},
	}
	next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv})
	// After Inventory → Firmware
	if next != "Firmware" {
		t.Fatalf("after Inventory, next_state = %q, want Firmware", next)
	}

	// The remaining stages advance one-for-one in order. After Firmware
	// the inline SpecValidate resolver advances through SpecValidate to
	// SMART without a dedicated /result POST for SpecValidate.
	walkPlan := []struct {
		stage    string
		expected string
	}{
		{"Firmware", "SMART"},
		{"SMART", "CPUStress"},
		{"CPUStress", "Storage"},
		{"Storage", "Network"},
		{"Network", "Burn"},
		{"Burn", "GPU"},
		{"GPU", "PSU"},
		{"PSU", "Completed"}, // PSU → Reporting resolves inline → Completed
	}
	for _, step := range walkPlan {
		got := walkStage(t, a, runID, token, step.stage, true, nil)
		if got != step.expected {
			t.Fatalf("after %s, next_state = %q, want %q", step.stage, got, step.expected)
		}
	}

	run, err := a.Runs.Get(context.Background(), runID)
	if err != nil {
		t.Fatalf("Get run: %v", err)
	}
	if run.State != model.StateCompleted {
		t.Fatalf("run.State = %q, want Completed", run.State)
	}
	if run.ReportPath == "" {
		t.Fatalf("run.ReportPath not set")
	}

	// Phase 5 assertions: an HTML report artifact exists on disk, and
	// the capture notifier saw a RunCompleted event.
	arts, err := a.Artifacts.ListForRun(context.Background(), runID)
	if err != nil {
		t.Fatalf("ListForRun: %v", err)
	}
	var htmlPath string
	for _, art := range arts {
		if art.Kind == "report_html" {
			htmlPath = art.Path
		}
	}
	if htmlPath == "" {
		t.Fatalf("no report_html artifact recorded (kinds seen: %v)", artifactKinds(arts))
	}
	data, err := os.ReadFile(htmlPath)
	if err != nil {
		t.Fatalf("read report.html: %v", err)
	}
	if !strings.Contains(string(data), "<html") {
		t.Fatalf("report.html missing <html tag: %s", string(data[:min(200, len(data))]))
	}
	ev := capture.awaitKind(t, notify.KindRunCompleted)
	if ev.HostName != "smoke-host" {
		t.Errorf("RunCompleted host = %q, want smoke-host", ev.HostName)
	}
	if ev.URL == "" || !strings.Contains(ev.URL, "/reports/") {
		t.Errorf("RunCompleted URL = %q, want non-empty with /reports/", ev.URL)
	}
}

func artifactKinds(arts []store.Artifact) []string {
	out := make([]string, 0, len(arts))
	for _, a := range arts {
		out = append(out, a.Kind)
	}
	return out
}

func min(a, b int) int {
	if a < b {
		return a
	}
	return b
}

// TestFaultInjectionSMART verifies a failing SMART stage halts the
// pipeline at FailedHolding with failed_stage recorded.
func TestFaultInjectionSMART(t *testing.T) {
	a, runID, token := fullAgent(t)
	capture := &captureNotifier{name: "capture"}
	a.Notify = newCaptureRegistry(capture)
	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
		t.Fatalf("set state: %v", err)
	}

	inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
		t.Fatalf("after Inventory, next = %q want Firmware", next)
	}
	if next := walkStage(t, a, runID, token, "Firmware", true, nil); next != "SMART" {
		t.Fatalf("after Firmware, next = %q want SMART (inline SpecValidate)", next)
	}

	// Fake SMART failure → expect FailedHolding.
	if next := walkStage(t, a, runID, token, "SMART", false, nil); next != "FailedHolding" {
		t.Fatalf("after SMART fail, next = %q want FailedHolding", next)
	}

	run, err := a.Runs.Get(context.Background(), runID)
	if err != nil {
		t.Fatalf("Get run: %v", err)
	}
	if run.State != model.StateFailedHolding {
		t.Fatalf("run.State = %q, want FailedHolding", run.State)
	}
	if run.FailedStage != "SMART" {
		t.Fatalf("run.FailedStage = %q, want SMART", run.FailedStage)
	}

	// Phase 5 assertion: the fault fires a StageFailed notification.
	ev := capture.awaitKind(t, notify.KindStageFailed)
	if !strings.Contains(ev.Title, "SMART") {
		t.Errorf("StageFailed title = %q, want to mention SMART", ev.Title)
	}
	if ev.Severity != notify.SeverityCritical {
		t.Errorf("StageFailed severity = %q, want critical", ev.Severity)
	}
}

// TestFirmwarePersistAndSpecMismatch exercises the Phase 4 firmware
// integration: the agent POSTs Firmware snapshots; server persists; the
// following SpecValidate diff picks up a firmware mismatch and parks
// the run in FailedHolding with FailedStage=SpecValidate.
func TestFirmwarePersistAndSpecMismatch(t *testing.T) {
	// Host demands BIOS 3.3; agent will POST 3.2 → one critical firmware diff.
	yaml := "firmware:\n  - component: bios\n    version: \"3.3\"\n"
	a, runID, token := fullAgentWithSpec(t, yaml)
	a.Notify = newCaptureRegistry(&captureNotifier{name: "capture"})

	if err := a.Runs.SetState(context.Background(), runID, model.StateInventoryCheck); err != nil {
		t.Fatalf("set state: %v", err)
	}

	inv := spec.Inventory{Memory: spec.MemorySpec{TotalGiB: 16}}
	if next := walkStage(t, a, runID, token, "Inventory", true, map[string]any{"inventory": inv}); next != "Firmware" {
		t.Fatalf("after Inventory, next = %q want Firmware", next)
	}

	// Firmware stage: agent reports actual BIOS 3.2 → one row persisted.
	fw := []map[string]any{
		{"component": "bios", "identifier": "system", "version": "3.2", "vendor": "AMI"},
	}
	next := walkStage(t, a, runID, token, "Firmware", true, map[string]any{"firmware": fw})
	// Inline SpecValidate should detect the firmware mismatch and send
	// the run to FailedHolding without the agent posting SpecValidate.
	if next != "FailedHolding" {
		t.Fatalf("after Firmware mismatch, next = %q want FailedHolding", next)
	}

	run, err := a.Runs.Get(context.Background(), runID)
	if err != nil {
		t.Fatalf("get run: %v", err)
	}
	if run.State != model.StateFailedHolding {
		t.Fatalf("run.State = %q, want FailedHolding", run.State)
	}
	if run.FailedStage != "SpecValidate" {
		t.Fatalf("run.FailedStage = %q, want SpecValidate", run.FailedStage)
	}

	// Persistence: row landed in firmware_snapshots.
	snaps, err := a.Firmware.ListForRun(context.Background(), runID)
	if err != nil {
		t.Fatalf("ListForRun firmware: %v", err)
	}
	if len(snaps) != 1 {
		t.Fatalf("firmware rows = %d, want 1: %+v", len(snaps), snaps)
	}
	if snaps[0].Component != "bios" || snaps[0].Version != "3.2" {
		t.Errorf("persisted snapshot = %+v", snaps[0])
	}

	// Diff row: SpecDiffs has a firmware-specific entry (rather than
	// only CPU/memory/disk rows) and is critical.
	diffs, err := a.SpecDiffs.ListForRun(context.Background(), runID)
	if err != nil {
		t.Fatalf("ListForRun specdiffs: %v", err)
	}
	found := false
	for _, d := range diffs {
		if strings.HasPrefix(d.Field, "firmware[") {
			found = true
			if d.Severity != "critical" {
				t.Errorf("firmware diff severity = %q, want critical", d.Severity)
			}
		}
	}
	if !found {
		t.Fatalf("no firmware[...] entry in spec diffs: %+v", diffs)
	}
}