package tests

import (
	"context"
	"encoding/json"
	"fmt"
	"os/exec"
	"runtime"
	"strconv"
	"strings"
	"sync"
	"time"

	"vetting/agent/probes"
)

// BurnConfig is what the agent passes to Burn: the orchestrator's iperf3
// server address and port. Durations + concurrency knobs come from
// Deps.BurnKnobs so they scale with profile.
type BurnConfig struct {
	OrchestratorURL string
	IperfPort       int // 0 = 5201
}

// Burn is the concurrent soak stage. Unlike CPUStress (serial
// CPU→memory) or Storage (serial per disk) it fans out every workload
// at once: stress-ng hammers CPU + memory, fio drives the allow-listed
// disks, iperf3 pushes sustained NIC traffic, and two sidecars poll
// EDAC + PSU rails for the duration of the window.
//
// This is where PSU rails actually matter: 12V sag under simultaneous
// CPU + disk + NIC load is exactly the failure a thermal/power
// regression produces, and it's invisible to any stage that loads one
// subsystem at a time. The PSU stage that follows Burn in the pipeline
// re-samples rails post-window to confirm they settle back to nominal.
//
// Burn stays inside the stage framework — it doesn't spawn a parallel
// stage runner. The goroutine fan-out is local; the stage converges
// before returning an Outcome so every invariant the orchestrator
// relies on (serial stage order, single in-flight stage per run) still
// holds.
func Burn(ctx context.Context, d Deps, cfg BurnConfig) Outcome {
	duration := d.BurnKnobs.Duration
	if duration <= 0 {
		duration = 2 * time.Minute
	}
	cpuWorkers := resolveCPUWorkers(d.BurnKnobs.CPUWorkers)
	memPct := clampMemPct(d.BurnKnobs.MemPct)
	iperfParallel := d.BurnKnobs.IperfParallel
	if iperfParallel <= 0 {
		iperfParallel = 2
	}
	d.Info(fmt.Sprintf("Burn: window=%s cpu_workers=%d mem_pct=%d iperf_parallel=%d fio_on_spare=%v",
		duration, cpuWorkers, memPct, iperfParallel, d.BurnKnobs.FioOnSpare))

	// Sidecars run for the lifetime of the window and are cancelled on
	// return so the main stage converges cleanly. EDAC catches DIMM
	// bit-flips that appear only under concurrent load; PSU catches
	// rail sag that only appears when CPU + disk + NIC pull current
	// simultaneously.
	sideCtx, sideCancel := context.WithCancel(ctx)
	defer sideCancel()
	var sideWG sync.WaitGroup
	sideWG.Add(2)
	go runEDACSidecar(sideCtx, &sideWG, d)
	go runPSUSidecar(sideCtx, &sideWG, d)

	runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
	defer cancel()

	results := make(chan burnSubResult, 4)
	var wg sync.WaitGroup

	wg.Add(1)
	go func() {
		defer wg.Done()
		results <- runBurnCPU(runCtx, d, duration, cpuWorkers)
	}()

	wg.Add(1)
	go func() {
		defer wg.Done()
		results <- runBurnMemory(runCtx, d, duration, memPct)
	}()

	// fio runs only when explicitly enabled *and* there are allow-listed
	// disks *and* the run wasn't marked non-destructive. Any of those
	// missing records a Skipped sub-step so the operator sees why.
	if d.BurnKnobs.FioOnSpare && len(d.ExpectedDisks) > 0 && !d.NonDestructive {
		wg.Add(1)
		go func() {
			defer wg.Done()
			results <- runBurnFio(runCtx, d, duration)
		}()
	} else {
		reason := burnFioSkipReason(d)
		results <- burnSubResult{Name: "Burn fio", Skipped: true, Reason: reason}
	}

	// iperf requires an orchestrator host. Lab hosts run with the
	// bundled iperf3 server; without a base URL we can't derive a
	// target so we skip rather than fail the stage.
	if cfg.OrchestratorURL != "" {
		wg.Add(1)
		go func() {
			defer wg.Done()
			results <- runBurnIperf(runCtx, d, duration, cfg.OrchestratorURL, cfg.IperfPort, iperfParallel)
		}()
	} else {
		results <- burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "no orchestrator host"}
	}

	wg.Wait()
	sideCancel()
	sideWG.Wait()
	close(results)

	subs, samples, failures := collectBurnResults(results)
	if d.Sensor != nil && len(samples) > 0 {
		_ = d.Sensor(ctx, samples)
	}

	extras := map[string]any{
		"duration":       duration.String(),
		"cpu_workers":    cpuWorkers,
		"mem_pct":        memPct,
		"iperf_parallel": iperfParallel,
		"fio_on_spare":   d.BurnKnobs.FioOnSpare,
	}
	if len(failures) > 0 {
		msg := "Burn workloads failed: " + strings.Join(failures, ", ")
		d.Error(msg)
		return Outcome{
			Passed:   false,
			Message:  msg,
			Summary:  fmt.Sprintf("Burn failed (%d of %d workloads)", len(failures), len(subs)),
			Extras:   extras,
			SubSteps: subs,
		}
	}
	d.Info(fmt.Sprintf("Burn: %s window passed; %d workloads converged", duration, len(subs)))
	return Outcome{
		Passed:   true,
		Summary:  fmt.Sprintf("Burn %s passed (%d workloads)", duration, len(subs)),
		Extras:   extras,
		SubSteps: subs,
	}
}

// burnSubResult is the per-workload return type used by the fan-out
// goroutines. Sample slice is merged into the stage's final /sensor
// batch; SubStep becomes a row on the /result sub-steps list.
type burnSubResult struct {
	Name    string
	Passed  bool
	Skipped bool
	Reason  string // why a workload was skipped
	Err     string // why a workload failed
	Samples []Sample
	SubStep SubStepReport
}

func collectBurnResults(ch <-chan burnSubResult) ([]SubStepReport, []Sample, []string) {
	var subs []SubStepReport
	var samples []Sample
	var failures []string
	for r := range ch {
		// Non-skipped goroutines populate SubStep directly. Skipped slots
		// get a synthesized row here so the /result shape stays stable.
		if r.Skipped {
			stamp := time.Now().UTC()
			subs = append(subs, SubStepReport{
				Name:        r.Name,
				Skipped:     true,
				StartedAt:   stamp,
				CompletedAt: stamp,
				SummaryJSON: mustJSON(map[string]any{"skipped": true, "reason": r.Reason}),
			})
			continue
		}
		subs = append(subs, r.SubStep)
		samples = append(samples, r.Samples...)
		if !r.Passed {
			reason := r.Err
			if reason == "" {
				reason = "unknown"
			}
			failures = append(failures, r.Name+": "+reason)
		}
	}
	return subs, samples, failures
}

func burnFioSkipReason(d Deps) string {
	if !d.BurnKnobs.FioOnSpare {
		return "fio_on_spare knob disabled"
	}
	if d.NonDestructive {
		return "non-destructive run"
	}
	if len(d.ExpectedDisks) == 0 {
		return "no allowlisted disks"
	}
	return "disabled"
}

// runBurnCPU hammers all CPU cores with stress-ng for the window. Same
// shape as CPUStress pass 1 but with shorter label so the sub-step row
// doesn't collide with the earlier stage's "CPU pass".
func runBurnCPU(ctx context.Context, d Deps, duration time.Duration, workers int) burnSubResult {
	if _, err := exec.LookPath("stress-ng"); err != nil {
		return burnSubResult{Name: "Burn CPU", Err: "stress-ng missing"}
	}
	args := []string{
		"--cpu", strconv.Itoa(workers),
		"--cpu-method", "all",
		"--timeout", durationSeconds(duration),
		"--metrics-brief",
		"--verify",
	}
	d.Info(fmt.Sprintf("Burn: stress-ng %s", strings.Join(args, " ")))
	pass := runStressPass(ctx, d, "Burn CPU", duration, args)
	return burnSubResult{
		Name:    "Burn CPU",
		Passed:  pass.Passed,
		Err:     pass.Err,
		SubStep: subStepFromPass("Burn CPU", pass),
	}
}

// runBurnMemory drives a single --vm worker sized at memPct of
// MemAvailable, capped so the kernel + agent + other workloads still
// have headroom. Clamping happens here rather than in resolveBurnKnobs
// so the cap is computed against real live memory each run.
func runBurnMemory(ctx context.Context, d Deps, duration time.Duration, memPct int) burnSubResult {
	if _, err := exec.LookPath("stress-ng"); err != nil {
		return burnSubResult{Name: "Burn memory", Err: "stress-ng missing"}
	}
	avail, err := memAvailableBytes()
	if err != nil {
		return burnSubResult{Name: "Burn memory", Err: "read MemAvailable: " + err.Error()}
	}
	// Budget = avail * memPct / 100, then subtract the standard headroom.
	// If the result is below the memory-pass floor we record a skipped
	// row instead — the window is too tight to be meaningful on this box.
	budget := int64(float64(avail) * float64(memPct) / 100.0)
	cap := budget - memHeadroomBytes
	if cap < memFloorBytes {
		return burnSubResult{
			Name:    "Burn memory",
			Skipped: true,
			Reason:  fmt.Sprintf("budget %s below floor %s after headroom", humanBytes(budget), humanBytes(memFloorBytes)),
		}
	}
	args := []string{
		"--vm", "1",
		"--vm-bytes", strconv.FormatInt(cap, 10),
		"--vm-keep",
		"--timeout", durationSeconds(duration),
		"--metrics-brief",
		"--verify",
	}
	d.Info(fmt.Sprintf("Burn: stress-ng memory cap=%s (%d%% of MemAvailable)", humanBytes(cap), memPct))
	pass := runStressPass(ctx, d, "Burn memory", duration, args)
	return burnSubResult{
		Name:    "Burn memory",
		Passed:  pass.Passed,
		Err:     pass.Err,
		SubStep: subStepFromPass(fmt.Sprintf("Burn memory (cap %s)", humanBytes(cap)), pass),
	}
}

// runBurnFio runs fio_sample against the first allow-listed disk for
// the window. Reuses runFioVerify + parseFioJSON so the samples line
// up with what Storage emits. Using fio_sample (bounded by --size)
// keeps Burn's write volume predictable regardless of profile.
func runBurnFio(ctx context.Context, d Deps, duration time.Duration) burnSubResult {
	if _, err := exec.LookPath("fio"); err != nil {
		return burnSubResult{Name: "Burn fio", Err: "fio missing"}
	}
	targets := resolveTargets(d.ExpectedDisks)
	if len(targets) == 0 {
		return burnSubResult{Name: "Burn fio", Skipped: true, Reason: "no allow-listed disks present"}
	}
	t := targets[0]
	opts := fioOpts{
		Mode:    "fio_sample",
		Size:    "512MiB",
		Runtime: duration,
		BS:      "4k",
		RW:      "randrw",
		Verify:  "md5",
	}
	start := time.Now()
	d.Info(fmt.Sprintf("Burn: fio %s on %s (%s window)", opts.Mode, t.Device, duration))
	fr := runFioVerify(ctx, t.Device, opts)
	end := time.Now()

	sub := SubStepReport{
		Name:        "Burn fio " + t.Device,
		Passed:      fr.Error == "",
		StartedAt:   start,
		CompletedAt: end,
		SummaryJSON: mustJSON(fr),
	}
	out := burnSubResult{Name: "Burn fio", SubStep: sub, Passed: fr.Error == "", Err: fr.Error}
	if fr.Error == "" {
		out.Samples = append(out.Samples,
			Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
			Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
		)
		if fr.ReadP99Us > 0 {
			out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
		}
		if fr.WriteP99Us > 0 {
			out.Samples = append(out.Samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
		}
	}
	return out
}

// runBurnIperf drives iperf3 -P N for the window. Reuses parseIperfJSON
// so the same (mbps, retrans, bytesSent) extraction the Network stage
// uses applies here too. Samples emitted as Burn-scoped keys so the
// dashboard can tell at-a-glance which window they came from.
func runBurnIperf(ctx context.Context, d Deps, duration time.Duration, orchestratorURL string, port, parallel int) burnSubResult {
	if _, err := exec.LookPath("iperf3"); err != nil {
		return burnSubResult{Name: "Burn iperf", Err: "iperf3 missing"}
	}
	host, err := deriveHost(orchestratorURL)
	if err != nil || host == "" {
		return burnSubResult{Name: "Burn iperf", Skipped: true, Reason: "can't derive orchestrator host"}
	}
	if port == 0 {
		port = 5201
	}
	if parallel < 1 {
		parallel = 1
	}
	args := []string{
		"-c", host,
		"-p", strconv.Itoa(port),
		"-t", strconv.Itoa(int(duration.Seconds())),
		"-P", strconv.Itoa(parallel),
		"-J",
	}
	runCtx, cancel := context.WithTimeout(ctx, duration+30*time.Second)
	defer cancel()
	start := time.Now()
	out, err := exec.CommandContext(runCtx, "iperf3", args...).Output()
	end := time.Now()
	if err != nil {
		return burnSubResult{
			Name:    "Burn iperf",
			Err:     "iperf3 client error: " + err.Error(),
			SubStep: SubStepReport{
				Name:        "Burn iperf",
				StartedAt:   start,
				CompletedAt: end,
				SummaryJSON: mustJSON(map[string]any{"error": err.Error(), "stderr_tail": tailLines(string(out), 20)}),
			},
		}
	}
	mbps, retrans, bytesSent, _, perr := parseIperfJSON(out)
	if perr != nil {
		return burnSubResult{
			Name:    "Burn iperf",
			Err:     "parse iperf3 json: " + perr.Error(),
			SubStep: SubStepReport{
				Name:        "Burn iperf",
				StartedAt:   start,
				CompletedAt: end,
				SummaryJSON: mustJSON(map[string]any{"error": perr.Error()}),
			},
		}
	}

	samples := []Sample{{Kind: "iperf", Key: "burn/throughput_mbps", Value: mbps, Unit: "Mbps"}}
	if bytesSent > 0 {
		packets := float64(bytesSent) / 1460.0
		if packets > 0 {
			samples = append(samples, Sample{
				Kind: "nic_retrans", Key: "burn/rate",
				Value: float64(retrans) / packets, Unit: "rate",
			})
		}
	}
	passed := mbps > 0
	errMsg := ""
	if !passed {
		errMsg = "zero throughput from iperf3"
	}
	return burnSubResult{
		Name:    "Burn iperf",
		Passed:  passed,
		Err:     errMsg,
		Samples: samples,
		SubStep: SubStepReport{
			Name:        fmt.Sprintf("Burn iperf (P=%d)", parallel),
			Passed:      passed,
			StartedAt:   start,
			CompletedAt: end,
			SummaryJSON: mustJSON(map[string]any{
				"throughput_mbps": mbps,
				"retransmits":     retrans,
				"bytes_sent":      bytesSent,
				"parallel":        parallel,
			}),
		},
	}
}

// runPSUSidecar polls /sys/class/hwmon rails every 5s for the duration
// of the Burn window, piping each read into the stage's sensor channel
// as a psu_volt sample. The threshold evaluator then applies the same
// within_pct gates used by the PSU stage — a 12V rail sagging to 10.5V
// under load will fire the critical threshold mid-Burn and the run
// will flip into FailedHolding without waiting for the post-Burn PSU
// stage to catch it.
func runPSUSidecar(ctx context.Context, wg *sync.WaitGroup, d Deps) {
	defer wg.Done()
	if d.Sensor == nil {
		return
	}
	t := time.NewTicker(5 * time.Second)
	defer t.Stop()
	for {
		select {
		case <-ctx.Done():
			return
		case <-t.C:
			rails := scanPSURails()
			if len(rails) == 0 {
				continue
			}
			batch := make([]Sample, 0, len(rails))
			for _, r := range rails {
				batch = append(batch, Sample{Kind: "psu_volt", Key: r.Label, Value: r.Volts, Unit: "V"})
			}
			sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
			if err := d.Sensor(sendCtx, batch); err != nil {
				d.Warn("Burn: PSU sample post: " + err.Error())
			}
			cancel()
		}
	}
}

func resolveCPUWorkers(raw string) int {
	if raw == "" || strings.EqualFold(raw, "all") {
		return runtime.NumCPU()
	}
	if n, err := strconv.Atoi(raw); err == nil && n > 0 {
		return n
	}
	return runtime.NumCPU()
}

// clampMemPct keeps the knob in a sane band. 0 means "use default 50%";
// above 90 would crowd the kernel + agent + fio + iperf3 workers off the
// page cache. Anything outside [10, 90] is clamped.
func clampMemPct(pct int) int {
	if pct <= 0 {
		return 50
	}
	if pct < 10 {
		return 10
	}
	if pct > 90 {
		return 90
	}
	return pct
}

func mustJSON(v any) json.RawMessage {
	b, err := json.Marshal(v)
	if err != nil {
		return json.RawMessage([]byte(`{"marshal_error":"` + err.Error() + `"}`))
	}
	return b
}

// Ensure the probes package import stays anchored — the Burn sidecars
// use probes.EDAC + the PSU rail scanner defined in psu.go which
// otherwise wouldn't pull probes in on its own.
var _ = probes.EDAC