Vetting/agent/tests/storage.go

package tests

import (
	"context"
	"encoding/json"
	"fmt"
	"os/exec"
	"strconv"
	"strings"
	"time"
)

// Storage is the destructive stage. Phase 2 replaced the old
// badblocks + 128 MiB fio combo with a single fio run per disk that
// writes, verifies md5 of what it wrote, and reports p99 latency.
// Modes:
//
//   - fio_sample (quick): bounded 1 GiB write per disk, ~3 min runtime.
//   - full_disk (deep/soak): writes the whole device, time-bounded by
//     the fio_time knob (2 h deep, 6 h soak).
//
// Pre-gates kept from Phase 1:
//
//  1. Device allowlist: only act on /dev/<X> where the kernel-reported
//     serial matches one of Deps.ExpectedDisks. USB sticks and unexpected
//     drives are excluded.
//  2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
//     signature, partition table, or LVM metadata → fail with
//     UnexpectedData unless Deps.OverrideWipe is set.
//
// After fio, the stage captures a SMART diff (start snapshot taken
// before any writes; end snapshot after all writes finish) and posts
// deltas on attributes like Reallocated_Sector_Ct and Current_Pending_Sector.
// The threshold evaluator isn't seeded to gate smart_delta out of the
// box — those samples are diagnostic for the report. Fio's p99 latency
// posts as fio_p99_us so the per-stage Storage warning threshold can
// fire on a latency cliff.
func Storage(ctx context.Context, d Deps) Outcome {
	if len(d.ExpectedDisks) == 0 {
		d.Info("Storage: no expected disks in spec — skipping stage")
		return Outcome{
			Passed:  true,
			Summary: "skipped (no expected disks)",
			Extras:  map[string]any{"skipped": true, "reason": "no_expected_disks"},
		}
	}

	targets := resolveTargets(d.ExpectedDisks)
	if len(targets) == 0 {
		d.Error("Storage: none of the expected disks are present on this host")
		return Outcome{
			Passed:  false,
			Message: "device allowlist matched zero disks",
			Summary: "no allowed disks present",
			Extras:  map[string]any{"expected": d.ExpectedDisks},
		}
	}

	// Non-destructive runs skip wipe-probe (nothing to refuse), fio
	// writes, and SMART delta (nothing changed so no delta to report).
	// Every expected disk is still asserted present so a vanished drive
	// still fails the stage.
	if d.NonDestructive {
		perDisk := map[string]any{}
		for _, t := range targets {
			perDisk[t.Device] = map[string]any{"mode": "non_destructive", "serial": t.Serial}
		}
		d.Info(fmt.Sprintf("Storage: non-destructive — verified %d disk(s) present", len(targets)))
		return Outcome{
			Passed:  true,
			Summary: fmt.Sprintf("non-destructive: read-only checks only (%d disks)", len(targets)),
			Extras:  map[string]any{"per_disk": perDisk, "non_destructive": true},
		}
	}

	// Wipe probe on every target. A single dirty disk halts the stage
	// unless the operator has set OverrideWipe via the UI.
	probes := map[string]wipeProbeResult{}
	dirty := []string{}
	for _, t := range targets {
		probe := probeWipe(ctx, t.Device)
		probes[t.Device] = probe
		if probe.HasData {
			dirty = append(dirty, t.Device)
		}
	}
	if len(dirty) > 0 && !d.OverrideWipe {
		d.Error("Storage: wipe probe found existing data on: " + strings.Join(dirty, ", "))
		return Outcome{
			Passed:  false,
			Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
			Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
			Extras: map[string]any{
				"wipe_probe":    probes,
				"override_hint": "click 'Override wipe & retry' in the held tile",
				"dirty_devices": dirty,
			},
		}
	}
	if d.OverrideWipe && len(dirty) > 0 {
		d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
	}

	// Capture start-of-stage SMART attributes before we write anything
	// so the delta is attributable to *this* stage's writes and not the
	// host's prior history. Per-disk failures are tolerated (e.g. the
	// device doesn't expose SMART); we just can't emit a delta for it.
	startSMART := captureSMARTAttrs(ctx, targets)

	fioOpts := resolveFioOpts(d.StorageKnobs)
	d.Info(fmt.Sprintf("Storage: fio mode=%s size=%s runtime=%s bs=%s rw=%s verify=%s",
		fioOpts.Mode, fioOpts.Size, fioOpts.Runtime, fioOpts.BS, fioOpts.RW, fioOpts.Verify))

	var samples []Sample
	var subs []SubStepReport
	perDisk := map[string]any{}
	failed := ""
	for _, t := range targets {
		d.Info(fmt.Sprintf("Storage: running fio %s on %s", fioOpts.Mode, t.Device))
		fioStart := time.Now()
		fr := runFioVerify(ctx, t.Device, fioOpts)
		fioEnd := time.Now()
		fioSummary, _ := json.Marshal(fr)
		subs = append(subs, SubStepReport{
			Name:        fmt.Sprintf("fio %s %s", fioOpts.Mode, t.Device),
			Passed:      fr.Error == "",
			StartedAt:   fioStart,
			CompletedAt: fioEnd,
			SummaryJSON: fioSummary,
		})
		perDisk[t.Device] = map[string]any{"fio": fr}

		if fr.Error == "" {
			samples = append(samples,
				Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
				Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
			)
			if fr.ReadP99Us > 0 {
				samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
			}
			if fr.WriteP99Us > 0 {
				samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
			}
		} else if failed == "" {
			failed = t.Device
		}
	}

	// End-of-stage SMART snapshot + diff. We capture whether or not fio
	// succeeded — a mid-run failure still produces attributable deltas,
	// which is often more interesting than the stage outcome itself.
	endSMART := captureSMARTAttrs(ctx, targets)
	deltas := diffSMARTAttrs(startSMART, endSMART)
	for dev, attrs := range deltas {
		for attr, delta := range attrs {
			samples = append(samples, Sample{Kind: "smart_delta", Key: dev + "/" + attr, Value: delta, Unit: "count"})
		}
	}
	if d.Sensor != nil && len(samples) > 0 {
		_ = d.Sensor(ctx, samples)
	}

	if failed != "" {
		return Outcome{
			Passed:   false,
			Message:  "fio verify failed on " + failed,
			Summary:  "fio failed on " + failed,
			Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
			SubSteps: subs,
		}
	}

	d.Info(fmt.Sprintf("Storage: %d disk(s) passed fio --verify", len(targets)))
	return Outcome{
		Passed:   true,
		Summary:  fmt.Sprintf("%d disks passed (%s)", len(targets), fioOpts.Mode),
		Extras:   map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
		SubSteps: subs,
	}
}

type diskTarget struct {
	Serial string
	Device string
}

// resolveTargets maps expected-disk serials to /dev/<X> paths by reading
// /sys/block. Uses the same mechanism as probes.inventory to avoid drift.
func resolveTargets(expected []ExpectedDisk) []diskTarget {
	disks, err := listBlockDisks()
	if err != nil {
		return nil
	}
	// Build serial → device map from /sys.
	serialOf := map[string]string{}
	for _, dev := range disks {
		name := strings.TrimPrefix(dev, "/dev/")
		s := diskSerialFromSys(name)
		if s != "" {
			serialOf[strings.ToLower(s)] = dev
		}
	}
	var out []diskTarget
	for _, e := range expected {
		if e.Serial == "" {
			continue
		}
		if dev, ok := serialOf[strings.ToLower(e.Serial)]; ok {
			out = append(out, diskTarget{Serial: e.Serial, Device: dev})
		}
	}
	return out
}

// diskSerialFromSys is a smaller copy of probes.diskSerial; imported
// from internal/probes would cause a cycle so we duplicate the short
// lookup. If it drifts from the inventory probe, Storage fails because
// the serial doesn't match — which is the correct behavior.
func diskSerialFromSys(name string) string {
	for _, rel := range []string{
		"/sys/block/" + name + "/device/serial",
		"/sys/block/" + name + "/serial",
	} {
		b, err := readFileBytes(rel)
		if err != nil {
			continue
		}
		s := strings.TrimSpace(string(b))
		if s != "" {
			return s
		}
	}
	// Fall back to udevadm — ID_SERIAL_SHORT is more reliable on SCSI.
	out, err := exec.Command("udevadm", "info", "--query=property", "--name="+name).Output()
	if err != nil {
		return ""
	}
	for _, line := range strings.Split(string(out), "\n") {
		if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
			return strings.TrimSpace(v)
		}
	}
	return ""
}

func readFileBytes(p string) ([]byte, error) {
	return readFile(p)
}

// ---------- wipe probe ----------

type wipeProbeResult struct {
	Device   string   `json:"device"`
	HasData  bool     `json:"has_data"`
	Findings []string `json:"findings,omitempty"`
}

// probeWipe runs blkid + wipefs -n. Any non-empty output from either is
// a "has data" signal. This is deliberately conservative: we'd rather
// halt on a bare ext4 signature than hand fio a disk with real bytes on
// it.
func probeWipe(ctx context.Context, device string) wipeProbeResult {
	out := wipeProbeResult{Device: device}

	if b, err := exec.CommandContext(ctx, "blkid", "-o", "full", device).Output(); err == nil {
		s := strings.TrimSpace(string(b))
		if s != "" {
			out.Findings = append(out.Findings, "blkid: "+s)
			out.HasData = true
		}
	}
	if b, err := exec.CommandContext(ctx, "wipefs", "--no-act", device).Output(); err == nil {
		s := strings.TrimSpace(string(b))
		// wipefs prints a header line even on a clean disk; keep only
		// lines with actual signature data.
		for _, line := range strings.Split(s, "\n") {
			line = strings.TrimSpace(line)
			if line == "" || strings.HasPrefix(line, "DEVICE") || strings.HasPrefix(line, "offset") {
				continue
			}
			out.Findings = append(out.Findings, "wipefs: "+line)
			out.HasData = true
		}
	}
	return out
}

// ---------- fio ----------

// fioOpts resolves the probe knobs into the concrete flag values fio
// needs. Defaults match the quick profile's fio_sample shape so callers
// with zero knobs still run something bounded.
type fioOpts struct {
	Mode    string        `json:"mode"`     // "fio_sample" | "full_disk"
	Size    string        `json:"size"`     // "1GiB"; only used for fio_sample
	Runtime time.Duration `json:"runtime"`  // bounding time
	BS      string        `json:"bs"`       // "4k"
	RW      string        `json:"rw"`       // "randrw"
	Verify  string        `json:"verify"`   // "md5" | ""
}

// resolveFioOpts normalizes the knobs into a runnable config. Zero-
// valued fields fall back to the quick defaults so a stage that's
// missing its knobs still has coherent behavior (safer than refusing).
func resolveFioOpts(k StorageKnobs) fioOpts {
	o := fioOpts{
		Mode:    firstNonEmpty(k.Mode, "fio_sample"),
		Size:    firstNonEmpty(k.FioSize, "1GiB"),
		Runtime: k.FioTime,
		BS:      firstNonEmpty(k.FioBS, "4k"),
		RW:      firstNonEmpty(k.FioRW, "randrw"),
		Verify:  firstNonEmpty(k.Verify, "md5"),
	}
	if o.Runtime <= 0 {
		o.Runtime = 3 * time.Minute
	}
	return o
}

func firstNonEmpty(vs ...string) string {
	for _, v := range vs {
		if v != "" {
			return v
		}
	}
	return ""
}

type fioResult struct {
	Mode        string  `json:"mode"`
	ReadIOPS    float64 `json:"read_iops"`
	WriteIOPS   float64 `json:"write_iops"`
	ReadBWKBps  float64 `json:"read_bw_kbps"`
	WriteBWKBps float64 `json:"write_bw_kbps"`
	ReadP99Us   float64 `json:"read_p99_us,omitempty"`
	WriteP99Us  float64 `json:"write_p99_us,omitempty"`
	Error       string  `json:"error,omitempty"`
	OutputTail  string  `json:"output_tail,omitempty"`
}

// runFioVerify invokes fio with md5-verify semantics. fio_sample mode
// caps the IO at opts.Size; full_disk drives the whole device bounded
// by runtime. Both use direct IO to bypass the page cache — we want
// real disk latency, not Linux' cheerful buffer.
func runFioVerify(ctx context.Context, device string, opts fioOpts) fioResult {
	// 30s grace over runtime so fio has time to flush + close cleanly.
	runCtx, cancel := context.WithTimeout(ctx, opts.Runtime+30*time.Second)
	defer cancel()

	args := []string{
		"--name=verify-" + strings.TrimPrefix(device, "/dev/"),
		"--filename=" + device,
		"--rw=" + opts.RW,
		"--bs=" + opts.BS,
		"--numjobs=1",
		"--direct=1",
		"--group_reporting",
		"--output-format=json",
		"--runtime=" + strconv.Itoa(int(opts.Runtime.Seconds())),
	}
	if opts.Verify != "" {
		args = append(args,
			"--verify="+opts.Verify,
			"--verify_pattern=random",
			"--do_verify=1",
		)
	}
	switch opts.Mode {
	case "full_disk":
		// Time-bounded across the full device — fio uses the device's
		// full size when --size is omitted on a block device.
		args = append(args, "--time_based=1")
	default:
		// fio_sample: bounded write. Setting --size= limits the IO
		// volume regardless of runtime.
		args = append(args, "--size="+opts.Size, "--time_based=0")
	}

	cmd := exec.CommandContext(runCtx, "fio", args...)
	out, err := cmd.Output()
	r := fioResult{Mode: opts.Mode, OutputTail: tailLines(string(out), 20)}
	if err != nil {
		r.Error = err.Error()
		return r
	}
	parsed, perr := parseFioJSON(out)
	if perr != nil {
		r.Error = "parse fio json: " + perr.Error()
		return r
	}
	r.ReadIOPS = parsed.ReadIOPS
	r.WriteIOPS = parsed.WriteIOPS
	r.ReadBWKBps = parsed.ReadBWKBps
	r.WriteBWKBps = parsed.WriteBWKBps
	r.ReadP99Us = parsed.ReadP99Us
	r.WriteP99Us = parsed.WriteP99Us
	return r
}

// parseFioJSON extracts the bits we care about from fio's --output-format=json.
// Latency percentiles live at .jobs[0].read.clat_ns.percentile["99.000000"];
// we convert nanoseconds to microseconds for the fio_p99_us sample.
func parseFioJSON(out []byte) (fioResult, error) {
	var top struct {
		Jobs []struct {
			Read struct {
				IOPS float64 `json:"iops"`
				BW   float64 `json:"bw"`
				CLat struct {
					Percentile map[string]float64 `json:"percentile"`
				} `json:"clat_ns"`
			} `json:"read"`
			Write struct {
				IOPS float64 `json:"iops"`
				BW   float64 `json:"bw"`
				CLat struct {
					Percentile map[string]float64 `json:"percentile"`
				} `json:"clat_ns"`
			} `json:"write"`
		} `json:"jobs"`
	}
	if err := json.Unmarshal(out, &top); err != nil {
		return fioResult{}, err
	}
	if len(top.Jobs) == 0 {
		return fioResult{}, fmt.Errorf("no jobs in fio output")
	}
	j := top.Jobs[0]
	r := fioResult{
		ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
		ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
	}
	if p := j.Read.CLat.Percentile["99.000000"]; p > 0 {
		r.ReadP99Us = p / 1000.0
	}
	if p := j.Write.CLat.Percentile["99.000000"]; p > 0 {
		r.WriteP99Us = p / 1000.0
	}
	return r, nil
}

// ---------- SMART delta ----------

// smartAttrMap: device → attribute → raw counter value. ATA drives
// populate named attributes (Reallocated_Sector_Ct etc); NVMe drives
// populate a flatter nvme-specific map. We track a curated whitelist
// of wear indicators — anything else is diagnostic and drops to the raw
// report output.
type smartAttrMap map[string]map[string]float64

// captureSMARTAttrs runs smartctl -aj on each target and pulls the
// whitelisted attributes. Per-device failures (virtio, permission
// issues) degrade silently — the delta step just shows no data for
// that device.
func captureSMARTAttrs(ctx context.Context, targets []diskTarget) smartAttrMap {
	out := smartAttrMap{}
	for _, t := range targets {
		parsed, err := runSmartctl(ctx, t.Device)
		if err != nil {
			continue
		}
		attrs := extractSMARTAttrs(parsed)
		if len(attrs) > 0 {
			out[t.Device] = attrs
		}
	}
	return out
}

// smartAttributeWhitelist is the set of attributes we diff across a
// stage. They're the ones that reflect *this stage's* IO damage, not
// cumulative drive history. Adding attributes is cheap — missing ones
// just drop to zero.
var smartAttributeWhitelist = map[string]bool{
	// ATA SMART attribute names (smartctl normalizes to these)
	"Reallocated_Sector_Ct":   true,
	"Current_Pending_Sector":  true,
	"Offline_Uncorrectable":   true,
	"UDMA_CRC_Error_Count":    true,
	"Reported_Uncorrect":      true,
	"Raw_Read_Error_Rate":     true,
	// NVMe log fields (flat keys at top of nvme_smart_health_information_log)
	"media_errors":            true,
	"num_err_log_entries":     true,
	"percentage_used":         true,
}

// extractSMARTAttrs walks smartctl's JSON for whitelisted attribute
// values. Handles both the ATA shape (ata_smart_attributes.table[]) and
// the NVMe shape (nvme_smart_health_information_log). Returns a map
// keyed by the canonical attribute name.
func extractSMARTAttrs(raw map[string]any) map[string]float64 {
	out := map[string]float64{}
	// ATA attributes are in ata_smart_attributes.table[] — each element
	// has {"name": "Reallocated_Sector_Ct", "raw": {"value": N}}.
	if ata, ok := raw["ata_smart_attributes"].(map[string]any); ok {
		if tbl, ok := ata["table"].([]any); ok {
			for _, row := range tbl {
				rm, ok := row.(map[string]any)
				if !ok {
					continue
				}
				name, _ := rm["name"].(string)
				if !smartAttributeWhitelist[name] {
					continue
				}
				if r, ok := rm["raw"].(map[string]any); ok {
					if v, ok := r["value"].(float64); ok {
						out[name] = v
					}
				}
			}
		}
	}
	// NVMe attributes live flat under nvme_smart_health_information_log.
	if nvme, ok := raw["nvme_smart_health_information_log"].(map[string]any); ok {
		for k, v := range nvme {
			if !smartAttributeWhitelist[k] {
				continue
			}
			if n, ok := v.(float64); ok {
				out[k] = n
			}
		}
	}
	return out
}

// diffSMARTAttrs subtracts start from end per (device, attribute).
// Only attributes present in both ends produce a delta; missing
// attributes drop out (can't attribute a zero-to-present delta safely).
// Negative deltas are kept so a drive that resets a counter is visible.
func diffSMARTAttrs(start, end smartAttrMap) map[string]map[string]float64 {
	out := map[string]map[string]float64{}
	for dev, endAttrs := range end {
		startAttrs, ok := start[dev]
		if !ok {
			continue
		}
		devOut := map[string]float64{}
		for attr, endV := range endAttrs {
			startV, ok := startAttrs[attr]
			if !ok {
				continue
			}
			devOut[attr] = endV - startV
		}
		if len(devOut) > 0 {
			out[dev] = devOut
		}
	}
	return out
}