Vetting/internal/orchestrator/thresholds.go

package orchestrator

import (
	"fmt"
	"strings"
)

// ThresholdOp is one of the comparison operators a threshold supports.
// within_pct is the only one that cares about a "nominal" value for
// the key — used for PSU rails ("+12V within 5% of 12.0").
type ThresholdOp string

const (
	OpLT        ThresholdOp = "lt"
	OpLTE       ThresholdOp = "lte"
	OpGT        ThresholdOp = "gt"
	OpGTE       ThresholdOp = "gte"
	OpWithinPct ThresholdOp = "within_pct"
)

// ThresholdSeverity routes a breach to either "fail the run" or "just
// surface a warning in the report". The evaluator returns it alongside
// the Pass flag so the caller can decide whether to transition the run.
type ThresholdSeverity string

const (
	SeverityCritical ThresholdSeverity = "critical"
	SeverityWarning  ThresholdSeverity = "warning"
)

// Threshold is the evaluator's view of a stored threshold row. It's a
// flat, already-parsed value-object — the evaluator doesn't look at
// the DB and the store doesn't look at the evaluator.
type Threshold struct {
	ID        int64
	Stage     string // "*" matches any stage
	Kind      string
	Key       string // glob-ish: "*" / "prefix*" / "*suffix" / exact
	Op        ThresholdOp
	Value     float64
	Nominal   float64 // for within_pct (nominal voltage/frequency)
	Severity  ThresholdSeverity
}

// Sample is a single observation the evaluator tests against matching
// thresholds. Stage may be empty when the agent doesn't know which
// stage posted it (e.g. the thermal sidecar running across stages) —
// empty-stage samples only match thresholds with Stage == "*".
type Sample struct {
	Stage string
	Kind  string
	Key   string
	Value float64
}

// EvalResult is the per-sample outcome of a threshold evaluation:
// which threshold was consulted, whether the sample passed, and the
// severity so the caller can fast-fail on critical breaches.
type EvalResult struct {
	Threshold Threshold
	Passed    bool
	Observed  float64
}

// Breached returns true when the sample violated the threshold.
func (r EvalResult) Breached() bool { return !r.Passed }

// CriticalBreach returns true only for critical-severity breaches —
// the "fail the run right now" case.
func (r EvalResult) CriticalBreach() bool {
	return r.Breached() && r.Threshold.Severity == SeverityCritical
}

// Evaluate runs a single sample through every threshold that applies
// to it. A sample may match more than one threshold (a generic "*"
// rule + a stage-specific override); each match produces its own
// EvalResult in the returned slice so both get persisted.
func Evaluate(sample Sample, thresholds []Threshold) []EvalResult {
	out := make([]EvalResult, 0, 1)
	for _, t := range thresholds {
		if !thresholdMatchesSample(t, sample) {
			continue
		}
		passed, err := evaluateOp(t.Op, sample.Value, t.Value, t.Nominal)
		if err != nil {
			// Unknown operator — skip. The caller could validate on
			// insert; here we prefer to drop the threshold than to
			// return an error that forces every Sensor write to 500.
			continue
		}
		out = append(out, EvalResult{
			Threshold: t,
			Passed:    passed,
			Observed:  sample.Value,
		})
	}
	return out
}

// thresholdMatchesSample applies the stage + kind + key filter. Kind
// is always literal — there's no "any kind" threshold and if there
// ever is we'll add a `kind: *` escape hatch. Stage and key both
// support glob-ish matching.
func thresholdMatchesSample(t Threshold, s Sample) bool {
	if t.Kind != s.Kind {
		return false
	}
	if !stageMatches(t.Stage, s.Stage) {
		return false
	}
	if !keyMatches(t.Key, s.Key) {
		return false
	}
	return true
}

// stageMatches returns true if the threshold's stage selector applies
// to the sample's stage. "*" matches everything; empty threshold
// selector is treated as "*" so a threshold declared without a stage
// key isn't accidentally inert. A sample without a stage only matches
// the "*" selector — we don't guess.
func stageMatches(selector, sampleStage string) bool {
	if selector == "" || selector == "*" {
		return true
	}
	return selector == sampleStage
}

// keyMatches handles "*", "prefix*", "*suffix", and exact match. We
// avoid pulling in filepath.Match so Windows `\`-vs-`/` rules don't
// leak into the sample namespace (key "eth0/rx_errors" is not a path).
func keyMatches(pattern, key string) bool {
	if pattern == "" || pattern == "*" {
		return true
	}
	hasPrefix := strings.HasPrefix(pattern, "*")
	hasSuffix := strings.HasSuffix(pattern, "*")
	switch {
	case hasPrefix && hasSuffix:
		inner := strings.TrimPrefix(strings.TrimSuffix(pattern, "*"), "*")
		return strings.Contains(key, inner)
	case hasSuffix:
		return strings.HasPrefix(key, strings.TrimSuffix(pattern, "*"))
	case hasPrefix:
		return strings.HasSuffix(key, strings.TrimPrefix(pattern, "*"))
	default:
		return pattern == key
	}
}

// evaluateOp does the numeric comparison. within_pct is the oddball:
// it tests |observed - nominal| <= (pct / 100) * nominal. Returns an
// error for unknown operators so the caller can log + drop.
func evaluateOp(op ThresholdOp, observed, threshold, nominal float64) (bool, error) {
	switch op {
	case OpLT:
		return observed < threshold, nil
	case OpLTE:
		return observed <= threshold, nil
	case OpGT:
		return observed > threshold, nil
	case OpGTE:
		return observed >= threshold, nil
	case OpWithinPct:
		if nominal == 0 {
			// within_pct against a 0 nominal is meaningless. Treat as
			// pass so a misconfigured rule doesn't spuriously fail.
			return true, nil
		}
		allowed := (threshold / 100.0) * nominal
		if allowed < 0 {
			allowed = -allowed
		}
		diff := observed - nominal
		if diff < 0 {
			diff = -diff
		}
		return diff <= allowed, nil
	default:
		return false, fmt.Errorf("unknown op %q", op)
	}
}