Remove benchmark evaluation system, use training capabilities directly

Model quality for market segments and product lines now derives from deployed model capabilities (coding, reasoning, agents, etc.) instead of requiring a separate manual benchmark evaluation step. This eliminates an unbounded benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines of dead-weight UI, types, and engine code. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-26 19:28:59 -04:00
parent db034687d6
commit bbb69a315c
10 changed files with 57 additions and 535 deletions
@@ -182,45 +182,6 @@ export interface QuantizationConfig {
  variantName: string;
 }

-export type BenchmarkCategory = 'reasoning' | 'coding' | 'math' | 'knowledge' | 'safety' | 'chat' | 'multimodal' | 'agents';
-
-export interface BenchmarkDefinition {
-  id: string;
-  name: string;
-  category: BenchmarkCategory;
-  description: string;
-  primaryCapability: keyof ModelCapabilities;
-  secondaryCapability?: keyof ModelCapabilities;
-  computeCost: number;
-  ticksToRun: number;
-  unlockedAtEra: Era;
-  marketRelevance: {
-    consumer: number;
-    enterprise: number;
-    developer: number;
-    research: number;
-  };
-}
-
-export interface BenchmarkResult {
-  benchmarkId: string;
-  modelId: string;
-  score: number;
-  ranAtTick: number;
-  rank?: number;
-}
-
-export interface EvalJob {
-  id: string;
-  modelId: string;
-  benchmarkIds: string[];
-  progressTicks: number;
-  totalTicks: number;
-  computeAllocated: number;
-  status: 'active' | 'completed';
-  results: BenchmarkResult[];
-}
-
 export type ProductLineType = 'text-api' | 'chat-product' | 'chat-free' | 'chat-enterprise' | 'code-api' | 'image' | 'agents-api';

 export interface ProductPricing {
@@ -246,11 +207,10 @@ export interface ModelsState {
  baseModels: BaseModel[];
  activeTrainingPipelines: TrainingPipeline[];
  variantJobs: VariantCreationJob[];
-  evalJobs: EvalJob[];
-  benchmarkResults: BenchmarkResult[];
  productLines: ProductLine[];
  bestDeployedModelScore: number;
  bestDeployedSafetyScore: number;
+  bestDeployedCapabilities: ModelCapabilities;
 }

 export const DEFAULT_DATA_MIX: DataMixAllocation = {
@@ -271,8 +231,6 @@ export const INITIAL_MODELS: ModelsState = {
  baseModels: [],
  activeTrainingPipelines: [],
  variantJobs: [],
-  evalJobs: [],
-  benchmarkResults: [],
  productLines: [
    {
      id: 'text-api',
@@ -307,4 +265,5 @@ export const INITIAL_MODELS: ModelsState = {
  ],
  bestDeployedModelScore: 0,
  bestDeployedSafetyScore: 0,
+  bestDeployedCapabilities: { reasoning: 0, coding: 0, creative: 0, math: 0, knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0 },
 };