Remove benchmark evaluation system, use training capabilities directly

Model quality for market segments and product lines now derives from deployed model capabilities (coding, reasoning, agents, etc.) instead of requiring a separate manual benchmark evaluation step. This eliminates an unbounded benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines of dead-weight UI, types, and engine code. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-26 19:28:59 -04:00
parent db034687d6
commit bbb69a315c
10 changed files with 57 additions and 535 deletions
@@ -1,7 +1,6 @@
-import type { GameState, MarketState, BenchmarkResult } from '@ai-tycoon/shared';
+import type { GameState, MarketState, ModelCapabilities } from '@ai-tycoon/shared';
 import { CONSUMER_TOKENS_PER_SUBSCRIBER, API_TOKENS_PER_DEVELOPER_PER_TICK, BATCH_API_DEMAND_PER_DEV, makeInitialServingMetrics } from '@ai-tycoon/shared';
 import type { TrafficPriority, TierServingMetrics } from '@ai-tycoon/shared';
-import { BENCHMARKS } from '../../data/benchmarks';
 import { computeSeasonal } from './seasonalSystem';
 import { updateObsolescence } from './obsolescenceSystem';
 import { buildPlayerProfile, buildCompetitorProfile, computeMarketShares, updateTAMGrowth } from './tamSystem';
@@ -21,31 +20,30 @@ export interface MarketTickResult {
  totalTokenDemand: number;
 }

+const SEGMENT_CAPABILITY_WEIGHTS: Record<string, Partial<Record<keyof ModelCapabilities, number>>> = {
+  consumer:   { creative: 0.35, knowledge: 0.25, reasoning: 0.15, multimodal: 0.15, coding: 0.05, agents: 0.05 },
+  enterprise: { reasoning: 0.25, coding: 0.20, agents: 0.20, knowledge: 0.15, math: 0.10, multimodal: 0.10 },
+  developer:  { coding: 0.35, reasoning: 0.20, agents: 0.20, math: 0.15, knowledge: 0.10 },
+  research:   { reasoning: 0.30, math: 0.30, knowledge: 0.20, coding: 0.10, agents: 0.10 },
+};
+
 function getSegmentQuality(
  segment: 'consumer' | 'enterprise' | 'developer' | 'research',
-  benchmarkResults: BenchmarkResult[],
+  capabilities: ModelCapabilities,
  fallbackScore: number,
 ): number {
-  if (benchmarkResults.length === 0) return fallbackScore / 100;
-
-  const bestByBenchmark = new Map<string, number>();
-  for (const r of benchmarkResults) {
-    const prev = bestByBenchmark.get(r.benchmarkId) ?? 0;
-    if (r.score > prev) bestByBenchmark.set(r.benchmarkId, r.score);
-  }
-
+  const weights = SEGMENT_CAPABILITY_WEIGHTS[segment];
+  if (!weights) return fallbackScore / 100;
  let weightedSum = 0;
  let totalWeight = 0;
-  for (const bench of BENCHMARKS) {
-    const score = bestByBenchmark.get(bench.id);
-    if (score == null) continue;
-    const weight = bench.marketRelevance[segment];
-    weightedSum += (score / 100) * weight;
-    totalWeight += weight;
+  for (const [cap, weight] of Object.entries(weights)) {
+    const score = capabilities[cap as keyof ModelCapabilities] ?? 0;
+    if (score > 0) {
+      weightedSum += (score / 100) * weight;
+      totalWeight += weight;
+    }
  }
-
-  if (totalWeight === 0) return fallbackScore / 100;
-  return weightedSum / totalWeight;
+  return totalWeight > 0 ? weightedSum / totalWeight : fallbackScore / 100;
 }

 export function processMarketV2(
@@ -54,9 +52,11 @@ export function processMarketV2(
  effectiveInferenceFlops?: number,
  researchBonuses?: ResearchBonuses,
 ): MarketTickResult {
-  const consumerQuality = getSegmentQuality('consumer', state.models.benchmarkResults, state.models.bestDeployedModelScore);
-  const enterpriseQuality = getSegmentQuality('enterprise', state.models.benchmarkResults, state.models.bestDeployedModelScore);
-  const modelQuality = state.models.benchmarkResults.length > 0
+  const caps = state.models.bestDeployedCapabilities;
+  const hasDeployed = state.models.bestDeployedModelScore > 0;
+  const consumerQuality = getSegmentQuality('consumer', caps, state.models.bestDeployedModelScore);
+  const enterpriseQuality = getSegmentQuality('enterprise', caps, state.models.bestDeployedModelScore);
+  const modelQuality = hasDeployed
    ? (consumerQuality + enterpriseQuality) / 2
    : state.models.bestDeployedModelScore / 100;

@@ -115,7 +115,7 @@ export function processMarketV2(
  const productResult = processProductLines(
    state.market.codeAssistant,
    state.market.agentsPlatform,
-    state.models.benchmarkResults,
+    caps,
    playerDevCustomers,
    playerEntCustomers,
    seasonal.multipliers.consumer,
@@ -1,4 +1,4 @@
-import type { CodeAssistantState, AgentsPlatformState, BenchmarkResult } from '@ai-tycoon/shared';
+import type { CodeAssistantState, AgentsPlatformState, ModelCapabilities } from '@ai-tycoon/shared';
 import {
  CODE_ASSISTANT_MIN_CODING_SCORE,
  CODE_ASSISTANT_BASE_ADOPTION_RATE,
@@ -7,27 +7,6 @@ import {
  AGENTS_PLATFORM_BASE_ADOPTION_RATE,
  AGENTS_PLATFORM_CHURN_RATE,
 } from '@ai-tycoon/shared';
-import { BENCHMARKS } from '../../data/benchmarks';
-
-function getBenchmarkScore(benchmarkId: string, results: BenchmarkResult[]): number {
-  let best = 0;
-  for (const r of results) {
-    if (r.benchmarkId === benchmarkId && r.score > best) best = r.score;
-  }
-  return best;
-}
-
-function getCodingScore(results: BenchmarkResult[]): number {
-  const codeBench = BENCHMARKS.find(b => b.id === 'codeforce');
-  if (!codeBench) return 0;
-  return getBenchmarkScore(codeBench.id, results);
-}
-
-function getAgentsScore(results: BenchmarkResult[]): number {
-  const agentBench = BENCHMARKS.find(b => b.id === 'agentarena');
-  if (!agentBench) return 0;
-  return getBenchmarkScore(agentBench.id, results);
-}

 export interface ProductLineResult {
  codeAssistant: CodeAssistantState;
@@ -41,7 +20,7 @@ export interface ProductLineResult {
 export function processProductLines(
  ca: CodeAssistantState,
  ap: AgentsPlatformState,
-  benchmarkResults: BenchmarkResult[],
+  capabilities: ModelCapabilities,
  playerDevCustomers: number,
  playerEntCustomers: number,
  seasonalConsumerMult: number,
@@ -53,7 +32,7 @@ export function processProductLines(
  let apRevenue = 0;

  // --- Code Assistant ---
-  updatedCA.qualityScore = getCodingScore(benchmarkResults);
+  updatedCA.qualityScore = capabilities.coding;
  if (updatedCA.isUnlocked && updatedCA.isActive && updatedCA.qualityScore >= CODE_ASSISTANT_MIN_CODING_SCORE) {
    const qualityFactor = updatedCA.qualityScore / 100;
    const priceAttr = Math.max(0.1, 1 - updatedCA.pricePerSeat / 50);
@@ -70,7 +49,7 @@ export function processProductLines(
  }

  // --- Agents Platform ---
-  updatedAP.qualityScore = getAgentsScore(benchmarkResults);
+  updatedAP.qualityScore = capabilities.agents;
  if (updatedAP.isUnlocked && updatedAP.isActive && updatedAP.qualityScore >= AGENTS_PLATFORM_MIN_AGENTS_SCORE) {
    const qualityFactor = updatedAP.qualityScore / 100;
    const priceAttr = Math.max(0.1, 1 - updatedAP.pricePerSeat / 250);
@@ -1,10 +1,8 @@
 import type {
  GameState, ModelsState, BaseModel, ModelCapabilities, SafetyProfile,
  TrainingPipeline, TrainingEvent, TrainingEventType,
-  ModelVariant, VariantCreationJob, EvalJob, BenchmarkResult,
-  BenchmarkDefinition,
+  ModelVariant, VariantCreationJob,
 } from '@ai-tycoon/shared';
-import { BENCHMARKS } from '../data/benchmarks';
 import {
  uuid, VRAM_REQUIREMENTS_BY_GENERATION,
  MOE_CAPABILITY_MULTIPLIER, MOE_SPEED_MULTIPLIER,
@@ -154,14 +152,21 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
    });
  }

-  const updatedEvalJobs = processEvalJobs(state);
-
+  const bestDeployedCapabilities: ModelCapabilities = {
+    reasoning: 0, coding: 0, creative: 0, math: 0,
+    knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0,
+  };
  let bestDeployedModelScore = 0;
  let bestDeployedSafetyScore = 0;
  for (const m of baseModels) {
    if (!m.isDeployed) continue;
    if (m.rawCapability > bestDeployedModelScore) bestDeployedModelScore = m.rawCapability;
    if (m.safetyProfile.overallSafety > bestDeployedSafetyScore) bestDeployedSafetyScore = m.safetyProfile.overallSafety;
+    for (const key of Object.keys(bestDeployedCapabilities) as (keyof ModelCapabilities)[]) {
+      if ((m.capabilities[key] ?? 0) > bestDeployedCapabilities[key]) {
+        bestDeployedCapabilities[key] = m.capabilities[key];
+      }
+    }
  }
  for (const f of families) {
    for (const v of f.variants) {
@@ -169,6 +174,11 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
      const score = computeVariantScore(v);
      if (score > bestDeployedModelScore) bestDeployedModelScore = score;
      if (v.safetyProfile.overallSafety > bestDeployedSafetyScore) bestDeployedSafetyScore = v.safetyProfile.overallSafety;
+      for (const key of Object.keys(bestDeployedCapabilities) as (keyof ModelCapabilities)[]) {
+        if ((v.capabilities[key] ?? 0) > bestDeployedCapabilities[key]) {
+          bestDeployedCapabilities[key] = v.capabilities[key];
+        }
+      }
    }
  }

@@ -179,10 +189,9 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
      families,
      activeTrainingPipelines: updatedPipelines,
      variantJobs: updatedVariantJobs.jobs,
-      evalJobs: updatedEvalJobs.jobs,
-      benchmarkResults: [...state.models.benchmarkResults, ...updatedEvalJobs.newResults],
      bestDeployedModelScore,
      bestDeployedSafetyScore,
+      bestDeployedCapabilities,
    },
    completedModels,
    notifications,
@@ -490,47 +499,6 @@ function createVariant(job: VariantCreationJob, base: BaseModel): ModelVariant {
  };
 }

-function processEvalJobs(state: GameState): { jobs: EvalJob[]; newResults: BenchmarkResult[] } {
-  const newResults: BenchmarkResult[] = [];
-  const allModels: (BaseModel | ModelVariant)[] = [
-    ...state.models.baseModels,
-    ...state.models.families.flatMap(f => f.variants),
-  ];
-
-  const jobs = state.models.evalJobs.map(job => {
-    if (job.status !== 'active') return job;
-    const newProgress = job.progressTicks + 1;
-    if (newProgress >= job.totalTicks) {
-      const model = allModels.find(m => m.id === job.modelId);
-      if (model) {
-        const results = computeBenchmarkScores(model, job.benchmarkIds, state.meta.tickCount);
-        newResults.push(...results);
-        return { ...job, status: 'completed' as const, progressTicks: job.totalTicks, results };
-      }
-      return { ...job, status: 'completed' as const, progressTicks: job.totalTicks };
-    }
-    return { ...job, progressTicks: newProgress };
-  });
-  return { jobs, newResults };
-}
-
-function computeBenchmarkScores(
-  model: BaseModel | ModelVariant,
-  benchmarkIds: string[],
-  tick: number,
-): BenchmarkResult[] {
-  const benchmarkMap = new Map(BENCHMARKS.map(b => [b.id, b]));
-  return benchmarkIds.map(id => {
-    const bench = benchmarkMap.get(id);
-    if (!bench) return { benchmarkId: id, modelId: model.id, score: 0, ranAtTick: tick };
-    const primary = model.capabilities[bench.primaryCapability] ?? 0;
-    const secondary = bench.secondaryCapability ? (model.capabilities[bench.secondaryCapability] ?? 0) : 0;
-    const noise = (Math.random() - 0.5) * 6;
-    const score = clamp(primary * 0.7 + secondary * 0.3 + noise);
-    return { benchmarkId: id, modelId: model.id, score, ranAtTick: tick };
-  });
-}
-
 function computeVariantScore(variant: ModelVariant): number {
  const c = variant.capabilities;
  return (c.reasoning * 0.25 + c.coding * 0.2 + c.creative * 0.15 + c.math * 0.15 + c.knowledge * 0.15 + c.agents * 0.1);