Remove benchmark evaluation system, use training capabilities directly

Model quality for market segments and product lines now derives from deployed model capabilities (coding, reasoning, agents, etc.) instead of requiring a separate manual benchmark evaluation step. This eliminates an unbounded benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines of dead-weight UI, types, and engine code. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-26 19:28:59 -04:00
parent db034687d6
commit bbb69a315c
10 changed files with 57 additions and 535 deletions
@@ -1,5 +1,5 @@
 import { useState } from 'react';
-import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap, BarChart3 } from 'lucide-react';
+import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap } from 'lucide-react';
 import { TutorialHint } from '@/components/game/TutorialHint';
 import { ConfirmModal } from '@/components/common/ConfirmModal';
 import { useGameStore } from '@/store';
@@ -16,10 +16,9 @@ import {
 } from '@ai-tycoon/shared';
 import type {
  ModelArchitecture, DataMixAllocation, SFTSpecialization, AlignmentMethod,
-  DataDomain, QuantizationLevel, BaseModel, ModelVariant, BenchmarkResult,
+  DataDomain, QuantizationLevel, BaseModel, ModelVariant,
  SizeTier, ModelFamily,
 } from '@ai-tycoon/shared';
-import { BENCHMARKS } from '@ai-tycoon/game-engine';

 const DATA_MIX_PRESETS: Record<string, { label: string; mix: DataMixAllocation }> = {
  balanced: { label: 'Balanced', mix: DEFAULT_DATA_MIX },
@@ -52,8 +51,6 @@ export function ModelsPage() {
  const families = useGameStore((s) => s.models.families);
  const pipelines = useGameStore((s) => s.models.activeTrainingPipelines);
  const variantJobs = useGameStore((s) => s.models.variantJobs);
-  const evalJobs = useGameStore((s) => s.models.evalJobs);
-  const benchmarkResults = useGameStore((s) => s.models.benchmarkResults);
  const productLines = useGameStore((s) => s.models.productLines);
  const totalFlops = useGameStore((s) => s.compute.totalFlops);
  const totalVramGB = useGameStore((s) => s.compute.totalVramGB);
@@ -64,7 +61,6 @@ export function ModelsPage() {
  const deployModel = useGameStore((s) => s.deployModel);
  const deployVariant = useGameStore((s) => s.deployVariant);
  const createQuantization = useGameStore((s) => s.createQuantization);
-  const startEvaluation = useGameStore((s) => s.startEvaluation);
  const setTrainingAllocation = useGameStore((s) => s.setTrainingAllocation);
  const openSourceModel = useGameStore((s) => s.openSourceModel);
  const openSourcedModels = useGameStore((s) => s.market.openSourcedModels);
@@ -96,15 +92,12 @@ export function ModelsPage() {

  const activePipelines = pipelines.filter(p => p.status === 'active' || p.status === 'stalled');
  const activeVariantJobs = variantJobs.filter(j => j.status === 'active');
-  const activeEvalJobs = evalJobs.filter(j => j.status === 'active');
  const undeployedCount = baseModels.filter(m => !m.isDeployed).length;
-  const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0 || activeEvalJobs.length > 0;
+  const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0;
  const noModelDeployed = baseModels.length > 0 && !baseModels.some(m => m.isDeployed);

  const eraOrder = ['startup', 'scaleup', 'bigtech', 'agi'] as const;
  const currentEraIdx = eraOrder.indexOf(currentEra);
-  const availableBenchmarks = BENCHMARKS.filter(b => eraOrder.indexOf(b.unlockedAtEra) <= currentEraIdx);
-
  const hasAlignmentResearch = completedResearch.some(r =>
    r === 'alignment-research' || r === 'interpretability' || r === 'constitutional-ai',
  );
@@ -186,7 +179,6 @@ export function ModelsPage() {
          { id: 'overview' as const, label: 'Overview' },
          { id: 'train' as const, label: 'Train New' },
          { id: 'models' as const, label: `Families${families.length > 0 ? ` (${families.length})` : ''}` },
-          { id: 'benchmarks' as const, label: 'Benchmarks' },
          { id: 'products' as const, label: 'Products' },
        ]).map(tab => (
          <button
@@ -347,28 +339,6 @@ export function ModelsPage() {
        </div>
      )}

-      {/* Active Eval Jobs */}
-      {modelsTab === 'overview' && activeEvalJobs.length > 0 && (
-        <div className="space-y-3">
-          <h3 className="font-semibold">Running Evaluations</h3>
-          {activeEvalJobs.map(job => {
-            const model = baseModels.find(m => m.id === job.modelId) ?? families.flatMap(f => f.variants).find(v => v.id === job.modelId);
-            const progress = job.progressTicks / job.totalTicks;
-            return (
-              <div key={job.id} className="bg-surface-900 border border-surface-700 rounded-xl p-3">
-                <div className="flex items-center justify-between mb-1">
-                  <span className="text-sm">{model?.name ?? 'Unknown'} — {job.benchmarkIds.length} benchmarks</span>
-                  <span className="text-xs text-surface-400">{formatPercent(progress)}</span>
-                </div>
-                <div className="h-1.5 bg-surface-800 rounded-full overflow-hidden">
-                  <div className="h-full bg-blue-500 rounded-full transition-all" style={{ width: `${progress * 100}%` }} />
-                </div>
-              </div>
-            );
-          })}
-        </div>
-      )}
-
      {/* Train New Model */}
      {modelsTab === 'train' && <div className="bg-surface-900 border border-surface-700 rounded-xl p-4 space-y-4">
        <h3 className="font-semibold">Train New Model</h3>
@@ -716,9 +686,8 @@ export function ModelsPage() {
                    {familyModels.map(model => (
                      <div key={model.id} className="space-y-3">
                        <h5 className="text-sm font-medium text-surface-300">{model.name}</h5>
-                        <ModelDetails model={model} benchmarkResults={benchmarkResults} />
+                        <ModelDetails model={model} />
                        <QuantizationCreator model={model} completedResearch={completedResearch} onQuantize={createQuantization} />
-                        <BenchmarkEvaluator modelId={model.id} modelName={model.name} availableBenchmarks={availableBenchmarks} benchmarkResults={benchmarkResults} evalJobs={evalJobs} onStartEval={startEvaluation} />
                      </div>
                    ))}

@@ -730,11 +699,7 @@ export function ModelsPage() {
                            key={variant.id}
                            variant={variant}
                            familyId={family.id}
-                            benchmarkResults={benchmarkResults}
-                            availableBenchmarks={availableBenchmarks}
-                            evalJobs={evalJobs}
                            onDeploy={() => deployVariant(family.id, variant.id)}
-                            onStartEval={startEvaluation}
                          />
                        ))}
                      </div>
@@ -747,21 +712,6 @@ export function ModelsPage() {
        </div>
      )}

-      {/* Benchmark Leaderboard */}
-      {modelsTab === 'benchmarks' && benchmarkResults.length > 0 && (
-        <BenchmarkLeaderboard
-          benchmarkResults={benchmarkResults}
-          baseModels={baseModels}
-          families={families}
-          availableBenchmarks={availableBenchmarks}
-        />
-      )}
-      {modelsTab === 'benchmarks' && benchmarkResults.length === 0 && (
-        <div className="bg-surface-900 border border-surface-700 rounded-xl p-8 text-center text-surface-500 text-sm">
-          No benchmark results yet. Run evaluations from the Models tab.
-        </div>
-      )}
-
      {/* Product Lines */}
      {modelsTab === 'products' && <div className="space-y-3">
        <h3 className="font-semibold">Product Lines</h3>
@@ -865,9 +815,7 @@ function ModelActions({ model, isOpenSourced, onDeploy, onOpenSource }: {
  );
 }

-function ModelDetails({ model, benchmarkResults }: { model: BaseModel; benchmarkResults: BenchmarkResult[] }) {
-  const modelResults = benchmarkResults.filter(r => r.modelId === model.id);
-
+function ModelDetails({ model }: { model: BaseModel }) {
  return (
    <div className="space-y-3">
      <div className="grid grid-cols-3 gap-3 text-xs">
@@ -907,22 +855,6 @@ function ModelDetails({ model, benchmarkResults }: { model: BaseModel; benchmark
        </div>
      </div>

-      {modelResults.length > 0 && (
-        <div>
-          <span className="text-xs font-medium text-surface-300">Benchmark Scores</span>
-          <div className="grid grid-cols-3 gap-2 mt-1">
-            {modelResults.map(r => {
-              const bench = BENCHMARKS.find(b => b.id === r.benchmarkId);
-              return (
-                <div key={r.benchmarkId} className="bg-surface-800 rounded-lg p-2 text-xs">
-                  <span className="text-surface-400">{bench?.name ?? r.benchmarkId}</span>
-                  <div className="font-mono mt-0.5 text-accent-light">{r.score.toFixed(1)}</div>
-                </div>
-              );
-            })}
-          </div>
-        </div>
-      )}
    </div>
  );
 }
@@ -981,91 +913,12 @@ function QuantizationCreator({ model, completedResearch, onQuantize }: {
  );
 }

-function BenchmarkEvaluator({ modelId, modelName, availableBenchmarks, benchmarkResults, evalJobs, onStartEval }: {
-  modelId: string;
-  modelName: string;
-  availableBenchmarks: typeof BENCHMARKS;
-  benchmarkResults: BenchmarkResult[];
-  evalJobs: { id: string; modelId: string; status: string }[];
-  onStartEval: (modelId: string, benchmarkIds: string[]) => void;
-}) {
-  const [showEval, setShowEval] = useState(false);
-  const [selectedBenchmarks, setSelectedBenchmarks] = useState<string[]>([]);
-
-  const existingResults = benchmarkResults.filter(r => r.modelId === modelId);
-  const evaluatedIds = new Set(existingResults.map(r => r.benchmarkId));
-  const isEvaluating = evalJobs.some(j => j.modelId === modelId && j.status === 'active');
-  const unevaluated = availableBenchmarks.filter(b => !evaluatedIds.has(b.id));
-
-  if (unevaluated.length === 0 && !showEval) {
-    return null;
-  }
-
-  if (!showEval) {
-    return (
-      <button onClick={() => { setShowEval(true); setSelectedBenchmarks(unevaluated.map(b => b.id)); }}
-        disabled={isEvaluating}
-        className="flex items-center gap-1 text-xs text-blue-400 hover:text-blue-300 disabled:opacity-50">
-        <BarChart3 size={12} /> Run Benchmarks ({unevaluated.length} available)
-      </button>
-    );
-  }
-
-  return (
-    <div className="bg-surface-800/50 rounded-lg p-3 space-y-2">
-      <div className="flex items-center justify-between">
-        <span className="text-xs font-medium text-surface-300">Run Evaluation</span>
-        <button onClick={() => setShowEval(false)} className="text-xs text-surface-500 hover:text-surface-300">Close</button>
-      </div>
-      <div className="flex flex-wrap gap-1">
-        {availableBenchmarks.map(bench => {
-          const alreadyDone = evaluatedIds.has(bench.id);
-          const selected = selectedBenchmarks.includes(bench.id);
-          return (
-            <button key={bench.id}
-              disabled={alreadyDone}
-              onClick={() => setSelectedBenchmarks(prev =>
-                prev.includes(bench.id) ? prev.filter(id => id !== bench.id) : [...prev, bench.id]
-              )}
-              className={`px-2 py-0.5 rounded text-[10px] border ${
-                alreadyDone ? 'bg-success/10 border-success/30 text-success cursor-default' :
-                selected ? 'bg-blue-500/20 border-blue-500 text-blue-300' :
-                'bg-surface-800 border-surface-600 text-surface-400'
-              }`}
-              title={bench.description}
-            >
-              {bench.name} {alreadyDone ? `(${existingResults.find(r => r.benchmarkId === bench.id)?.score.toFixed(0)})` : ''}
-            </button>
-          );
-        })}
-      </div>
-      {selectedBenchmarks.length > 0 && (
-        <div className="flex items-center justify-between">
-          <span className="text-[10px] text-surface-500">
-            {selectedBenchmarks.length} benchmark{selectedBenchmarks.length > 1 ? 's' : ''} · ~{availableBenchmarks.filter(b => selectedBenchmarks.includes(b.id)).reduce((s, b) => s + b.ticksToRun, 0)} ticks
-          </span>
-          <button onClick={() => { onStartEval(modelId, selectedBenchmarks); setShowEval(false); }}
-            disabled={isEvaluating}
-            className="bg-blue-600 hover:bg-blue-700 text-white rounded px-3 py-1 text-xs disabled:opacity-50">
-            Evaluate
-          </button>
-        </div>
-      )}
-    </div>
-  );
-}
-
-function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks, evalJobs, onDeploy, onStartEval }: {
+function VariantCard({ variant, familyId, onDeploy }: {
  variant: ModelVariant;
  familyId: string;
-  benchmarkResults: BenchmarkResult[];
-  availableBenchmarks: typeof BENCHMARKS;
-  evalJobs: { id: string; modelId: string; status: string }[];
  onDeploy: () => void;
-  onStartEval: (modelId: string, benchmarkIds: string[]) => void;
 }) {
  const [isExpanded, setIsExpanded] = useState(false);
-  const variantResults = benchmarkResults.filter(r => r.modelId === variant.id);

  return (
    <div className="bg-surface-800/50 rounded-lg p-3 ml-4 border-l-2 border-surface-600">
@@ -1106,108 +959,12 @@ function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks,
              </div>
            ))}
          </div>
-
-          {variantResults.length > 0 && (
-            <div className="grid grid-cols-3 gap-2">
-              {variantResults.map(r => {
-                const bench = BENCHMARKS.find(b => b.id === r.benchmarkId);
-                return (
-                  <div key={r.benchmarkId} className="bg-surface-800 rounded p-1.5 text-xs">
-                    <span className="text-surface-400 text-[10px]">{bench?.name ?? r.benchmarkId}</span>
-                    <div className="font-mono text-accent-light text-[11px]">{r.score.toFixed(1)}</div>
-                  </div>
-                );
-              })}
-            </div>
-          )}
-
-          <BenchmarkEvaluator
-            modelId={variant.id}
-            modelName={variant.name}
-            availableBenchmarks={availableBenchmarks}
-            benchmarkResults={benchmarkResults}
-            evalJobs={evalJobs}
-            onStartEval={onStartEval}
-          />
        </div>
      )}
    </div>
  );
 }

-function BenchmarkLeaderboard({ benchmarkResults, baseModels, families, availableBenchmarks }: {
-  benchmarkResults: BenchmarkResult[];
-  baseModels: BaseModel[];
-  families: { id: string; name: string; variants: ModelVariant[] }[];
-  availableBenchmarks: typeof BENCHMARKS;
-}) {
-  const allModels: (BaseModel | ModelVariant)[] = [
-    ...baseModels,
-    ...families.flatMap(f => f.variants),
-  ];
-
-  const modelNames = new Map(allModels.map(m => [m.id, m.name]));
-  const benchmarksWithResults = availableBenchmarks.filter(b =>
-    benchmarkResults.some(r => r.benchmarkId === b.id),
-  );
-
-  if (benchmarksWithResults.length === 0) return null;
-
-  const modelIds = [...new Set(benchmarkResults.map(r => r.modelId))];
-
-  return (
-    <div className="bg-surface-900 border border-surface-700 rounded-xl p-4">
-      <h3 className="font-semibold mb-3 flex items-center gap-2">
-        <BarChart3 size={16} /> Benchmark Leaderboard
-      </h3>
-      <div className="overflow-x-auto">
-        <table className="w-full text-xs">
-          <thead>
-            <tr className="border-b border-surface-700">
-              <th className="text-left py-1.5 pr-3 text-surface-400 font-medium">Model</th>
-              {benchmarksWithResults.map(b => (
-                <th key={b.id} className="text-center py-1.5 px-2 text-surface-400 font-medium">{b.name}</th>
-              ))}
-              <th className="text-center py-1.5 px-2 text-surface-400 font-medium">Avg</th>
-            </tr>
-          </thead>
-          <tbody>
-            {modelIds.map(modelId => {
-              const results = benchmarkResults.filter(r => r.modelId === modelId);
-              const scores = benchmarksWithResults.map(b => {
-                const r = results.find(r => r.benchmarkId === b.id);
-                return r?.score ?? null;
-              });
-              const validScores = scores.filter((s): s is number => s !== null);
-              const avg = validScores.length > 0 ? validScores.reduce((a, b) => a + b, 0) / validScores.length : 0;
-
-              return (
-                <tr key={modelId} className="border-b border-surface-800">
-                  <td className="py-1.5 pr-3 font-medium">{modelNames.get(modelId) ?? 'Unknown'}</td>
-                  {scores.map((score, i) => (
-                    <td key={i} className="text-center py-1.5 px-2 font-mono">
-                      {score !== null ? (
-                        <span className={score >= 80 ? 'text-success' : score >= 50 ? 'text-accent-light' : 'text-surface-400'}>
-                          {score.toFixed(1)}
-                        </span>
-                      ) : (
-                        <span className="text-surface-600">—</span>
-                      )}
-                    </td>
-                  ))}
-                  <td className="text-center py-1.5 px-2 font-mono font-medium text-accent-light">
-                    {avg > 0 ? avg.toFixed(1) : '—'}
-                  </td>
-                </tr>
-              );
-            })}
-          </tbody>
-        </table>
-      </div>
-    </div>
-  );
-}
-
 function StageBar({ label, active, complete, progress }: {
  label: string; active: boolean; complete: boolean; progress: number;
 }) {
@@ -15,7 +15,6 @@ import type {
  TrainingPipeline, ModelFamily, DataMixAllocation,
  ModelArchitecture, AlignmentMethod, SizeTier,
  SFTSpecialization, QuantizationLevel, VariantCreationJob,
-  EvalJob,
  ConsumerTierId, ApiTierId,
 } from '@ai-tycoon/shared';
 import {
@@ -43,7 +42,7 @@ import {
 } from '@ai-tycoon/shared';
 import {
  emptyDCNetworkSummary, emptyCampusNetworkSummary, emptyClusterNetworkSummary,
-  BENCHMARKS, TECH_TREE, onModelDeployed,
+  TECH_TREE, onModelDeployed,
 } from '@ai-tycoon/game-engine';
 import { INITIAL_RIVALS } from '@ai-tycoon/game-engine';

@@ -59,7 +58,7 @@ export interface InfraNav {
  datacenterId?: string;
 }

-type ModelsTab = 'overview' | 'train' | 'models' | 'benchmarks' | 'products';
+type ModelsTab = 'overview' | 'train' | 'models' | 'products';

 interface UIState {
  activePage: ActivePage;
@@ -132,7 +131,6 @@ interface Actions {
  }) => void;
  startPointRelease: (baseModelId: string) => void;
  createQuantization: (baseModelId: string, level: QuantizationLevel, variantName: string) => void;
-  startEvaluation: (modelId: string, benchmarkIds: string[]) => void;
  deployModel: (modelId: string) => void;
  deployVariant: (familyId: string, variantId: string) => void;
  setProductPricing: (productLineId: string, field: string, value: number) => void;
@@ -1076,32 +1074,6 @@ export const useGameStore = create<Store>()(
        }
      },

-      startEvaluation: (modelId, benchmarkIds) => {
-        let created = false;
-        set((s) => {
-          const benchmarks = BENCHMARKS.filter(b => benchmarkIds.includes(b.id));
-          if (benchmarks.length === 0) return s;
-          created = true;
-          const totalTicks = benchmarks.reduce((sum, b) => sum + b.ticksToRun, 0);
-          const computeCost = benchmarks.reduce((sum, b) => sum + b.computeCost, 0);
-          const job: EvalJob = {
-            id: uuid(),
-            modelId,
-            benchmarkIds,
-            progressTicks: 0,
-            totalTicks,
-            computeAllocated: computeCost,
-            status: 'active',
-            results: [],
-          };
-          return { models: { ...s.models, evalJobs: [...s.models.evalJobs, job] } };
-        });
-        if (created) {
-          get().addNotification({ title: 'Evaluation Started', message: `${benchmarkIds.length} benchmark${benchmarkIds.length > 1 ? 's' : ''} queued.`, type: 'info', tick: get().meta.tickCount });
-          set({ modelsTab: 'overview' as ModelsTab });
-        }
-      },
-
      deployModel: (modelId) => {
        const modelName = get().models.baseModels.find(m => m.id === modelId)?.name ?? 'Model';
        set((s) => ({
@@ -171,7 +171,6 @@ export function createTestBaseModel(overrides?: Partial<BaseModel>): BaseModel {
    sizeTier: 'small',
    isPointRelease: false,
    sourceModelId: null,
-    benchmarkResults: {},
    dataMix: { web: 0.4, code: 0.2, books: 0.15, academic: 0.1, conversational: 0.1, specialized: 0.05 },
  };
  return overrides ? { ...base, ...overrides } : base;
@@ -181,9 +180,10 @@ export function createTestModelFamily(overrides?: Partial<ModelFamily>): ModelFa
  const base: ModelFamily = {
    id: uuid(),
    name: 'Test Family',
-    baseModels: [],
+    generation: 1,
+    baseModelIds: [],
    variants: [],
-    activeEvals: [],
+    createdAtTick: 0,
  };
  return overrides ? { ...base, ...overrides } : base;
 }
@@ -1,111 +0,0 @@
-import type { BenchmarkDefinition } from '@ai-tycoon/shared';
-
-export const BENCHMARKS: BenchmarkDefinition[] = [
-  {
-    id: 'arc-challenge',
-    name: 'ARC Challenge',
-    category: 'reasoning',
-    description: 'Advanced reasoning and comprehension tasks requiring multi-step inference.',
-    primaryCapability: 'reasoning',
-    secondaryCapability: 'knowledge',
-    computeCost: 0.001,
-    ticksToRun: 8,
-    unlockedAtEra: 'startup',
-    marketRelevance: { consumer: 0.3, enterprise: 0.5, developer: 0.4, research: 0.8 },
-  },
-  {
-    id: 'codeforce',
-    name: 'CodeForce',
-    category: 'coding',
-    description: 'Competitive programming and software engineering benchmarks.',
-    primaryCapability: 'coding',
-    secondaryCapability: 'reasoning',
-    computeCost: 0.001,
-    ticksToRun: 8,
-    unlockedAtEra: 'startup',
-    marketRelevance: { consumer: 0.2, enterprise: 0.7, developer: 0.9, research: 0.5 },
-  },
-  {
-    id: 'mathquest',
-    name: 'MathQuest',
-    category: 'math',
-    description: 'Mathematical problem-solving from algebra to graduate-level proofs.',
-    primaryCapability: 'math',
-    secondaryCapability: 'reasoning',
-    computeCost: 0.001,
-    ticksToRun: 8,
-    unlockedAtEra: 'startup',
-    marketRelevance: { consumer: 0.1, enterprise: 0.6, developer: 0.5, research: 0.9 },
-  },
-  {
-    id: 'worldfacts',
-    name: 'WorldFacts',
-    category: 'knowledge',
-    description: 'Broad factual knowledge across science, history, culture, and current events.',
-    primaryCapability: 'knowledge',
-    secondaryCapability: 'reasoning',
-    computeCost: 0.001,
-    ticksToRun: 6,
-    unlockedAtEra: 'startup',
-    marketRelevance: { consumer: 0.5, enterprise: 0.4, developer: 0.3, research: 0.6 },
-  },
-  {
-    id: 'chatrank',
-    name: 'ChatRank',
-    category: 'chat',
-    description: 'Human preference evaluation of conversational quality, helpfulness, and creativity.',
-    primaryCapability: 'creative',
-    secondaryCapability: 'knowledge',
-    computeCost: 0.002,
-    ticksToRun: 10,
-    unlockedAtEra: 'startup',
-    marketRelevance: { consumer: 0.9, enterprise: 0.3, developer: 0.2, research: 0.2 },
-  },
-  {
-    id: 'harmguard',
-    name: 'HarmGuard',
-    category: 'safety',
-    description: 'Safety evaluation measuring harm avoidance, truthfulness, and responsible behavior.',
-    primaryCapability: 'reasoning',
-    computeCost: 0.001,
-    ticksToRun: 8,
-    unlockedAtEra: 'startup',
-    marketRelevance: { consumer: 0.4, enterprise: 0.9, developer: 0.3, research: 0.7 },
-  },
-  {
-    id: 'visionbench',
-    name: 'VisionBench',
-    category: 'multimodal',
-    description: 'Image understanding, visual reasoning, and multimodal comprehension.',
-    primaryCapability: 'multimodal',
-    secondaryCapability: 'reasoning',
-    computeCost: 0.003,
-    ticksToRun: 12,
-    unlockedAtEra: 'scaleup',
-    marketRelevance: { consumer: 0.5, enterprise: 0.6, developer: 0.6, research: 0.7 },
-  },
-  {
-    id: 'agentarena',
-    name: 'AgentArena',
-    category: 'agents',
-    description: 'Autonomous agent tasks: tool use, multi-step planning, and environment interaction.',
-    primaryCapability: 'agents',
-    secondaryCapability: 'coding',
-    computeCost: 0.005,
-    ticksToRun: 15,
-    unlockedAtEra: 'bigtech',
-    marketRelevance: { consumer: 0.3, enterprise: 0.8, developer: 0.7, research: 0.6 },
-  },
-  {
-    id: 'frontier-eval',
-    name: 'Frontier Eval',
-    category: 'reasoning',
-    description: 'Cutting-edge capability evaluation at the frontier of AI research.',
-    primaryCapability: 'reasoning',
-    secondaryCapability: 'math',
-    computeCost: 0.01,
-    ticksToRun: 20,
-    unlockedAtEra: 'agi',
-    marketRelevance: { consumer: 0.2, enterprise: 0.5, developer: 0.5, research: 1.0 },
-  },
-];
@@ -11,4 +11,3 @@ export { TECH_TREE } from './data/techTree';
 export { INITIAL_RIVALS } from './data/competitors';
 export { KEY_HIRE_POOL } from './data/keyHires';
 export { ACHIEVEMENT_DEFINITIONS } from './data/achievements';
-export { BENCHMARKS } from './data/benchmarks';
@@ -1,7 +1,6 @@
-import type { GameState, MarketState, BenchmarkResult } from '@ai-tycoon/shared';
+import type { GameState, MarketState, ModelCapabilities } from '@ai-tycoon/shared';
 import { CONSUMER_TOKENS_PER_SUBSCRIBER, API_TOKENS_PER_DEVELOPER_PER_TICK, BATCH_API_DEMAND_PER_DEV, makeInitialServingMetrics } from '@ai-tycoon/shared';
 import type { TrafficPriority, TierServingMetrics } from '@ai-tycoon/shared';
-import { BENCHMARKS } from '../../data/benchmarks';
 import { computeSeasonal } from './seasonalSystem';
 import { updateObsolescence } from './obsolescenceSystem';
 import { buildPlayerProfile, buildCompetitorProfile, computeMarketShares, updateTAMGrowth } from './tamSystem';
@@ -21,31 +20,30 @@ export interface MarketTickResult {
  totalTokenDemand: number;
 }

+const SEGMENT_CAPABILITY_WEIGHTS: Record<string, Partial<Record<keyof ModelCapabilities, number>>> = {
+  consumer:   { creative: 0.35, knowledge: 0.25, reasoning: 0.15, multimodal: 0.15, coding: 0.05, agents: 0.05 },
+  enterprise: { reasoning: 0.25, coding: 0.20, agents: 0.20, knowledge: 0.15, math: 0.10, multimodal: 0.10 },
+  developer:  { coding: 0.35, reasoning: 0.20, agents: 0.20, math: 0.15, knowledge: 0.10 },
+  research:   { reasoning: 0.30, math: 0.30, knowledge: 0.20, coding: 0.10, agents: 0.10 },
+};
+
 function getSegmentQuality(
  segment: 'consumer' | 'enterprise' | 'developer' | 'research',
-  benchmarkResults: BenchmarkResult[],
+  capabilities: ModelCapabilities,
  fallbackScore: number,
 ): number {
-  if (benchmarkResults.length === 0) return fallbackScore / 100;
-
-  const bestByBenchmark = new Map<string, number>();
-  for (const r of benchmarkResults) {
-    const prev = bestByBenchmark.get(r.benchmarkId) ?? 0;
-    if (r.score > prev) bestByBenchmark.set(r.benchmarkId, r.score);
-  }
-
+  const weights = SEGMENT_CAPABILITY_WEIGHTS[segment];
+  if (!weights) return fallbackScore / 100;
  let weightedSum = 0;
  let totalWeight = 0;
-  for (const bench of BENCHMARKS) {
-    const score = bestByBenchmark.get(bench.id);
-    if (score == null) continue;
-    const weight = bench.marketRelevance[segment];
-    weightedSum += (score / 100) * weight;
-    totalWeight += weight;
+  for (const [cap, weight] of Object.entries(weights)) {
+    const score = capabilities[cap as keyof ModelCapabilities] ?? 0;
+    if (score > 0) {
+      weightedSum += (score / 100) * weight;
+      totalWeight += weight;
+    }
  }
-
-  if (totalWeight === 0) return fallbackScore / 100;
-  return weightedSum / totalWeight;
+  return totalWeight > 0 ? weightedSum / totalWeight : fallbackScore / 100;
 }

 export function processMarketV2(
@@ -54,9 +52,11 @@ export function processMarketV2(
  effectiveInferenceFlops?: number,
  researchBonuses?: ResearchBonuses,
 ): MarketTickResult {
-  const consumerQuality = getSegmentQuality('consumer', state.models.benchmarkResults, state.models.bestDeployedModelScore);
-  const enterpriseQuality = getSegmentQuality('enterprise', state.models.benchmarkResults, state.models.bestDeployedModelScore);
-  const modelQuality = state.models.benchmarkResults.length > 0
+  const caps = state.models.bestDeployedCapabilities;
+  const hasDeployed = state.models.bestDeployedModelScore > 0;
+  const consumerQuality = getSegmentQuality('consumer', caps, state.models.bestDeployedModelScore);
+  const enterpriseQuality = getSegmentQuality('enterprise', caps, state.models.bestDeployedModelScore);
+  const modelQuality = hasDeployed
    ? (consumerQuality + enterpriseQuality) / 2
    : state.models.bestDeployedModelScore / 100;

@@ -115,7 +115,7 @@ export function processMarketV2(
  const productResult = processProductLines(
    state.market.codeAssistant,
    state.market.agentsPlatform,
-    state.models.benchmarkResults,
+    caps,
    playerDevCustomers,
    playerEntCustomers,
    seasonal.multipliers.consumer,
@@ -1,4 +1,4 @@
-import type { CodeAssistantState, AgentsPlatformState, BenchmarkResult } from '@ai-tycoon/shared';
+import type { CodeAssistantState, AgentsPlatformState, ModelCapabilities } from '@ai-tycoon/shared';
 import {
  CODE_ASSISTANT_MIN_CODING_SCORE,
  CODE_ASSISTANT_BASE_ADOPTION_RATE,
@@ -7,27 +7,6 @@ import {
  AGENTS_PLATFORM_BASE_ADOPTION_RATE,
  AGENTS_PLATFORM_CHURN_RATE,
 } from '@ai-tycoon/shared';
-import { BENCHMARKS } from '../../data/benchmarks';
-
-function getBenchmarkScore(benchmarkId: string, results: BenchmarkResult[]): number {
-  let best = 0;
-  for (const r of results) {
-    if (r.benchmarkId === benchmarkId && r.score > best) best = r.score;
-  }
-  return best;
-}
-
-function getCodingScore(results: BenchmarkResult[]): number {
-  const codeBench = BENCHMARKS.find(b => b.id === 'codeforce');
-  if (!codeBench) return 0;
-  return getBenchmarkScore(codeBench.id, results);
-}
-
-function getAgentsScore(results: BenchmarkResult[]): number {
-  const agentBench = BENCHMARKS.find(b => b.id === 'agentarena');
-  if (!agentBench) return 0;
-  return getBenchmarkScore(agentBench.id, results);
-}

 export interface ProductLineResult {
  codeAssistant: CodeAssistantState;
@@ -41,7 +20,7 @@ export interface ProductLineResult {
 export function processProductLines(
  ca: CodeAssistantState,
  ap: AgentsPlatformState,
-  benchmarkResults: BenchmarkResult[],
+  capabilities: ModelCapabilities,
  playerDevCustomers: number,
  playerEntCustomers: number,
  seasonalConsumerMult: number,
@@ -53,7 +32,7 @@ export function processProductLines(
  let apRevenue = 0;

  // --- Code Assistant ---
-  updatedCA.qualityScore = getCodingScore(benchmarkResults);
+  updatedCA.qualityScore = capabilities.coding;
  if (updatedCA.isUnlocked && updatedCA.isActive && updatedCA.qualityScore >= CODE_ASSISTANT_MIN_CODING_SCORE) {
    const qualityFactor = updatedCA.qualityScore / 100;
    const priceAttr = Math.max(0.1, 1 - updatedCA.pricePerSeat / 50);
@@ -70,7 +49,7 @@ export function processProductLines(
  }

  // --- Agents Platform ---
-  updatedAP.qualityScore = getAgentsScore(benchmarkResults);
+  updatedAP.qualityScore = capabilities.agents;
  if (updatedAP.isUnlocked && updatedAP.isActive && updatedAP.qualityScore >= AGENTS_PLATFORM_MIN_AGENTS_SCORE) {
    const qualityFactor = updatedAP.qualityScore / 100;
    const priceAttr = Math.max(0.1, 1 - updatedAP.pricePerSeat / 250);
@@ -1,10 +1,8 @@
 import type {
  GameState, ModelsState, BaseModel, ModelCapabilities, SafetyProfile,
  TrainingPipeline, TrainingEvent, TrainingEventType,
-  ModelVariant, VariantCreationJob, EvalJob, BenchmarkResult,
-  BenchmarkDefinition,
+  ModelVariant, VariantCreationJob,
 } from '@ai-tycoon/shared';
-import { BENCHMARKS } from '../data/benchmarks';
 import {
  uuid, VRAM_REQUIREMENTS_BY_GENERATION,
  MOE_CAPABILITY_MULTIPLIER, MOE_SPEED_MULTIPLIER,
@@ -154,14 +152,21 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
    });
  }

-  const updatedEvalJobs = processEvalJobs(state);
-
+  const bestDeployedCapabilities: ModelCapabilities = {
+    reasoning: 0, coding: 0, creative: 0, math: 0,
+    knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0,
+  };
  let bestDeployedModelScore = 0;
  let bestDeployedSafetyScore = 0;
  for (const m of baseModels) {
    if (!m.isDeployed) continue;
    if (m.rawCapability > bestDeployedModelScore) bestDeployedModelScore = m.rawCapability;
    if (m.safetyProfile.overallSafety > bestDeployedSafetyScore) bestDeployedSafetyScore = m.safetyProfile.overallSafety;
+    for (const key of Object.keys(bestDeployedCapabilities) as (keyof ModelCapabilities)[]) {
+      if ((m.capabilities[key] ?? 0) > bestDeployedCapabilities[key]) {
+        bestDeployedCapabilities[key] = m.capabilities[key];
+      }
+    }
  }
  for (const f of families) {
    for (const v of f.variants) {
@@ -169,6 +174,11 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
      const score = computeVariantScore(v);
      if (score > bestDeployedModelScore) bestDeployedModelScore = score;
      if (v.safetyProfile.overallSafety > bestDeployedSafetyScore) bestDeployedSafetyScore = v.safetyProfile.overallSafety;
+      for (const key of Object.keys(bestDeployedCapabilities) as (keyof ModelCapabilities)[]) {
+        if ((v.capabilities[key] ?? 0) > bestDeployedCapabilities[key]) {
+          bestDeployedCapabilities[key] = v.capabilities[key];
+        }
+      }
    }
  }

@@ -179,10 +189,9 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
      families,
      activeTrainingPipelines: updatedPipelines,
      variantJobs: updatedVariantJobs.jobs,
-      evalJobs: updatedEvalJobs.jobs,
-      benchmarkResults: [...state.models.benchmarkResults, ...updatedEvalJobs.newResults],
      bestDeployedModelScore,
      bestDeployedSafetyScore,
+      bestDeployedCapabilities,
    },
    completedModels,
    notifications,
@@ -490,47 +499,6 @@ function createVariant(job: VariantCreationJob, base: BaseModel): ModelVariant {
  };
 }

-function processEvalJobs(state: GameState): { jobs: EvalJob[]; newResults: BenchmarkResult[] } {
-  const newResults: BenchmarkResult[] = [];
-  const allModels: (BaseModel | ModelVariant)[] = [
-    ...state.models.baseModels,
-    ...state.models.families.flatMap(f => f.variants),
-  ];
-
-  const jobs = state.models.evalJobs.map(job => {
-    if (job.status !== 'active') return job;
-    const newProgress = job.progressTicks + 1;
-    if (newProgress >= job.totalTicks) {
-      const model = allModels.find(m => m.id === job.modelId);
-      if (model) {
-        const results = computeBenchmarkScores(model, job.benchmarkIds, state.meta.tickCount);
-        newResults.push(...results);
-        return { ...job, status: 'completed' as const, progressTicks: job.totalTicks, results };
-      }
-      return { ...job, status: 'completed' as const, progressTicks: job.totalTicks };
-    }
-    return { ...job, progressTicks: newProgress };
-  });
-  return { jobs, newResults };
-}
-
-function computeBenchmarkScores(
-  model: BaseModel | ModelVariant,
-  benchmarkIds: string[],
-  tick: number,
-): BenchmarkResult[] {
-  const benchmarkMap = new Map(BENCHMARKS.map(b => [b.id, b]));
-  return benchmarkIds.map(id => {
-    const bench = benchmarkMap.get(id);
-    if (!bench) return { benchmarkId: id, modelId: model.id, score: 0, ranAtTick: tick };
-    const primary = model.capabilities[bench.primaryCapability] ?? 0;
-    const secondary = bench.secondaryCapability ? (model.capabilities[bench.secondaryCapability] ?? 0) : 0;
-    const noise = (Math.random() - 0.5) * 6;
-    const score = clamp(primary * 0.7 + secondary * 0.3 + noise);
-    return { benchmarkId: id, modelId: model.id, score, ranAtTick: tick };
-  });
-}
-
 function computeVariantScore(variant: ModelVariant): number {
  const c = variant.capabilities;
  return (c.reasoning * 0.25 + c.coding * 0.2 + c.creative * 0.15 + c.math * 0.15 + c.knowledge * 0.15 + c.agents * 0.1);
@@ -66,7 +66,6 @@ describe('processTick', () => {
      isDeployed: true, trainedAtTick: 0, trainingCostTotal: 0, trainingStagesCompleted: ['pretraining' as const],
      sizeTier: 'small' as const, version: 1.0, sftSpecializations: ['general' as const], alignmentMethod: 'rlhf' as const,
      dataMix: { web: 0.4, code: 0.2, books: 0.15, academic: 0.1, conversational: 0.1, specialized: 0.05 },
-      benchmarkResults: {},
    };
    const state = createTestState({
      meta: { currentEra: 'startup' },
@@ -182,45 +182,6 @@ export interface QuantizationConfig {
  variantName: string;
 }

-export type BenchmarkCategory = 'reasoning' | 'coding' | 'math' | 'knowledge' | 'safety' | 'chat' | 'multimodal' | 'agents';
-
-export interface BenchmarkDefinition {
-  id: string;
-  name: string;
-  category: BenchmarkCategory;
-  description: string;
-  primaryCapability: keyof ModelCapabilities;
-  secondaryCapability?: keyof ModelCapabilities;
-  computeCost: number;
-  ticksToRun: number;
-  unlockedAtEra: Era;
-  marketRelevance: {
-    consumer: number;
-    enterprise: number;
-    developer: number;
-    research: number;
-  };
-}
-
-export interface BenchmarkResult {
-  benchmarkId: string;
-  modelId: string;
-  score: number;
-  ranAtTick: number;
-  rank?: number;
-}
-
-export interface EvalJob {
-  id: string;
-  modelId: string;
-  benchmarkIds: string[];
-  progressTicks: number;
-  totalTicks: number;
-  computeAllocated: number;
-  status: 'active' | 'completed';
-  results: BenchmarkResult[];
-}
-
 export type ProductLineType = 'text-api' | 'chat-product' | 'chat-free' | 'chat-enterprise' | 'code-api' | 'image' | 'agents-api';

 export interface ProductPricing {
@@ -246,11 +207,10 @@ export interface ModelsState {
  baseModels: BaseModel[];
  activeTrainingPipelines: TrainingPipeline[];
  variantJobs: VariantCreationJob[];
-  evalJobs: EvalJob[];
-  benchmarkResults: BenchmarkResult[];
  productLines: ProductLine[];
  bestDeployedModelScore: number;
  bestDeployedSafetyScore: number;
+  bestDeployedCapabilities: ModelCapabilities;
 }

 export const DEFAULT_DATA_MIX: DataMixAllocation = {
@@ -271,8 +231,6 @@ export const INITIAL_MODELS: ModelsState = {
  baseModels: [],
  activeTrainingPipelines: [],
  variantJobs: [],
-  evalJobs: [],
-  benchmarkResults: [],
  productLines: [
    {
      id: 'text-api',
@@ -307,4 +265,5 @@ export const INITIAL_MODELS: ModelsState = {
  ],
  bestDeployedModelScore: 0,
  bestDeployedSafetyScore: 0,
+  bestDeployedCapabilities: { reasoning: 0, coding: 0, creative: 0, math: 0, knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0 },
 };