Remove benchmark evaluation system, use training capabilities directly

Model quality for market segments and product lines now derives from deployed model capabilities (coding, reasoning, agents, etc.) instead of requiring a separate manual benchmark evaluation step. This eliminates an unbounded benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines of dead-weight UI, types, and engine code. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-26 19:28:59 -04:00
parent db034687d6
commit bbb69a315c
10 changed files with 57 additions and 535 deletions
@@ -1,5 +1,5 @@
 import { useState } from 'react';
-import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap, BarChart3 } from 'lucide-react';
+import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap } from 'lucide-react';
 import { TutorialHint } from '@/components/game/TutorialHint';
 import { ConfirmModal } from '@/components/common/ConfirmModal';
 import { useGameStore } from '@/store';
@@ -16,10 +16,9 @@ import {
 } from '@ai-tycoon/shared';
 import type {
  ModelArchitecture, DataMixAllocation, SFTSpecialization, AlignmentMethod,
-  DataDomain, QuantizationLevel, BaseModel, ModelVariant, BenchmarkResult,
+  DataDomain, QuantizationLevel, BaseModel, ModelVariant,
  SizeTier, ModelFamily,
 } from '@ai-tycoon/shared';
-import { BENCHMARKS } from '@ai-tycoon/game-engine';

 const DATA_MIX_PRESETS: Record<string, { label: string; mix: DataMixAllocation }> = {
  balanced: { label: 'Balanced', mix: DEFAULT_DATA_MIX },
@@ -52,8 +51,6 @@ export function ModelsPage() {
  const families = useGameStore((s) => s.models.families);
  const pipelines = useGameStore((s) => s.models.activeTrainingPipelines);
  const variantJobs = useGameStore((s) => s.models.variantJobs);
-  const evalJobs = useGameStore((s) => s.models.evalJobs);
-  const benchmarkResults = useGameStore((s) => s.models.benchmarkResults);
  const productLines = useGameStore((s) => s.models.productLines);
  const totalFlops = useGameStore((s) => s.compute.totalFlops);
  const totalVramGB = useGameStore((s) => s.compute.totalVramGB);
@@ -64,7 +61,6 @@ export function ModelsPage() {
  const deployModel = useGameStore((s) => s.deployModel);
  const deployVariant = useGameStore((s) => s.deployVariant);
  const createQuantization = useGameStore((s) => s.createQuantization);
-  const startEvaluation = useGameStore((s) => s.startEvaluation);
  const setTrainingAllocation = useGameStore((s) => s.setTrainingAllocation);
  const openSourceModel = useGameStore((s) => s.openSourceModel);
  const openSourcedModels = useGameStore((s) => s.market.openSourcedModels);
@@ -96,15 +92,12 @@ export function ModelsPage() {

  const activePipelines = pipelines.filter(p => p.status === 'active' || p.status === 'stalled');
  const activeVariantJobs = variantJobs.filter(j => j.status === 'active');
-  const activeEvalJobs = evalJobs.filter(j => j.status === 'active');
  const undeployedCount = baseModels.filter(m => !m.isDeployed).length;
-  const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0 || activeEvalJobs.length > 0;
+  const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0;
  const noModelDeployed = baseModels.length > 0 && !baseModels.some(m => m.isDeployed);

  const eraOrder = ['startup', 'scaleup', 'bigtech', 'agi'] as const;
  const currentEraIdx = eraOrder.indexOf(currentEra);
-  const availableBenchmarks = BENCHMARKS.filter(b => eraOrder.indexOf(b.unlockedAtEra) <= currentEraIdx);
-
  const hasAlignmentResearch = completedResearch.some(r =>
    r === 'alignment-research' || r === 'interpretability' || r === 'constitutional-ai',
  );
@@ -186,7 +179,6 @@ export function ModelsPage() {
          { id: 'overview' as const, label: 'Overview' },
          { id: 'train' as const, label: 'Train New' },
          { id: 'models' as const, label: `Families${families.length > 0 ? ` (${families.length})` : ''}` },
-          { id: 'benchmarks' as const, label: 'Benchmarks' },
          { id: 'products' as const, label: 'Products' },
        ]).map(tab => (
          <button
@@ -347,28 +339,6 @@ export function ModelsPage() {
        </div>
      )}

-      {/* Active Eval Jobs */}
-      {modelsTab === 'overview' && activeEvalJobs.length > 0 && (
-        <div className="space-y-3">
-          <h3 className="font-semibold">Running Evaluations</h3>
-          {activeEvalJobs.map(job => {
-            const model = baseModels.find(m => m.id === job.modelId) ?? families.flatMap(f => f.variants).find(v => v.id === job.modelId);
-            const progress = job.progressTicks / job.totalTicks;
-            return (
-              <div key={job.id} className="bg-surface-900 border border-surface-700 rounded-xl p-3">
-                <div className="flex items-center justify-between mb-1">
-                  <span className="text-sm">{model?.name ?? 'Unknown'} — {job.benchmarkIds.length} benchmarks</span>
-                  <span className="text-xs text-surface-400">{formatPercent(progress)}</span>
-                </div>
-                <div className="h-1.5 bg-surface-800 rounded-full overflow-hidden">
-                  <div className="h-full bg-blue-500 rounded-full transition-all" style={{ width: `${progress * 100}%` }} />
-                </div>
-              </div>
-            );
-          })}
-        </div>
-      )}
-
      {/* Train New Model */}
      {modelsTab === 'train' && <div className="bg-surface-900 border border-surface-700 rounded-xl p-4 space-y-4">
        <h3 className="font-semibold">Train New Model</h3>
@@ -716,9 +686,8 @@ export function ModelsPage() {
                    {familyModels.map(model => (
                      <div key={model.id} className="space-y-3">
                        <h5 className="text-sm font-medium text-surface-300">{model.name}</h5>
-                        <ModelDetails model={model} benchmarkResults={benchmarkResults} />
+                        <ModelDetails model={model} />
                        <QuantizationCreator model={model} completedResearch={completedResearch} onQuantize={createQuantization} />
-                        <BenchmarkEvaluator modelId={model.id} modelName={model.name} availableBenchmarks={availableBenchmarks} benchmarkResults={benchmarkResults} evalJobs={evalJobs} onStartEval={startEvaluation} />
                      </div>
                    ))}

@@ -730,11 +699,7 @@ export function ModelsPage() {
                            key={variant.id}
                            variant={variant}
                            familyId={family.id}
-                            benchmarkResults={benchmarkResults}
-                            availableBenchmarks={availableBenchmarks}
-                            evalJobs={evalJobs}
                            onDeploy={() => deployVariant(family.id, variant.id)}
-                            onStartEval={startEvaluation}
                          />
                        ))}
                      </div>
@@ -747,21 +712,6 @@ export function ModelsPage() {
        </div>
      )}

-      {/* Benchmark Leaderboard */}
-      {modelsTab === 'benchmarks' && benchmarkResults.length > 0 && (
-        <BenchmarkLeaderboard
-          benchmarkResults={benchmarkResults}
-          baseModels={baseModels}
-          families={families}
-          availableBenchmarks={availableBenchmarks}
-        />
-      )}
-      {modelsTab === 'benchmarks' && benchmarkResults.length === 0 && (
-        <div className="bg-surface-900 border border-surface-700 rounded-xl p-8 text-center text-surface-500 text-sm">
-          No benchmark results yet. Run evaluations from the Models tab.
-        </div>
-      )}
-
      {/* Product Lines */}
      {modelsTab === 'products' && <div className="space-y-3">
        <h3 className="font-semibold">Product Lines</h3>
@@ -865,9 +815,7 @@ function ModelActions({ model, isOpenSourced, onDeploy, onOpenSource }: {
  );
 }

-function ModelDetails({ model, benchmarkResults }: { model: BaseModel; benchmarkResults: BenchmarkResult[] }) {
-  const modelResults = benchmarkResults.filter(r => r.modelId === model.id);
-
+function ModelDetails({ model }: { model: BaseModel }) {
  return (
    <div className="space-y-3">
      <div className="grid grid-cols-3 gap-3 text-xs">
@@ -907,22 +855,6 @@ function ModelDetails({ model, benchmarkResults }: { model: BaseModel; benchmark
        </div>
      </div>

-      {modelResults.length > 0 && (
-        <div>
-          <span className="text-xs font-medium text-surface-300">Benchmark Scores</span>
-          <div className="grid grid-cols-3 gap-2 mt-1">
-            {modelResults.map(r => {
-              const bench = BENCHMARKS.find(b => b.id === r.benchmarkId);
-              return (
-                <div key={r.benchmarkId} className="bg-surface-800 rounded-lg p-2 text-xs">
-                  <span className="text-surface-400">{bench?.name ?? r.benchmarkId}</span>
-                  <div className="font-mono mt-0.5 text-accent-light">{r.score.toFixed(1)}</div>
-                </div>
-              );
-            })}
-          </div>
-        </div>
-      )}
    </div>
  );
 }
@@ -981,91 +913,12 @@ function QuantizationCreator({ model, completedResearch, onQuantize }: {
  );
 }

-function BenchmarkEvaluator({ modelId, modelName, availableBenchmarks, benchmarkResults, evalJobs, onStartEval }: {
-  modelId: string;
-  modelName: string;
-  availableBenchmarks: typeof BENCHMARKS;
-  benchmarkResults: BenchmarkResult[];
-  evalJobs: { id: string; modelId: string; status: string }[];
-  onStartEval: (modelId: string, benchmarkIds: string[]) => void;
-}) {
-  const [showEval, setShowEval] = useState(false);
-  const [selectedBenchmarks, setSelectedBenchmarks] = useState<string[]>([]);
-
-  const existingResults = benchmarkResults.filter(r => r.modelId === modelId);
-  const evaluatedIds = new Set(existingResults.map(r => r.benchmarkId));
-  const isEvaluating = evalJobs.some(j => j.modelId === modelId && j.status === 'active');
-  const unevaluated = availableBenchmarks.filter(b => !evaluatedIds.has(b.id));
-
-  if (unevaluated.length === 0 && !showEval) {
-    return null;
-  }
-
-  if (!showEval) {
-    return (
-      <button onClick={() => { setShowEval(true); setSelectedBenchmarks(unevaluated.map(b => b.id)); }}
-        disabled={isEvaluating}
-        className="flex items-center gap-1 text-xs text-blue-400 hover:text-blue-300 disabled:opacity-50">
-        <BarChart3 size={12} /> Run Benchmarks ({unevaluated.length} available)
-      </button>
-    );
-  }
-
-  return (
-    <div className="bg-surface-800/50 rounded-lg p-3 space-y-2">
-      <div className="flex items-center justify-between">
-        <span className="text-xs font-medium text-surface-300">Run Evaluation</span>
-        <button onClick={() => setShowEval(false)} className="text-xs text-surface-500 hover:text-surface-300">Close</button>
-      </div>
-      <div className="flex flex-wrap gap-1">
-        {availableBenchmarks.map(bench => {
-          const alreadyDone = evaluatedIds.has(bench.id);
-          const selected = selectedBenchmarks.includes(bench.id);
-          return (
-            <button key={bench.id}
-              disabled={alreadyDone}
-              onClick={() => setSelectedBenchmarks(prev =>
-                prev.includes(bench.id) ? prev.filter(id => id !== bench.id) : [...prev, bench.id]
-              )}
-              className={`px-2 py-0.5 rounded text-[10px] border ${
-                alreadyDone ? 'bg-success/10 border-success/30 text-success cursor-default' :
-                selected ? 'bg-blue-500/20 border-blue-500 text-blue-300' :
-                'bg-surface-800 border-surface-600 text-surface-400'
-              }`}
-              title={bench.description}
-            >
-              {bench.name} {alreadyDone ? `(${existingResults.find(r => r.benchmarkId === bench.id)?.score.toFixed(0)})` : ''}
-            </button>
-          );
-        })}
-      </div>
-      {selectedBenchmarks.length > 0 && (
-        <div className="flex items-center justify-between">
-          <span className="text-[10px] text-surface-500">
-            {selectedBenchmarks.length} benchmark{selectedBenchmarks.length > 1 ? 's' : ''} · ~{availableBenchmarks.filter(b => selectedBenchmarks.includes(b.id)).reduce((s, b) => s + b.ticksToRun, 0)} ticks
-          </span>
-          <button onClick={() => { onStartEval(modelId, selectedBenchmarks); setShowEval(false); }}
-            disabled={isEvaluating}
-            className="bg-blue-600 hover:bg-blue-700 text-white rounded px-3 py-1 text-xs disabled:opacity-50">
-            Evaluate
-          </button>
-        </div>
-      )}
-    </div>
-  );
-}
-
-function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks, evalJobs, onDeploy, onStartEval }: {
+function VariantCard({ variant, familyId, onDeploy }: {
  variant: ModelVariant;
  familyId: string;
-  benchmarkResults: BenchmarkResult[];
-  availableBenchmarks: typeof BENCHMARKS;
-  evalJobs: { id: string; modelId: string; status: string }[];
  onDeploy: () => void;
-  onStartEval: (modelId: string, benchmarkIds: string[]) => void;
 }) {
  const [isExpanded, setIsExpanded] = useState(false);
-  const variantResults = benchmarkResults.filter(r => r.modelId === variant.id);

  return (
    <div className="bg-surface-800/50 rounded-lg p-3 ml-4 border-l-2 border-surface-600">
@@ -1106,108 +959,12 @@ function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks,
              </div>
            ))}
          </div>
-
-          {variantResults.length > 0 && (
-            <div className="grid grid-cols-3 gap-2">
-              {variantResults.map(r => {
-                const bench = BENCHMARKS.find(b => b.id === r.benchmarkId);
-                return (
-                  <div key={r.benchmarkId} className="bg-surface-800 rounded p-1.5 text-xs">
-                    <span className="text-surface-400 text-[10px]">{bench?.name ?? r.benchmarkId}</span>
-                    <div className="font-mono text-accent-light text-[11px]">{r.score.toFixed(1)}</div>
-                  </div>
-                );
-              })}
-            </div>
-          )}
-
-          <BenchmarkEvaluator
-            modelId={variant.id}
-            modelName={variant.name}
-            availableBenchmarks={availableBenchmarks}
-            benchmarkResults={benchmarkResults}
-            evalJobs={evalJobs}
-            onStartEval={onStartEval}
-          />
        </div>
      )}
    </div>
  );
 }

-function BenchmarkLeaderboard({ benchmarkResults, baseModels, families, availableBenchmarks }: {
-  benchmarkResults: BenchmarkResult[];
-  baseModels: BaseModel[];
-  families: { id: string; name: string; variants: ModelVariant[] }[];
-  availableBenchmarks: typeof BENCHMARKS;
-}) {
-  const allModels: (BaseModel | ModelVariant)[] = [
-    ...baseModels,
-    ...families.flatMap(f => f.variants),
-  ];
-
-  const modelNames = new Map(allModels.map(m => [m.id, m.name]));
-  const benchmarksWithResults = availableBenchmarks.filter(b =>
-    benchmarkResults.some(r => r.benchmarkId === b.id),
-  );
-
-  if (benchmarksWithResults.length === 0) return null;
-
-  const modelIds = [...new Set(benchmarkResults.map(r => r.modelId))];
-
-  return (
-    <div className="bg-surface-900 border border-surface-700 rounded-xl p-4">
-      <h3 className="font-semibold mb-3 flex items-center gap-2">
-        <BarChart3 size={16} /> Benchmark Leaderboard
-      </h3>
-      <div className="overflow-x-auto">
-        <table className="w-full text-xs">
-          <thead>
-            <tr className="border-b border-surface-700">
-              <th className="text-left py-1.5 pr-3 text-surface-400 font-medium">Model</th>
-              {benchmarksWithResults.map(b => (
-                <th key={b.id} className="text-center py-1.5 px-2 text-surface-400 font-medium">{b.name}</th>
-              ))}
-              <th className="text-center py-1.5 px-2 text-surface-400 font-medium">Avg</th>
-            </tr>
-          </thead>
-          <tbody>
-            {modelIds.map(modelId => {
-              const results = benchmarkResults.filter(r => r.modelId === modelId);
-              const scores = benchmarksWithResults.map(b => {
-                const r = results.find(r => r.benchmarkId === b.id);
-                return r?.score ?? null;
-              });
-              const validScores = scores.filter((s): s is number => s !== null);
-              const avg = validScores.length > 0 ? validScores.reduce((a, b) => a + b, 0) / validScores.length : 0;
-
-              return (
-                <tr key={modelId} className="border-b border-surface-800">
-                  <td className="py-1.5 pr-3 font-medium">{modelNames.get(modelId) ?? 'Unknown'}</td>
-                  {scores.map((score, i) => (
-                    <td key={i} className="text-center py-1.5 px-2 font-mono">
-                      {score !== null ? (
-                        <span className={score >= 80 ? 'text-success' : score >= 50 ? 'text-accent-light' : 'text-surface-400'}>
-                          {score.toFixed(1)}
-                        </span>
-                      ) : (
-                        <span className="text-surface-600">—</span>
-                      )}
-                    </td>
-                  ))}
-                  <td className="text-center py-1.5 px-2 font-mono font-medium text-accent-light">
-                    {avg > 0 ? avg.toFixed(1) : '—'}
-                  </td>
-                </tr>
-              );
-            })}
-          </tbody>
-        </table>
-      </div>
-    </div>
-  );
-}
-
 function StageBar({ label, active, complete, progress }: {
  label: string; active: boolean; complete: boolean; progress: number;
 }) {