Remove benchmark evaluation system, use training capabilities directly

Model quality for market segments and product lines now derives from deployed
model capabilities (coding, reasoning, agents, etc.) instead of requiring a
separate manual benchmark evaluation step. This eliminates an unbounded
benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines
of dead-weight UI, types, and engine code.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-26 19:28:59 -04:00
parent db034687d6
commit bbb69a315c
10 changed files with 57 additions and 535 deletions
+6 -249
View File
@@ -1,5 +1,5 @@
import { useState } from 'react';
import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap, BarChart3 } from 'lucide-react';
import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap } from 'lucide-react';
import { TutorialHint } from '@/components/game/TutorialHint';
import { ConfirmModal } from '@/components/common/ConfirmModal';
import { useGameStore } from '@/store';
@@ -16,10 +16,9 @@ import {
} from '@ai-tycoon/shared';
import type {
ModelArchitecture, DataMixAllocation, SFTSpecialization, AlignmentMethod,
DataDomain, QuantizationLevel, BaseModel, ModelVariant, BenchmarkResult,
DataDomain, QuantizationLevel, BaseModel, ModelVariant,
SizeTier, ModelFamily,
} from '@ai-tycoon/shared';
import { BENCHMARKS } from '@ai-tycoon/game-engine';
const DATA_MIX_PRESETS: Record<string, { label: string; mix: DataMixAllocation }> = {
balanced: { label: 'Balanced', mix: DEFAULT_DATA_MIX },
@@ -52,8 +51,6 @@ export function ModelsPage() {
const families = useGameStore((s) => s.models.families);
const pipelines = useGameStore((s) => s.models.activeTrainingPipelines);
const variantJobs = useGameStore((s) => s.models.variantJobs);
const evalJobs = useGameStore((s) => s.models.evalJobs);
const benchmarkResults = useGameStore((s) => s.models.benchmarkResults);
const productLines = useGameStore((s) => s.models.productLines);
const totalFlops = useGameStore((s) => s.compute.totalFlops);
const totalVramGB = useGameStore((s) => s.compute.totalVramGB);
@@ -64,7 +61,6 @@ export function ModelsPage() {
const deployModel = useGameStore((s) => s.deployModel);
const deployVariant = useGameStore((s) => s.deployVariant);
const createQuantization = useGameStore((s) => s.createQuantization);
const startEvaluation = useGameStore((s) => s.startEvaluation);
const setTrainingAllocation = useGameStore((s) => s.setTrainingAllocation);
const openSourceModel = useGameStore((s) => s.openSourceModel);
const openSourcedModels = useGameStore((s) => s.market.openSourcedModels);
@@ -96,15 +92,12 @@ export function ModelsPage() {
const activePipelines = pipelines.filter(p => p.status === 'active' || p.status === 'stalled');
const activeVariantJobs = variantJobs.filter(j => j.status === 'active');
const activeEvalJobs = evalJobs.filter(j => j.status === 'active');
const undeployedCount = baseModels.filter(m => !m.isDeployed).length;
const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0 || activeEvalJobs.length > 0;
const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0;
const noModelDeployed = baseModels.length > 0 && !baseModels.some(m => m.isDeployed);
const eraOrder = ['startup', 'scaleup', 'bigtech', 'agi'] as const;
const currentEraIdx = eraOrder.indexOf(currentEra);
const availableBenchmarks = BENCHMARKS.filter(b => eraOrder.indexOf(b.unlockedAtEra) <= currentEraIdx);
const hasAlignmentResearch = completedResearch.some(r =>
r === 'alignment-research' || r === 'interpretability' || r === 'constitutional-ai',
);
@@ -186,7 +179,6 @@ export function ModelsPage() {
{ id: 'overview' as const, label: 'Overview' },
{ id: 'train' as const, label: 'Train New' },
{ id: 'models' as const, label: `Families${families.length > 0 ? ` (${families.length})` : ''}` },
{ id: 'benchmarks' as const, label: 'Benchmarks' },
{ id: 'products' as const, label: 'Products' },
]).map(tab => (
<button
@@ -347,28 +339,6 @@ export function ModelsPage() {
</div>
)}
{/* Active Eval Jobs */}
{modelsTab === 'overview' && activeEvalJobs.length > 0 && (
<div className="space-y-3">
<h3 className="font-semibold">Running Evaluations</h3>
{activeEvalJobs.map(job => {
const model = baseModels.find(m => m.id === job.modelId) ?? families.flatMap(f => f.variants).find(v => v.id === job.modelId);
const progress = job.progressTicks / job.totalTicks;
return (
<div key={job.id} className="bg-surface-900 border border-surface-700 rounded-xl p-3">
<div className="flex items-center justify-between mb-1">
<span className="text-sm">{model?.name ?? 'Unknown'} {job.benchmarkIds.length} benchmarks</span>
<span className="text-xs text-surface-400">{formatPercent(progress)}</span>
</div>
<div className="h-1.5 bg-surface-800 rounded-full overflow-hidden">
<div className="h-full bg-blue-500 rounded-full transition-all" style={{ width: `${progress * 100}%` }} />
</div>
</div>
);
})}
</div>
)}
{/* Train New Model */}
{modelsTab === 'train' && <div className="bg-surface-900 border border-surface-700 rounded-xl p-4 space-y-4">
<h3 className="font-semibold">Train New Model</h3>
@@ -716,9 +686,8 @@ export function ModelsPage() {
{familyModels.map(model => (
<div key={model.id} className="space-y-3">
<h5 className="text-sm font-medium text-surface-300">{model.name}</h5>
<ModelDetails model={model} benchmarkResults={benchmarkResults} />
<ModelDetails model={model} />
<QuantizationCreator model={model} completedResearch={completedResearch} onQuantize={createQuantization} />
<BenchmarkEvaluator modelId={model.id} modelName={model.name} availableBenchmarks={availableBenchmarks} benchmarkResults={benchmarkResults} evalJobs={evalJobs} onStartEval={startEvaluation} />
</div>
))}
@@ -730,11 +699,7 @@ export function ModelsPage() {
key={variant.id}
variant={variant}
familyId={family.id}
benchmarkResults={benchmarkResults}
availableBenchmarks={availableBenchmarks}
evalJobs={evalJobs}
onDeploy={() => deployVariant(family.id, variant.id)}
onStartEval={startEvaluation}
/>
))}
</div>
@@ -747,21 +712,6 @@ export function ModelsPage() {
</div>
)}
{/* Benchmark Leaderboard */}
{modelsTab === 'benchmarks' && benchmarkResults.length > 0 && (
<BenchmarkLeaderboard
benchmarkResults={benchmarkResults}
baseModels={baseModels}
families={families}
availableBenchmarks={availableBenchmarks}
/>
)}
{modelsTab === 'benchmarks' && benchmarkResults.length === 0 && (
<div className="bg-surface-900 border border-surface-700 rounded-xl p-8 text-center text-surface-500 text-sm">
No benchmark results yet. Run evaluations from the Models tab.
</div>
)}
{/* Product Lines */}
{modelsTab === 'products' && <div className="space-y-3">
<h3 className="font-semibold">Product Lines</h3>
@@ -865,9 +815,7 @@ function ModelActions({ model, isOpenSourced, onDeploy, onOpenSource }: {
);
}
function ModelDetails({ model, benchmarkResults }: { model: BaseModel; benchmarkResults: BenchmarkResult[] }) {
const modelResults = benchmarkResults.filter(r => r.modelId === model.id);
function ModelDetails({ model }: { model: BaseModel }) {
return (
<div className="space-y-3">
<div className="grid grid-cols-3 gap-3 text-xs">
@@ -907,22 +855,6 @@ function ModelDetails({ model, benchmarkResults }: { model: BaseModel; benchmark
</div>
</div>
{modelResults.length > 0 && (
<div>
<span className="text-xs font-medium text-surface-300">Benchmark Scores</span>
<div className="grid grid-cols-3 gap-2 mt-1">
{modelResults.map(r => {
const bench = BENCHMARKS.find(b => b.id === r.benchmarkId);
return (
<div key={r.benchmarkId} className="bg-surface-800 rounded-lg p-2 text-xs">
<span className="text-surface-400">{bench?.name ?? r.benchmarkId}</span>
<div className="font-mono mt-0.5 text-accent-light">{r.score.toFixed(1)}</div>
</div>
);
})}
</div>
</div>
)}
</div>
);
}
@@ -981,91 +913,12 @@ function QuantizationCreator({ model, completedResearch, onQuantize }: {
);
}
function BenchmarkEvaluator({ modelId, modelName, availableBenchmarks, benchmarkResults, evalJobs, onStartEval }: {
modelId: string;
modelName: string;
availableBenchmarks: typeof BENCHMARKS;
benchmarkResults: BenchmarkResult[];
evalJobs: { id: string; modelId: string; status: string }[];
onStartEval: (modelId: string, benchmarkIds: string[]) => void;
}) {
const [showEval, setShowEval] = useState(false);
const [selectedBenchmarks, setSelectedBenchmarks] = useState<string[]>([]);
const existingResults = benchmarkResults.filter(r => r.modelId === modelId);
const evaluatedIds = new Set(existingResults.map(r => r.benchmarkId));
const isEvaluating = evalJobs.some(j => j.modelId === modelId && j.status === 'active');
const unevaluated = availableBenchmarks.filter(b => !evaluatedIds.has(b.id));
if (unevaluated.length === 0 && !showEval) {
return null;
}
if (!showEval) {
return (
<button onClick={() => { setShowEval(true); setSelectedBenchmarks(unevaluated.map(b => b.id)); }}
disabled={isEvaluating}
className="flex items-center gap-1 text-xs text-blue-400 hover:text-blue-300 disabled:opacity-50">
<BarChart3 size={12} /> Run Benchmarks ({unevaluated.length} available)
</button>
);
}
return (
<div className="bg-surface-800/50 rounded-lg p-3 space-y-2">
<div className="flex items-center justify-between">
<span className="text-xs font-medium text-surface-300">Run Evaluation</span>
<button onClick={() => setShowEval(false)} className="text-xs text-surface-500 hover:text-surface-300">Close</button>
</div>
<div className="flex flex-wrap gap-1">
{availableBenchmarks.map(bench => {
const alreadyDone = evaluatedIds.has(bench.id);
const selected = selectedBenchmarks.includes(bench.id);
return (
<button key={bench.id}
disabled={alreadyDone}
onClick={() => setSelectedBenchmarks(prev =>
prev.includes(bench.id) ? prev.filter(id => id !== bench.id) : [...prev, bench.id]
)}
className={`px-2 py-0.5 rounded text-[10px] border ${
alreadyDone ? 'bg-success/10 border-success/30 text-success cursor-default' :
selected ? 'bg-blue-500/20 border-blue-500 text-blue-300' :
'bg-surface-800 border-surface-600 text-surface-400'
}`}
title={bench.description}
>
{bench.name} {alreadyDone ? `(${existingResults.find(r => r.benchmarkId === bench.id)?.score.toFixed(0)})` : ''}
</button>
);
})}
</div>
{selectedBenchmarks.length > 0 && (
<div className="flex items-center justify-between">
<span className="text-[10px] text-surface-500">
{selectedBenchmarks.length} benchmark{selectedBenchmarks.length > 1 ? 's' : ''} · ~{availableBenchmarks.filter(b => selectedBenchmarks.includes(b.id)).reduce((s, b) => s + b.ticksToRun, 0)} ticks
</span>
<button onClick={() => { onStartEval(modelId, selectedBenchmarks); setShowEval(false); }}
disabled={isEvaluating}
className="bg-blue-600 hover:bg-blue-700 text-white rounded px-3 py-1 text-xs disabled:opacity-50">
Evaluate
</button>
</div>
)}
</div>
);
}
function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks, evalJobs, onDeploy, onStartEval }: {
function VariantCard({ variant, familyId, onDeploy }: {
variant: ModelVariant;
familyId: string;
benchmarkResults: BenchmarkResult[];
availableBenchmarks: typeof BENCHMARKS;
evalJobs: { id: string; modelId: string; status: string }[];
onDeploy: () => void;
onStartEval: (modelId: string, benchmarkIds: string[]) => void;
}) {
const [isExpanded, setIsExpanded] = useState(false);
const variantResults = benchmarkResults.filter(r => r.modelId === variant.id);
return (
<div className="bg-surface-800/50 rounded-lg p-3 ml-4 border-l-2 border-surface-600">
@@ -1106,108 +959,12 @@ function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks,
</div>
))}
</div>
{variantResults.length > 0 && (
<div className="grid grid-cols-3 gap-2">
{variantResults.map(r => {
const bench = BENCHMARKS.find(b => b.id === r.benchmarkId);
return (
<div key={r.benchmarkId} className="bg-surface-800 rounded p-1.5 text-xs">
<span className="text-surface-400 text-[10px]">{bench?.name ?? r.benchmarkId}</span>
<div className="font-mono text-accent-light text-[11px]">{r.score.toFixed(1)}</div>
</div>
);
})}
</div>
)}
<BenchmarkEvaluator
modelId={variant.id}
modelName={variant.name}
availableBenchmarks={availableBenchmarks}
benchmarkResults={benchmarkResults}
evalJobs={evalJobs}
onStartEval={onStartEval}
/>
</div>
)}
</div>
);
}
function BenchmarkLeaderboard({ benchmarkResults, baseModels, families, availableBenchmarks }: {
benchmarkResults: BenchmarkResult[];
baseModels: BaseModel[];
families: { id: string; name: string; variants: ModelVariant[] }[];
availableBenchmarks: typeof BENCHMARKS;
}) {
const allModels: (BaseModel | ModelVariant)[] = [
...baseModels,
...families.flatMap(f => f.variants),
];
const modelNames = new Map(allModels.map(m => [m.id, m.name]));
const benchmarksWithResults = availableBenchmarks.filter(b =>
benchmarkResults.some(r => r.benchmarkId === b.id),
);
if (benchmarksWithResults.length === 0) return null;
const modelIds = [...new Set(benchmarkResults.map(r => r.modelId))];
return (
<div className="bg-surface-900 border border-surface-700 rounded-xl p-4">
<h3 className="font-semibold mb-3 flex items-center gap-2">
<BarChart3 size={16} /> Benchmark Leaderboard
</h3>
<div className="overflow-x-auto">
<table className="w-full text-xs">
<thead>
<tr className="border-b border-surface-700">
<th className="text-left py-1.5 pr-3 text-surface-400 font-medium">Model</th>
{benchmarksWithResults.map(b => (
<th key={b.id} className="text-center py-1.5 px-2 text-surface-400 font-medium">{b.name}</th>
))}
<th className="text-center py-1.5 px-2 text-surface-400 font-medium">Avg</th>
</tr>
</thead>
<tbody>
{modelIds.map(modelId => {
const results = benchmarkResults.filter(r => r.modelId === modelId);
const scores = benchmarksWithResults.map(b => {
const r = results.find(r => r.benchmarkId === b.id);
return r?.score ?? null;
});
const validScores = scores.filter((s): s is number => s !== null);
const avg = validScores.length > 0 ? validScores.reduce((a, b) => a + b, 0) / validScores.length : 0;
return (
<tr key={modelId} className="border-b border-surface-800">
<td className="py-1.5 pr-3 font-medium">{modelNames.get(modelId) ?? 'Unknown'}</td>
{scores.map((score, i) => (
<td key={i} className="text-center py-1.5 px-2 font-mono">
{score !== null ? (
<span className={score >= 80 ? 'text-success' : score >= 50 ? 'text-accent-light' : 'text-surface-400'}>
{score.toFixed(1)}
</span>
) : (
<span className="text-surface-600"></span>
)}
</td>
))}
<td className="text-center py-1.5 px-2 font-mono font-medium text-accent-light">
{avg > 0 ? avg.toFixed(1) : '—'}
</td>
</tr>
);
})}
</tbody>
</table>
</div>
</div>
);
}
function StageBar({ label, active, complete, progress }: {
label: string; active: boolean; complete: boolean; progress: number;
}) {