Remove benchmark evaluation system, use training capabilities directly
Model quality for market segments and product lines now derives from deployed model capabilities (coding, reasoning, agents, etc.) instead of requiring a separate manual benchmark evaluation step. This eliminates an unbounded benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines of dead-weight UI, types, and engine code. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import { useState } from 'react';
|
||||
import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap, BarChart3 } from 'lucide-react';
|
||||
import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap } from 'lucide-react';
|
||||
import { TutorialHint } from '@/components/game/TutorialHint';
|
||||
import { ConfirmModal } from '@/components/common/ConfirmModal';
|
||||
import { useGameStore } from '@/store';
|
||||
@@ -16,10 +16,9 @@ import {
|
||||
} from '@ai-tycoon/shared';
|
||||
import type {
|
||||
ModelArchitecture, DataMixAllocation, SFTSpecialization, AlignmentMethod,
|
||||
DataDomain, QuantizationLevel, BaseModel, ModelVariant, BenchmarkResult,
|
||||
DataDomain, QuantizationLevel, BaseModel, ModelVariant,
|
||||
SizeTier, ModelFamily,
|
||||
} from '@ai-tycoon/shared';
|
||||
import { BENCHMARKS } from '@ai-tycoon/game-engine';
|
||||
|
||||
const DATA_MIX_PRESETS: Record<string, { label: string; mix: DataMixAllocation }> = {
|
||||
balanced: { label: 'Balanced', mix: DEFAULT_DATA_MIX },
|
||||
@@ -52,8 +51,6 @@ export function ModelsPage() {
|
||||
const families = useGameStore((s) => s.models.families);
|
||||
const pipelines = useGameStore((s) => s.models.activeTrainingPipelines);
|
||||
const variantJobs = useGameStore((s) => s.models.variantJobs);
|
||||
const evalJobs = useGameStore((s) => s.models.evalJobs);
|
||||
const benchmarkResults = useGameStore((s) => s.models.benchmarkResults);
|
||||
const productLines = useGameStore((s) => s.models.productLines);
|
||||
const totalFlops = useGameStore((s) => s.compute.totalFlops);
|
||||
const totalVramGB = useGameStore((s) => s.compute.totalVramGB);
|
||||
@@ -64,7 +61,6 @@ export function ModelsPage() {
|
||||
const deployModel = useGameStore((s) => s.deployModel);
|
||||
const deployVariant = useGameStore((s) => s.deployVariant);
|
||||
const createQuantization = useGameStore((s) => s.createQuantization);
|
||||
const startEvaluation = useGameStore((s) => s.startEvaluation);
|
||||
const setTrainingAllocation = useGameStore((s) => s.setTrainingAllocation);
|
||||
const openSourceModel = useGameStore((s) => s.openSourceModel);
|
||||
const openSourcedModels = useGameStore((s) => s.market.openSourcedModels);
|
||||
@@ -96,15 +92,12 @@ export function ModelsPage() {
|
||||
|
||||
const activePipelines = pipelines.filter(p => p.status === 'active' || p.status === 'stalled');
|
||||
const activeVariantJobs = variantJobs.filter(j => j.status === 'active');
|
||||
const activeEvalJobs = evalJobs.filter(j => j.status === 'active');
|
||||
const undeployedCount = baseModels.filter(m => !m.isDeployed).length;
|
||||
const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0 || activeEvalJobs.length > 0;
|
||||
const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0;
|
||||
const noModelDeployed = baseModels.length > 0 && !baseModels.some(m => m.isDeployed);
|
||||
|
||||
const eraOrder = ['startup', 'scaleup', 'bigtech', 'agi'] as const;
|
||||
const currentEraIdx = eraOrder.indexOf(currentEra);
|
||||
const availableBenchmarks = BENCHMARKS.filter(b => eraOrder.indexOf(b.unlockedAtEra) <= currentEraIdx);
|
||||
|
||||
const hasAlignmentResearch = completedResearch.some(r =>
|
||||
r === 'alignment-research' || r === 'interpretability' || r === 'constitutional-ai',
|
||||
);
|
||||
@@ -186,7 +179,6 @@ export function ModelsPage() {
|
||||
{ id: 'overview' as const, label: 'Overview' },
|
||||
{ id: 'train' as const, label: 'Train New' },
|
||||
{ id: 'models' as const, label: `Families${families.length > 0 ? ` (${families.length})` : ''}` },
|
||||
{ id: 'benchmarks' as const, label: 'Benchmarks' },
|
||||
{ id: 'products' as const, label: 'Products' },
|
||||
]).map(tab => (
|
||||
<button
|
||||
@@ -347,28 +339,6 @@ export function ModelsPage() {
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Active Eval Jobs */}
|
||||
{modelsTab === 'overview' && activeEvalJobs.length > 0 && (
|
||||
<div className="space-y-3">
|
||||
<h3 className="font-semibold">Running Evaluations</h3>
|
||||
{activeEvalJobs.map(job => {
|
||||
const model = baseModels.find(m => m.id === job.modelId) ?? families.flatMap(f => f.variants).find(v => v.id === job.modelId);
|
||||
const progress = job.progressTicks / job.totalTicks;
|
||||
return (
|
||||
<div key={job.id} className="bg-surface-900 border border-surface-700 rounded-xl p-3">
|
||||
<div className="flex items-center justify-between mb-1">
|
||||
<span className="text-sm">{model?.name ?? 'Unknown'} — {job.benchmarkIds.length} benchmarks</span>
|
||||
<span className="text-xs text-surface-400">{formatPercent(progress)}</span>
|
||||
</div>
|
||||
<div className="h-1.5 bg-surface-800 rounded-full overflow-hidden">
|
||||
<div className="h-full bg-blue-500 rounded-full transition-all" style={{ width: `${progress * 100}%` }} />
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Train New Model */}
|
||||
{modelsTab === 'train' && <div className="bg-surface-900 border border-surface-700 rounded-xl p-4 space-y-4">
|
||||
<h3 className="font-semibold">Train New Model</h3>
|
||||
@@ -716,9 +686,8 @@ export function ModelsPage() {
|
||||
{familyModels.map(model => (
|
||||
<div key={model.id} className="space-y-3">
|
||||
<h5 className="text-sm font-medium text-surface-300">{model.name}</h5>
|
||||
<ModelDetails model={model} benchmarkResults={benchmarkResults} />
|
||||
<ModelDetails model={model} />
|
||||
<QuantizationCreator model={model} completedResearch={completedResearch} onQuantize={createQuantization} />
|
||||
<BenchmarkEvaluator modelId={model.id} modelName={model.name} availableBenchmarks={availableBenchmarks} benchmarkResults={benchmarkResults} evalJobs={evalJobs} onStartEval={startEvaluation} />
|
||||
</div>
|
||||
))}
|
||||
|
||||
@@ -730,11 +699,7 @@ export function ModelsPage() {
|
||||
key={variant.id}
|
||||
variant={variant}
|
||||
familyId={family.id}
|
||||
benchmarkResults={benchmarkResults}
|
||||
availableBenchmarks={availableBenchmarks}
|
||||
evalJobs={evalJobs}
|
||||
onDeploy={() => deployVariant(family.id, variant.id)}
|
||||
onStartEval={startEvaluation}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
@@ -747,21 +712,6 @@ export function ModelsPage() {
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Benchmark Leaderboard */}
|
||||
{modelsTab === 'benchmarks' && benchmarkResults.length > 0 && (
|
||||
<BenchmarkLeaderboard
|
||||
benchmarkResults={benchmarkResults}
|
||||
baseModels={baseModels}
|
||||
families={families}
|
||||
availableBenchmarks={availableBenchmarks}
|
||||
/>
|
||||
)}
|
||||
{modelsTab === 'benchmarks' && benchmarkResults.length === 0 && (
|
||||
<div className="bg-surface-900 border border-surface-700 rounded-xl p-8 text-center text-surface-500 text-sm">
|
||||
No benchmark results yet. Run evaluations from the Models tab.
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Product Lines */}
|
||||
{modelsTab === 'products' && <div className="space-y-3">
|
||||
<h3 className="font-semibold">Product Lines</h3>
|
||||
@@ -865,9 +815,7 @@ function ModelActions({ model, isOpenSourced, onDeploy, onOpenSource }: {
|
||||
);
|
||||
}
|
||||
|
||||
function ModelDetails({ model, benchmarkResults }: { model: BaseModel; benchmarkResults: BenchmarkResult[] }) {
|
||||
const modelResults = benchmarkResults.filter(r => r.modelId === model.id);
|
||||
|
||||
function ModelDetails({ model }: { model: BaseModel }) {
|
||||
return (
|
||||
<div className="space-y-3">
|
||||
<div className="grid grid-cols-3 gap-3 text-xs">
|
||||
@@ -907,22 +855,6 @@ function ModelDetails({ model, benchmarkResults }: { model: BaseModel; benchmark
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{modelResults.length > 0 && (
|
||||
<div>
|
||||
<span className="text-xs font-medium text-surface-300">Benchmark Scores</span>
|
||||
<div className="grid grid-cols-3 gap-2 mt-1">
|
||||
{modelResults.map(r => {
|
||||
const bench = BENCHMARKS.find(b => b.id === r.benchmarkId);
|
||||
return (
|
||||
<div key={r.benchmarkId} className="bg-surface-800 rounded-lg p-2 text-xs">
|
||||
<span className="text-surface-400">{bench?.name ?? r.benchmarkId}</span>
|
||||
<div className="font-mono mt-0.5 text-accent-light">{r.score.toFixed(1)}</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -981,91 +913,12 @@ function QuantizationCreator({ model, completedResearch, onQuantize }: {
|
||||
);
|
||||
}
|
||||
|
||||
function BenchmarkEvaluator({ modelId, modelName, availableBenchmarks, benchmarkResults, evalJobs, onStartEval }: {
|
||||
modelId: string;
|
||||
modelName: string;
|
||||
availableBenchmarks: typeof BENCHMARKS;
|
||||
benchmarkResults: BenchmarkResult[];
|
||||
evalJobs: { id: string; modelId: string; status: string }[];
|
||||
onStartEval: (modelId: string, benchmarkIds: string[]) => void;
|
||||
}) {
|
||||
const [showEval, setShowEval] = useState(false);
|
||||
const [selectedBenchmarks, setSelectedBenchmarks] = useState<string[]>([]);
|
||||
|
||||
const existingResults = benchmarkResults.filter(r => r.modelId === modelId);
|
||||
const evaluatedIds = new Set(existingResults.map(r => r.benchmarkId));
|
||||
const isEvaluating = evalJobs.some(j => j.modelId === modelId && j.status === 'active');
|
||||
const unevaluated = availableBenchmarks.filter(b => !evaluatedIds.has(b.id));
|
||||
|
||||
if (unevaluated.length === 0 && !showEval) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!showEval) {
|
||||
return (
|
||||
<button onClick={() => { setShowEval(true); setSelectedBenchmarks(unevaluated.map(b => b.id)); }}
|
||||
disabled={isEvaluating}
|
||||
className="flex items-center gap-1 text-xs text-blue-400 hover:text-blue-300 disabled:opacity-50">
|
||||
<BarChart3 size={12} /> Run Benchmarks ({unevaluated.length} available)
|
||||
</button>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="bg-surface-800/50 rounded-lg p-3 space-y-2">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-xs font-medium text-surface-300">Run Evaluation</span>
|
||||
<button onClick={() => setShowEval(false)} className="text-xs text-surface-500 hover:text-surface-300">Close</button>
|
||||
</div>
|
||||
<div className="flex flex-wrap gap-1">
|
||||
{availableBenchmarks.map(bench => {
|
||||
const alreadyDone = evaluatedIds.has(bench.id);
|
||||
const selected = selectedBenchmarks.includes(bench.id);
|
||||
return (
|
||||
<button key={bench.id}
|
||||
disabled={alreadyDone}
|
||||
onClick={() => setSelectedBenchmarks(prev =>
|
||||
prev.includes(bench.id) ? prev.filter(id => id !== bench.id) : [...prev, bench.id]
|
||||
)}
|
||||
className={`px-2 py-0.5 rounded text-[10px] border ${
|
||||
alreadyDone ? 'bg-success/10 border-success/30 text-success cursor-default' :
|
||||
selected ? 'bg-blue-500/20 border-blue-500 text-blue-300' :
|
||||
'bg-surface-800 border-surface-600 text-surface-400'
|
||||
}`}
|
||||
title={bench.description}
|
||||
>
|
||||
{bench.name} {alreadyDone ? `(${existingResults.find(r => r.benchmarkId === bench.id)?.score.toFixed(0)})` : ''}
|
||||
</button>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
{selectedBenchmarks.length > 0 && (
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-[10px] text-surface-500">
|
||||
{selectedBenchmarks.length} benchmark{selectedBenchmarks.length > 1 ? 's' : ''} · ~{availableBenchmarks.filter(b => selectedBenchmarks.includes(b.id)).reduce((s, b) => s + b.ticksToRun, 0)} ticks
|
||||
</span>
|
||||
<button onClick={() => { onStartEval(modelId, selectedBenchmarks); setShowEval(false); }}
|
||||
disabled={isEvaluating}
|
||||
className="bg-blue-600 hover:bg-blue-700 text-white rounded px-3 py-1 text-xs disabled:opacity-50">
|
||||
Evaluate
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks, evalJobs, onDeploy, onStartEval }: {
|
||||
function VariantCard({ variant, familyId, onDeploy }: {
|
||||
variant: ModelVariant;
|
||||
familyId: string;
|
||||
benchmarkResults: BenchmarkResult[];
|
||||
availableBenchmarks: typeof BENCHMARKS;
|
||||
evalJobs: { id: string; modelId: string; status: string }[];
|
||||
onDeploy: () => void;
|
||||
onStartEval: (modelId: string, benchmarkIds: string[]) => void;
|
||||
}) {
|
||||
const [isExpanded, setIsExpanded] = useState(false);
|
||||
const variantResults = benchmarkResults.filter(r => r.modelId === variant.id);
|
||||
|
||||
return (
|
||||
<div className="bg-surface-800/50 rounded-lg p-3 ml-4 border-l-2 border-surface-600">
|
||||
@@ -1106,108 +959,12 @@ function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks,
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
||||
{variantResults.length > 0 && (
|
||||
<div className="grid grid-cols-3 gap-2">
|
||||
{variantResults.map(r => {
|
||||
const bench = BENCHMARKS.find(b => b.id === r.benchmarkId);
|
||||
return (
|
||||
<div key={r.benchmarkId} className="bg-surface-800 rounded p-1.5 text-xs">
|
||||
<span className="text-surface-400 text-[10px]">{bench?.name ?? r.benchmarkId}</span>
|
||||
<div className="font-mono text-accent-light text-[11px]">{r.score.toFixed(1)}</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
|
||||
<BenchmarkEvaluator
|
||||
modelId={variant.id}
|
||||
modelName={variant.name}
|
||||
availableBenchmarks={availableBenchmarks}
|
||||
benchmarkResults={benchmarkResults}
|
||||
evalJobs={evalJobs}
|
||||
onStartEval={onStartEval}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function BenchmarkLeaderboard({ benchmarkResults, baseModels, families, availableBenchmarks }: {
|
||||
benchmarkResults: BenchmarkResult[];
|
||||
baseModels: BaseModel[];
|
||||
families: { id: string; name: string; variants: ModelVariant[] }[];
|
||||
availableBenchmarks: typeof BENCHMARKS;
|
||||
}) {
|
||||
const allModels: (BaseModel | ModelVariant)[] = [
|
||||
...baseModels,
|
||||
...families.flatMap(f => f.variants),
|
||||
];
|
||||
|
||||
const modelNames = new Map(allModels.map(m => [m.id, m.name]));
|
||||
const benchmarksWithResults = availableBenchmarks.filter(b =>
|
||||
benchmarkResults.some(r => r.benchmarkId === b.id),
|
||||
);
|
||||
|
||||
if (benchmarksWithResults.length === 0) return null;
|
||||
|
||||
const modelIds = [...new Set(benchmarkResults.map(r => r.modelId))];
|
||||
|
||||
return (
|
||||
<div className="bg-surface-900 border border-surface-700 rounded-xl p-4">
|
||||
<h3 className="font-semibold mb-3 flex items-center gap-2">
|
||||
<BarChart3 size={16} /> Benchmark Leaderboard
|
||||
</h3>
|
||||
<div className="overflow-x-auto">
|
||||
<table className="w-full text-xs">
|
||||
<thead>
|
||||
<tr className="border-b border-surface-700">
|
||||
<th className="text-left py-1.5 pr-3 text-surface-400 font-medium">Model</th>
|
||||
{benchmarksWithResults.map(b => (
|
||||
<th key={b.id} className="text-center py-1.5 px-2 text-surface-400 font-medium">{b.name}</th>
|
||||
))}
|
||||
<th className="text-center py-1.5 px-2 text-surface-400 font-medium">Avg</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{modelIds.map(modelId => {
|
||||
const results = benchmarkResults.filter(r => r.modelId === modelId);
|
||||
const scores = benchmarksWithResults.map(b => {
|
||||
const r = results.find(r => r.benchmarkId === b.id);
|
||||
return r?.score ?? null;
|
||||
});
|
||||
const validScores = scores.filter((s): s is number => s !== null);
|
||||
const avg = validScores.length > 0 ? validScores.reduce((a, b) => a + b, 0) / validScores.length : 0;
|
||||
|
||||
return (
|
||||
<tr key={modelId} className="border-b border-surface-800">
|
||||
<td className="py-1.5 pr-3 font-medium">{modelNames.get(modelId) ?? 'Unknown'}</td>
|
||||
{scores.map((score, i) => (
|
||||
<td key={i} className="text-center py-1.5 px-2 font-mono">
|
||||
{score !== null ? (
|
||||
<span className={score >= 80 ? 'text-success' : score >= 50 ? 'text-accent-light' : 'text-surface-400'}>
|
||||
{score.toFixed(1)}
|
||||
</span>
|
||||
) : (
|
||||
<span className="text-surface-600">—</span>
|
||||
)}
|
||||
</td>
|
||||
))}
|
||||
<td className="text-center py-1.5 px-2 font-mono font-medium text-accent-light">
|
||||
{avg > 0 ? avg.toFixed(1) : '—'}
|
||||
</td>
|
||||
</tr>
|
||||
);
|
||||
})}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function StageBar({ label, active, complete, progress }: {
|
||||
label: string; active: boolean; complete: boolean; progress: number;
|
||||
}) {
|
||||
|
||||
@@ -15,7 +15,6 @@ import type {
|
||||
TrainingPipeline, ModelFamily, DataMixAllocation,
|
||||
ModelArchitecture, AlignmentMethod, SizeTier,
|
||||
SFTSpecialization, QuantizationLevel, VariantCreationJob,
|
||||
EvalJob,
|
||||
ConsumerTierId, ApiTierId,
|
||||
} from '@ai-tycoon/shared';
|
||||
import {
|
||||
@@ -43,7 +42,7 @@ import {
|
||||
} from '@ai-tycoon/shared';
|
||||
import {
|
||||
emptyDCNetworkSummary, emptyCampusNetworkSummary, emptyClusterNetworkSummary,
|
||||
BENCHMARKS, TECH_TREE, onModelDeployed,
|
||||
TECH_TREE, onModelDeployed,
|
||||
} from '@ai-tycoon/game-engine';
|
||||
import { INITIAL_RIVALS } from '@ai-tycoon/game-engine';
|
||||
|
||||
@@ -59,7 +58,7 @@ export interface InfraNav {
|
||||
datacenterId?: string;
|
||||
}
|
||||
|
||||
type ModelsTab = 'overview' | 'train' | 'models' | 'benchmarks' | 'products';
|
||||
type ModelsTab = 'overview' | 'train' | 'models' | 'products';
|
||||
|
||||
interface UIState {
|
||||
activePage: ActivePage;
|
||||
@@ -132,7 +131,6 @@ interface Actions {
|
||||
}) => void;
|
||||
startPointRelease: (baseModelId: string) => void;
|
||||
createQuantization: (baseModelId: string, level: QuantizationLevel, variantName: string) => void;
|
||||
startEvaluation: (modelId: string, benchmarkIds: string[]) => void;
|
||||
deployModel: (modelId: string) => void;
|
||||
deployVariant: (familyId: string, variantId: string) => void;
|
||||
setProductPricing: (productLineId: string, field: string, value: number) => void;
|
||||
@@ -1076,32 +1074,6 @@ export const useGameStore = create<Store>()(
|
||||
}
|
||||
},
|
||||
|
||||
startEvaluation: (modelId, benchmarkIds) => {
|
||||
let created = false;
|
||||
set((s) => {
|
||||
const benchmarks = BENCHMARKS.filter(b => benchmarkIds.includes(b.id));
|
||||
if (benchmarks.length === 0) return s;
|
||||
created = true;
|
||||
const totalTicks = benchmarks.reduce((sum, b) => sum + b.ticksToRun, 0);
|
||||
const computeCost = benchmarks.reduce((sum, b) => sum + b.computeCost, 0);
|
||||
const job: EvalJob = {
|
||||
id: uuid(),
|
||||
modelId,
|
||||
benchmarkIds,
|
||||
progressTicks: 0,
|
||||
totalTicks,
|
||||
computeAllocated: computeCost,
|
||||
status: 'active',
|
||||
results: [],
|
||||
};
|
||||
return { models: { ...s.models, evalJobs: [...s.models.evalJobs, job] } };
|
||||
});
|
||||
if (created) {
|
||||
get().addNotification({ title: 'Evaluation Started', message: `${benchmarkIds.length} benchmark${benchmarkIds.length > 1 ? 's' : ''} queued.`, type: 'info', tick: get().meta.tickCount });
|
||||
set({ modelsTab: 'overview' as ModelsTab });
|
||||
}
|
||||
},
|
||||
|
||||
deployModel: (modelId) => {
|
||||
const modelName = get().models.baseModels.find(m => m.id === modelId)?.name ?? 'Model';
|
||||
set((s) => ({
|
||||
|
||||
Reference in New Issue
Block a user