Remove benchmark evaluation system, use training capabilities directly

Model quality for market segments and product lines now derives from deployed
model capabilities (coding, reasoning, agents, etc.) instead of requiring a
separate manual benchmark evaluation step. This eliminates an unbounded
benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines
of dead-weight UI, types, and engine code.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-26 19:28:59 -04:00
parent db034687d6
commit bbb69a315c
10 changed files with 57 additions and 535 deletions
+6 -249
View File
@@ -1,5 +1,5 @@
import { useState } from 'react';
import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap, BarChart3 } from 'lucide-react';
import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap } from 'lucide-react';
import { TutorialHint } from '@/components/game/TutorialHint';
import { ConfirmModal } from '@/components/common/ConfirmModal';
import { useGameStore } from '@/store';
@@ -16,10 +16,9 @@ import {
} from '@ai-tycoon/shared';
import type {
ModelArchitecture, DataMixAllocation, SFTSpecialization, AlignmentMethod,
DataDomain, QuantizationLevel, BaseModel, ModelVariant, BenchmarkResult,
DataDomain, QuantizationLevel, BaseModel, ModelVariant,
SizeTier, ModelFamily,
} from '@ai-tycoon/shared';
import { BENCHMARKS } from '@ai-tycoon/game-engine';
const DATA_MIX_PRESETS: Record<string, { label: string; mix: DataMixAllocation }> = {
balanced: { label: 'Balanced', mix: DEFAULT_DATA_MIX },
@@ -52,8 +51,6 @@ export function ModelsPage() {
const families = useGameStore((s) => s.models.families);
const pipelines = useGameStore((s) => s.models.activeTrainingPipelines);
const variantJobs = useGameStore((s) => s.models.variantJobs);
const evalJobs = useGameStore((s) => s.models.evalJobs);
const benchmarkResults = useGameStore((s) => s.models.benchmarkResults);
const productLines = useGameStore((s) => s.models.productLines);
const totalFlops = useGameStore((s) => s.compute.totalFlops);
const totalVramGB = useGameStore((s) => s.compute.totalVramGB);
@@ -64,7 +61,6 @@ export function ModelsPage() {
const deployModel = useGameStore((s) => s.deployModel);
const deployVariant = useGameStore((s) => s.deployVariant);
const createQuantization = useGameStore((s) => s.createQuantization);
const startEvaluation = useGameStore((s) => s.startEvaluation);
const setTrainingAllocation = useGameStore((s) => s.setTrainingAllocation);
const openSourceModel = useGameStore((s) => s.openSourceModel);
const openSourcedModels = useGameStore((s) => s.market.openSourcedModels);
@@ -96,15 +92,12 @@ export function ModelsPage() {
const activePipelines = pipelines.filter(p => p.status === 'active' || p.status === 'stalled');
const activeVariantJobs = variantJobs.filter(j => j.status === 'active');
const activeEvalJobs = evalJobs.filter(j => j.status === 'active');
const undeployedCount = baseModels.filter(m => !m.isDeployed).length;
const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0 || activeEvalJobs.length > 0;
const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0;
const noModelDeployed = baseModels.length > 0 && !baseModels.some(m => m.isDeployed);
const eraOrder = ['startup', 'scaleup', 'bigtech', 'agi'] as const;
const currentEraIdx = eraOrder.indexOf(currentEra);
const availableBenchmarks = BENCHMARKS.filter(b => eraOrder.indexOf(b.unlockedAtEra) <= currentEraIdx);
const hasAlignmentResearch = completedResearch.some(r =>
r === 'alignment-research' || r === 'interpretability' || r === 'constitutional-ai',
);
@@ -186,7 +179,6 @@ export function ModelsPage() {
{ id: 'overview' as const, label: 'Overview' },
{ id: 'train' as const, label: 'Train New' },
{ id: 'models' as const, label: `Families${families.length > 0 ? ` (${families.length})` : ''}` },
{ id: 'benchmarks' as const, label: 'Benchmarks' },
{ id: 'products' as const, label: 'Products' },
]).map(tab => (
<button
@@ -347,28 +339,6 @@ export function ModelsPage() {
</div>
)}
{/* Active Eval Jobs */}
{modelsTab === 'overview' && activeEvalJobs.length > 0 && (
<div className="space-y-3">
<h3 className="font-semibold">Running Evaluations</h3>
{activeEvalJobs.map(job => {
const model = baseModels.find(m => m.id === job.modelId) ?? families.flatMap(f => f.variants).find(v => v.id === job.modelId);
const progress = job.progressTicks / job.totalTicks;
return (
<div key={job.id} className="bg-surface-900 border border-surface-700 rounded-xl p-3">
<div className="flex items-center justify-between mb-1">
<span className="text-sm">{model?.name ?? 'Unknown'} {job.benchmarkIds.length} benchmarks</span>
<span className="text-xs text-surface-400">{formatPercent(progress)}</span>
</div>
<div className="h-1.5 bg-surface-800 rounded-full overflow-hidden">
<div className="h-full bg-blue-500 rounded-full transition-all" style={{ width: `${progress * 100}%` }} />
</div>
</div>
);
})}
</div>
)}
{/* Train New Model */}
{modelsTab === 'train' && <div className="bg-surface-900 border border-surface-700 rounded-xl p-4 space-y-4">
<h3 className="font-semibold">Train New Model</h3>
@@ -716,9 +686,8 @@ export function ModelsPage() {
{familyModels.map(model => (
<div key={model.id} className="space-y-3">
<h5 className="text-sm font-medium text-surface-300">{model.name}</h5>
<ModelDetails model={model} benchmarkResults={benchmarkResults} />
<ModelDetails model={model} />
<QuantizationCreator model={model} completedResearch={completedResearch} onQuantize={createQuantization} />
<BenchmarkEvaluator modelId={model.id} modelName={model.name} availableBenchmarks={availableBenchmarks} benchmarkResults={benchmarkResults} evalJobs={evalJobs} onStartEval={startEvaluation} />
</div>
))}
@@ -730,11 +699,7 @@ export function ModelsPage() {
key={variant.id}
variant={variant}
familyId={family.id}
benchmarkResults={benchmarkResults}
availableBenchmarks={availableBenchmarks}
evalJobs={evalJobs}
onDeploy={() => deployVariant(family.id, variant.id)}
onStartEval={startEvaluation}
/>
))}
</div>
@@ -747,21 +712,6 @@ export function ModelsPage() {
</div>
)}
{/* Benchmark Leaderboard */}
{modelsTab === 'benchmarks' && benchmarkResults.length > 0 && (
<BenchmarkLeaderboard
benchmarkResults={benchmarkResults}
baseModels={baseModels}
families={families}
availableBenchmarks={availableBenchmarks}
/>
)}
{modelsTab === 'benchmarks' && benchmarkResults.length === 0 && (
<div className="bg-surface-900 border border-surface-700 rounded-xl p-8 text-center text-surface-500 text-sm">
No benchmark results yet. Run evaluations from the Models tab.
</div>
)}
{/* Product Lines */}
{modelsTab === 'products' && <div className="space-y-3">
<h3 className="font-semibold">Product Lines</h3>
@@ -865,9 +815,7 @@ function ModelActions({ model, isOpenSourced, onDeploy, onOpenSource }: {
);
}
function ModelDetails({ model, benchmarkResults }: { model: BaseModel; benchmarkResults: BenchmarkResult[] }) {
const modelResults = benchmarkResults.filter(r => r.modelId === model.id);
function ModelDetails({ model }: { model: BaseModel }) {
return (
<div className="space-y-3">
<div className="grid grid-cols-3 gap-3 text-xs">
@@ -907,22 +855,6 @@ function ModelDetails({ model, benchmarkResults }: { model: BaseModel; benchmark
</div>
</div>
{modelResults.length > 0 && (
<div>
<span className="text-xs font-medium text-surface-300">Benchmark Scores</span>
<div className="grid grid-cols-3 gap-2 mt-1">
{modelResults.map(r => {
const bench = BENCHMARKS.find(b => b.id === r.benchmarkId);
return (
<div key={r.benchmarkId} className="bg-surface-800 rounded-lg p-2 text-xs">
<span className="text-surface-400">{bench?.name ?? r.benchmarkId}</span>
<div className="font-mono mt-0.5 text-accent-light">{r.score.toFixed(1)}</div>
</div>
);
})}
</div>
</div>
)}
</div>
);
}
@@ -981,91 +913,12 @@ function QuantizationCreator({ model, completedResearch, onQuantize }: {
);
}
function BenchmarkEvaluator({ modelId, modelName, availableBenchmarks, benchmarkResults, evalJobs, onStartEval }: {
modelId: string;
modelName: string;
availableBenchmarks: typeof BENCHMARKS;
benchmarkResults: BenchmarkResult[];
evalJobs: { id: string; modelId: string; status: string }[];
onStartEval: (modelId: string, benchmarkIds: string[]) => void;
}) {
const [showEval, setShowEval] = useState(false);
const [selectedBenchmarks, setSelectedBenchmarks] = useState<string[]>([]);
const existingResults = benchmarkResults.filter(r => r.modelId === modelId);
const evaluatedIds = new Set(existingResults.map(r => r.benchmarkId));
const isEvaluating = evalJobs.some(j => j.modelId === modelId && j.status === 'active');
const unevaluated = availableBenchmarks.filter(b => !evaluatedIds.has(b.id));
if (unevaluated.length === 0 && !showEval) {
return null;
}
if (!showEval) {
return (
<button onClick={() => { setShowEval(true); setSelectedBenchmarks(unevaluated.map(b => b.id)); }}
disabled={isEvaluating}
className="flex items-center gap-1 text-xs text-blue-400 hover:text-blue-300 disabled:opacity-50">
<BarChart3 size={12} /> Run Benchmarks ({unevaluated.length} available)
</button>
);
}
return (
<div className="bg-surface-800/50 rounded-lg p-3 space-y-2">
<div className="flex items-center justify-between">
<span className="text-xs font-medium text-surface-300">Run Evaluation</span>
<button onClick={() => setShowEval(false)} className="text-xs text-surface-500 hover:text-surface-300">Close</button>
</div>
<div className="flex flex-wrap gap-1">
{availableBenchmarks.map(bench => {
const alreadyDone = evaluatedIds.has(bench.id);
const selected = selectedBenchmarks.includes(bench.id);
return (
<button key={bench.id}
disabled={alreadyDone}
onClick={() => setSelectedBenchmarks(prev =>
prev.includes(bench.id) ? prev.filter(id => id !== bench.id) : [...prev, bench.id]
)}
className={`px-2 py-0.5 rounded text-[10px] border ${
alreadyDone ? 'bg-success/10 border-success/30 text-success cursor-default' :
selected ? 'bg-blue-500/20 border-blue-500 text-blue-300' :
'bg-surface-800 border-surface-600 text-surface-400'
}`}
title={bench.description}
>
{bench.name} {alreadyDone ? `(${existingResults.find(r => r.benchmarkId === bench.id)?.score.toFixed(0)})` : ''}
</button>
);
})}
</div>
{selectedBenchmarks.length > 0 && (
<div className="flex items-center justify-between">
<span className="text-[10px] text-surface-500">
{selectedBenchmarks.length} benchmark{selectedBenchmarks.length > 1 ? 's' : ''} · ~{availableBenchmarks.filter(b => selectedBenchmarks.includes(b.id)).reduce((s, b) => s + b.ticksToRun, 0)} ticks
</span>
<button onClick={() => { onStartEval(modelId, selectedBenchmarks); setShowEval(false); }}
disabled={isEvaluating}
className="bg-blue-600 hover:bg-blue-700 text-white rounded px-3 py-1 text-xs disabled:opacity-50">
Evaluate
</button>
</div>
)}
</div>
);
}
function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks, evalJobs, onDeploy, onStartEval }: {
function VariantCard({ variant, familyId, onDeploy }: {
variant: ModelVariant;
familyId: string;
benchmarkResults: BenchmarkResult[];
availableBenchmarks: typeof BENCHMARKS;
evalJobs: { id: string; modelId: string; status: string }[];
onDeploy: () => void;
onStartEval: (modelId: string, benchmarkIds: string[]) => void;
}) {
const [isExpanded, setIsExpanded] = useState(false);
const variantResults = benchmarkResults.filter(r => r.modelId === variant.id);
return (
<div className="bg-surface-800/50 rounded-lg p-3 ml-4 border-l-2 border-surface-600">
@@ -1106,108 +959,12 @@ function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks,
</div>
))}
</div>
{variantResults.length > 0 && (
<div className="grid grid-cols-3 gap-2">
{variantResults.map(r => {
const bench = BENCHMARKS.find(b => b.id === r.benchmarkId);
return (
<div key={r.benchmarkId} className="bg-surface-800 rounded p-1.5 text-xs">
<span className="text-surface-400 text-[10px]">{bench?.name ?? r.benchmarkId}</span>
<div className="font-mono text-accent-light text-[11px]">{r.score.toFixed(1)}</div>
</div>
);
})}
</div>
)}
<BenchmarkEvaluator
modelId={variant.id}
modelName={variant.name}
availableBenchmarks={availableBenchmarks}
benchmarkResults={benchmarkResults}
evalJobs={evalJobs}
onStartEval={onStartEval}
/>
</div>
)}
</div>
);
}
function BenchmarkLeaderboard({ benchmarkResults, baseModels, families, availableBenchmarks }: {
benchmarkResults: BenchmarkResult[];
baseModels: BaseModel[];
families: { id: string; name: string; variants: ModelVariant[] }[];
availableBenchmarks: typeof BENCHMARKS;
}) {
const allModels: (BaseModel | ModelVariant)[] = [
...baseModels,
...families.flatMap(f => f.variants),
];
const modelNames = new Map(allModels.map(m => [m.id, m.name]));
const benchmarksWithResults = availableBenchmarks.filter(b =>
benchmarkResults.some(r => r.benchmarkId === b.id),
);
if (benchmarksWithResults.length === 0) return null;
const modelIds = [...new Set(benchmarkResults.map(r => r.modelId))];
return (
<div className="bg-surface-900 border border-surface-700 rounded-xl p-4">
<h3 className="font-semibold mb-3 flex items-center gap-2">
<BarChart3 size={16} /> Benchmark Leaderboard
</h3>
<div className="overflow-x-auto">
<table className="w-full text-xs">
<thead>
<tr className="border-b border-surface-700">
<th className="text-left py-1.5 pr-3 text-surface-400 font-medium">Model</th>
{benchmarksWithResults.map(b => (
<th key={b.id} className="text-center py-1.5 px-2 text-surface-400 font-medium">{b.name}</th>
))}
<th className="text-center py-1.5 px-2 text-surface-400 font-medium">Avg</th>
</tr>
</thead>
<tbody>
{modelIds.map(modelId => {
const results = benchmarkResults.filter(r => r.modelId === modelId);
const scores = benchmarksWithResults.map(b => {
const r = results.find(r => r.benchmarkId === b.id);
return r?.score ?? null;
});
const validScores = scores.filter((s): s is number => s !== null);
const avg = validScores.length > 0 ? validScores.reduce((a, b) => a + b, 0) / validScores.length : 0;
return (
<tr key={modelId} className="border-b border-surface-800">
<td className="py-1.5 pr-3 font-medium">{modelNames.get(modelId) ?? 'Unknown'}</td>
{scores.map((score, i) => (
<td key={i} className="text-center py-1.5 px-2 font-mono">
{score !== null ? (
<span className={score >= 80 ? 'text-success' : score >= 50 ? 'text-accent-light' : 'text-surface-400'}>
{score.toFixed(1)}
</span>
) : (
<span className="text-surface-600"></span>
)}
</td>
))}
<td className="text-center py-1.5 px-2 font-mono font-medium text-accent-light">
{avg > 0 ? avg.toFixed(1) : '—'}
</td>
</tr>
);
})}
</tbody>
</table>
</div>
</div>
);
}
function StageBar({ label, active, complete, progress }: {
label: string; active: boolean; complete: boolean; progress: number;
}) {
+2 -30
View File
@@ -15,7 +15,6 @@ import type {
TrainingPipeline, ModelFamily, DataMixAllocation,
ModelArchitecture, AlignmentMethod, SizeTier,
SFTSpecialization, QuantizationLevel, VariantCreationJob,
EvalJob,
ConsumerTierId, ApiTierId,
} from '@ai-tycoon/shared';
import {
@@ -43,7 +42,7 @@ import {
} from '@ai-tycoon/shared';
import {
emptyDCNetworkSummary, emptyCampusNetworkSummary, emptyClusterNetworkSummary,
BENCHMARKS, TECH_TREE, onModelDeployed,
TECH_TREE, onModelDeployed,
} from '@ai-tycoon/game-engine';
import { INITIAL_RIVALS } from '@ai-tycoon/game-engine';
@@ -59,7 +58,7 @@ export interface InfraNav {
datacenterId?: string;
}
type ModelsTab = 'overview' | 'train' | 'models' | 'benchmarks' | 'products';
type ModelsTab = 'overview' | 'train' | 'models' | 'products';
interface UIState {
activePage: ActivePage;
@@ -132,7 +131,6 @@ interface Actions {
}) => void;
startPointRelease: (baseModelId: string) => void;
createQuantization: (baseModelId: string, level: QuantizationLevel, variantName: string) => void;
startEvaluation: (modelId: string, benchmarkIds: string[]) => void;
deployModel: (modelId: string) => void;
deployVariant: (familyId: string, variantId: string) => void;
setProductPricing: (productLineId: string, field: string, value: number) => void;
@@ -1076,32 +1074,6 @@ export const useGameStore = create<Store>()(
}
},
startEvaluation: (modelId, benchmarkIds) => {
let created = false;
set((s) => {
const benchmarks = BENCHMARKS.filter(b => benchmarkIds.includes(b.id));
if (benchmarks.length === 0) return s;
created = true;
const totalTicks = benchmarks.reduce((sum, b) => sum + b.ticksToRun, 0);
const computeCost = benchmarks.reduce((sum, b) => sum + b.computeCost, 0);
const job: EvalJob = {
id: uuid(),
modelId,
benchmarkIds,
progressTicks: 0,
totalTicks,
computeAllocated: computeCost,
status: 'active',
results: [],
};
return { models: { ...s.models, evalJobs: [...s.models.evalJobs, job] } };
});
if (created) {
get().addNotification({ title: 'Evaluation Started', message: `${benchmarkIds.length} benchmark${benchmarkIds.length > 1 ? 's' : ''} queued.`, type: 'info', tick: get().meta.tickCount });
set({ modelsTab: 'overview' as ModelsTab });
}
},
deployModel: (modelId) => {
const modelName = get().models.baseModels.find(m => m.id === modelId)?.name ?? 'Model';
set((s) => ({
@@ -171,7 +171,6 @@ export function createTestBaseModel(overrides?: Partial<BaseModel>): BaseModel {
sizeTier: 'small',
isPointRelease: false,
sourceModelId: null,
benchmarkResults: {},
dataMix: { web: 0.4, code: 0.2, books: 0.15, academic: 0.1, conversational: 0.1, specialized: 0.05 },
};
return overrides ? { ...base, ...overrides } : base;
@@ -181,9 +180,10 @@ export function createTestModelFamily(overrides?: Partial<ModelFamily>): ModelFa
const base: ModelFamily = {
id: uuid(),
name: 'Test Family',
baseModels: [],
generation: 1,
baseModelIds: [],
variants: [],
activeEvals: [],
createdAtTick: 0,
};
return overrides ? { ...base, ...overrides } : base;
}
-111
View File
@@ -1,111 +0,0 @@
import type { BenchmarkDefinition } from '@ai-tycoon/shared';
export const BENCHMARKS: BenchmarkDefinition[] = [
{
id: 'arc-challenge',
name: 'ARC Challenge',
category: 'reasoning',
description: 'Advanced reasoning and comprehension tasks requiring multi-step inference.',
primaryCapability: 'reasoning',
secondaryCapability: 'knowledge',
computeCost: 0.001,
ticksToRun: 8,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.3, enterprise: 0.5, developer: 0.4, research: 0.8 },
},
{
id: 'codeforce',
name: 'CodeForce',
category: 'coding',
description: 'Competitive programming and software engineering benchmarks.',
primaryCapability: 'coding',
secondaryCapability: 'reasoning',
computeCost: 0.001,
ticksToRun: 8,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.2, enterprise: 0.7, developer: 0.9, research: 0.5 },
},
{
id: 'mathquest',
name: 'MathQuest',
category: 'math',
description: 'Mathematical problem-solving from algebra to graduate-level proofs.',
primaryCapability: 'math',
secondaryCapability: 'reasoning',
computeCost: 0.001,
ticksToRun: 8,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.1, enterprise: 0.6, developer: 0.5, research: 0.9 },
},
{
id: 'worldfacts',
name: 'WorldFacts',
category: 'knowledge',
description: 'Broad factual knowledge across science, history, culture, and current events.',
primaryCapability: 'knowledge',
secondaryCapability: 'reasoning',
computeCost: 0.001,
ticksToRun: 6,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.5, enterprise: 0.4, developer: 0.3, research: 0.6 },
},
{
id: 'chatrank',
name: 'ChatRank',
category: 'chat',
description: 'Human preference evaluation of conversational quality, helpfulness, and creativity.',
primaryCapability: 'creative',
secondaryCapability: 'knowledge',
computeCost: 0.002,
ticksToRun: 10,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.9, enterprise: 0.3, developer: 0.2, research: 0.2 },
},
{
id: 'harmguard',
name: 'HarmGuard',
category: 'safety',
description: 'Safety evaluation measuring harm avoidance, truthfulness, and responsible behavior.',
primaryCapability: 'reasoning',
computeCost: 0.001,
ticksToRun: 8,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.4, enterprise: 0.9, developer: 0.3, research: 0.7 },
},
{
id: 'visionbench',
name: 'VisionBench',
category: 'multimodal',
description: 'Image understanding, visual reasoning, and multimodal comprehension.',
primaryCapability: 'multimodal',
secondaryCapability: 'reasoning',
computeCost: 0.003,
ticksToRun: 12,
unlockedAtEra: 'scaleup',
marketRelevance: { consumer: 0.5, enterprise: 0.6, developer: 0.6, research: 0.7 },
},
{
id: 'agentarena',
name: 'AgentArena',
category: 'agents',
description: 'Autonomous agent tasks: tool use, multi-step planning, and environment interaction.',
primaryCapability: 'agents',
secondaryCapability: 'coding',
computeCost: 0.005,
ticksToRun: 15,
unlockedAtEra: 'bigtech',
marketRelevance: { consumer: 0.3, enterprise: 0.8, developer: 0.7, research: 0.6 },
},
{
id: 'frontier-eval',
name: 'Frontier Eval',
category: 'reasoning',
description: 'Cutting-edge capability evaluation at the frontier of AI research.',
primaryCapability: 'reasoning',
secondaryCapability: 'math',
computeCost: 0.01,
ticksToRun: 20,
unlockedAtEra: 'agi',
marketRelevance: { consumer: 0.2, enterprise: 0.5, developer: 0.5, research: 1.0 },
},
];
-1
View File
@@ -11,4 +11,3 @@ export { TECH_TREE } from './data/techTree';
export { INITIAL_RIVALS } from './data/competitors';
export { KEY_HIRE_POOL } from './data/keyHires';
export { ACHIEVEMENT_DEFINITIONS } from './data/achievements';
export { BENCHMARKS } from './data/benchmarks';
@@ -1,7 +1,6 @@
import type { GameState, MarketState, BenchmarkResult } from '@ai-tycoon/shared';
import type { GameState, MarketState, ModelCapabilities } from '@ai-tycoon/shared';
import { CONSUMER_TOKENS_PER_SUBSCRIBER, API_TOKENS_PER_DEVELOPER_PER_TICK, BATCH_API_DEMAND_PER_DEV, makeInitialServingMetrics } from '@ai-tycoon/shared';
import type { TrafficPriority, TierServingMetrics } from '@ai-tycoon/shared';
import { BENCHMARKS } from '../../data/benchmarks';
import { computeSeasonal } from './seasonalSystem';
import { updateObsolescence } from './obsolescenceSystem';
import { buildPlayerProfile, buildCompetitorProfile, computeMarketShares, updateTAMGrowth } from './tamSystem';
@@ -21,31 +20,30 @@ export interface MarketTickResult {
totalTokenDemand: number;
}
const SEGMENT_CAPABILITY_WEIGHTS: Record<string, Partial<Record<keyof ModelCapabilities, number>>> = {
consumer: { creative: 0.35, knowledge: 0.25, reasoning: 0.15, multimodal: 0.15, coding: 0.05, agents: 0.05 },
enterprise: { reasoning: 0.25, coding: 0.20, agents: 0.20, knowledge: 0.15, math: 0.10, multimodal: 0.10 },
developer: { coding: 0.35, reasoning: 0.20, agents: 0.20, math: 0.15, knowledge: 0.10 },
research: { reasoning: 0.30, math: 0.30, knowledge: 0.20, coding: 0.10, agents: 0.10 },
};
function getSegmentQuality(
segment: 'consumer' | 'enterprise' | 'developer' | 'research',
benchmarkResults: BenchmarkResult[],
capabilities: ModelCapabilities,
fallbackScore: number,
): number {
if (benchmarkResults.length === 0) return fallbackScore / 100;
const bestByBenchmark = new Map<string, number>();
for (const r of benchmarkResults) {
const prev = bestByBenchmark.get(r.benchmarkId) ?? 0;
if (r.score > prev) bestByBenchmark.set(r.benchmarkId, r.score);
}
const weights = SEGMENT_CAPABILITY_WEIGHTS[segment];
if (!weights) return fallbackScore / 100;
let weightedSum = 0;
let totalWeight = 0;
for (const bench of BENCHMARKS) {
const score = bestByBenchmark.get(bench.id);
if (score == null) continue;
const weight = bench.marketRelevance[segment];
weightedSum += (score / 100) * weight;
totalWeight += weight;
for (const [cap, weight] of Object.entries(weights)) {
const score = capabilities[cap as keyof ModelCapabilities] ?? 0;
if (score > 0) {
weightedSum += (score / 100) * weight;
totalWeight += weight;
}
}
if (totalWeight === 0) return fallbackScore / 100;
return weightedSum / totalWeight;
return totalWeight > 0 ? weightedSum / totalWeight : fallbackScore / 100;
}
export function processMarketV2(
@@ -54,9 +52,11 @@ export function processMarketV2(
effectiveInferenceFlops?: number,
researchBonuses?: ResearchBonuses,
): MarketTickResult {
const consumerQuality = getSegmentQuality('consumer', state.models.benchmarkResults, state.models.bestDeployedModelScore);
const enterpriseQuality = getSegmentQuality('enterprise', state.models.benchmarkResults, state.models.bestDeployedModelScore);
const modelQuality = state.models.benchmarkResults.length > 0
const caps = state.models.bestDeployedCapabilities;
const hasDeployed = state.models.bestDeployedModelScore > 0;
const consumerQuality = getSegmentQuality('consumer', caps, state.models.bestDeployedModelScore);
const enterpriseQuality = getSegmentQuality('enterprise', caps, state.models.bestDeployedModelScore);
const modelQuality = hasDeployed
? (consumerQuality + enterpriseQuality) / 2
: state.models.bestDeployedModelScore / 100;
@@ -115,7 +115,7 @@ export function processMarketV2(
const productResult = processProductLines(
state.market.codeAssistant,
state.market.agentsPlatform,
state.models.benchmarkResults,
caps,
playerDevCustomers,
playerEntCustomers,
seasonal.multipliers.consumer,
@@ -1,4 +1,4 @@
import type { CodeAssistantState, AgentsPlatformState, BenchmarkResult } from '@ai-tycoon/shared';
import type { CodeAssistantState, AgentsPlatformState, ModelCapabilities } from '@ai-tycoon/shared';
import {
CODE_ASSISTANT_MIN_CODING_SCORE,
CODE_ASSISTANT_BASE_ADOPTION_RATE,
@@ -7,27 +7,6 @@ import {
AGENTS_PLATFORM_BASE_ADOPTION_RATE,
AGENTS_PLATFORM_CHURN_RATE,
} from '@ai-tycoon/shared';
import { BENCHMARKS } from '../../data/benchmarks';
function getBenchmarkScore(benchmarkId: string, results: BenchmarkResult[]): number {
let best = 0;
for (const r of results) {
if (r.benchmarkId === benchmarkId && r.score > best) best = r.score;
}
return best;
}
function getCodingScore(results: BenchmarkResult[]): number {
const codeBench = BENCHMARKS.find(b => b.id === 'codeforce');
if (!codeBench) return 0;
return getBenchmarkScore(codeBench.id, results);
}
function getAgentsScore(results: BenchmarkResult[]): number {
const agentBench = BENCHMARKS.find(b => b.id === 'agentarena');
if (!agentBench) return 0;
return getBenchmarkScore(agentBench.id, results);
}
export interface ProductLineResult {
codeAssistant: CodeAssistantState;
@@ -41,7 +20,7 @@ export interface ProductLineResult {
export function processProductLines(
ca: CodeAssistantState,
ap: AgentsPlatformState,
benchmarkResults: BenchmarkResult[],
capabilities: ModelCapabilities,
playerDevCustomers: number,
playerEntCustomers: number,
seasonalConsumerMult: number,
@@ -53,7 +32,7 @@ export function processProductLines(
let apRevenue = 0;
// --- Code Assistant ---
updatedCA.qualityScore = getCodingScore(benchmarkResults);
updatedCA.qualityScore = capabilities.coding;
if (updatedCA.isUnlocked && updatedCA.isActive && updatedCA.qualityScore >= CODE_ASSISTANT_MIN_CODING_SCORE) {
const qualityFactor = updatedCA.qualityScore / 100;
const priceAttr = Math.max(0.1, 1 - updatedCA.pricePerSeat / 50);
@@ -70,7 +49,7 @@ export function processProductLines(
}
// --- Agents Platform ---
updatedAP.qualityScore = getAgentsScore(benchmarkResults);
updatedAP.qualityScore = capabilities.agents;
if (updatedAP.isUnlocked && updatedAP.isActive && updatedAP.qualityScore >= AGENTS_PLATFORM_MIN_AGENTS_SCORE) {
const qualityFactor = updatedAP.qualityScore / 100;
const priceAttr = Math.max(0.1, 1 - updatedAP.pricePerSeat / 250);
+16 -48
View File
@@ -1,10 +1,8 @@
import type {
GameState, ModelsState, BaseModel, ModelCapabilities, SafetyProfile,
TrainingPipeline, TrainingEvent, TrainingEventType,
ModelVariant, VariantCreationJob, EvalJob, BenchmarkResult,
BenchmarkDefinition,
ModelVariant, VariantCreationJob,
} from '@ai-tycoon/shared';
import { BENCHMARKS } from '../data/benchmarks';
import {
uuid, VRAM_REQUIREMENTS_BY_GENERATION,
MOE_CAPABILITY_MULTIPLIER, MOE_SPEED_MULTIPLIER,
@@ -154,14 +152,21 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
});
}
const updatedEvalJobs = processEvalJobs(state);
const bestDeployedCapabilities: ModelCapabilities = {
reasoning: 0, coding: 0, creative: 0, math: 0,
knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0,
};
let bestDeployedModelScore = 0;
let bestDeployedSafetyScore = 0;
for (const m of baseModels) {
if (!m.isDeployed) continue;
if (m.rawCapability > bestDeployedModelScore) bestDeployedModelScore = m.rawCapability;
if (m.safetyProfile.overallSafety > bestDeployedSafetyScore) bestDeployedSafetyScore = m.safetyProfile.overallSafety;
for (const key of Object.keys(bestDeployedCapabilities) as (keyof ModelCapabilities)[]) {
if ((m.capabilities[key] ?? 0) > bestDeployedCapabilities[key]) {
bestDeployedCapabilities[key] = m.capabilities[key];
}
}
}
for (const f of families) {
for (const v of f.variants) {
@@ -169,6 +174,11 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
const score = computeVariantScore(v);
if (score > bestDeployedModelScore) bestDeployedModelScore = score;
if (v.safetyProfile.overallSafety > bestDeployedSafetyScore) bestDeployedSafetyScore = v.safetyProfile.overallSafety;
for (const key of Object.keys(bestDeployedCapabilities) as (keyof ModelCapabilities)[]) {
if ((v.capabilities[key] ?? 0) > bestDeployedCapabilities[key]) {
bestDeployedCapabilities[key] = v.capabilities[key];
}
}
}
}
@@ -179,10 +189,9 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
families,
activeTrainingPipelines: updatedPipelines,
variantJobs: updatedVariantJobs.jobs,
evalJobs: updatedEvalJobs.jobs,
benchmarkResults: [...state.models.benchmarkResults, ...updatedEvalJobs.newResults],
bestDeployedModelScore,
bestDeployedSafetyScore,
bestDeployedCapabilities,
},
completedModels,
notifications,
@@ -490,47 +499,6 @@ function createVariant(job: VariantCreationJob, base: BaseModel): ModelVariant {
};
}
function processEvalJobs(state: GameState): { jobs: EvalJob[]; newResults: BenchmarkResult[] } {
const newResults: BenchmarkResult[] = [];
const allModels: (BaseModel | ModelVariant)[] = [
...state.models.baseModels,
...state.models.families.flatMap(f => f.variants),
];
const jobs = state.models.evalJobs.map(job => {
if (job.status !== 'active') return job;
const newProgress = job.progressTicks + 1;
if (newProgress >= job.totalTicks) {
const model = allModels.find(m => m.id === job.modelId);
if (model) {
const results = computeBenchmarkScores(model, job.benchmarkIds, state.meta.tickCount);
newResults.push(...results);
return { ...job, status: 'completed' as const, progressTicks: job.totalTicks, results };
}
return { ...job, status: 'completed' as const, progressTicks: job.totalTicks };
}
return { ...job, progressTicks: newProgress };
});
return { jobs, newResults };
}
function computeBenchmarkScores(
model: BaseModel | ModelVariant,
benchmarkIds: string[],
tick: number,
): BenchmarkResult[] {
const benchmarkMap = new Map(BENCHMARKS.map(b => [b.id, b]));
return benchmarkIds.map(id => {
const bench = benchmarkMap.get(id);
if (!bench) return { benchmarkId: id, modelId: model.id, score: 0, ranAtTick: tick };
const primary = model.capabilities[bench.primaryCapability] ?? 0;
const secondary = bench.secondaryCapability ? (model.capabilities[bench.secondaryCapability] ?? 0) : 0;
const noise = (Math.random() - 0.5) * 6;
const score = clamp(primary * 0.7 + secondary * 0.3 + noise);
return { benchmarkId: id, modelId: model.id, score, ranAtTick: tick };
});
}
function computeVariantScore(variant: ModelVariant): number {
const c = variant.capabilities;
return (c.reasoning * 0.25 + c.coding * 0.2 + c.creative * 0.15 + c.math * 0.15 + c.knowledge * 0.15 + c.agents * 0.1);
-1
View File
@@ -66,7 +66,6 @@ describe('processTick', () => {
isDeployed: true, trainedAtTick: 0, trainingCostTotal: 0, trainingStagesCompleted: ['pretraining' as const],
sizeTier: 'small' as const, version: 1.0, sftSpecializations: ['general' as const], alignmentMethod: 'rlhf' as const,
dataMix: { web: 0.4, code: 0.2, books: 0.15, academic: 0.1, conversational: 0.1, specialized: 0.05 },
benchmarkResults: {},
};
const state = createTestState({
meta: { currentEra: 'startup' },
+2 -43
View File
@@ -182,45 +182,6 @@ export interface QuantizationConfig {
variantName: string;
}
export type BenchmarkCategory = 'reasoning' | 'coding' | 'math' | 'knowledge' | 'safety' | 'chat' | 'multimodal' | 'agents';
export interface BenchmarkDefinition {
id: string;
name: string;
category: BenchmarkCategory;
description: string;
primaryCapability: keyof ModelCapabilities;
secondaryCapability?: keyof ModelCapabilities;
computeCost: number;
ticksToRun: number;
unlockedAtEra: Era;
marketRelevance: {
consumer: number;
enterprise: number;
developer: number;
research: number;
};
}
export interface BenchmarkResult {
benchmarkId: string;
modelId: string;
score: number;
ranAtTick: number;
rank?: number;
}
export interface EvalJob {
id: string;
modelId: string;
benchmarkIds: string[];
progressTicks: number;
totalTicks: number;
computeAllocated: number;
status: 'active' | 'completed';
results: BenchmarkResult[];
}
export type ProductLineType = 'text-api' | 'chat-product' | 'chat-free' | 'chat-enterprise' | 'code-api' | 'image' | 'agents-api';
export interface ProductPricing {
@@ -246,11 +207,10 @@ export interface ModelsState {
baseModels: BaseModel[];
activeTrainingPipelines: TrainingPipeline[];
variantJobs: VariantCreationJob[];
evalJobs: EvalJob[];
benchmarkResults: BenchmarkResult[];
productLines: ProductLine[];
bestDeployedModelScore: number;
bestDeployedSafetyScore: number;
bestDeployedCapabilities: ModelCapabilities;
}
export const DEFAULT_DATA_MIX: DataMixAllocation = {
@@ -271,8 +231,6 @@ export const INITIAL_MODELS: ModelsState = {
baseModels: [],
activeTrainingPipelines: [],
variantJobs: [],
evalJobs: [],
benchmarkResults: [],
productLines: [
{
id: 'text-api',
@@ -307,4 +265,5 @@ export const INITIAL_MODELS: ModelsState = {
],
bestDeployedModelScore: 0,
bestDeployedSafetyScore: 0,
bestDeployedCapabilities: { reasoning: 0, coding: 0, creative: 0, math: 0, knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0 },
};