From bbb69a315cec5d14b3f2d63280c1baab79a8ab32 Mon Sep 17 00:00:00 2001 From: josh Date: Sun, 26 Apr 2026 19:28:59 -0400 Subject: [PATCH] Remove benchmark evaluation system, use training capabilities directly Model quality for market segments and product lines now derives from deployed model capabilities (coding, reasoning, agents, etc.) instead of requiring a separate manual benchmark evaluation step. This eliminates an unbounded benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines of dead-weight UI, types, and engine code. Co-Authored-By: Claude Opus 4.6 --- apps/web/src/pages/ModelsPage.tsx | 255 +----------------- apps/web/src/store/index.ts | 32 +-- .../src/__test-utils__/builders.ts | 6 +- packages/game-engine/src/data/benchmarks.ts | 111 -------- packages/game-engine/src/index.ts | 1 - .../game-engine/src/systems/market/index.ts | 48 ++-- .../src/systems/market/productLines.ts | 29 +- .../game-engine/src/systems/modelSystem.ts | 64 ++--- packages/game-engine/src/tick.test.ts | 1 - packages/shared/src/types/models.ts | 45 +--- 10 files changed, 57 insertions(+), 535 deletions(-) delete mode 100644 packages/game-engine/src/data/benchmarks.ts diff --git a/apps/web/src/pages/ModelsPage.tsx b/apps/web/src/pages/ModelsPage.tsx index 920e63c..89fc4db 100644 --- a/apps/web/src/pages/ModelsPage.tsx +++ b/apps/web/src/pages/ModelsPage.tsx @@ -1,5 +1,5 @@ import { useState } from 'react'; -import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap, BarChart3 } from 'lucide-react'; +import { Play, Rocket, Globe, ChevronDown, ChevronUp, Beaker, Shield, Zap } from 'lucide-react'; import { TutorialHint } from '@/components/game/TutorialHint'; import { ConfirmModal } from '@/components/common/ConfirmModal'; import { useGameStore } from '@/store'; @@ -16,10 +16,9 @@ import { } from '@ai-tycoon/shared'; import type { ModelArchitecture, DataMixAllocation, SFTSpecialization, AlignmentMethod, - DataDomain, QuantizationLevel, BaseModel, ModelVariant, BenchmarkResult, + DataDomain, QuantizationLevel, BaseModel, ModelVariant, SizeTier, ModelFamily, } from '@ai-tycoon/shared'; -import { BENCHMARKS } from '@ai-tycoon/game-engine'; const DATA_MIX_PRESETS: Record = { balanced: { label: 'Balanced', mix: DEFAULT_DATA_MIX }, @@ -52,8 +51,6 @@ export function ModelsPage() { const families = useGameStore((s) => s.models.families); const pipelines = useGameStore((s) => s.models.activeTrainingPipelines); const variantJobs = useGameStore((s) => s.models.variantJobs); - const evalJobs = useGameStore((s) => s.models.evalJobs); - const benchmarkResults = useGameStore((s) => s.models.benchmarkResults); const productLines = useGameStore((s) => s.models.productLines); const totalFlops = useGameStore((s) => s.compute.totalFlops); const totalVramGB = useGameStore((s) => s.compute.totalVramGB); @@ -64,7 +61,6 @@ export function ModelsPage() { const deployModel = useGameStore((s) => s.deployModel); const deployVariant = useGameStore((s) => s.deployVariant); const createQuantization = useGameStore((s) => s.createQuantization); - const startEvaluation = useGameStore((s) => s.startEvaluation); const setTrainingAllocation = useGameStore((s) => s.setTrainingAllocation); const openSourceModel = useGameStore((s) => s.openSourceModel); const openSourcedModels = useGameStore((s) => s.market.openSourcedModels); @@ -96,15 +92,12 @@ export function ModelsPage() { const activePipelines = pipelines.filter(p => p.status === 'active' || p.status === 'stalled'); const activeVariantJobs = variantJobs.filter(j => j.status === 'active'); - const activeEvalJobs = evalJobs.filter(j => j.status === 'active'); const undeployedCount = baseModels.filter(m => !m.isDeployed).length; - const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0 || activeEvalJobs.length > 0; + const hasActiveJobs = activePipelines.length > 0 || activeVariantJobs.length > 0; const noModelDeployed = baseModels.length > 0 && !baseModels.some(m => m.isDeployed); const eraOrder = ['startup', 'scaleup', 'bigtech', 'agi'] as const; const currentEraIdx = eraOrder.indexOf(currentEra); - const availableBenchmarks = BENCHMARKS.filter(b => eraOrder.indexOf(b.unlockedAtEra) <= currentEraIdx); - const hasAlignmentResearch = completedResearch.some(r => r === 'alignment-research' || r === 'interpretability' || r === 'constitutional-ai', ); @@ -186,7 +179,6 @@ export function ModelsPage() { { id: 'overview' as const, label: 'Overview' }, { id: 'train' as const, label: 'Train New' }, { id: 'models' as const, label: `Families${families.length > 0 ? ` (${families.length})` : ''}` }, - { id: 'benchmarks' as const, label: 'Benchmarks' }, { id: 'products' as const, label: 'Products' }, ]).map(tab => ( - ); - } - - return ( -
-
- Run Evaluation - -
-
- {availableBenchmarks.map(bench => { - const alreadyDone = evaluatedIds.has(bench.id); - const selected = selectedBenchmarks.includes(bench.id); - return ( - - ); - })} -
- {selectedBenchmarks.length > 0 && ( -
- - {selectedBenchmarks.length} benchmark{selectedBenchmarks.length > 1 ? 's' : ''} ยท ~{availableBenchmarks.filter(b => selectedBenchmarks.includes(b.id)).reduce((s, b) => s + b.ticksToRun, 0)} ticks - - -
- )} -
- ); -} - -function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks, evalJobs, onDeploy, onStartEval }: { +function VariantCard({ variant, familyId, onDeploy }: { variant: ModelVariant; familyId: string; - benchmarkResults: BenchmarkResult[]; - availableBenchmarks: typeof BENCHMARKS; - evalJobs: { id: string; modelId: string; status: string }[]; onDeploy: () => void; - onStartEval: (modelId: string, benchmarkIds: string[]) => void; }) { const [isExpanded, setIsExpanded] = useState(false); - const variantResults = benchmarkResults.filter(r => r.modelId === variant.id); return (
@@ -1106,108 +959,12 @@ function VariantCard({ variant, familyId, benchmarkResults, availableBenchmarks,
))} - - {variantResults.length > 0 && ( -
- {variantResults.map(r => { - const bench = BENCHMARKS.find(b => b.id === r.benchmarkId); - return ( -
- {bench?.name ?? r.benchmarkId} -
{r.score.toFixed(1)}
-
- ); - })} -
- )} - - )} ); } -function BenchmarkLeaderboard({ benchmarkResults, baseModels, families, availableBenchmarks }: { - benchmarkResults: BenchmarkResult[]; - baseModels: BaseModel[]; - families: { id: string; name: string; variants: ModelVariant[] }[]; - availableBenchmarks: typeof BENCHMARKS; -}) { - const allModels: (BaseModel | ModelVariant)[] = [ - ...baseModels, - ...families.flatMap(f => f.variants), - ]; - - const modelNames = new Map(allModels.map(m => [m.id, m.name])); - const benchmarksWithResults = availableBenchmarks.filter(b => - benchmarkResults.some(r => r.benchmarkId === b.id), - ); - - if (benchmarksWithResults.length === 0) return null; - - const modelIds = [...new Set(benchmarkResults.map(r => r.modelId))]; - - return ( -
-

- Benchmark Leaderboard -

-
- - - - - {benchmarksWithResults.map(b => ( - - ))} - - - - - {modelIds.map(modelId => { - const results = benchmarkResults.filter(r => r.modelId === modelId); - const scores = benchmarksWithResults.map(b => { - const r = results.find(r => r.benchmarkId === b.id); - return r?.score ?? null; - }); - const validScores = scores.filter((s): s is number => s !== null); - const avg = validScores.length > 0 ? validScores.reduce((a, b) => a + b, 0) / validScores.length : 0; - - return ( - - - {scores.map((score, i) => ( - - ))} - - - ); - })} - -
Model{b.name}Avg
{modelNames.get(modelId) ?? 'Unknown'} - {score !== null ? ( - = 80 ? 'text-success' : score >= 50 ? 'text-accent-light' : 'text-surface-400'}> - {score.toFixed(1)} - - ) : ( - โ€” - )} - - {avg > 0 ? avg.toFixed(1) : 'โ€”'} -
-
-
- ); -} - function StageBar({ label, active, complete, progress }: { label: string; active: boolean; complete: boolean; progress: number; }) { diff --git a/apps/web/src/store/index.ts b/apps/web/src/store/index.ts index a01e8c4..8087c70 100644 --- a/apps/web/src/store/index.ts +++ b/apps/web/src/store/index.ts @@ -15,7 +15,6 @@ import type { TrainingPipeline, ModelFamily, DataMixAllocation, ModelArchitecture, AlignmentMethod, SizeTier, SFTSpecialization, QuantizationLevel, VariantCreationJob, - EvalJob, ConsumerTierId, ApiTierId, } from '@ai-tycoon/shared'; import { @@ -43,7 +42,7 @@ import { } from '@ai-tycoon/shared'; import { emptyDCNetworkSummary, emptyCampusNetworkSummary, emptyClusterNetworkSummary, - BENCHMARKS, TECH_TREE, onModelDeployed, + TECH_TREE, onModelDeployed, } from '@ai-tycoon/game-engine'; import { INITIAL_RIVALS } from '@ai-tycoon/game-engine'; @@ -59,7 +58,7 @@ export interface InfraNav { datacenterId?: string; } -type ModelsTab = 'overview' | 'train' | 'models' | 'benchmarks' | 'products'; +type ModelsTab = 'overview' | 'train' | 'models' | 'products'; interface UIState { activePage: ActivePage; @@ -132,7 +131,6 @@ interface Actions { }) => void; startPointRelease: (baseModelId: string) => void; createQuantization: (baseModelId: string, level: QuantizationLevel, variantName: string) => void; - startEvaluation: (modelId: string, benchmarkIds: string[]) => void; deployModel: (modelId: string) => void; deployVariant: (familyId: string, variantId: string) => void; setProductPricing: (productLineId: string, field: string, value: number) => void; @@ -1076,32 +1074,6 @@ export const useGameStore = create()( } }, - startEvaluation: (modelId, benchmarkIds) => { - let created = false; - set((s) => { - const benchmarks = BENCHMARKS.filter(b => benchmarkIds.includes(b.id)); - if (benchmarks.length === 0) return s; - created = true; - const totalTicks = benchmarks.reduce((sum, b) => sum + b.ticksToRun, 0); - const computeCost = benchmarks.reduce((sum, b) => sum + b.computeCost, 0); - const job: EvalJob = { - id: uuid(), - modelId, - benchmarkIds, - progressTicks: 0, - totalTicks, - computeAllocated: computeCost, - status: 'active', - results: [], - }; - return { models: { ...s.models, evalJobs: [...s.models.evalJobs, job] } }; - }); - if (created) { - get().addNotification({ title: 'Evaluation Started', message: `${benchmarkIds.length} benchmark${benchmarkIds.length > 1 ? 's' : ''} queued.`, type: 'info', tick: get().meta.tickCount }); - set({ modelsTab: 'overview' as ModelsTab }); - } - }, - deployModel: (modelId) => { const modelName = get().models.baseModels.find(m => m.id === modelId)?.name ?? 'Model'; set((s) => ({ diff --git a/packages/game-engine/src/__test-utils__/builders.ts b/packages/game-engine/src/__test-utils__/builders.ts index e36c3b8..6eff4da 100644 --- a/packages/game-engine/src/__test-utils__/builders.ts +++ b/packages/game-engine/src/__test-utils__/builders.ts @@ -171,7 +171,6 @@ export function createTestBaseModel(overrides?: Partial): BaseModel { sizeTier: 'small', isPointRelease: false, sourceModelId: null, - benchmarkResults: {}, dataMix: { web: 0.4, code: 0.2, books: 0.15, academic: 0.1, conversational: 0.1, specialized: 0.05 }, }; return overrides ? { ...base, ...overrides } : base; @@ -181,9 +180,10 @@ export function createTestModelFamily(overrides?: Partial): ModelFa const base: ModelFamily = { id: uuid(), name: 'Test Family', - baseModels: [], + generation: 1, + baseModelIds: [], variants: [], - activeEvals: [], + createdAtTick: 0, }; return overrides ? { ...base, ...overrides } : base; } diff --git a/packages/game-engine/src/data/benchmarks.ts b/packages/game-engine/src/data/benchmarks.ts deleted file mode 100644 index 0ae5adf..0000000 --- a/packages/game-engine/src/data/benchmarks.ts +++ /dev/null @@ -1,111 +0,0 @@ -import type { BenchmarkDefinition } from '@ai-tycoon/shared'; - -export const BENCHMARKS: BenchmarkDefinition[] = [ - { - id: 'arc-challenge', - name: 'ARC Challenge', - category: 'reasoning', - description: 'Advanced reasoning and comprehension tasks requiring multi-step inference.', - primaryCapability: 'reasoning', - secondaryCapability: 'knowledge', - computeCost: 0.001, - ticksToRun: 8, - unlockedAtEra: 'startup', - marketRelevance: { consumer: 0.3, enterprise: 0.5, developer: 0.4, research: 0.8 }, - }, - { - id: 'codeforce', - name: 'CodeForce', - category: 'coding', - description: 'Competitive programming and software engineering benchmarks.', - primaryCapability: 'coding', - secondaryCapability: 'reasoning', - computeCost: 0.001, - ticksToRun: 8, - unlockedAtEra: 'startup', - marketRelevance: { consumer: 0.2, enterprise: 0.7, developer: 0.9, research: 0.5 }, - }, - { - id: 'mathquest', - name: 'MathQuest', - category: 'math', - description: 'Mathematical problem-solving from algebra to graduate-level proofs.', - primaryCapability: 'math', - secondaryCapability: 'reasoning', - computeCost: 0.001, - ticksToRun: 8, - unlockedAtEra: 'startup', - marketRelevance: { consumer: 0.1, enterprise: 0.6, developer: 0.5, research: 0.9 }, - }, - { - id: 'worldfacts', - name: 'WorldFacts', - category: 'knowledge', - description: 'Broad factual knowledge across science, history, culture, and current events.', - primaryCapability: 'knowledge', - secondaryCapability: 'reasoning', - computeCost: 0.001, - ticksToRun: 6, - unlockedAtEra: 'startup', - marketRelevance: { consumer: 0.5, enterprise: 0.4, developer: 0.3, research: 0.6 }, - }, - { - id: 'chatrank', - name: 'ChatRank', - category: 'chat', - description: 'Human preference evaluation of conversational quality, helpfulness, and creativity.', - primaryCapability: 'creative', - secondaryCapability: 'knowledge', - computeCost: 0.002, - ticksToRun: 10, - unlockedAtEra: 'startup', - marketRelevance: { consumer: 0.9, enterprise: 0.3, developer: 0.2, research: 0.2 }, - }, - { - id: 'harmguard', - name: 'HarmGuard', - category: 'safety', - description: 'Safety evaluation measuring harm avoidance, truthfulness, and responsible behavior.', - primaryCapability: 'reasoning', - computeCost: 0.001, - ticksToRun: 8, - unlockedAtEra: 'startup', - marketRelevance: { consumer: 0.4, enterprise: 0.9, developer: 0.3, research: 0.7 }, - }, - { - id: 'visionbench', - name: 'VisionBench', - category: 'multimodal', - description: 'Image understanding, visual reasoning, and multimodal comprehension.', - primaryCapability: 'multimodal', - secondaryCapability: 'reasoning', - computeCost: 0.003, - ticksToRun: 12, - unlockedAtEra: 'scaleup', - marketRelevance: { consumer: 0.5, enterprise: 0.6, developer: 0.6, research: 0.7 }, - }, - { - id: 'agentarena', - name: 'AgentArena', - category: 'agents', - description: 'Autonomous agent tasks: tool use, multi-step planning, and environment interaction.', - primaryCapability: 'agents', - secondaryCapability: 'coding', - computeCost: 0.005, - ticksToRun: 15, - unlockedAtEra: 'bigtech', - marketRelevance: { consumer: 0.3, enterprise: 0.8, developer: 0.7, research: 0.6 }, - }, - { - id: 'frontier-eval', - name: 'Frontier Eval', - category: 'reasoning', - description: 'Cutting-edge capability evaluation at the frontier of AI research.', - primaryCapability: 'reasoning', - secondaryCapability: 'math', - computeCost: 0.01, - ticksToRun: 20, - unlockedAtEra: 'agi', - marketRelevance: { consumer: 0.2, enterprise: 0.5, developer: 0.5, research: 1.0 }, - }, -]; diff --git a/packages/game-engine/src/index.ts b/packages/game-engine/src/index.ts index 4340480..a32d0ea 100644 --- a/packages/game-engine/src/index.ts +++ b/packages/game-engine/src/index.ts @@ -11,4 +11,3 @@ export { TECH_TREE } from './data/techTree'; export { INITIAL_RIVALS } from './data/competitors'; export { KEY_HIRE_POOL } from './data/keyHires'; export { ACHIEVEMENT_DEFINITIONS } from './data/achievements'; -export { BENCHMARKS } from './data/benchmarks'; diff --git a/packages/game-engine/src/systems/market/index.ts b/packages/game-engine/src/systems/market/index.ts index 7606113..2700970 100644 --- a/packages/game-engine/src/systems/market/index.ts +++ b/packages/game-engine/src/systems/market/index.ts @@ -1,7 +1,6 @@ -import type { GameState, MarketState, BenchmarkResult } from '@ai-tycoon/shared'; +import type { GameState, MarketState, ModelCapabilities } from '@ai-tycoon/shared'; import { CONSUMER_TOKENS_PER_SUBSCRIBER, API_TOKENS_PER_DEVELOPER_PER_TICK, BATCH_API_DEMAND_PER_DEV, makeInitialServingMetrics } from '@ai-tycoon/shared'; import type { TrafficPriority, TierServingMetrics } from '@ai-tycoon/shared'; -import { BENCHMARKS } from '../../data/benchmarks'; import { computeSeasonal } from './seasonalSystem'; import { updateObsolescence } from './obsolescenceSystem'; import { buildPlayerProfile, buildCompetitorProfile, computeMarketShares, updateTAMGrowth } from './tamSystem'; @@ -21,31 +20,30 @@ export interface MarketTickResult { totalTokenDemand: number; } +const SEGMENT_CAPABILITY_WEIGHTS: Record>> = { + consumer: { creative: 0.35, knowledge: 0.25, reasoning: 0.15, multimodal: 0.15, coding: 0.05, agents: 0.05 }, + enterprise: { reasoning: 0.25, coding: 0.20, agents: 0.20, knowledge: 0.15, math: 0.10, multimodal: 0.10 }, + developer: { coding: 0.35, reasoning: 0.20, agents: 0.20, math: 0.15, knowledge: 0.10 }, + research: { reasoning: 0.30, math: 0.30, knowledge: 0.20, coding: 0.10, agents: 0.10 }, +}; + function getSegmentQuality( segment: 'consumer' | 'enterprise' | 'developer' | 'research', - benchmarkResults: BenchmarkResult[], + capabilities: ModelCapabilities, fallbackScore: number, ): number { - if (benchmarkResults.length === 0) return fallbackScore / 100; - - const bestByBenchmark = new Map(); - for (const r of benchmarkResults) { - const prev = bestByBenchmark.get(r.benchmarkId) ?? 0; - if (r.score > prev) bestByBenchmark.set(r.benchmarkId, r.score); - } - + const weights = SEGMENT_CAPABILITY_WEIGHTS[segment]; + if (!weights) return fallbackScore / 100; let weightedSum = 0; let totalWeight = 0; - for (const bench of BENCHMARKS) { - const score = bestByBenchmark.get(bench.id); - if (score == null) continue; - const weight = bench.marketRelevance[segment]; - weightedSum += (score / 100) * weight; - totalWeight += weight; + for (const [cap, weight] of Object.entries(weights)) { + const score = capabilities[cap as keyof ModelCapabilities] ?? 0; + if (score > 0) { + weightedSum += (score / 100) * weight; + totalWeight += weight; + } } - - if (totalWeight === 0) return fallbackScore / 100; - return weightedSum / totalWeight; + return totalWeight > 0 ? weightedSum / totalWeight : fallbackScore / 100; } export function processMarketV2( @@ -54,9 +52,11 @@ export function processMarketV2( effectiveInferenceFlops?: number, researchBonuses?: ResearchBonuses, ): MarketTickResult { - const consumerQuality = getSegmentQuality('consumer', state.models.benchmarkResults, state.models.bestDeployedModelScore); - const enterpriseQuality = getSegmentQuality('enterprise', state.models.benchmarkResults, state.models.bestDeployedModelScore); - const modelQuality = state.models.benchmarkResults.length > 0 + const caps = state.models.bestDeployedCapabilities; + const hasDeployed = state.models.bestDeployedModelScore > 0; + const consumerQuality = getSegmentQuality('consumer', caps, state.models.bestDeployedModelScore); + const enterpriseQuality = getSegmentQuality('enterprise', caps, state.models.bestDeployedModelScore); + const modelQuality = hasDeployed ? (consumerQuality + enterpriseQuality) / 2 : state.models.bestDeployedModelScore / 100; @@ -115,7 +115,7 @@ export function processMarketV2( const productResult = processProductLines( state.market.codeAssistant, state.market.agentsPlatform, - state.models.benchmarkResults, + caps, playerDevCustomers, playerEntCustomers, seasonal.multipliers.consumer, diff --git a/packages/game-engine/src/systems/market/productLines.ts b/packages/game-engine/src/systems/market/productLines.ts index 2286d6d..7161d7b 100644 --- a/packages/game-engine/src/systems/market/productLines.ts +++ b/packages/game-engine/src/systems/market/productLines.ts @@ -1,4 +1,4 @@ -import type { CodeAssistantState, AgentsPlatformState, BenchmarkResult } from '@ai-tycoon/shared'; +import type { CodeAssistantState, AgentsPlatformState, ModelCapabilities } from '@ai-tycoon/shared'; import { CODE_ASSISTANT_MIN_CODING_SCORE, CODE_ASSISTANT_BASE_ADOPTION_RATE, @@ -7,27 +7,6 @@ import { AGENTS_PLATFORM_BASE_ADOPTION_RATE, AGENTS_PLATFORM_CHURN_RATE, } from '@ai-tycoon/shared'; -import { BENCHMARKS } from '../../data/benchmarks'; - -function getBenchmarkScore(benchmarkId: string, results: BenchmarkResult[]): number { - let best = 0; - for (const r of results) { - if (r.benchmarkId === benchmarkId && r.score > best) best = r.score; - } - return best; -} - -function getCodingScore(results: BenchmarkResult[]): number { - const codeBench = BENCHMARKS.find(b => b.id === 'codeforce'); - if (!codeBench) return 0; - return getBenchmarkScore(codeBench.id, results); -} - -function getAgentsScore(results: BenchmarkResult[]): number { - const agentBench = BENCHMARKS.find(b => b.id === 'agentarena'); - if (!agentBench) return 0; - return getBenchmarkScore(agentBench.id, results); -} export interface ProductLineResult { codeAssistant: CodeAssistantState; @@ -41,7 +20,7 @@ export interface ProductLineResult { export function processProductLines( ca: CodeAssistantState, ap: AgentsPlatformState, - benchmarkResults: BenchmarkResult[], + capabilities: ModelCapabilities, playerDevCustomers: number, playerEntCustomers: number, seasonalConsumerMult: number, @@ -53,7 +32,7 @@ export function processProductLines( let apRevenue = 0; // --- Code Assistant --- - updatedCA.qualityScore = getCodingScore(benchmarkResults); + updatedCA.qualityScore = capabilities.coding; if (updatedCA.isUnlocked && updatedCA.isActive && updatedCA.qualityScore >= CODE_ASSISTANT_MIN_CODING_SCORE) { const qualityFactor = updatedCA.qualityScore / 100; const priceAttr = Math.max(0.1, 1 - updatedCA.pricePerSeat / 50); @@ -70,7 +49,7 @@ export function processProductLines( } // --- Agents Platform --- - updatedAP.qualityScore = getAgentsScore(benchmarkResults); + updatedAP.qualityScore = capabilities.agents; if (updatedAP.isUnlocked && updatedAP.isActive && updatedAP.qualityScore >= AGENTS_PLATFORM_MIN_AGENTS_SCORE) { const qualityFactor = updatedAP.qualityScore / 100; const priceAttr = Math.max(0.1, 1 - updatedAP.pricePerSeat / 250); diff --git a/packages/game-engine/src/systems/modelSystem.ts b/packages/game-engine/src/systems/modelSystem.ts index 7e45b51..4aaa8e5 100644 --- a/packages/game-engine/src/systems/modelSystem.ts +++ b/packages/game-engine/src/systems/modelSystem.ts @@ -1,10 +1,8 @@ import type { GameState, ModelsState, BaseModel, ModelCapabilities, SafetyProfile, TrainingPipeline, TrainingEvent, TrainingEventType, - ModelVariant, VariantCreationJob, EvalJob, BenchmarkResult, - BenchmarkDefinition, + ModelVariant, VariantCreationJob, } from '@ai-tycoon/shared'; -import { BENCHMARKS } from '../data/benchmarks'; import { uuid, VRAM_REQUIREMENTS_BY_GENERATION, MOE_CAPABILITY_MULTIPLIER, MOE_SPEED_MULTIPLIER, @@ -154,14 +152,21 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse }); } - const updatedEvalJobs = processEvalJobs(state); - + const bestDeployedCapabilities: ModelCapabilities = { + reasoning: 0, coding: 0, creative: 0, math: 0, + knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0, + }; let bestDeployedModelScore = 0; let bestDeployedSafetyScore = 0; for (const m of baseModels) { if (!m.isDeployed) continue; if (m.rawCapability > bestDeployedModelScore) bestDeployedModelScore = m.rawCapability; if (m.safetyProfile.overallSafety > bestDeployedSafetyScore) bestDeployedSafetyScore = m.safetyProfile.overallSafety; + for (const key of Object.keys(bestDeployedCapabilities) as (keyof ModelCapabilities)[]) { + if ((m.capabilities[key] ?? 0) > bestDeployedCapabilities[key]) { + bestDeployedCapabilities[key] = m.capabilities[key]; + } + } } for (const f of families) { for (const v of f.variants) { @@ -169,6 +174,11 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse const score = computeVariantScore(v); if (score > bestDeployedModelScore) bestDeployedModelScore = score; if (v.safetyProfile.overallSafety > bestDeployedSafetyScore) bestDeployedSafetyScore = v.safetyProfile.overallSafety; + for (const key of Object.keys(bestDeployedCapabilities) as (keyof ModelCapabilities)[]) { + if ((v.capabilities[key] ?? 0) > bestDeployedCapabilities[key]) { + bestDeployedCapabilities[key] = v.capabilities[key]; + } + } } } @@ -179,10 +189,9 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse families, activeTrainingPipelines: updatedPipelines, variantJobs: updatedVariantJobs.jobs, - evalJobs: updatedEvalJobs.jobs, - benchmarkResults: [...state.models.benchmarkResults, ...updatedEvalJobs.newResults], bestDeployedModelScore, bestDeployedSafetyScore, + bestDeployedCapabilities, }, completedModels, notifications, @@ -490,47 +499,6 @@ function createVariant(job: VariantCreationJob, base: BaseModel): ModelVariant { }; } -function processEvalJobs(state: GameState): { jobs: EvalJob[]; newResults: BenchmarkResult[] } { - const newResults: BenchmarkResult[] = []; - const allModels: (BaseModel | ModelVariant)[] = [ - ...state.models.baseModels, - ...state.models.families.flatMap(f => f.variants), - ]; - - const jobs = state.models.evalJobs.map(job => { - if (job.status !== 'active') return job; - const newProgress = job.progressTicks + 1; - if (newProgress >= job.totalTicks) { - const model = allModels.find(m => m.id === job.modelId); - if (model) { - const results = computeBenchmarkScores(model, job.benchmarkIds, state.meta.tickCount); - newResults.push(...results); - return { ...job, status: 'completed' as const, progressTicks: job.totalTicks, results }; - } - return { ...job, status: 'completed' as const, progressTicks: job.totalTicks }; - } - return { ...job, progressTicks: newProgress }; - }); - return { jobs, newResults }; -} - -function computeBenchmarkScores( - model: BaseModel | ModelVariant, - benchmarkIds: string[], - tick: number, -): BenchmarkResult[] { - const benchmarkMap = new Map(BENCHMARKS.map(b => [b.id, b])); - return benchmarkIds.map(id => { - const bench = benchmarkMap.get(id); - if (!bench) return { benchmarkId: id, modelId: model.id, score: 0, ranAtTick: tick }; - const primary = model.capabilities[bench.primaryCapability] ?? 0; - const secondary = bench.secondaryCapability ? (model.capabilities[bench.secondaryCapability] ?? 0) : 0; - const noise = (Math.random() - 0.5) * 6; - const score = clamp(primary * 0.7 + secondary * 0.3 + noise); - return { benchmarkId: id, modelId: model.id, score, ranAtTick: tick }; - }); -} - function computeVariantScore(variant: ModelVariant): number { const c = variant.capabilities; return (c.reasoning * 0.25 + c.coding * 0.2 + c.creative * 0.15 + c.math * 0.15 + c.knowledge * 0.15 + c.agents * 0.1); diff --git a/packages/game-engine/src/tick.test.ts b/packages/game-engine/src/tick.test.ts index c593783..65136c3 100644 --- a/packages/game-engine/src/tick.test.ts +++ b/packages/game-engine/src/tick.test.ts @@ -66,7 +66,6 @@ describe('processTick', () => { isDeployed: true, trainedAtTick: 0, trainingCostTotal: 0, trainingStagesCompleted: ['pretraining' as const], sizeTier: 'small' as const, version: 1.0, sftSpecializations: ['general' as const], alignmentMethod: 'rlhf' as const, dataMix: { web: 0.4, code: 0.2, books: 0.15, academic: 0.1, conversational: 0.1, specialized: 0.05 }, - benchmarkResults: {}, }; const state = createTestState({ meta: { currentEra: 'startup' }, diff --git a/packages/shared/src/types/models.ts b/packages/shared/src/types/models.ts index afa5ee8..c571249 100644 --- a/packages/shared/src/types/models.ts +++ b/packages/shared/src/types/models.ts @@ -182,45 +182,6 @@ export interface QuantizationConfig { variantName: string; } -export type BenchmarkCategory = 'reasoning' | 'coding' | 'math' | 'knowledge' | 'safety' | 'chat' | 'multimodal' | 'agents'; - -export interface BenchmarkDefinition { - id: string; - name: string; - category: BenchmarkCategory; - description: string; - primaryCapability: keyof ModelCapabilities; - secondaryCapability?: keyof ModelCapabilities; - computeCost: number; - ticksToRun: number; - unlockedAtEra: Era; - marketRelevance: { - consumer: number; - enterprise: number; - developer: number; - research: number; - }; -} - -export interface BenchmarkResult { - benchmarkId: string; - modelId: string; - score: number; - ranAtTick: number; - rank?: number; -} - -export interface EvalJob { - id: string; - modelId: string; - benchmarkIds: string[]; - progressTicks: number; - totalTicks: number; - computeAllocated: number; - status: 'active' | 'completed'; - results: BenchmarkResult[]; -} - export type ProductLineType = 'text-api' | 'chat-product' | 'chat-free' | 'chat-enterprise' | 'code-api' | 'image' | 'agents-api'; export interface ProductPricing { @@ -246,11 +207,10 @@ export interface ModelsState { baseModels: BaseModel[]; activeTrainingPipelines: TrainingPipeline[]; variantJobs: VariantCreationJob[]; - evalJobs: EvalJob[]; - benchmarkResults: BenchmarkResult[]; productLines: ProductLine[]; bestDeployedModelScore: number; bestDeployedSafetyScore: number; + bestDeployedCapabilities: ModelCapabilities; } export const DEFAULT_DATA_MIX: DataMixAllocation = { @@ -271,8 +231,6 @@ export const INITIAL_MODELS: ModelsState = { baseModels: [], activeTrainingPipelines: [], variantJobs: [], - evalJobs: [], - benchmarkResults: [], productLines: [ { id: 'text-api', @@ -307,4 +265,5 @@ export const INITIAL_MODELS: ModelsState = { ], bestDeployedModelScore: 0, bestDeployedSafetyScore: 0, + bestDeployedCapabilities: { reasoning: 0, coding: 0, creative: 0, math: 0, knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0 }, };