Remove benchmark evaluation system, use training capabilities directly
Model quality for market segments and product lines now derives from deployed model capabilities (coding, reasoning, agents, etc.) instead of requiring a separate manual benchmark evaluation step. This eliminates an unbounded benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines of dead-weight UI, types, and engine code. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -171,7 +171,6 @@ export function createTestBaseModel(overrides?: Partial<BaseModel>): BaseModel {
|
||||
sizeTier: 'small',
|
||||
isPointRelease: false,
|
||||
sourceModelId: null,
|
||||
benchmarkResults: {},
|
||||
dataMix: { web: 0.4, code: 0.2, books: 0.15, academic: 0.1, conversational: 0.1, specialized: 0.05 },
|
||||
};
|
||||
return overrides ? { ...base, ...overrides } : base;
|
||||
@@ -181,9 +180,10 @@ export function createTestModelFamily(overrides?: Partial<ModelFamily>): ModelFa
|
||||
const base: ModelFamily = {
|
||||
id: uuid(),
|
||||
name: 'Test Family',
|
||||
baseModels: [],
|
||||
generation: 1,
|
||||
baseModelIds: [],
|
||||
variants: [],
|
||||
activeEvals: [],
|
||||
createdAtTick: 0,
|
||||
};
|
||||
return overrides ? { ...base, ...overrides } : base;
|
||||
}
|
||||
|
||||
@@ -1,111 +0,0 @@
|
||||
import type { BenchmarkDefinition } from '@ai-tycoon/shared';
|
||||
|
||||
export const BENCHMARKS: BenchmarkDefinition[] = [
|
||||
{
|
||||
id: 'arc-challenge',
|
||||
name: 'ARC Challenge',
|
||||
category: 'reasoning',
|
||||
description: 'Advanced reasoning and comprehension tasks requiring multi-step inference.',
|
||||
primaryCapability: 'reasoning',
|
||||
secondaryCapability: 'knowledge',
|
||||
computeCost: 0.001,
|
||||
ticksToRun: 8,
|
||||
unlockedAtEra: 'startup',
|
||||
marketRelevance: { consumer: 0.3, enterprise: 0.5, developer: 0.4, research: 0.8 },
|
||||
},
|
||||
{
|
||||
id: 'codeforce',
|
||||
name: 'CodeForce',
|
||||
category: 'coding',
|
||||
description: 'Competitive programming and software engineering benchmarks.',
|
||||
primaryCapability: 'coding',
|
||||
secondaryCapability: 'reasoning',
|
||||
computeCost: 0.001,
|
||||
ticksToRun: 8,
|
||||
unlockedAtEra: 'startup',
|
||||
marketRelevance: { consumer: 0.2, enterprise: 0.7, developer: 0.9, research: 0.5 },
|
||||
},
|
||||
{
|
||||
id: 'mathquest',
|
||||
name: 'MathQuest',
|
||||
category: 'math',
|
||||
description: 'Mathematical problem-solving from algebra to graduate-level proofs.',
|
||||
primaryCapability: 'math',
|
||||
secondaryCapability: 'reasoning',
|
||||
computeCost: 0.001,
|
||||
ticksToRun: 8,
|
||||
unlockedAtEra: 'startup',
|
||||
marketRelevance: { consumer: 0.1, enterprise: 0.6, developer: 0.5, research: 0.9 },
|
||||
},
|
||||
{
|
||||
id: 'worldfacts',
|
||||
name: 'WorldFacts',
|
||||
category: 'knowledge',
|
||||
description: 'Broad factual knowledge across science, history, culture, and current events.',
|
||||
primaryCapability: 'knowledge',
|
||||
secondaryCapability: 'reasoning',
|
||||
computeCost: 0.001,
|
||||
ticksToRun: 6,
|
||||
unlockedAtEra: 'startup',
|
||||
marketRelevance: { consumer: 0.5, enterprise: 0.4, developer: 0.3, research: 0.6 },
|
||||
},
|
||||
{
|
||||
id: 'chatrank',
|
||||
name: 'ChatRank',
|
||||
category: 'chat',
|
||||
description: 'Human preference evaluation of conversational quality, helpfulness, and creativity.',
|
||||
primaryCapability: 'creative',
|
||||
secondaryCapability: 'knowledge',
|
||||
computeCost: 0.002,
|
||||
ticksToRun: 10,
|
||||
unlockedAtEra: 'startup',
|
||||
marketRelevance: { consumer: 0.9, enterprise: 0.3, developer: 0.2, research: 0.2 },
|
||||
},
|
||||
{
|
||||
id: 'harmguard',
|
||||
name: 'HarmGuard',
|
||||
category: 'safety',
|
||||
description: 'Safety evaluation measuring harm avoidance, truthfulness, and responsible behavior.',
|
||||
primaryCapability: 'reasoning',
|
||||
computeCost: 0.001,
|
||||
ticksToRun: 8,
|
||||
unlockedAtEra: 'startup',
|
||||
marketRelevance: { consumer: 0.4, enterprise: 0.9, developer: 0.3, research: 0.7 },
|
||||
},
|
||||
{
|
||||
id: 'visionbench',
|
||||
name: 'VisionBench',
|
||||
category: 'multimodal',
|
||||
description: 'Image understanding, visual reasoning, and multimodal comprehension.',
|
||||
primaryCapability: 'multimodal',
|
||||
secondaryCapability: 'reasoning',
|
||||
computeCost: 0.003,
|
||||
ticksToRun: 12,
|
||||
unlockedAtEra: 'scaleup',
|
||||
marketRelevance: { consumer: 0.5, enterprise: 0.6, developer: 0.6, research: 0.7 },
|
||||
},
|
||||
{
|
||||
id: 'agentarena',
|
||||
name: 'AgentArena',
|
||||
category: 'agents',
|
||||
description: 'Autonomous agent tasks: tool use, multi-step planning, and environment interaction.',
|
||||
primaryCapability: 'agents',
|
||||
secondaryCapability: 'coding',
|
||||
computeCost: 0.005,
|
||||
ticksToRun: 15,
|
||||
unlockedAtEra: 'bigtech',
|
||||
marketRelevance: { consumer: 0.3, enterprise: 0.8, developer: 0.7, research: 0.6 },
|
||||
},
|
||||
{
|
||||
id: 'frontier-eval',
|
||||
name: 'Frontier Eval',
|
||||
category: 'reasoning',
|
||||
description: 'Cutting-edge capability evaluation at the frontier of AI research.',
|
||||
primaryCapability: 'reasoning',
|
||||
secondaryCapability: 'math',
|
||||
computeCost: 0.01,
|
||||
ticksToRun: 20,
|
||||
unlockedAtEra: 'agi',
|
||||
marketRelevance: { consumer: 0.2, enterprise: 0.5, developer: 0.5, research: 1.0 },
|
||||
},
|
||||
];
|
||||
@@ -11,4 +11,3 @@ export { TECH_TREE } from './data/techTree';
|
||||
export { INITIAL_RIVALS } from './data/competitors';
|
||||
export { KEY_HIRE_POOL } from './data/keyHires';
|
||||
export { ACHIEVEMENT_DEFINITIONS } from './data/achievements';
|
||||
export { BENCHMARKS } from './data/benchmarks';
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import type { GameState, MarketState, BenchmarkResult } from '@ai-tycoon/shared';
|
||||
import type { GameState, MarketState, ModelCapabilities } from '@ai-tycoon/shared';
|
||||
import { CONSUMER_TOKENS_PER_SUBSCRIBER, API_TOKENS_PER_DEVELOPER_PER_TICK, BATCH_API_DEMAND_PER_DEV, makeInitialServingMetrics } from '@ai-tycoon/shared';
|
||||
import type { TrafficPriority, TierServingMetrics } from '@ai-tycoon/shared';
|
||||
import { BENCHMARKS } from '../../data/benchmarks';
|
||||
import { computeSeasonal } from './seasonalSystem';
|
||||
import { updateObsolescence } from './obsolescenceSystem';
|
||||
import { buildPlayerProfile, buildCompetitorProfile, computeMarketShares, updateTAMGrowth } from './tamSystem';
|
||||
@@ -21,31 +20,30 @@ export interface MarketTickResult {
|
||||
totalTokenDemand: number;
|
||||
}
|
||||
|
||||
const SEGMENT_CAPABILITY_WEIGHTS: Record<string, Partial<Record<keyof ModelCapabilities, number>>> = {
|
||||
consumer: { creative: 0.35, knowledge: 0.25, reasoning: 0.15, multimodal: 0.15, coding: 0.05, agents: 0.05 },
|
||||
enterprise: { reasoning: 0.25, coding: 0.20, agents: 0.20, knowledge: 0.15, math: 0.10, multimodal: 0.10 },
|
||||
developer: { coding: 0.35, reasoning: 0.20, agents: 0.20, math: 0.15, knowledge: 0.10 },
|
||||
research: { reasoning: 0.30, math: 0.30, knowledge: 0.20, coding: 0.10, agents: 0.10 },
|
||||
};
|
||||
|
||||
function getSegmentQuality(
|
||||
segment: 'consumer' | 'enterprise' | 'developer' | 'research',
|
||||
benchmarkResults: BenchmarkResult[],
|
||||
capabilities: ModelCapabilities,
|
||||
fallbackScore: number,
|
||||
): number {
|
||||
if (benchmarkResults.length === 0) return fallbackScore / 100;
|
||||
|
||||
const bestByBenchmark = new Map<string, number>();
|
||||
for (const r of benchmarkResults) {
|
||||
const prev = bestByBenchmark.get(r.benchmarkId) ?? 0;
|
||||
if (r.score > prev) bestByBenchmark.set(r.benchmarkId, r.score);
|
||||
}
|
||||
|
||||
const weights = SEGMENT_CAPABILITY_WEIGHTS[segment];
|
||||
if (!weights) return fallbackScore / 100;
|
||||
let weightedSum = 0;
|
||||
let totalWeight = 0;
|
||||
for (const bench of BENCHMARKS) {
|
||||
const score = bestByBenchmark.get(bench.id);
|
||||
if (score == null) continue;
|
||||
const weight = bench.marketRelevance[segment];
|
||||
weightedSum += (score / 100) * weight;
|
||||
totalWeight += weight;
|
||||
for (const [cap, weight] of Object.entries(weights)) {
|
||||
const score = capabilities[cap as keyof ModelCapabilities] ?? 0;
|
||||
if (score > 0) {
|
||||
weightedSum += (score / 100) * weight;
|
||||
totalWeight += weight;
|
||||
}
|
||||
}
|
||||
|
||||
if (totalWeight === 0) return fallbackScore / 100;
|
||||
return weightedSum / totalWeight;
|
||||
return totalWeight > 0 ? weightedSum / totalWeight : fallbackScore / 100;
|
||||
}
|
||||
|
||||
export function processMarketV2(
|
||||
@@ -54,9 +52,11 @@ export function processMarketV2(
|
||||
effectiveInferenceFlops?: number,
|
||||
researchBonuses?: ResearchBonuses,
|
||||
): MarketTickResult {
|
||||
const consumerQuality = getSegmentQuality('consumer', state.models.benchmarkResults, state.models.bestDeployedModelScore);
|
||||
const enterpriseQuality = getSegmentQuality('enterprise', state.models.benchmarkResults, state.models.bestDeployedModelScore);
|
||||
const modelQuality = state.models.benchmarkResults.length > 0
|
||||
const caps = state.models.bestDeployedCapabilities;
|
||||
const hasDeployed = state.models.bestDeployedModelScore > 0;
|
||||
const consumerQuality = getSegmentQuality('consumer', caps, state.models.bestDeployedModelScore);
|
||||
const enterpriseQuality = getSegmentQuality('enterprise', caps, state.models.bestDeployedModelScore);
|
||||
const modelQuality = hasDeployed
|
||||
? (consumerQuality + enterpriseQuality) / 2
|
||||
: state.models.bestDeployedModelScore / 100;
|
||||
|
||||
@@ -115,7 +115,7 @@ export function processMarketV2(
|
||||
const productResult = processProductLines(
|
||||
state.market.codeAssistant,
|
||||
state.market.agentsPlatform,
|
||||
state.models.benchmarkResults,
|
||||
caps,
|
||||
playerDevCustomers,
|
||||
playerEntCustomers,
|
||||
seasonal.multipliers.consumer,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import type { CodeAssistantState, AgentsPlatformState, BenchmarkResult } from '@ai-tycoon/shared';
|
||||
import type { CodeAssistantState, AgentsPlatformState, ModelCapabilities } from '@ai-tycoon/shared';
|
||||
import {
|
||||
CODE_ASSISTANT_MIN_CODING_SCORE,
|
||||
CODE_ASSISTANT_BASE_ADOPTION_RATE,
|
||||
@@ -7,27 +7,6 @@ import {
|
||||
AGENTS_PLATFORM_BASE_ADOPTION_RATE,
|
||||
AGENTS_PLATFORM_CHURN_RATE,
|
||||
} from '@ai-tycoon/shared';
|
||||
import { BENCHMARKS } from '../../data/benchmarks';
|
||||
|
||||
function getBenchmarkScore(benchmarkId: string, results: BenchmarkResult[]): number {
|
||||
let best = 0;
|
||||
for (const r of results) {
|
||||
if (r.benchmarkId === benchmarkId && r.score > best) best = r.score;
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
function getCodingScore(results: BenchmarkResult[]): number {
|
||||
const codeBench = BENCHMARKS.find(b => b.id === 'codeforce');
|
||||
if (!codeBench) return 0;
|
||||
return getBenchmarkScore(codeBench.id, results);
|
||||
}
|
||||
|
||||
function getAgentsScore(results: BenchmarkResult[]): number {
|
||||
const agentBench = BENCHMARKS.find(b => b.id === 'agentarena');
|
||||
if (!agentBench) return 0;
|
||||
return getBenchmarkScore(agentBench.id, results);
|
||||
}
|
||||
|
||||
export interface ProductLineResult {
|
||||
codeAssistant: CodeAssistantState;
|
||||
@@ -41,7 +20,7 @@ export interface ProductLineResult {
|
||||
export function processProductLines(
|
||||
ca: CodeAssistantState,
|
||||
ap: AgentsPlatformState,
|
||||
benchmarkResults: BenchmarkResult[],
|
||||
capabilities: ModelCapabilities,
|
||||
playerDevCustomers: number,
|
||||
playerEntCustomers: number,
|
||||
seasonalConsumerMult: number,
|
||||
@@ -53,7 +32,7 @@ export function processProductLines(
|
||||
let apRevenue = 0;
|
||||
|
||||
// --- Code Assistant ---
|
||||
updatedCA.qualityScore = getCodingScore(benchmarkResults);
|
||||
updatedCA.qualityScore = capabilities.coding;
|
||||
if (updatedCA.isUnlocked && updatedCA.isActive && updatedCA.qualityScore >= CODE_ASSISTANT_MIN_CODING_SCORE) {
|
||||
const qualityFactor = updatedCA.qualityScore / 100;
|
||||
const priceAttr = Math.max(0.1, 1 - updatedCA.pricePerSeat / 50);
|
||||
@@ -70,7 +49,7 @@ export function processProductLines(
|
||||
}
|
||||
|
||||
// --- Agents Platform ---
|
||||
updatedAP.qualityScore = getAgentsScore(benchmarkResults);
|
||||
updatedAP.qualityScore = capabilities.agents;
|
||||
if (updatedAP.isUnlocked && updatedAP.isActive && updatedAP.qualityScore >= AGENTS_PLATFORM_MIN_AGENTS_SCORE) {
|
||||
const qualityFactor = updatedAP.qualityScore / 100;
|
||||
const priceAttr = Math.max(0.1, 1 - updatedAP.pricePerSeat / 250);
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
import type {
|
||||
GameState, ModelsState, BaseModel, ModelCapabilities, SafetyProfile,
|
||||
TrainingPipeline, TrainingEvent, TrainingEventType,
|
||||
ModelVariant, VariantCreationJob, EvalJob, BenchmarkResult,
|
||||
BenchmarkDefinition,
|
||||
ModelVariant, VariantCreationJob,
|
||||
} from '@ai-tycoon/shared';
|
||||
import { BENCHMARKS } from '../data/benchmarks';
|
||||
import {
|
||||
uuid, VRAM_REQUIREMENTS_BY_GENERATION,
|
||||
MOE_CAPABILITY_MULTIPLIER, MOE_SPEED_MULTIPLIER,
|
||||
@@ -154,14 +152,21 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
|
||||
});
|
||||
}
|
||||
|
||||
const updatedEvalJobs = processEvalJobs(state);
|
||||
|
||||
const bestDeployedCapabilities: ModelCapabilities = {
|
||||
reasoning: 0, coding: 0, creative: 0, math: 0,
|
||||
knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0,
|
||||
};
|
||||
let bestDeployedModelScore = 0;
|
||||
let bestDeployedSafetyScore = 0;
|
||||
for (const m of baseModels) {
|
||||
if (!m.isDeployed) continue;
|
||||
if (m.rawCapability > bestDeployedModelScore) bestDeployedModelScore = m.rawCapability;
|
||||
if (m.safetyProfile.overallSafety > bestDeployedSafetyScore) bestDeployedSafetyScore = m.safetyProfile.overallSafety;
|
||||
for (const key of Object.keys(bestDeployedCapabilities) as (keyof ModelCapabilities)[]) {
|
||||
if ((m.capabilities[key] ?? 0) > bestDeployedCapabilities[key]) {
|
||||
bestDeployedCapabilities[key] = m.capabilities[key];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const f of families) {
|
||||
for (const v of f.variants) {
|
||||
@@ -169,6 +174,11 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
|
||||
const score = computeVariantScore(v);
|
||||
if (score > bestDeployedModelScore) bestDeployedModelScore = score;
|
||||
if (v.safetyProfile.overallSafety > bestDeployedSafetyScore) bestDeployedSafetyScore = v.safetyProfile.overallSafety;
|
||||
for (const key of Object.keys(bestDeployedCapabilities) as (keyof ModelCapabilities)[]) {
|
||||
if ((v.capabilities[key] ?? 0) > bestDeployedCapabilities[key]) {
|
||||
bestDeployedCapabilities[key] = v.capabilities[key];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,10 +189,9 @@ export function processModels(state: GameState, researchBonuses?: ResearchBonuse
|
||||
families,
|
||||
activeTrainingPipelines: updatedPipelines,
|
||||
variantJobs: updatedVariantJobs.jobs,
|
||||
evalJobs: updatedEvalJobs.jobs,
|
||||
benchmarkResults: [...state.models.benchmarkResults, ...updatedEvalJobs.newResults],
|
||||
bestDeployedModelScore,
|
||||
bestDeployedSafetyScore,
|
||||
bestDeployedCapabilities,
|
||||
},
|
||||
completedModels,
|
||||
notifications,
|
||||
@@ -490,47 +499,6 @@ function createVariant(job: VariantCreationJob, base: BaseModel): ModelVariant {
|
||||
};
|
||||
}
|
||||
|
||||
function processEvalJobs(state: GameState): { jobs: EvalJob[]; newResults: BenchmarkResult[] } {
|
||||
const newResults: BenchmarkResult[] = [];
|
||||
const allModels: (BaseModel | ModelVariant)[] = [
|
||||
...state.models.baseModels,
|
||||
...state.models.families.flatMap(f => f.variants),
|
||||
];
|
||||
|
||||
const jobs = state.models.evalJobs.map(job => {
|
||||
if (job.status !== 'active') return job;
|
||||
const newProgress = job.progressTicks + 1;
|
||||
if (newProgress >= job.totalTicks) {
|
||||
const model = allModels.find(m => m.id === job.modelId);
|
||||
if (model) {
|
||||
const results = computeBenchmarkScores(model, job.benchmarkIds, state.meta.tickCount);
|
||||
newResults.push(...results);
|
||||
return { ...job, status: 'completed' as const, progressTicks: job.totalTicks, results };
|
||||
}
|
||||
return { ...job, status: 'completed' as const, progressTicks: job.totalTicks };
|
||||
}
|
||||
return { ...job, progressTicks: newProgress };
|
||||
});
|
||||
return { jobs, newResults };
|
||||
}
|
||||
|
||||
function computeBenchmarkScores(
|
||||
model: BaseModel | ModelVariant,
|
||||
benchmarkIds: string[],
|
||||
tick: number,
|
||||
): BenchmarkResult[] {
|
||||
const benchmarkMap = new Map(BENCHMARKS.map(b => [b.id, b]));
|
||||
return benchmarkIds.map(id => {
|
||||
const bench = benchmarkMap.get(id);
|
||||
if (!bench) return { benchmarkId: id, modelId: model.id, score: 0, ranAtTick: tick };
|
||||
const primary = model.capabilities[bench.primaryCapability] ?? 0;
|
||||
const secondary = bench.secondaryCapability ? (model.capabilities[bench.secondaryCapability] ?? 0) : 0;
|
||||
const noise = (Math.random() - 0.5) * 6;
|
||||
const score = clamp(primary * 0.7 + secondary * 0.3 + noise);
|
||||
return { benchmarkId: id, modelId: model.id, score, ranAtTick: tick };
|
||||
});
|
||||
}
|
||||
|
||||
function computeVariantScore(variant: ModelVariant): number {
|
||||
const c = variant.capabilities;
|
||||
return (c.reasoning * 0.25 + c.coding * 0.2 + c.creative * 0.15 + c.math * 0.15 + c.knowledge * 0.15 + c.agents * 0.1);
|
||||
|
||||
@@ -66,7 +66,6 @@ describe('processTick', () => {
|
||||
isDeployed: true, trainedAtTick: 0, trainingCostTotal: 0, trainingStagesCompleted: ['pretraining' as const],
|
||||
sizeTier: 'small' as const, version: 1.0, sftSpecializations: ['general' as const], alignmentMethod: 'rlhf' as const,
|
||||
dataMix: { web: 0.4, code: 0.2, books: 0.15, academic: 0.1, conversational: 0.1, specialized: 0.05 },
|
||||
benchmarkResults: {},
|
||||
};
|
||||
const state = createTestState({
|
||||
meta: { currentEra: 'startup' },
|
||||
|
||||
Reference in New Issue
Block a user