Remove benchmark evaluation system, use training capabilities directly

Model quality for market segments and product lines now derives from deployed
model capabilities (coding, reasoning, agents, etc.) instead of requiring a
separate manual benchmark evaluation step. This eliminates an unbounded
benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines
of dead-weight UI, types, and engine code.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-26 19:28:59 -04:00
parent db034687d6
commit bbb69a315c
10 changed files with 57 additions and 535 deletions
@@ -1,7 +1,6 @@
import type { GameState, MarketState, BenchmarkResult } from '@ai-tycoon/shared';
import type { GameState, MarketState, ModelCapabilities } from '@ai-tycoon/shared';
import { CONSUMER_TOKENS_PER_SUBSCRIBER, API_TOKENS_PER_DEVELOPER_PER_TICK, BATCH_API_DEMAND_PER_DEV, makeInitialServingMetrics } from '@ai-tycoon/shared';
import type { TrafficPriority, TierServingMetrics } from '@ai-tycoon/shared';
import { BENCHMARKS } from '../../data/benchmarks';
import { computeSeasonal } from './seasonalSystem';
import { updateObsolescence } from './obsolescenceSystem';
import { buildPlayerProfile, buildCompetitorProfile, computeMarketShares, updateTAMGrowth } from './tamSystem';
@@ -21,31 +20,30 @@ export interface MarketTickResult {
totalTokenDemand: number;
}
const SEGMENT_CAPABILITY_WEIGHTS: Record<string, Partial<Record<keyof ModelCapabilities, number>>> = {
consumer: { creative: 0.35, knowledge: 0.25, reasoning: 0.15, multimodal: 0.15, coding: 0.05, agents: 0.05 },
enterprise: { reasoning: 0.25, coding: 0.20, agents: 0.20, knowledge: 0.15, math: 0.10, multimodal: 0.10 },
developer: { coding: 0.35, reasoning: 0.20, agents: 0.20, math: 0.15, knowledge: 0.10 },
research: { reasoning: 0.30, math: 0.30, knowledge: 0.20, coding: 0.10, agents: 0.10 },
};
function getSegmentQuality(
segment: 'consumer' | 'enterprise' | 'developer' | 'research',
benchmarkResults: BenchmarkResult[],
capabilities: ModelCapabilities,
fallbackScore: number,
): number {
if (benchmarkResults.length === 0) return fallbackScore / 100;
const bestByBenchmark = new Map<string, number>();
for (const r of benchmarkResults) {
const prev = bestByBenchmark.get(r.benchmarkId) ?? 0;
if (r.score > prev) bestByBenchmark.set(r.benchmarkId, r.score);
}
const weights = SEGMENT_CAPABILITY_WEIGHTS[segment];
if (!weights) return fallbackScore / 100;
let weightedSum = 0;
let totalWeight = 0;
for (const bench of BENCHMARKS) {
const score = bestByBenchmark.get(bench.id);
if (score == null) continue;
const weight = bench.marketRelevance[segment];
weightedSum += (score / 100) * weight;
totalWeight += weight;
for (const [cap, weight] of Object.entries(weights)) {
const score = capabilities[cap as keyof ModelCapabilities] ?? 0;
if (score > 0) {
weightedSum += (score / 100) * weight;
totalWeight += weight;
}
}
if (totalWeight === 0) return fallbackScore / 100;
return weightedSum / totalWeight;
return totalWeight > 0 ? weightedSum / totalWeight : fallbackScore / 100;
}
export function processMarketV2(
@@ -54,9 +52,11 @@ export function processMarketV2(
effectiveInferenceFlops?: number,
researchBonuses?: ResearchBonuses,
): MarketTickResult {
const consumerQuality = getSegmentQuality('consumer', state.models.benchmarkResults, state.models.bestDeployedModelScore);
const enterpriseQuality = getSegmentQuality('enterprise', state.models.benchmarkResults, state.models.bestDeployedModelScore);
const modelQuality = state.models.benchmarkResults.length > 0
const caps = state.models.bestDeployedCapabilities;
const hasDeployed = state.models.bestDeployedModelScore > 0;
const consumerQuality = getSegmentQuality('consumer', caps, state.models.bestDeployedModelScore);
const enterpriseQuality = getSegmentQuality('enterprise', caps, state.models.bestDeployedModelScore);
const modelQuality = hasDeployed
? (consumerQuality + enterpriseQuality) / 2
: state.models.bestDeployedModelScore / 100;
@@ -115,7 +115,7 @@ export function processMarketV2(
const productResult = processProductLines(
state.market.codeAssistant,
state.market.agentsPlatform,
state.models.benchmarkResults,
caps,
playerDevCustomers,
playerEntCustomers,
seasonal.multipliers.consumer,
@@ -1,4 +1,4 @@
import type { CodeAssistantState, AgentsPlatformState, BenchmarkResult } from '@ai-tycoon/shared';
import type { CodeAssistantState, AgentsPlatformState, ModelCapabilities } from '@ai-tycoon/shared';
import {
CODE_ASSISTANT_MIN_CODING_SCORE,
CODE_ASSISTANT_BASE_ADOPTION_RATE,
@@ -7,27 +7,6 @@ import {
AGENTS_PLATFORM_BASE_ADOPTION_RATE,
AGENTS_PLATFORM_CHURN_RATE,
} from '@ai-tycoon/shared';
import { BENCHMARKS } from '../../data/benchmarks';
function getBenchmarkScore(benchmarkId: string, results: BenchmarkResult[]): number {
let best = 0;
for (const r of results) {
if (r.benchmarkId === benchmarkId && r.score > best) best = r.score;
}
return best;
}
function getCodingScore(results: BenchmarkResult[]): number {
const codeBench = BENCHMARKS.find(b => b.id === 'codeforce');
if (!codeBench) return 0;
return getBenchmarkScore(codeBench.id, results);
}
function getAgentsScore(results: BenchmarkResult[]): number {
const agentBench = BENCHMARKS.find(b => b.id === 'agentarena');
if (!agentBench) return 0;
return getBenchmarkScore(agentBench.id, results);
}
export interface ProductLineResult {
codeAssistant: CodeAssistantState;
@@ -41,7 +20,7 @@ export interface ProductLineResult {
export function processProductLines(
ca: CodeAssistantState,
ap: AgentsPlatformState,
benchmarkResults: BenchmarkResult[],
capabilities: ModelCapabilities,
playerDevCustomers: number,
playerEntCustomers: number,
seasonalConsumerMult: number,
@@ -53,7 +32,7 @@ export function processProductLines(
let apRevenue = 0;
// --- Code Assistant ---
updatedCA.qualityScore = getCodingScore(benchmarkResults);
updatedCA.qualityScore = capabilities.coding;
if (updatedCA.isUnlocked && updatedCA.isActive && updatedCA.qualityScore >= CODE_ASSISTANT_MIN_CODING_SCORE) {
const qualityFactor = updatedCA.qualityScore / 100;
const priceAttr = Math.max(0.1, 1 - updatedCA.pricePerSeat / 50);
@@ -70,7 +49,7 @@ export function processProductLines(
}
// --- Agents Platform ---
updatedAP.qualityScore = getAgentsScore(benchmarkResults);
updatedAP.qualityScore = capabilities.agents;
if (updatedAP.isUnlocked && updatedAP.isActive && updatedAP.qualityScore >= AGENTS_PLATFORM_MIN_AGENTS_SCORE) {
const qualityFactor = updatedAP.qualityScore / 100;
const priceAttr = Math.max(0.1, 1 - updatedAP.pricePerSeat / 250);