import type { BenchmarkDefinition } from '@ai-tycoon/shared'; export const BENCHMARKS: BenchmarkDefinition[] = [ { id: 'arc-challenge', name: 'ARC Challenge', category: 'reasoning', description: 'Advanced reasoning and comprehension tasks requiring multi-step inference.', primaryCapability: 'reasoning', secondaryCapability: 'knowledge', computeCost: 0.001, ticksToRun: 8, unlockedAtEra: 'startup', marketRelevance: { consumer: 0.3, enterprise: 0.5, developer: 0.4, research: 0.8 }, }, { id: 'codeforce', name: 'CodeForce', category: 'coding', description: 'Competitive programming and software engineering benchmarks.', primaryCapability: 'coding', secondaryCapability: 'reasoning', computeCost: 0.001, ticksToRun: 8, unlockedAtEra: 'startup', marketRelevance: { consumer: 0.2, enterprise: 0.7, developer: 0.9, research: 0.5 }, }, { id: 'mathquest', name: 'MathQuest', category: 'math', description: 'Mathematical problem-solving from algebra to graduate-level proofs.', primaryCapability: 'math', secondaryCapability: 'reasoning', computeCost: 0.001, ticksToRun: 8, unlockedAtEra: 'startup', marketRelevance: { consumer: 0.1, enterprise: 0.6, developer: 0.5, research: 0.9 }, }, { id: 'worldfacts', name: 'WorldFacts', category: 'knowledge', description: 'Broad factual knowledge across science, history, culture, and current events.', primaryCapability: 'knowledge', secondaryCapability: 'reasoning', computeCost: 0.001, ticksToRun: 6, unlockedAtEra: 'startup', marketRelevance: { consumer: 0.5, enterprise: 0.4, developer: 0.3, research: 0.6 }, }, { id: 'chatrank', name: 'ChatRank', category: 'chat', description: 'Human preference evaluation of conversational quality, helpfulness, and creativity.', primaryCapability: 'creative', secondaryCapability: 'knowledge', computeCost: 0.002, ticksToRun: 10, unlockedAtEra: 'startup', marketRelevance: { consumer: 0.9, enterprise: 0.3, developer: 0.2, research: 0.2 }, }, { id: 'harmguard', name: 'HarmGuard', category: 'safety', description: 'Safety evaluation measuring harm avoidance, truthfulness, and responsible behavior.', primaryCapability: 'reasoning', computeCost: 0.001, ticksToRun: 8, unlockedAtEra: 'startup', marketRelevance: { consumer: 0.4, enterprise: 0.9, developer: 0.3, research: 0.7 }, }, { id: 'visionbench', name: 'VisionBench', category: 'multimodal', description: 'Image understanding, visual reasoning, and multimodal comprehension.', primaryCapability: 'multimodal', secondaryCapability: 'reasoning', computeCost: 0.003, ticksToRun: 12, unlockedAtEra: 'scaleup', marketRelevance: { consumer: 0.5, enterprise: 0.6, developer: 0.6, research: 0.7 }, }, { id: 'agentarena', name: 'AgentArena', category: 'agents', description: 'Autonomous agent tasks: tool use, multi-step planning, and environment interaction.', primaryCapability: 'agents', secondaryCapability: 'coding', computeCost: 0.005, ticksToRun: 15, unlockedAtEra: 'bigtech', marketRelevance: { consumer: 0.3, enterprise: 0.8, developer: 0.7, research: 0.6 }, }, { id: 'frontier-eval', name: 'Frontier Eval', category: 'reasoning', description: 'Cutting-edge capability evaluation at the frontier of AI research.', primaryCapability: 'reasoning', secondaryCapability: 'math', computeCost: 0.01, ticksToRun: 20, unlockedAtEra: 'agi', marketRelevance: { consumer: 0.2, enterprise: 0.5, developer: 0.5, research: 1.0 }, }, ];