AIHostingTycoon/packages/game-engine/src/data/benchmarks.ts

import type { BenchmarkDefinition } from '@ai-tycoon/shared';

export const BENCHMARKS: BenchmarkDefinition[] = [
  {
    id: 'arc-challenge',
    name: 'ARC Challenge',
    category: 'reasoning',
    description: 'Advanced reasoning and comprehension tasks requiring multi-step inference.',
    primaryCapability: 'reasoning',
    secondaryCapability: 'knowledge',
    computeCost: 0.001,
    ticksToRun: 8,
    unlockedAtEra: 'startup',
    marketRelevance: { consumer: 0.3, enterprise: 0.5, developer: 0.4, research: 0.8 },
  },
  {
    id: 'codeforce',
    name: 'CodeForce',
    category: 'coding',
    description: 'Competitive programming and software engineering benchmarks.',
    primaryCapability: 'coding',
    secondaryCapability: 'reasoning',
    computeCost: 0.001,
    ticksToRun: 8,
    unlockedAtEra: 'startup',
    marketRelevance: { consumer: 0.2, enterprise: 0.7, developer: 0.9, research: 0.5 },
  },
  {
    id: 'mathquest',
    name: 'MathQuest',
    category: 'math',
    description: 'Mathematical problem-solving from algebra to graduate-level proofs.',
    primaryCapability: 'math',
    secondaryCapability: 'reasoning',
    computeCost: 0.001,
    ticksToRun: 8,
    unlockedAtEra: 'startup',
    marketRelevance: { consumer: 0.1, enterprise: 0.6, developer: 0.5, research: 0.9 },
  },
  {
    id: 'worldfacts',
    name: 'WorldFacts',
    category: 'knowledge',
    description: 'Broad factual knowledge across science, history, culture, and current events.',
    primaryCapability: 'knowledge',
    secondaryCapability: 'reasoning',
    computeCost: 0.001,
    ticksToRun: 6,
    unlockedAtEra: 'startup',
    marketRelevance: { consumer: 0.5, enterprise: 0.4, developer: 0.3, research: 0.6 },
  },
  {
    id: 'chatrank',
    name: 'ChatRank',
    category: 'chat',
    description: 'Human preference evaluation of conversational quality, helpfulness, and creativity.',
    primaryCapability: 'creative',
    secondaryCapability: 'knowledge',
    computeCost: 0.002,
    ticksToRun: 10,
    unlockedAtEra: 'startup',
    marketRelevance: { consumer: 0.9, enterprise: 0.3, developer: 0.2, research: 0.2 },
  },
  {
    id: 'harmguard',
    name: 'HarmGuard',
    category: 'safety',
    description: 'Safety evaluation measuring harm avoidance, truthfulness, and responsible behavior.',
    primaryCapability: 'reasoning',
    computeCost: 0.001,
    ticksToRun: 8,
    unlockedAtEra: 'startup',
    marketRelevance: { consumer: 0.4, enterprise: 0.9, developer: 0.3, research: 0.7 },
  },
  {
    id: 'visionbench',
    name: 'VisionBench',
    category: 'multimodal',
    description: 'Image understanding, visual reasoning, and multimodal comprehension.',
    primaryCapability: 'multimodal',
    secondaryCapability: 'reasoning',
    computeCost: 0.003,
    ticksToRun: 12,
    unlockedAtEra: 'scaleup',
    marketRelevance: { consumer: 0.5, enterprise: 0.6, developer: 0.6, research: 0.7 },
  },
  {
    id: 'agentarena',
    name: 'AgentArena',
    category: 'agents',
    description: 'Autonomous agent tasks: tool use, multi-step planning, and environment interaction.',
    primaryCapability: 'agents',
    secondaryCapability: 'coding',
    computeCost: 0.005,
    ticksToRun: 15,
    unlockedAtEra: 'bigtech',
    marketRelevance: { consumer: 0.3, enterprise: 0.8, developer: 0.7, research: 0.6 },
  },
  {
    id: 'frontier-eval',
    name: 'Frontier Eval',
    category: 'reasoning',
    description: 'Cutting-edge capability evaluation at the frontier of AI research.',
    primaryCapability: 'reasoning',
    secondaryCapability: 'math',
    computeCost: 0.01,
    ticksToRun: 20,
    unlockedAtEra: 'agi',
    marketRelevance: { consumer: 0.2, enterprise: 0.5, developer: 0.5, research: 1.0 },
  },
];