Files
AIHostingTycoon/packages/game-engine/src/data/benchmarks.ts
T
josh 4c1c0e9ff2
CI / build-and-push (push) Successful in 32s
Overhaul model system with multi-stage training, variants, benchmarks, and eval
Replace the single-stage training + flat capability score with a realistic AI
development pipeline: pre-training with Chinchilla scaling laws, SFT with
specializations, alignment with safety/capability tradeoffs (RLHF/DPO/Constitutional),
model families with distillation/fine-tuning/quantization variants, named benchmark
suite with compute-costing eval jobs, and segment-specific market quality.

Phases 1-6 of the model rework plan: new types, engine rewrite, save migration,
training events/risk system, concurrent training, variant creation, benchmark
evaluation with leaderboard, and market integration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-25 07:36:34 -04:00

112 lines
3.7 KiB
TypeScript

import type { BenchmarkDefinition } from '@ai-tycoon/shared';
export const BENCHMARKS: BenchmarkDefinition[] = [
{
id: 'arc-challenge',
name: 'ARC Challenge',
category: 'reasoning',
description: 'Advanced reasoning and comprehension tasks requiring multi-step inference.',
primaryCapability: 'reasoning',
secondaryCapability: 'knowledge',
computeCost: 0.001,
ticksToRun: 8,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.3, enterprise: 0.5, developer: 0.4, research: 0.8 },
},
{
id: 'codeforce',
name: 'CodeForce',
category: 'coding',
description: 'Competitive programming and software engineering benchmarks.',
primaryCapability: 'coding',
secondaryCapability: 'reasoning',
computeCost: 0.001,
ticksToRun: 8,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.2, enterprise: 0.7, developer: 0.9, research: 0.5 },
},
{
id: 'mathquest',
name: 'MathQuest',
category: 'math',
description: 'Mathematical problem-solving from algebra to graduate-level proofs.',
primaryCapability: 'math',
secondaryCapability: 'reasoning',
computeCost: 0.001,
ticksToRun: 8,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.1, enterprise: 0.6, developer: 0.5, research: 0.9 },
},
{
id: 'worldfacts',
name: 'WorldFacts',
category: 'knowledge',
description: 'Broad factual knowledge across science, history, culture, and current events.',
primaryCapability: 'knowledge',
secondaryCapability: 'reasoning',
computeCost: 0.001,
ticksToRun: 6,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.5, enterprise: 0.4, developer: 0.3, research: 0.6 },
},
{
id: 'chatrank',
name: 'ChatRank',
category: 'chat',
description: 'Human preference evaluation of conversational quality, helpfulness, and creativity.',
primaryCapability: 'creative',
secondaryCapability: 'knowledge',
computeCost: 0.002,
ticksToRun: 10,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.9, enterprise: 0.3, developer: 0.2, research: 0.2 },
},
{
id: 'harmguard',
name: 'HarmGuard',
category: 'safety',
description: 'Safety evaluation measuring harm avoidance, truthfulness, and responsible behavior.',
primaryCapability: 'reasoning',
computeCost: 0.001,
ticksToRun: 8,
unlockedAtEra: 'startup',
marketRelevance: { consumer: 0.4, enterprise: 0.9, developer: 0.3, research: 0.7 },
},
{
id: 'visionbench',
name: 'VisionBench',
category: 'multimodal',
description: 'Image understanding, visual reasoning, and multimodal comprehension.',
primaryCapability: 'multimodal',
secondaryCapability: 'reasoning',
computeCost: 0.003,
ticksToRun: 12,
unlockedAtEra: 'scaleup',
marketRelevance: { consumer: 0.5, enterprise: 0.6, developer: 0.6, research: 0.7 },
},
{
id: 'agentarena',
name: 'AgentArena',
category: 'agents',
description: 'Autonomous agent tasks: tool use, multi-step planning, and environment interaction.',
primaryCapability: 'agents',
secondaryCapability: 'coding',
computeCost: 0.005,
ticksToRun: 15,
unlockedAtEra: 'bigtech',
marketRelevance: { consumer: 0.3, enterprise: 0.8, developer: 0.7, research: 0.6 },
},
{
id: 'frontier-eval',
name: 'Frontier Eval',
category: 'reasoning',
description: 'Cutting-edge capability evaluation at the frontier of AI research.',
primaryCapability: 'reasoning',
secondaryCapability: 'math',
computeCost: 0.01,
ticksToRun: 20,
unlockedAtEra: 'agi',
marketRelevance: { consumer: 0.2, enterprise: 0.5, developer: 0.5, research: 1.0 },
},
];