4c1c0e9ff2
CI / build-and-push (push) Successful in 32s
Replace the single-stage training + flat capability score with a realistic AI development pipeline: pre-training with Chinchilla scaling laws, SFT with specializations, alignment with safety/capability tradeoffs (RLHF/DPO/Constitutional), model families with distillation/fine-tuning/quantization variants, named benchmark suite with compute-costing eval jobs, and segment-specific market quality. Phases 1-6 of the model rework plan: new types, engine rewrite, save migration, training events/risk system, concurrent training, variant creation, benchmark evaluation with leaderboard, and market integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
112 lines
3.7 KiB
TypeScript
112 lines
3.7 KiB
TypeScript
import type { BenchmarkDefinition } from '@ai-tycoon/shared';
|
|
|
|
export const BENCHMARKS: BenchmarkDefinition[] = [
|
|
{
|
|
id: 'arc-challenge',
|
|
name: 'ARC Challenge',
|
|
category: 'reasoning',
|
|
description: 'Advanced reasoning and comprehension tasks requiring multi-step inference.',
|
|
primaryCapability: 'reasoning',
|
|
secondaryCapability: 'knowledge',
|
|
computeCost: 0.001,
|
|
ticksToRun: 8,
|
|
unlockedAtEra: 'startup',
|
|
marketRelevance: { consumer: 0.3, enterprise: 0.5, developer: 0.4, research: 0.8 },
|
|
},
|
|
{
|
|
id: 'codeforce',
|
|
name: 'CodeForce',
|
|
category: 'coding',
|
|
description: 'Competitive programming and software engineering benchmarks.',
|
|
primaryCapability: 'coding',
|
|
secondaryCapability: 'reasoning',
|
|
computeCost: 0.001,
|
|
ticksToRun: 8,
|
|
unlockedAtEra: 'startup',
|
|
marketRelevance: { consumer: 0.2, enterprise: 0.7, developer: 0.9, research: 0.5 },
|
|
},
|
|
{
|
|
id: 'mathquest',
|
|
name: 'MathQuest',
|
|
category: 'math',
|
|
description: 'Mathematical problem-solving from algebra to graduate-level proofs.',
|
|
primaryCapability: 'math',
|
|
secondaryCapability: 'reasoning',
|
|
computeCost: 0.001,
|
|
ticksToRun: 8,
|
|
unlockedAtEra: 'startup',
|
|
marketRelevance: { consumer: 0.1, enterprise: 0.6, developer: 0.5, research: 0.9 },
|
|
},
|
|
{
|
|
id: 'worldfacts',
|
|
name: 'WorldFacts',
|
|
category: 'knowledge',
|
|
description: 'Broad factual knowledge across science, history, culture, and current events.',
|
|
primaryCapability: 'knowledge',
|
|
secondaryCapability: 'reasoning',
|
|
computeCost: 0.001,
|
|
ticksToRun: 6,
|
|
unlockedAtEra: 'startup',
|
|
marketRelevance: { consumer: 0.5, enterprise: 0.4, developer: 0.3, research: 0.6 },
|
|
},
|
|
{
|
|
id: 'chatrank',
|
|
name: 'ChatRank',
|
|
category: 'chat',
|
|
description: 'Human preference evaluation of conversational quality, helpfulness, and creativity.',
|
|
primaryCapability: 'creative',
|
|
secondaryCapability: 'knowledge',
|
|
computeCost: 0.002,
|
|
ticksToRun: 10,
|
|
unlockedAtEra: 'startup',
|
|
marketRelevance: { consumer: 0.9, enterprise: 0.3, developer: 0.2, research: 0.2 },
|
|
},
|
|
{
|
|
id: 'harmguard',
|
|
name: 'HarmGuard',
|
|
category: 'safety',
|
|
description: 'Safety evaluation measuring harm avoidance, truthfulness, and responsible behavior.',
|
|
primaryCapability: 'reasoning',
|
|
computeCost: 0.001,
|
|
ticksToRun: 8,
|
|
unlockedAtEra: 'startup',
|
|
marketRelevance: { consumer: 0.4, enterprise: 0.9, developer: 0.3, research: 0.7 },
|
|
},
|
|
{
|
|
id: 'visionbench',
|
|
name: 'VisionBench',
|
|
category: 'multimodal',
|
|
description: 'Image understanding, visual reasoning, and multimodal comprehension.',
|
|
primaryCapability: 'multimodal',
|
|
secondaryCapability: 'reasoning',
|
|
computeCost: 0.003,
|
|
ticksToRun: 12,
|
|
unlockedAtEra: 'scaleup',
|
|
marketRelevance: { consumer: 0.5, enterprise: 0.6, developer: 0.6, research: 0.7 },
|
|
},
|
|
{
|
|
id: 'agentarena',
|
|
name: 'AgentArena',
|
|
category: 'agents',
|
|
description: 'Autonomous agent tasks: tool use, multi-step planning, and environment interaction.',
|
|
primaryCapability: 'agents',
|
|
secondaryCapability: 'coding',
|
|
computeCost: 0.005,
|
|
ticksToRun: 15,
|
|
unlockedAtEra: 'bigtech',
|
|
marketRelevance: { consumer: 0.3, enterprise: 0.8, developer: 0.7, research: 0.6 },
|
|
},
|
|
{
|
|
id: 'frontier-eval',
|
|
name: 'Frontier Eval',
|
|
category: 'reasoning',
|
|
description: 'Cutting-edge capability evaluation at the frontier of AI research.',
|
|
primaryCapability: 'reasoning',
|
|
secondaryCapability: 'math',
|
|
computeCost: 0.01,
|
|
ticksToRun: 20,
|
|
unlockedAtEra: 'agi',
|
|
marketRelevance: { consumer: 0.2, enterprise: 0.5, developer: 0.5, research: 1.0 },
|
|
},
|
|
];
|