Remove benchmark evaluation system, use training capabilities directly

Model quality for market segments and product lines now derives from deployed
model capabilities (coding, reasoning, agents, etc.) instead of requiring a
separate manual benchmark evaluation step. This eliminates an unbounded
benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines
of dead-weight UI, types, and engine code.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-26 19:28:59 -04:00
parent db034687d6
commit bbb69a315c
10 changed files with 57 additions and 535 deletions
+2 -43
View File
@@ -182,45 +182,6 @@ export interface QuantizationConfig {
variantName: string;
}
export type BenchmarkCategory = 'reasoning' | 'coding' | 'math' | 'knowledge' | 'safety' | 'chat' | 'multimodal' | 'agents';
export interface BenchmarkDefinition {
id: string;
name: string;
category: BenchmarkCategory;
description: string;
primaryCapability: keyof ModelCapabilities;
secondaryCapability?: keyof ModelCapabilities;
computeCost: number;
ticksToRun: number;
unlockedAtEra: Era;
marketRelevance: {
consumer: number;
enterprise: number;
developer: number;
research: number;
};
}
export interface BenchmarkResult {
benchmarkId: string;
modelId: string;
score: number;
ranAtTick: number;
rank?: number;
}
export interface EvalJob {
id: string;
modelId: string;
benchmarkIds: string[];
progressTicks: number;
totalTicks: number;
computeAllocated: number;
status: 'active' | 'completed';
results: BenchmarkResult[];
}
export type ProductLineType = 'text-api' | 'chat-product' | 'chat-free' | 'chat-enterprise' | 'code-api' | 'image' | 'agents-api';
export interface ProductPricing {
@@ -246,11 +207,10 @@ export interface ModelsState {
baseModels: BaseModel[];
activeTrainingPipelines: TrainingPipeline[];
variantJobs: VariantCreationJob[];
evalJobs: EvalJob[];
benchmarkResults: BenchmarkResult[];
productLines: ProductLine[];
bestDeployedModelScore: number;
bestDeployedSafetyScore: number;
bestDeployedCapabilities: ModelCapabilities;
}
export const DEFAULT_DATA_MIX: DataMixAllocation = {
@@ -271,8 +231,6 @@ export const INITIAL_MODELS: ModelsState = {
baseModels: [],
activeTrainingPipelines: [],
variantJobs: [],
evalJobs: [],
benchmarkResults: [],
productLines: [
{
id: 'text-api',
@@ -307,4 +265,5 @@ export const INITIAL_MODELS: ModelsState = {
],
bestDeployedModelScore: 0,
bestDeployedSafetyScore: 0,
bestDeployedCapabilities: { reasoning: 0, coding: 0, creative: 0, math: 0, knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0 },
};