Remove benchmark evaluation system, use training capabilities directly
Model quality for market segments and product lines now derives from deployed model capabilities (coding, reasoning, agents, etc.) instead of requiring a separate manual benchmark evaluation step. This eliminates an unbounded benchmarkResults[] array that was scanned 5x per tick and removes ~480 lines of dead-weight UI, types, and engine code. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -182,45 +182,6 @@ export interface QuantizationConfig {
|
||||
variantName: string;
|
||||
}
|
||||
|
||||
export type BenchmarkCategory = 'reasoning' | 'coding' | 'math' | 'knowledge' | 'safety' | 'chat' | 'multimodal' | 'agents';
|
||||
|
||||
export interface BenchmarkDefinition {
|
||||
id: string;
|
||||
name: string;
|
||||
category: BenchmarkCategory;
|
||||
description: string;
|
||||
primaryCapability: keyof ModelCapabilities;
|
||||
secondaryCapability?: keyof ModelCapabilities;
|
||||
computeCost: number;
|
||||
ticksToRun: number;
|
||||
unlockedAtEra: Era;
|
||||
marketRelevance: {
|
||||
consumer: number;
|
||||
enterprise: number;
|
||||
developer: number;
|
||||
research: number;
|
||||
};
|
||||
}
|
||||
|
||||
export interface BenchmarkResult {
|
||||
benchmarkId: string;
|
||||
modelId: string;
|
||||
score: number;
|
||||
ranAtTick: number;
|
||||
rank?: number;
|
||||
}
|
||||
|
||||
export interface EvalJob {
|
||||
id: string;
|
||||
modelId: string;
|
||||
benchmarkIds: string[];
|
||||
progressTicks: number;
|
||||
totalTicks: number;
|
||||
computeAllocated: number;
|
||||
status: 'active' | 'completed';
|
||||
results: BenchmarkResult[];
|
||||
}
|
||||
|
||||
export type ProductLineType = 'text-api' | 'chat-product' | 'chat-free' | 'chat-enterprise' | 'code-api' | 'image' | 'agents-api';
|
||||
|
||||
export interface ProductPricing {
|
||||
@@ -246,11 +207,10 @@ export interface ModelsState {
|
||||
baseModels: BaseModel[];
|
||||
activeTrainingPipelines: TrainingPipeline[];
|
||||
variantJobs: VariantCreationJob[];
|
||||
evalJobs: EvalJob[];
|
||||
benchmarkResults: BenchmarkResult[];
|
||||
productLines: ProductLine[];
|
||||
bestDeployedModelScore: number;
|
||||
bestDeployedSafetyScore: number;
|
||||
bestDeployedCapabilities: ModelCapabilities;
|
||||
}
|
||||
|
||||
export const DEFAULT_DATA_MIX: DataMixAllocation = {
|
||||
@@ -271,8 +231,6 @@ export const INITIAL_MODELS: ModelsState = {
|
||||
baseModels: [],
|
||||
activeTrainingPipelines: [],
|
||||
variantJobs: [],
|
||||
evalJobs: [],
|
||||
benchmarkResults: [],
|
||||
productLines: [
|
||||
{
|
||||
id: 'text-api',
|
||||
@@ -307,4 +265,5 @@ export const INITIAL_MODELS: ModelsState = {
|
||||
],
|
||||
bestDeployedModelScore: 0,
|
||||
bestDeployedSafetyScore: 0,
|
||||
bestDeployedCapabilities: { reasoning: 0, coding: 0, creative: 0, math: 0, knowledge: 0, multimodal: 0, agents: 0, speed: 0, contextUtilization: 0 },
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user