Replace decorative overload policy with real serving pipeline and dedicated Serving page
CI / build-and-push (push) Successful in 28s
CI / build-and-push (push) Successful in 28s
The old overload policy had dead controls (maxQueueDepth, rateLimitPerCustomer never read) and trivial flat penalties. This replaces it with a full serving pipeline where deployed models form a fleet, requests route through priority/degradation logic, and policy choices create meaningful strategic tradeoffs. New serving pipeline: fleet building from deployed models (size/quant/MoE multipliers), demand categorization by 5 priority tiers, enterprise capacity reservation, priority-ordered serving with overflow behaviors (queue/reject/degrade), auto-degradation to faster models under load, and Batch API to fill idle capacity at discounted rates. 4 new research nodes gate features progressively: Intelligent Request Routing, Priority Queue System, Request Batching, and Auto-Scaling. New dedicated Serving page with pipeline metrics, model fleet utilization, and research-gated policy controls. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,462 @@
|
||||
import type {
|
||||
OverloadPolicy,
|
||||
TrafficPriority,
|
||||
TierServingMetrics,
|
||||
ServingMetrics,
|
||||
ModelUtilizationEntry,
|
||||
BatchApiState,
|
||||
} from '@ai-tycoon/shared';
|
||||
import type { BaseModel, ModelVariant, ModelFamily, ModelsState, SizeTier } from '@ai-tycoon/shared';
|
||||
import {
|
||||
MODEL_SIZE_THROUGHPUT_SCALER,
|
||||
MOE_SPEED_MULTIPLIER,
|
||||
FLOPS_TO_TOKENS_MULTIPLIER,
|
||||
QUANTIZATION_CONFIGS,
|
||||
REJECTION_SATISFACTION_PENALTY,
|
||||
QUEUE_SATISFACTION_PENALTY,
|
||||
DEGRADATION_SATISFACTION_PENALTY,
|
||||
BASE_LATENCY_MS,
|
||||
QUEUE_LATENCY_MS_PER_PERCENT,
|
||||
BATCH_API_MAX_PENDING,
|
||||
} from '@ai-tycoon/shared';
|
||||
import { makeInitialServingMetrics } from '@ai-tycoon/shared';
|
||||
|
||||
export interface ModelServingSlot {
|
||||
modelId: string;
|
||||
modelName: string;
|
||||
sizeTier: SizeTier;
|
||||
isVariant: boolean;
|
||||
quantization: string | null;
|
||||
qualityScore: number;
|
||||
speedMultiplier: number;
|
||||
throughputCapacity: number;
|
||||
isMoE: boolean;
|
||||
}
|
||||
|
||||
export interface DemandByTier {
|
||||
enterprise: number;
|
||||
'api-paid': number;
|
||||
'consumer-paid': number;
|
||||
'api-free': number;
|
||||
'consumer-free': number;
|
||||
}
|
||||
|
||||
export interface ServingPipelineInput {
|
||||
modelsState: ModelsState;
|
||||
effectiveInferenceFlops: number;
|
||||
overloadPolicy: OverloadPolicy;
|
||||
demandByTier: DemandByTier;
|
||||
batchApi: BatchApiState;
|
||||
modelQuality: number;
|
||||
researchUnlocks: {
|
||||
servingRoutingUnlocked: boolean;
|
||||
priorityQueuesUnlocked: boolean;
|
||||
batchApiUnlocked: boolean;
|
||||
autoScalingBonus: number;
|
||||
};
|
||||
}
|
||||
|
||||
export interface ServingPipelineResult {
|
||||
servingMetrics: ServingMetrics;
|
||||
batchApi: BatchApiState;
|
||||
batchRevenue: number;
|
||||
}
|
||||
|
||||
function buildModelFleet(
|
||||
modelsState: ModelsState,
|
||||
effectiveInferenceFlops: number,
|
||||
): ModelServingSlot[] {
|
||||
const slots: ModelServingSlot[] = [];
|
||||
|
||||
const deployedBases = modelsState.baseModels.filter(m => m.isDeployed);
|
||||
const deployedVariants: { variant: ModelVariant; baseModel: BaseModel }[] = [];
|
||||
|
||||
for (const family of modelsState.families) {
|
||||
for (const variant of family.variants) {
|
||||
if (!variant.isDeployed) continue;
|
||||
const base = modelsState.baseModels.find(m => m.id === variant.baseModelId);
|
||||
if (base) deployedVariants.push({ variant, baseModel: base });
|
||||
}
|
||||
}
|
||||
|
||||
const totalDeployed = deployedBases.length + deployedVariants.length;
|
||||
if (totalDeployed === 0 || effectiveInferenceFlops <= 0) return slots;
|
||||
|
||||
const flopsPerModel = effectiveInferenceFlops / totalDeployed;
|
||||
|
||||
for (const model of deployedBases) {
|
||||
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[model.sizeTier] ?? 1.0;
|
||||
const moeFactor = model.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
|
||||
const throughput = flopsPerModel * FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor;
|
||||
|
||||
slots.push({
|
||||
modelId: model.id,
|
||||
modelName: model.name,
|
||||
sizeTier: model.sizeTier,
|
||||
isVariant: false,
|
||||
quantization: null,
|
||||
qualityScore: model.rawCapability / 100,
|
||||
speedMultiplier: moeFactor,
|
||||
throughputCapacity: throughput,
|
||||
isMoE: model.architecture.type === 'moe',
|
||||
});
|
||||
}
|
||||
|
||||
for (const { variant, baseModel } of deployedVariants) {
|
||||
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[baseModel.sizeTier] ?? 1.0;
|
||||
const moeFactor = variant.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
|
||||
const quantConfig = variant.quantization ? QUANTIZATION_CONFIGS[variant.quantization] : null;
|
||||
const quantSpeedFactor = quantConfig?.speedMultiplier ?? 1.0;
|
||||
const qualityRetention = quantConfig?.qualityRetention ?? 1.0;
|
||||
const throughput = flopsPerModel * FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor * quantSpeedFactor;
|
||||
|
||||
slots.push({
|
||||
modelId: variant.id,
|
||||
modelName: variant.name,
|
||||
sizeTier: baseModel.sizeTier,
|
||||
isVariant: true,
|
||||
quantization: variant.quantization ?? null,
|
||||
qualityScore: (baseModel.rawCapability / 100) * qualityRetention,
|
||||
speedMultiplier: moeFactor * quantSpeedFactor,
|
||||
throughputCapacity: throughput,
|
||||
isMoE: variant.architecture.type === 'moe',
|
||||
});
|
||||
}
|
||||
|
||||
return slots;
|
||||
}
|
||||
|
||||
function sortFleetByStrategy(
|
||||
fleet: ModelServingSlot[],
|
||||
strategy: string,
|
||||
overallUtilization: number,
|
||||
): ModelServingSlot[] {
|
||||
const sorted = [...fleet];
|
||||
switch (strategy) {
|
||||
case 'quality-first':
|
||||
sorted.sort((a, b) => b.qualityScore - a.qualityScore);
|
||||
break;
|
||||
case 'speed-first':
|
||||
sorted.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
|
||||
break;
|
||||
case 'balanced':
|
||||
default:
|
||||
if (overallUtilization > 0.8) {
|
||||
sorted.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
|
||||
} else {
|
||||
sorted.sort((a, b) => b.qualityScore - a.qualityScore);
|
||||
}
|
||||
break;
|
||||
}
|
||||
return sorted;
|
||||
}
|
||||
|
||||
interface FleetState {
|
||||
remaining: Map<string, number>;
|
||||
used: Map<string, number>;
|
||||
}
|
||||
|
||||
function serveFromFleet(
|
||||
demand: number,
|
||||
fleet: ModelServingSlot[],
|
||||
fleetState: FleetState,
|
||||
policy: OverloadPolicy,
|
||||
tier: TrafficPriority,
|
||||
overallUtilization: number,
|
||||
): TierServingMetrics {
|
||||
if (demand <= 0) {
|
||||
return { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
|
||||
}
|
||||
|
||||
let remaining = demand;
|
||||
let served = 0;
|
||||
let degraded = 0;
|
||||
let qualityWeightedSum = 0;
|
||||
|
||||
const bestQuality = fleet.length > 0 ? Math.max(...fleet.map(s => s.qualityScore)) : 1;
|
||||
const degradationActive = policy.autoDegradation.enabled && overallUtilization > policy.autoDegradation.triggerThreshold;
|
||||
|
||||
for (const slot of fleet) {
|
||||
if (remaining <= 0) break;
|
||||
|
||||
const isDegraded = slot.qualityScore < bestQuality * 0.95;
|
||||
if (isDegraded && !degradationActive) continue;
|
||||
if (isDegraded && slot.qualityScore < policy.autoDegradation.minQualityFloor) continue;
|
||||
|
||||
const available = fleetState.remaining.get(slot.modelId) ?? 0;
|
||||
if (available <= 0) continue;
|
||||
|
||||
const toServe = Math.min(remaining, available);
|
||||
fleetState.remaining.set(slot.modelId, available - toServe);
|
||||
fleetState.used.set(slot.modelId, (fleetState.used.get(slot.modelId) ?? 0) + toServe);
|
||||
|
||||
served += toServe;
|
||||
if (isDegraded) degraded += toServe;
|
||||
qualityWeightedSum += toServe * slot.qualityScore;
|
||||
remaining -= toServe;
|
||||
}
|
||||
|
||||
let queued = 0;
|
||||
let rejected = 0;
|
||||
|
||||
if (remaining > 0) {
|
||||
const behavior = policy.overflowBehavior[tier];
|
||||
switch (behavior) {
|
||||
case 'queue':
|
||||
queued = remaining;
|
||||
break;
|
||||
case 'reject':
|
||||
rejected = remaining;
|
||||
break;
|
||||
case 'degrade':
|
||||
for (const slot of fleet) {
|
||||
if (remaining <= 0) break;
|
||||
const available = fleetState.remaining.get(slot.modelId) ?? 0;
|
||||
if (available <= 0) continue;
|
||||
|
||||
const toServe = Math.min(remaining, available);
|
||||
fleetState.remaining.set(slot.modelId, available - toServe);
|
||||
fleetState.used.set(slot.modelId, (fleetState.used.get(slot.modelId) ?? 0) + toServe);
|
||||
served += toServe;
|
||||
degraded += toServe;
|
||||
qualityWeightedSum += toServe * slot.qualityScore;
|
||||
remaining -= toServe;
|
||||
}
|
||||
rejected = remaining;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const avgQuality = served > 0 ? qualityWeightedSum / served : bestQuality;
|
||||
|
||||
return {
|
||||
demandTokens: demand,
|
||||
servedTokens: served,
|
||||
queuedTokens: queued,
|
||||
rejectedTokens: rejected,
|
||||
degradedTokens: degraded,
|
||||
avgQualityDelivered: avgQuality,
|
||||
};
|
||||
}
|
||||
|
||||
export function processServingPipeline(input: ServingPipelineInput): ServingPipelineResult {
|
||||
const { modelsState, effectiveInferenceFlops, overloadPolicy, demandByTier, batchApi, modelQuality, researchUnlocks } = input;
|
||||
|
||||
const fleet = buildModelFleet(modelsState, effectiveInferenceFlops);
|
||||
const totalFleetCapacity = fleet.reduce((sum, s) => sum + s.throughputCapacity, 0);
|
||||
|
||||
if (fleet.length === 0 || totalFleetCapacity <= 0) {
|
||||
const metrics = makeInitialServingMetrics();
|
||||
for (const tier of Object.keys(demandByTier) as TrafficPriority[]) {
|
||||
const demand = demandByTier[tier] ?? 0;
|
||||
if (demand > 0) {
|
||||
metrics.tierMetrics[tier] = {
|
||||
demandTokens: demand,
|
||||
servedTokens: 0,
|
||||
queuedTokens: 0,
|
||||
rejectedTokens: demand,
|
||||
degradedTokens: 0,
|
||||
avgQualityDelivered: 0,
|
||||
};
|
||||
metrics.totalRejected += demand;
|
||||
}
|
||||
}
|
||||
return {
|
||||
servingMetrics: metrics,
|
||||
batchApi: { ...batchApi, servedLastTick: 0, revenue: 0 },
|
||||
batchRevenue: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const totalDemand = Object.values(demandByTier).reduce((s, v) => s + v, 0);
|
||||
const overallUtilization = totalFleetCapacity > 0 ? totalDemand / totalFleetCapacity : 0;
|
||||
|
||||
const effectiveStrategy = researchUnlocks.servingRoutingUnlocked
|
||||
? overloadPolicy.routingStrategy
|
||||
: 'balanced';
|
||||
|
||||
const sortedFleet = sortFleetByStrategy(fleet, effectiveStrategy, overallUtilization);
|
||||
|
||||
const fleetState: FleetState = {
|
||||
remaining: new Map(fleet.map(s => [s.modelId, s.throughputCapacity])),
|
||||
used: new Map(fleet.map(s => [s.modelId, 0])),
|
||||
};
|
||||
|
||||
const reservedCapacity = totalFleetCapacity * overloadPolicy.enterpriseReservation;
|
||||
const enterpriseDemand = demandByTier['enterprise'] ?? 0;
|
||||
|
||||
if (reservedCapacity > 0 && enterpriseDemand > 0) {
|
||||
const reservePerModel = reservedCapacity / fleet.length;
|
||||
for (const slot of sortedFleet) {
|
||||
const current = fleetState.remaining.get(slot.modelId) ?? 0;
|
||||
const reserved = Math.min(reservePerModel, current);
|
||||
fleetState.remaining.set(slot.modelId, current - reserved);
|
||||
}
|
||||
}
|
||||
|
||||
const effectivePriorityOrder = researchUnlocks.priorityQueuesUnlocked
|
||||
? overloadPolicy.priorityOrder
|
||||
: ['enterprise', 'api-paid', 'consumer-paid', 'api-free', 'consumer-free'] as TrafficPriority[];
|
||||
|
||||
const tierResults: Record<TrafficPriority, TierServingMetrics> = {} as Record<TrafficPriority, TierServingMetrics>;
|
||||
|
||||
const nonEnterpriseTiers = effectivePriorityOrder.filter(t => t !== 'enterprise');
|
||||
|
||||
if (enterpriseDemand > 0) {
|
||||
const enterpriseFleetState: FleetState = {
|
||||
remaining: new Map(fleet.map(s => [s.modelId, s.throughputCapacity])),
|
||||
used: new Map(fleet.map(s => [s.modelId, 0])),
|
||||
};
|
||||
|
||||
const reserveLimit = reservedCapacity > 0 ? reservedCapacity : totalFleetCapacity;
|
||||
let budgetLeft = reserveLimit;
|
||||
for (const slot of sortedFleet) {
|
||||
const cap = slot.throughputCapacity;
|
||||
const alloc = Math.min(cap, budgetLeft);
|
||||
enterpriseFleetState.remaining.set(slot.modelId, alloc);
|
||||
budgetLeft -= alloc;
|
||||
if (budgetLeft <= 0) break;
|
||||
}
|
||||
|
||||
const effectiveEntDemand = researchUnlocks.servingRoutingUnlocked
|
||||
? Math.min(enterpriseDemand, overloadPolicy.rateLimitPerCustomer['enterprise'] * 100)
|
||||
: enterpriseDemand;
|
||||
|
||||
tierResults['enterprise'] = serveFromFleet(
|
||||
effectiveEntDemand, sortedFleet, enterpriseFleetState, overloadPolicy, 'enterprise', overallUtilization,
|
||||
);
|
||||
|
||||
for (const slot of fleet) {
|
||||
const entUsed = enterpriseFleetState.used.get(slot.modelId) ?? 0;
|
||||
const mainRemaining = fleetState.remaining.get(slot.modelId) ?? 0;
|
||||
fleetState.remaining.set(slot.modelId, Math.max(0, mainRemaining - entUsed + (reservedCapacity > 0 ? reservedCapacity / fleet.length : 0)));
|
||||
fleetState.used.set(slot.modelId, entUsed);
|
||||
}
|
||||
} else {
|
||||
tierResults['enterprise'] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
|
||||
|
||||
if (reservedCapacity > 0) {
|
||||
const reservePerModel = reservedCapacity / fleet.length;
|
||||
for (const slot of fleet) {
|
||||
const current = fleetState.remaining.get(slot.modelId) ?? 0;
|
||||
fleetState.remaining.set(slot.modelId, current + reservePerModel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const tier of nonEnterpriseTiers) {
|
||||
const rawDemand = demandByTier[tier] ?? 0;
|
||||
const effectiveDemand = researchUnlocks.servingRoutingUnlocked
|
||||
? Math.min(rawDemand, overloadPolicy.rateLimitPerCustomer[tier] * 100)
|
||||
: rawDemand;
|
||||
|
||||
tierResults[tier] = serveFromFleet(
|
||||
effectiveDemand, sortedFleet, fleetState, overloadPolicy, tier, overallUtilization,
|
||||
);
|
||||
}
|
||||
|
||||
for (const tier of effectivePriorityOrder) {
|
||||
if (!(tier in tierResults)) {
|
||||
tierResults[tier] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
|
||||
}
|
||||
}
|
||||
|
||||
let batchTokensServed = 0;
|
||||
let batchRevenue = 0;
|
||||
const updatedBatchApi = { ...batchApi };
|
||||
|
||||
if (overloadPolicy.batchApiEnabled && researchUnlocks.batchApiUnlocked) {
|
||||
let idleCapacity = 0;
|
||||
for (const slot of fleet) {
|
||||
const remaining = fleetState.remaining.get(slot.modelId) ?? 0;
|
||||
idleCapacity += remaining;
|
||||
}
|
||||
|
||||
const pendingBatch = Math.min(batchApi.pendingQueue + batchApi.totalBatchDemand, BATCH_API_MAX_PENDING);
|
||||
batchTokensServed = Math.min(pendingBatch, idleCapacity);
|
||||
|
||||
const baseTokenPrice = 3.0;
|
||||
batchRevenue = (batchTokensServed / 1_000_000) * baseTokenPrice * (1 - overloadPolicy.batchApiDiscount);
|
||||
|
||||
updatedBatchApi.pendingQueue = Math.max(0, pendingBatch - batchTokensServed);
|
||||
updatedBatchApi.servedLastTick = batchTokensServed;
|
||||
updatedBatchApi.revenue = batchRevenue;
|
||||
}
|
||||
|
||||
const totalServed = Object.values(tierResults).reduce((s, t) => s + t.servedTokens, 0);
|
||||
const totalQueued = Object.values(tierResults).reduce((s, t) => s + t.queuedTokens, 0);
|
||||
const totalRejected = Object.values(tierResults).reduce((s, t) => s + t.rejectedTokens, 0);
|
||||
const totalDegraded = Object.values(tierResults).reduce((s, t) => s + t.degradedTokens, 0);
|
||||
|
||||
let effectiveQuality = modelQuality;
|
||||
if (totalServed > 0) {
|
||||
let qualitySum = 0;
|
||||
for (const t of Object.values(tierResults)) {
|
||||
qualitySum += t.avgQualityDelivered * t.servedTokens;
|
||||
}
|
||||
effectiveQuality = qualitySum / totalServed;
|
||||
}
|
||||
|
||||
const queuedFraction = totalDemand > 0 ? totalQueued / totalDemand : 0;
|
||||
const avgLatencyMs = BASE_LATENCY_MS + queuedFraction * 100 * QUEUE_LATENCY_MS_PER_PERCENT;
|
||||
|
||||
const modelUtilization: ModelUtilizationEntry[] = fleet.map(slot => ({
|
||||
modelId: slot.modelId,
|
||||
modelName: slot.modelName,
|
||||
quantization: slot.quantization,
|
||||
qualityScore: slot.qualityScore,
|
||||
throughputCapacity: slot.throughputCapacity,
|
||||
throughputUsed: fleetState.used.get(slot.modelId) ?? 0,
|
||||
utilization: slot.throughputCapacity > 0
|
||||
? Math.min(1, (fleetState.used.get(slot.modelId) ?? 0) / slot.throughputCapacity)
|
||||
: 0,
|
||||
}));
|
||||
|
||||
const autoScaleBoost = researchUnlocks.autoScalingBonus;
|
||||
if (autoScaleBoost > 0) {
|
||||
for (const tier of Object.keys(tierResults) as TrafficPriority[]) {
|
||||
const metrics = tierResults[tier];
|
||||
if (metrics.rejectedTokens > 0) {
|
||||
const recovered = Math.min(metrics.rejectedTokens, metrics.rejectedTokens * autoScaleBoost);
|
||||
tierResults[tier] = {
|
||||
...metrics,
|
||||
servedTokens: metrics.servedTokens + recovered,
|
||||
rejectedTokens: metrics.rejectedTokens - recovered,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
servingMetrics: {
|
||||
tierMetrics: tierResults,
|
||||
totalServed,
|
||||
totalQueued,
|
||||
totalRejected,
|
||||
totalDegraded,
|
||||
effectiveQuality,
|
||||
avgLatencyMs,
|
||||
modelUtilization,
|
||||
batchApiTokensServed: batchTokensServed,
|
||||
batchApiRevenue: batchRevenue,
|
||||
},
|
||||
batchApi: updatedBatchApi,
|
||||
batchRevenue,
|
||||
};
|
||||
}
|
||||
|
||||
export function computeSatisfactionImpact(
|
||||
metrics: TierServingMetrics,
|
||||
): number {
|
||||
if (metrics.demandTokens <= 0) return 0;
|
||||
|
||||
const rejectedFraction = metrics.rejectedTokens / metrics.demandTokens;
|
||||
const queuedFraction = metrics.queuedTokens / metrics.demandTokens;
|
||||
const degradedFraction = metrics.servedTokens > 0 ? metrics.degradedTokens / metrics.servedTokens : 0;
|
||||
|
||||
const rejectionPenalty = rejectedFraction * REJECTION_SATISFACTION_PENALTY * 10;
|
||||
const queuePenalty = queuedFraction * QUEUE_SATISFACTION_PENALTY * 10;
|
||||
const degradationPenalty = degradedFraction * (1 - metrics.avgQualityDelivered) * DEGRADATION_SATISFACTION_PENALTY * 10;
|
||||
|
||||
return -(rejectionPenalty + queuePenalty + degradationPenalty);
|
||||
}
|
||||
Reference in New Issue
Block a user