Replace decorative overload policy with real serving pipeline and dedicated Serving page
CI / build-and-push (push) Successful in 28s

The old overload policy had dead controls (maxQueueDepth, rateLimitPerCustomer never read)
and trivial flat penalties. This replaces it with a full serving pipeline where deployed
models form a fleet, requests route through priority/degradation logic, and policy choices
create meaningful strategic tradeoffs.

New serving pipeline: fleet building from deployed models (size/quant/MoE multipliers),
demand categorization by 5 priority tiers, enterprise capacity reservation, priority-ordered
serving with overflow behaviors (queue/reject/degrade), auto-degradation to faster models
under load, and Batch API to fill idle capacity at discounted rates.

4 new research nodes gate features progressively: Intelligent Request Routing, Priority
Queue System, Request Batching, and Auto-Scaling. New dedicated Serving page with pipeline
metrics, model fleet utilization, and research-gated policy controls.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-25 12:42:09 -04:00
parent d7d77238b9
commit 901db02a6b
17 changed files with 1349 additions and 229 deletions
@@ -0,0 +1,462 @@
import type {
OverloadPolicy,
TrafficPriority,
TierServingMetrics,
ServingMetrics,
ModelUtilizationEntry,
BatchApiState,
} from '@ai-tycoon/shared';
import type { BaseModel, ModelVariant, ModelFamily, ModelsState, SizeTier } from '@ai-tycoon/shared';
import {
MODEL_SIZE_THROUGHPUT_SCALER,
MOE_SPEED_MULTIPLIER,
FLOPS_TO_TOKENS_MULTIPLIER,
QUANTIZATION_CONFIGS,
REJECTION_SATISFACTION_PENALTY,
QUEUE_SATISFACTION_PENALTY,
DEGRADATION_SATISFACTION_PENALTY,
BASE_LATENCY_MS,
QUEUE_LATENCY_MS_PER_PERCENT,
BATCH_API_MAX_PENDING,
} from '@ai-tycoon/shared';
import { makeInitialServingMetrics } from '@ai-tycoon/shared';
export interface ModelServingSlot {
modelId: string;
modelName: string;
sizeTier: SizeTier;
isVariant: boolean;
quantization: string | null;
qualityScore: number;
speedMultiplier: number;
throughputCapacity: number;
isMoE: boolean;
}
export interface DemandByTier {
enterprise: number;
'api-paid': number;
'consumer-paid': number;
'api-free': number;
'consumer-free': number;
}
export interface ServingPipelineInput {
modelsState: ModelsState;
effectiveInferenceFlops: number;
overloadPolicy: OverloadPolicy;
demandByTier: DemandByTier;
batchApi: BatchApiState;
modelQuality: number;
researchUnlocks: {
servingRoutingUnlocked: boolean;
priorityQueuesUnlocked: boolean;
batchApiUnlocked: boolean;
autoScalingBonus: number;
};
}
export interface ServingPipelineResult {
servingMetrics: ServingMetrics;
batchApi: BatchApiState;
batchRevenue: number;
}
function buildModelFleet(
modelsState: ModelsState,
effectiveInferenceFlops: number,
): ModelServingSlot[] {
const slots: ModelServingSlot[] = [];
const deployedBases = modelsState.baseModels.filter(m => m.isDeployed);
const deployedVariants: { variant: ModelVariant; baseModel: BaseModel }[] = [];
for (const family of modelsState.families) {
for (const variant of family.variants) {
if (!variant.isDeployed) continue;
const base = modelsState.baseModels.find(m => m.id === variant.baseModelId);
if (base) deployedVariants.push({ variant, baseModel: base });
}
}
const totalDeployed = deployedBases.length + deployedVariants.length;
if (totalDeployed === 0 || effectiveInferenceFlops <= 0) return slots;
const flopsPerModel = effectiveInferenceFlops / totalDeployed;
for (const model of deployedBases) {
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[model.sizeTier] ?? 1.0;
const moeFactor = model.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
const throughput = flopsPerModel * FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor;
slots.push({
modelId: model.id,
modelName: model.name,
sizeTier: model.sizeTier,
isVariant: false,
quantization: null,
qualityScore: model.rawCapability / 100,
speedMultiplier: moeFactor,
throughputCapacity: throughput,
isMoE: model.architecture.type === 'moe',
});
}
for (const { variant, baseModel } of deployedVariants) {
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[baseModel.sizeTier] ?? 1.0;
const moeFactor = variant.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
const quantConfig = variant.quantization ? QUANTIZATION_CONFIGS[variant.quantization] : null;
const quantSpeedFactor = quantConfig?.speedMultiplier ?? 1.0;
const qualityRetention = quantConfig?.qualityRetention ?? 1.0;
const throughput = flopsPerModel * FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor * quantSpeedFactor;
slots.push({
modelId: variant.id,
modelName: variant.name,
sizeTier: baseModel.sizeTier,
isVariant: true,
quantization: variant.quantization ?? null,
qualityScore: (baseModel.rawCapability / 100) * qualityRetention,
speedMultiplier: moeFactor * quantSpeedFactor,
throughputCapacity: throughput,
isMoE: variant.architecture.type === 'moe',
});
}
return slots;
}
function sortFleetByStrategy(
fleet: ModelServingSlot[],
strategy: string,
overallUtilization: number,
): ModelServingSlot[] {
const sorted = [...fleet];
switch (strategy) {
case 'quality-first':
sorted.sort((a, b) => b.qualityScore - a.qualityScore);
break;
case 'speed-first':
sorted.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
break;
case 'balanced':
default:
if (overallUtilization > 0.8) {
sorted.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
} else {
sorted.sort((a, b) => b.qualityScore - a.qualityScore);
}
break;
}
return sorted;
}
interface FleetState {
remaining: Map<string, number>;
used: Map<string, number>;
}
function serveFromFleet(
demand: number,
fleet: ModelServingSlot[],
fleetState: FleetState,
policy: OverloadPolicy,
tier: TrafficPriority,
overallUtilization: number,
): TierServingMetrics {
if (demand <= 0) {
return { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
}
let remaining = demand;
let served = 0;
let degraded = 0;
let qualityWeightedSum = 0;
const bestQuality = fleet.length > 0 ? Math.max(...fleet.map(s => s.qualityScore)) : 1;
const degradationActive = policy.autoDegradation.enabled && overallUtilization > policy.autoDegradation.triggerThreshold;
for (const slot of fleet) {
if (remaining <= 0) break;
const isDegraded = slot.qualityScore < bestQuality * 0.95;
if (isDegraded && !degradationActive) continue;
if (isDegraded && slot.qualityScore < policy.autoDegradation.minQualityFloor) continue;
const available = fleetState.remaining.get(slot.modelId) ?? 0;
if (available <= 0) continue;
const toServe = Math.min(remaining, available);
fleetState.remaining.set(slot.modelId, available - toServe);
fleetState.used.set(slot.modelId, (fleetState.used.get(slot.modelId) ?? 0) + toServe);
served += toServe;
if (isDegraded) degraded += toServe;
qualityWeightedSum += toServe * slot.qualityScore;
remaining -= toServe;
}
let queued = 0;
let rejected = 0;
if (remaining > 0) {
const behavior = policy.overflowBehavior[tier];
switch (behavior) {
case 'queue':
queued = remaining;
break;
case 'reject':
rejected = remaining;
break;
case 'degrade':
for (const slot of fleet) {
if (remaining <= 0) break;
const available = fleetState.remaining.get(slot.modelId) ?? 0;
if (available <= 0) continue;
const toServe = Math.min(remaining, available);
fleetState.remaining.set(slot.modelId, available - toServe);
fleetState.used.set(slot.modelId, (fleetState.used.get(slot.modelId) ?? 0) + toServe);
served += toServe;
degraded += toServe;
qualityWeightedSum += toServe * slot.qualityScore;
remaining -= toServe;
}
rejected = remaining;
break;
}
}
const avgQuality = served > 0 ? qualityWeightedSum / served : bestQuality;
return {
demandTokens: demand,
servedTokens: served,
queuedTokens: queued,
rejectedTokens: rejected,
degradedTokens: degraded,
avgQualityDelivered: avgQuality,
};
}
export function processServingPipeline(input: ServingPipelineInput): ServingPipelineResult {
const { modelsState, effectiveInferenceFlops, overloadPolicy, demandByTier, batchApi, modelQuality, researchUnlocks } = input;
const fleet = buildModelFleet(modelsState, effectiveInferenceFlops);
const totalFleetCapacity = fleet.reduce((sum, s) => sum + s.throughputCapacity, 0);
if (fleet.length === 0 || totalFleetCapacity <= 0) {
const metrics = makeInitialServingMetrics();
for (const tier of Object.keys(demandByTier) as TrafficPriority[]) {
const demand = demandByTier[tier] ?? 0;
if (demand > 0) {
metrics.tierMetrics[tier] = {
demandTokens: demand,
servedTokens: 0,
queuedTokens: 0,
rejectedTokens: demand,
degradedTokens: 0,
avgQualityDelivered: 0,
};
metrics.totalRejected += demand;
}
}
return {
servingMetrics: metrics,
batchApi: { ...batchApi, servedLastTick: 0, revenue: 0 },
batchRevenue: 0,
};
}
const totalDemand = Object.values(demandByTier).reduce((s, v) => s + v, 0);
const overallUtilization = totalFleetCapacity > 0 ? totalDemand / totalFleetCapacity : 0;
const effectiveStrategy = researchUnlocks.servingRoutingUnlocked
? overloadPolicy.routingStrategy
: 'balanced';
const sortedFleet = sortFleetByStrategy(fleet, effectiveStrategy, overallUtilization);
const fleetState: FleetState = {
remaining: new Map(fleet.map(s => [s.modelId, s.throughputCapacity])),
used: new Map(fleet.map(s => [s.modelId, 0])),
};
const reservedCapacity = totalFleetCapacity * overloadPolicy.enterpriseReservation;
const enterpriseDemand = demandByTier['enterprise'] ?? 0;
if (reservedCapacity > 0 && enterpriseDemand > 0) {
const reservePerModel = reservedCapacity / fleet.length;
for (const slot of sortedFleet) {
const current = fleetState.remaining.get(slot.modelId) ?? 0;
const reserved = Math.min(reservePerModel, current);
fleetState.remaining.set(slot.modelId, current - reserved);
}
}
const effectivePriorityOrder = researchUnlocks.priorityQueuesUnlocked
? overloadPolicy.priorityOrder
: ['enterprise', 'api-paid', 'consumer-paid', 'api-free', 'consumer-free'] as TrafficPriority[];
const tierResults: Record<TrafficPriority, TierServingMetrics> = {} as Record<TrafficPriority, TierServingMetrics>;
const nonEnterpriseTiers = effectivePriorityOrder.filter(t => t !== 'enterprise');
if (enterpriseDemand > 0) {
const enterpriseFleetState: FleetState = {
remaining: new Map(fleet.map(s => [s.modelId, s.throughputCapacity])),
used: new Map(fleet.map(s => [s.modelId, 0])),
};
const reserveLimit = reservedCapacity > 0 ? reservedCapacity : totalFleetCapacity;
let budgetLeft = reserveLimit;
for (const slot of sortedFleet) {
const cap = slot.throughputCapacity;
const alloc = Math.min(cap, budgetLeft);
enterpriseFleetState.remaining.set(slot.modelId, alloc);
budgetLeft -= alloc;
if (budgetLeft <= 0) break;
}
const effectiveEntDemand = researchUnlocks.servingRoutingUnlocked
? Math.min(enterpriseDemand, overloadPolicy.rateLimitPerCustomer['enterprise'] * 100)
: enterpriseDemand;
tierResults['enterprise'] = serveFromFleet(
effectiveEntDemand, sortedFleet, enterpriseFleetState, overloadPolicy, 'enterprise', overallUtilization,
);
for (const slot of fleet) {
const entUsed = enterpriseFleetState.used.get(slot.modelId) ?? 0;
const mainRemaining = fleetState.remaining.get(slot.modelId) ?? 0;
fleetState.remaining.set(slot.modelId, Math.max(0, mainRemaining - entUsed + (reservedCapacity > 0 ? reservedCapacity / fleet.length : 0)));
fleetState.used.set(slot.modelId, entUsed);
}
} else {
tierResults['enterprise'] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
if (reservedCapacity > 0) {
const reservePerModel = reservedCapacity / fleet.length;
for (const slot of fleet) {
const current = fleetState.remaining.get(slot.modelId) ?? 0;
fleetState.remaining.set(slot.modelId, current + reservePerModel);
}
}
}
for (const tier of nonEnterpriseTiers) {
const rawDemand = demandByTier[tier] ?? 0;
const effectiveDemand = researchUnlocks.servingRoutingUnlocked
? Math.min(rawDemand, overloadPolicy.rateLimitPerCustomer[tier] * 100)
: rawDemand;
tierResults[tier] = serveFromFleet(
effectiveDemand, sortedFleet, fleetState, overloadPolicy, tier, overallUtilization,
);
}
for (const tier of effectivePriorityOrder) {
if (!(tier in tierResults)) {
tierResults[tier] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
}
}
let batchTokensServed = 0;
let batchRevenue = 0;
const updatedBatchApi = { ...batchApi };
if (overloadPolicy.batchApiEnabled && researchUnlocks.batchApiUnlocked) {
let idleCapacity = 0;
for (const slot of fleet) {
const remaining = fleetState.remaining.get(slot.modelId) ?? 0;
idleCapacity += remaining;
}
const pendingBatch = Math.min(batchApi.pendingQueue + batchApi.totalBatchDemand, BATCH_API_MAX_PENDING);
batchTokensServed = Math.min(pendingBatch, idleCapacity);
const baseTokenPrice = 3.0;
batchRevenue = (batchTokensServed / 1_000_000) * baseTokenPrice * (1 - overloadPolicy.batchApiDiscount);
updatedBatchApi.pendingQueue = Math.max(0, pendingBatch - batchTokensServed);
updatedBatchApi.servedLastTick = batchTokensServed;
updatedBatchApi.revenue = batchRevenue;
}
const totalServed = Object.values(tierResults).reduce((s, t) => s + t.servedTokens, 0);
const totalQueued = Object.values(tierResults).reduce((s, t) => s + t.queuedTokens, 0);
const totalRejected = Object.values(tierResults).reduce((s, t) => s + t.rejectedTokens, 0);
const totalDegraded = Object.values(tierResults).reduce((s, t) => s + t.degradedTokens, 0);
let effectiveQuality = modelQuality;
if (totalServed > 0) {
let qualitySum = 0;
for (const t of Object.values(tierResults)) {
qualitySum += t.avgQualityDelivered * t.servedTokens;
}
effectiveQuality = qualitySum / totalServed;
}
const queuedFraction = totalDemand > 0 ? totalQueued / totalDemand : 0;
const avgLatencyMs = BASE_LATENCY_MS + queuedFraction * 100 * QUEUE_LATENCY_MS_PER_PERCENT;
const modelUtilization: ModelUtilizationEntry[] = fleet.map(slot => ({
modelId: slot.modelId,
modelName: slot.modelName,
quantization: slot.quantization,
qualityScore: slot.qualityScore,
throughputCapacity: slot.throughputCapacity,
throughputUsed: fleetState.used.get(slot.modelId) ?? 0,
utilization: slot.throughputCapacity > 0
? Math.min(1, (fleetState.used.get(slot.modelId) ?? 0) / slot.throughputCapacity)
: 0,
}));
const autoScaleBoost = researchUnlocks.autoScalingBonus;
if (autoScaleBoost > 0) {
for (const tier of Object.keys(tierResults) as TrafficPriority[]) {
const metrics = tierResults[tier];
if (metrics.rejectedTokens > 0) {
const recovered = Math.min(metrics.rejectedTokens, metrics.rejectedTokens * autoScaleBoost);
tierResults[tier] = {
...metrics,
servedTokens: metrics.servedTokens + recovered,
rejectedTokens: metrics.rejectedTokens - recovered,
};
}
}
}
return {
servingMetrics: {
tierMetrics: tierResults,
totalServed,
totalQueued,
totalRejected,
totalDegraded,
effectiveQuality,
avgLatencyMs,
modelUtilization,
batchApiTokensServed: batchTokensServed,
batchApiRevenue: batchRevenue,
},
batchApi: updatedBatchApi,
batchRevenue,
};
}
export function computeSatisfactionImpact(
metrics: TierServingMetrics,
): number {
if (metrics.demandTokens <= 0) return 0;
const rejectedFraction = metrics.rejectedTokens / metrics.demandTokens;
const queuedFraction = metrics.queuedTokens / metrics.demandTokens;
const degradedFraction = metrics.servedTokens > 0 ? metrics.degradedTokens / metrics.servedTokens : 0;
const rejectionPenalty = rejectedFraction * REJECTION_SATISFACTION_PENALTY * 10;
const queuePenalty = queuedFraction * QUEUE_SATISFACTION_PENALTY * 10;
const degradationPenalty = degradedFraction * (1 - metrics.avgQualityDelivered) * DEGRADATION_SATISFACTION_PENALTY * 10;
return -(rejectionPenalty + queuePenalty + degradationPenalty);
}