import type { OverloadPolicy, TrafficPriority, TierServingMetrics, ServingMetrics, ModelUtilizationEntry, BatchApiState, } from '@ai-tycoon/shared'; import type { BaseModel, ModelsState, SizeTier } from '@ai-tycoon/shared'; import { MODEL_SIZE_THROUGHPUT_SCALER, MOE_SPEED_MULTIPLIER, FLOPS_TO_TOKENS_MULTIPLIER, QUANTIZATION_CONFIGS, REJECTION_SATISFACTION_PENALTY, QUEUE_SATISFACTION_PENALTY, DEGRADATION_SATISFACTION_PENALTY, BASE_LATENCY_MS, QUEUE_LATENCY_MS_PER_PERCENT, BATCH_API_MAX_PENDING, } from '@ai-tycoon/shared'; import { makeInitialServingMetrics } from '@ai-tycoon/shared'; export interface ModelServingSlot { modelId: string; modelName: string; sizeTier: SizeTier; isVariant: boolean; quantization: string | null; qualityScore: number; speedMultiplier: number; throughputCapacity: number; isMoE: boolean; } export interface DemandByTier { enterprise: number; 'api-paid': number; 'consumer-paid': number; 'api-free': number; 'consumer-free': number; } export interface ServingPipelineInput { modelsState: ModelsState; effectiveInferenceFlops: number; overloadPolicy: OverloadPolicy; demandByTier: DemandByTier; batchApi: BatchApiState; modelQuality: number; researchUnlocks: { servingRoutingUnlocked: boolean; priorityQueuesUnlocked: boolean; batchApiUnlocked: boolean; autoScalingBonus: number; }; } export interface ServingPipelineResult { servingMetrics: ServingMetrics; batchApi: BatchApiState; batchRevenue: number; } interface CachedSlot { modelId: string; modelName: string; sizeTier: SizeTier; isVariant: boolean; quantization: string | null; qualityScore: number; speedMultiplier: number; throughputMultiplier: number; isMoE: boolean; } let cachedDeploymentVersion = -1; let cachedSlots: CachedSlot[] = []; const fleetOutput: ModelServingSlot[] = []; const mainRemaining = new Map(); const mainUsed = new Map(); const entRemaining = new Map(); const entUsed = new Map(); let cachedUtilization: ModelUtilizationEntry[] = []; export function resetFleetCache(): void { cachedDeploymentVersion = -1; cachedSlots.length = 0; fleetOutput.length = 0; mainRemaining.clear(); mainUsed.clear(); entRemaining.clear(); entUsed.clear(); cachedUtilization.length = 0; } function buildModelFleet( modelsState: ModelsState, effectiveInferenceFlops: number, ): ModelServingSlot[] { const version = modelsState.deploymentVersion; if (version !== cachedDeploymentVersion) { cachedSlots.length = 0; const baseModelById = new Map(); for (const m of modelsState.baseModels) { baseModelById.set(m.id, m); if (!m.isDeployed) continue; const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[m.sizeTier] ?? 1.0; const moeFactor = m.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0; cachedSlots.push({ modelId: m.id, modelName: m.name, sizeTier: m.sizeTier, isVariant: false, quantization: null, qualityScore: m.rawCapability / 100, speedMultiplier: moeFactor, throughputMultiplier: FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor, isMoE: m.architecture.type === 'moe', }); } for (const family of modelsState.families) { for (const variant of family.variants) { if (!variant.isDeployed) continue; const base = baseModelById.get(variant.baseModelId); if (!base) continue; const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[base.sizeTier] ?? 1.0; const moeFactor = variant.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0; const quantConfig = variant.quantization ? QUANTIZATION_CONFIGS[variant.quantization] : null; const quantSpeedFactor = quantConfig?.speedMultiplier ?? 1.0; const qualityRetention = quantConfig?.qualityRetention ?? 1.0; cachedSlots.push({ modelId: variant.id, modelName: variant.name, sizeTier: base.sizeTier, isVariant: true, quantization: variant.quantization ?? null, qualityScore: (base.rawCapability / 100) * qualityRetention, speedMultiplier: moeFactor * quantSpeedFactor, throughputMultiplier: FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor * quantSpeedFactor, isMoE: variant.architecture.type === 'moe', }); } } cachedDeploymentVersion = version; } const totalDeployed = cachedSlots.length; if (totalDeployed === 0 || effectiveInferenceFlops <= 0) { fleetOutput.length = 0; return fleetOutput; } const flopsPerModel = effectiveInferenceFlops / totalDeployed; fleetOutput.length = totalDeployed; for (let i = 0; i < totalDeployed; i++) { const cs = cachedSlots[i]; const existing = fleetOutput[i]; if (existing) { existing.modelId = cs.modelId; existing.modelName = cs.modelName; existing.sizeTier = cs.sizeTier; existing.isVariant = cs.isVariant; existing.quantization = cs.quantization; existing.qualityScore = cs.qualityScore; existing.speedMultiplier = cs.speedMultiplier; existing.throughputCapacity = flopsPerModel * cs.throughputMultiplier; existing.isMoE = cs.isMoE; } else { fleetOutput[i] = { modelId: cs.modelId, modelName: cs.modelName, sizeTier: cs.sizeTier, isVariant: cs.isVariant, quantization: cs.quantization, qualityScore: cs.qualityScore, speedMultiplier: cs.speedMultiplier, throughputCapacity: flopsPerModel * cs.throughputMultiplier, isMoE: cs.isMoE, }; } } return fleetOutput; } function sortFleetByStrategy( fleet: ModelServingSlot[], strategy: string, overallUtilization: number, ): ModelServingSlot[] { switch (strategy) { case 'quality-first': fleet.sort((a, b) => b.qualityScore - a.qualityScore); break; case 'speed-first': fleet.sort((a, b) => b.throughputCapacity - a.throughputCapacity); break; case 'balanced': default: if (overallUtilization > 0.8) { fleet.sort((a, b) => b.throughputCapacity - a.throughputCapacity); } else { fleet.sort((a, b) => b.qualityScore - a.qualityScore); } break; } return fleet; } interface FleetState { remaining: Map; used: Map; } function serveFromFleet( demand: number, fleet: ModelServingSlot[], fleetState: FleetState, policy: OverloadPolicy, tier: TrafficPriority, overallUtilization: number, ): TierServingMetrics { if (demand <= 0) { return { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 }; } let remaining = demand; let served = 0; let degraded = 0; let qualityWeightedSum = 0; let bestQuality = 0; for (const s of fleet) { if (s.qualityScore > bestQuality) bestQuality = s.qualityScore; } if (bestQuality === 0) bestQuality = 1; const degradationActive = policy.autoDegradation.enabled && overallUtilization > policy.autoDegradation.triggerThreshold; for (const slot of fleet) { if (remaining <= 0) break; const isDegraded = slot.qualityScore < bestQuality * 0.95; if (isDegraded && !degradationActive) continue; if (isDegraded && slot.qualityScore < policy.autoDegradation.minQualityFloor) continue; const available = fleetState.remaining.get(slot.modelId) ?? 0; if (available <= 0) continue; const toServe = Math.min(remaining, available); fleetState.remaining.set(slot.modelId, available - toServe); fleetState.used.set(slot.modelId, (fleetState.used.get(slot.modelId) ?? 0) + toServe); served += toServe; if (isDegraded) degraded += toServe; qualityWeightedSum += toServe * slot.qualityScore; remaining -= toServe; } let queued = 0; let rejected = 0; if (remaining > 0) { const behavior = policy.overflowBehavior[tier]; switch (behavior) { case 'queue': queued = remaining; break; case 'reject': rejected = remaining; break; case 'degrade': for (const slot of fleet) { if (remaining <= 0) break; const available = fleetState.remaining.get(slot.modelId) ?? 0; if (available <= 0) continue; const toServe = Math.min(remaining, available); fleetState.remaining.set(slot.modelId, available - toServe); fleetState.used.set(slot.modelId, (fleetState.used.get(slot.modelId) ?? 0) + toServe); served += toServe; degraded += toServe; qualityWeightedSum += toServe * slot.qualityScore; remaining -= toServe; } rejected = remaining; break; } } const avgQuality = served > 0 ? qualityWeightedSum / served : bestQuality; return { demandTokens: demand, servedTokens: served, queuedTokens: queued, rejectedTokens: rejected, degradedTokens: degraded, avgQualityDelivered: avgQuality, }; } export function processServingPipeline(input: ServingPipelineInput): ServingPipelineResult { const { modelsState, effectiveInferenceFlops, overloadPolicy, demandByTier, batchApi, modelQuality, researchUnlocks } = input; const fleet = buildModelFleet(modelsState, effectiveInferenceFlops); let totalFleetCapacity = 0; for (const s of fleet) totalFleetCapacity += s.throughputCapacity; if (fleet.length === 0 || totalFleetCapacity <= 0) { const metrics = makeInitialServingMetrics(); for (const tier of Object.keys(demandByTier) as TrafficPriority[]) { const demand = demandByTier[tier] ?? 0; if (demand > 0) { metrics.tierMetrics[tier] = { demandTokens: demand, servedTokens: 0, queuedTokens: 0, rejectedTokens: demand, degradedTokens: 0, avgQualityDelivered: 0, }; metrics.totalRejected += demand; } } return { servingMetrics: metrics, batchApi: { ...batchApi, servedLastTick: 0, revenue: 0 }, batchRevenue: 0, }; } const totalDemand = demandByTier.enterprise + demandByTier['api-paid'] + demandByTier['consumer-paid'] + demandByTier['api-free'] + demandByTier['consumer-free']; const overallUtilization = totalFleetCapacity > 0 ? totalDemand / totalFleetCapacity : 0; const effectiveStrategy = researchUnlocks.servingRoutingUnlocked ? overloadPolicy.routingStrategy : 'balanced'; const sortedFleet = sortFleetByStrategy(fleet, effectiveStrategy, overallUtilization); mainRemaining.clear(); mainUsed.clear(); for (const s of fleet) { mainRemaining.set(s.modelId, s.throughputCapacity); mainUsed.set(s.modelId, 0); } const fleetState: FleetState = { remaining: mainRemaining, used: mainUsed }; const reservedCapacity = totalFleetCapacity * overloadPolicy.enterpriseReservation; const enterpriseDemand = demandByTier['enterprise'] ?? 0; if (reservedCapacity > 0 && enterpriseDemand > 0) { const reservePerModel = reservedCapacity / fleet.length; for (const slot of sortedFleet) { const current = fleetState.remaining.get(slot.modelId) ?? 0; const reserved = Math.min(reservePerModel, current); fleetState.remaining.set(slot.modelId, current - reserved); } } const effectivePriorityOrder = researchUnlocks.priorityQueuesUnlocked ? overloadPolicy.priorityOrder : ['enterprise', 'api-paid', 'consumer-paid', 'api-free', 'consumer-free'] as TrafficPriority[]; const tierResults: Record = {} as Record; const nonEnterpriseTiers = effectivePriorityOrder.filter(t => t !== 'enterprise'); if (enterpriseDemand > 0) { entRemaining.clear(); entUsed.clear(); for (const s of fleet) { entRemaining.set(s.modelId, s.throughputCapacity); entUsed.set(s.modelId, 0); } const enterpriseFleetState: FleetState = { remaining: entRemaining, used: entUsed }; const reserveLimit = reservedCapacity > 0 ? reservedCapacity : totalFleetCapacity; let budgetLeft = reserveLimit; for (const slot of sortedFleet) { const cap = slot.throughputCapacity; const alloc = Math.min(cap, budgetLeft); enterpriseFleetState.remaining.set(slot.modelId, alloc); budgetLeft -= alloc; if (budgetLeft <= 0) break; } const effectiveEntDemand = researchUnlocks.servingRoutingUnlocked ? Math.min(enterpriseDemand, overloadPolicy.rateLimitPerCustomer['enterprise'] * 100) : enterpriseDemand; tierResults['enterprise'] = serveFromFleet( effectiveEntDemand, sortedFleet, enterpriseFleetState, overloadPolicy, 'enterprise', overallUtilization, ); for (const slot of fleet) { const entUsedForModel = enterpriseFleetState.used.get(slot.modelId) ?? 0; const mainRemainingForModel = fleetState.remaining.get(slot.modelId) ?? 0; fleetState.remaining.set(slot.modelId, Math.max(0, mainRemainingForModel - entUsedForModel + (reservedCapacity > 0 ? reservedCapacity / fleet.length : 0))); fleetState.used.set(slot.modelId, entUsedForModel); } } else { tierResults['enterprise'] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 }; if (reservedCapacity > 0) { const reservePerModel = reservedCapacity / fleet.length; for (const slot of fleet) { const current = fleetState.remaining.get(slot.modelId) ?? 0; fleetState.remaining.set(slot.modelId, current + reservePerModel); } } } for (const tier of nonEnterpriseTiers) { const rawDemand = demandByTier[tier] ?? 0; const effectiveDemand = researchUnlocks.servingRoutingUnlocked ? Math.min(rawDemand, overloadPolicy.rateLimitPerCustomer[tier] * 100) : rawDemand; tierResults[tier] = serveFromFleet( effectiveDemand, sortedFleet, fleetState, overloadPolicy, tier, overallUtilization, ); } for (const tier of effectivePriorityOrder) { if (!(tier in tierResults)) { tierResults[tier] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 }; } } let batchTokensServed = 0; let batchRevenue = 0; const updatedBatchApi = { ...batchApi }; if (overloadPolicy.batchApiEnabled && researchUnlocks.batchApiUnlocked) { let idleCapacity = 0; for (const slot of fleet) { const remaining = fleetState.remaining.get(slot.modelId) ?? 0; idleCapacity += remaining; } const pendingBatch = Math.min(batchApi.pendingQueue + batchApi.totalBatchDemand, BATCH_API_MAX_PENDING); batchTokensServed = Math.min(pendingBatch, idleCapacity); const baseTokenPrice = 3.0; batchRevenue = (batchTokensServed / 1_000_000) * baseTokenPrice * (1 - overloadPolicy.batchApiDiscount); updatedBatchApi.pendingQueue = Math.max(0, pendingBatch - batchTokensServed); updatedBatchApi.servedLastTick = batchTokensServed; updatedBatchApi.revenue = batchRevenue; } let totalServed = 0; let totalQueued = 0; let totalRejected = 0; let totalDegraded = 0; let qualitySum = 0; for (const tier of effectivePriorityOrder) { const t = tierResults[tier]; if (!t) continue; totalServed += t.servedTokens; totalQueued += t.queuedTokens; totalRejected += t.rejectedTokens; totalDegraded += t.degradedTokens; qualitySum += t.avgQualityDelivered * t.servedTokens; } const effectiveQuality = totalServed > 0 ? qualitySum / totalServed : modelQuality; const queuedFraction = totalDemand > 0 ? totalQueued / totalDemand : 0; const avgLatencyMs = BASE_LATENCY_MS + queuedFraction * 100 * QUEUE_LATENCY_MS_PER_PERCENT; cachedUtilization.length = fleet.length; for (let i = 0; i < fleet.length; i++) { const slot = fleet[i]; const used = fleetState.used.get(slot.modelId) ?? 0; const existing = cachedUtilization[i]; if (existing) { existing.modelId = slot.modelId; existing.modelName = slot.modelName; existing.quantization = slot.quantization; existing.qualityScore = slot.qualityScore; existing.throughputCapacity = slot.throughputCapacity; existing.throughputUsed = used; existing.utilization = slot.throughputCapacity > 0 ? Math.min(1, used / slot.throughputCapacity) : 0; } else { cachedUtilization[i] = { modelId: slot.modelId, modelName: slot.modelName, quantization: slot.quantization, qualityScore: slot.qualityScore, throughputCapacity: slot.throughputCapacity, throughputUsed: used, utilization: slot.throughputCapacity > 0 ? Math.min(1, used / slot.throughputCapacity) : 0, }; } } const autoScaleBoost = researchUnlocks.autoScalingBonus; if (autoScaleBoost > 0) { for (const tier of Object.keys(tierResults) as TrafficPriority[]) { const metrics = tierResults[tier]; if (metrics.rejectedTokens > 0) { const recovered = Math.min(metrics.rejectedTokens, metrics.rejectedTokens * autoScaleBoost); tierResults[tier] = { ...metrics, servedTokens: metrics.servedTokens + recovered, rejectedTokens: metrics.rejectedTokens - recovered, }; } } } return { servingMetrics: { tierMetrics: tierResults, totalServed, totalQueued, totalRejected, totalDegraded, effectiveQuality, avgLatencyMs, modelUtilization: cachedUtilization, batchApiTokensServed: batchTokensServed, batchApiRevenue: batchRevenue, }, batchApi: updatedBatchApi, batchRevenue, }; } export function computeSatisfactionImpact( metrics: TierServingMetrics, ): number { if (metrics.demandTokens <= 0) return 0; const rejectedFraction = metrics.rejectedTokens / metrics.demandTokens; const queuedFraction = metrics.queuedTokens / metrics.demandTokens; const degradedFraction = metrics.servedTokens > 0 ? metrics.degradedTokens / metrics.servedTokens : 0; const rejectionPenalty = rejectedFraction * REJECTION_SATISFACTION_PENALTY * 10; const queuePenalty = queuedFraction * QUEUE_SATISFACTION_PENALTY * 10; const degradationPenalty = degradedFraction * (1 - metrics.avgQualityDelivered) * DEGRADATION_SATISFACTION_PENALTY * 10; return -(rejectionPenalty + queuePenalty + degradationPenalty); }