Files
AIHostingTycoon/packages/game-engine/src/systems/market/servingPipeline.ts
T
josh 57a81be769 Cache serving pipeline fleet to eliminate per-tick rebuilds and reduce GC pressure
Fleet template is now rebuilt only when deploymentVersion changes (~68 times per
28,800-tick run instead of every tick). Reuses module-level Maps, arrays, and
utilization objects instead of allocating new ones each tick. Replaces 4x
Object.values().reduce() with single-pass aggregation and sorts fleet in-place.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-26 19:51:13 -04:00

552 lines
18 KiB
TypeScript

import type {
OverloadPolicy,
TrafficPriority,
TierServingMetrics,
ServingMetrics,
ModelUtilizationEntry,
BatchApiState,
} from '@ai-tycoon/shared';
import type { BaseModel, ModelsState, SizeTier } from '@ai-tycoon/shared';
import {
MODEL_SIZE_THROUGHPUT_SCALER,
MOE_SPEED_MULTIPLIER,
FLOPS_TO_TOKENS_MULTIPLIER,
QUANTIZATION_CONFIGS,
REJECTION_SATISFACTION_PENALTY,
QUEUE_SATISFACTION_PENALTY,
DEGRADATION_SATISFACTION_PENALTY,
BASE_LATENCY_MS,
QUEUE_LATENCY_MS_PER_PERCENT,
BATCH_API_MAX_PENDING,
} from '@ai-tycoon/shared';
import { makeInitialServingMetrics } from '@ai-tycoon/shared';
export interface ModelServingSlot {
modelId: string;
modelName: string;
sizeTier: SizeTier;
isVariant: boolean;
quantization: string | null;
qualityScore: number;
speedMultiplier: number;
throughputCapacity: number;
isMoE: boolean;
}
export interface DemandByTier {
enterprise: number;
'api-paid': number;
'consumer-paid': number;
'api-free': number;
'consumer-free': number;
}
export interface ServingPipelineInput {
modelsState: ModelsState;
effectiveInferenceFlops: number;
overloadPolicy: OverloadPolicy;
demandByTier: DemandByTier;
batchApi: BatchApiState;
modelQuality: number;
researchUnlocks: {
servingRoutingUnlocked: boolean;
priorityQueuesUnlocked: boolean;
batchApiUnlocked: boolean;
autoScalingBonus: number;
};
}
export interface ServingPipelineResult {
servingMetrics: ServingMetrics;
batchApi: BatchApiState;
batchRevenue: number;
}
interface CachedSlot {
modelId: string;
modelName: string;
sizeTier: SizeTier;
isVariant: boolean;
quantization: string | null;
qualityScore: number;
speedMultiplier: number;
throughputMultiplier: number;
isMoE: boolean;
}
let cachedDeploymentVersion = -1;
let cachedSlots: CachedSlot[] = [];
const fleetOutput: ModelServingSlot[] = [];
const mainRemaining = new Map<string, number>();
const mainUsed = new Map<string, number>();
const entRemaining = new Map<string, number>();
const entUsed = new Map<string, number>();
let cachedUtilization: ModelUtilizationEntry[] = [];
export function resetFleetCache(): void {
cachedDeploymentVersion = -1;
cachedSlots.length = 0;
fleetOutput.length = 0;
mainRemaining.clear();
mainUsed.clear();
entRemaining.clear();
entUsed.clear();
cachedUtilization.length = 0;
}
function buildModelFleet(
modelsState: ModelsState,
effectiveInferenceFlops: number,
): ModelServingSlot[] {
const version = modelsState.deploymentVersion;
if (version !== cachedDeploymentVersion) {
cachedSlots.length = 0;
const baseModelById = new Map<string, BaseModel>();
for (const m of modelsState.baseModels) {
baseModelById.set(m.id, m);
if (!m.isDeployed) continue;
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[m.sizeTier] ?? 1.0;
const moeFactor = m.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
cachedSlots.push({
modelId: m.id,
modelName: m.name,
sizeTier: m.sizeTier,
isVariant: false,
quantization: null,
qualityScore: m.rawCapability / 100,
speedMultiplier: moeFactor,
throughputMultiplier: FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor,
isMoE: m.architecture.type === 'moe',
});
}
for (const family of modelsState.families) {
for (const variant of family.variants) {
if (!variant.isDeployed) continue;
const base = baseModelById.get(variant.baseModelId);
if (!base) continue;
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[base.sizeTier] ?? 1.0;
const moeFactor = variant.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
const quantConfig = variant.quantization ? QUANTIZATION_CONFIGS[variant.quantization] : null;
const quantSpeedFactor = quantConfig?.speedMultiplier ?? 1.0;
const qualityRetention = quantConfig?.qualityRetention ?? 1.0;
cachedSlots.push({
modelId: variant.id,
modelName: variant.name,
sizeTier: base.sizeTier,
isVariant: true,
quantization: variant.quantization ?? null,
qualityScore: (base.rawCapability / 100) * qualityRetention,
speedMultiplier: moeFactor * quantSpeedFactor,
throughputMultiplier: FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor * quantSpeedFactor,
isMoE: variant.architecture.type === 'moe',
});
}
}
cachedDeploymentVersion = version;
}
const totalDeployed = cachedSlots.length;
if (totalDeployed === 0 || effectiveInferenceFlops <= 0) {
fleetOutput.length = 0;
return fleetOutput;
}
const flopsPerModel = effectiveInferenceFlops / totalDeployed;
fleetOutput.length = totalDeployed;
for (let i = 0; i < totalDeployed; i++) {
const cs = cachedSlots[i];
const existing = fleetOutput[i];
if (existing) {
existing.modelId = cs.modelId;
existing.modelName = cs.modelName;
existing.sizeTier = cs.sizeTier;
existing.isVariant = cs.isVariant;
existing.quantization = cs.quantization;
existing.qualityScore = cs.qualityScore;
existing.speedMultiplier = cs.speedMultiplier;
existing.throughputCapacity = flopsPerModel * cs.throughputMultiplier;
existing.isMoE = cs.isMoE;
} else {
fleetOutput[i] = {
modelId: cs.modelId,
modelName: cs.modelName,
sizeTier: cs.sizeTier,
isVariant: cs.isVariant,
quantization: cs.quantization,
qualityScore: cs.qualityScore,
speedMultiplier: cs.speedMultiplier,
throughputCapacity: flopsPerModel * cs.throughputMultiplier,
isMoE: cs.isMoE,
};
}
}
return fleetOutput;
}
function sortFleetByStrategy(
fleet: ModelServingSlot[],
strategy: string,
overallUtilization: number,
): ModelServingSlot[] {
switch (strategy) {
case 'quality-first':
fleet.sort((a, b) => b.qualityScore - a.qualityScore);
break;
case 'speed-first':
fleet.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
break;
case 'balanced':
default:
if (overallUtilization > 0.8) {
fleet.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
} else {
fleet.sort((a, b) => b.qualityScore - a.qualityScore);
}
break;
}
return fleet;
}
interface FleetState {
remaining: Map<string, number>;
used: Map<string, number>;
}
function serveFromFleet(
demand: number,
fleet: ModelServingSlot[],
fleetState: FleetState,
policy: OverloadPolicy,
tier: TrafficPriority,
overallUtilization: number,
): TierServingMetrics {
if (demand <= 0) {
return { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
}
let remaining = demand;
let served = 0;
let degraded = 0;
let qualityWeightedSum = 0;
let bestQuality = 0;
for (const s of fleet) { if (s.qualityScore > bestQuality) bestQuality = s.qualityScore; }
if (bestQuality === 0) bestQuality = 1;
const degradationActive = policy.autoDegradation.enabled && overallUtilization > policy.autoDegradation.triggerThreshold;
for (const slot of fleet) {
if (remaining <= 0) break;
const isDegraded = slot.qualityScore < bestQuality * 0.95;
if (isDegraded && !degradationActive) continue;
if (isDegraded && slot.qualityScore < policy.autoDegradation.minQualityFloor) continue;
const available = fleetState.remaining.get(slot.modelId) ?? 0;
if (available <= 0) continue;
const toServe = Math.min(remaining, available);
fleetState.remaining.set(slot.modelId, available - toServe);
fleetState.used.set(slot.modelId, (fleetState.used.get(slot.modelId) ?? 0) + toServe);
served += toServe;
if (isDegraded) degraded += toServe;
qualityWeightedSum += toServe * slot.qualityScore;
remaining -= toServe;
}
let queued = 0;
let rejected = 0;
if (remaining > 0) {
const behavior = policy.overflowBehavior[tier];
switch (behavior) {
case 'queue':
queued = remaining;
break;
case 'reject':
rejected = remaining;
break;
case 'degrade':
for (const slot of fleet) {
if (remaining <= 0) break;
const available = fleetState.remaining.get(slot.modelId) ?? 0;
if (available <= 0) continue;
const toServe = Math.min(remaining, available);
fleetState.remaining.set(slot.modelId, available - toServe);
fleetState.used.set(slot.modelId, (fleetState.used.get(slot.modelId) ?? 0) + toServe);
served += toServe;
degraded += toServe;
qualityWeightedSum += toServe * slot.qualityScore;
remaining -= toServe;
}
rejected = remaining;
break;
}
}
const avgQuality = served > 0 ? qualityWeightedSum / served : bestQuality;
return {
demandTokens: demand,
servedTokens: served,
queuedTokens: queued,
rejectedTokens: rejected,
degradedTokens: degraded,
avgQualityDelivered: avgQuality,
};
}
export function processServingPipeline(input: ServingPipelineInput): ServingPipelineResult {
const { modelsState, effectiveInferenceFlops, overloadPolicy, demandByTier, batchApi, modelQuality, researchUnlocks } = input;
const fleet = buildModelFleet(modelsState, effectiveInferenceFlops);
let totalFleetCapacity = 0;
for (const s of fleet) totalFleetCapacity += s.throughputCapacity;
if (fleet.length === 0 || totalFleetCapacity <= 0) {
const metrics = makeInitialServingMetrics();
for (const tier of Object.keys(demandByTier) as TrafficPriority[]) {
const demand = demandByTier[tier] ?? 0;
if (demand > 0) {
metrics.tierMetrics[tier] = {
demandTokens: demand,
servedTokens: 0,
queuedTokens: 0,
rejectedTokens: demand,
degradedTokens: 0,
avgQualityDelivered: 0,
};
metrics.totalRejected += demand;
}
}
return {
servingMetrics: metrics,
batchApi: { ...batchApi, servedLastTick: 0, revenue: 0 },
batchRevenue: 0,
};
}
const totalDemand = demandByTier.enterprise + demandByTier['api-paid'] + demandByTier['consumer-paid'] + demandByTier['api-free'] + demandByTier['consumer-free'];
const overallUtilization = totalFleetCapacity > 0 ? totalDemand / totalFleetCapacity : 0;
const effectiveStrategy = researchUnlocks.servingRoutingUnlocked
? overloadPolicy.routingStrategy
: 'balanced';
const sortedFleet = sortFleetByStrategy(fleet, effectiveStrategy, overallUtilization);
mainRemaining.clear();
mainUsed.clear();
for (const s of fleet) {
mainRemaining.set(s.modelId, s.throughputCapacity);
mainUsed.set(s.modelId, 0);
}
const fleetState: FleetState = { remaining: mainRemaining, used: mainUsed };
const reservedCapacity = totalFleetCapacity * overloadPolicy.enterpriseReservation;
const enterpriseDemand = demandByTier['enterprise'] ?? 0;
if (reservedCapacity > 0 && enterpriseDemand > 0) {
const reservePerModel = reservedCapacity / fleet.length;
for (const slot of sortedFleet) {
const current = fleetState.remaining.get(slot.modelId) ?? 0;
const reserved = Math.min(reservePerModel, current);
fleetState.remaining.set(slot.modelId, current - reserved);
}
}
const effectivePriorityOrder = researchUnlocks.priorityQueuesUnlocked
? overloadPolicy.priorityOrder
: ['enterprise', 'api-paid', 'consumer-paid', 'api-free', 'consumer-free'] as TrafficPriority[];
const tierResults: Record<TrafficPriority, TierServingMetrics> = {} as Record<TrafficPriority, TierServingMetrics>;
const nonEnterpriseTiers = effectivePriorityOrder.filter(t => t !== 'enterprise');
if (enterpriseDemand > 0) {
entRemaining.clear();
entUsed.clear();
for (const s of fleet) {
entRemaining.set(s.modelId, s.throughputCapacity);
entUsed.set(s.modelId, 0);
}
const enterpriseFleetState: FleetState = { remaining: entRemaining, used: entUsed };
const reserveLimit = reservedCapacity > 0 ? reservedCapacity : totalFleetCapacity;
let budgetLeft = reserveLimit;
for (const slot of sortedFleet) {
const cap = slot.throughputCapacity;
const alloc = Math.min(cap, budgetLeft);
enterpriseFleetState.remaining.set(slot.modelId, alloc);
budgetLeft -= alloc;
if (budgetLeft <= 0) break;
}
const effectiveEntDemand = researchUnlocks.servingRoutingUnlocked
? Math.min(enterpriseDemand, overloadPolicy.rateLimitPerCustomer['enterprise'] * 100)
: enterpriseDemand;
tierResults['enterprise'] = serveFromFleet(
effectiveEntDemand, sortedFleet, enterpriseFleetState, overloadPolicy, 'enterprise', overallUtilization,
);
for (const slot of fleet) {
const entUsedForModel = enterpriseFleetState.used.get(slot.modelId) ?? 0;
const mainRemainingForModel = fleetState.remaining.get(slot.modelId) ?? 0;
fleetState.remaining.set(slot.modelId, Math.max(0, mainRemainingForModel - entUsedForModel + (reservedCapacity > 0 ? reservedCapacity / fleet.length : 0)));
fleetState.used.set(slot.modelId, entUsedForModel);
}
} else {
tierResults['enterprise'] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
if (reservedCapacity > 0) {
const reservePerModel = reservedCapacity / fleet.length;
for (const slot of fleet) {
const current = fleetState.remaining.get(slot.modelId) ?? 0;
fleetState.remaining.set(slot.modelId, current + reservePerModel);
}
}
}
for (const tier of nonEnterpriseTiers) {
const rawDemand = demandByTier[tier] ?? 0;
const effectiveDemand = researchUnlocks.servingRoutingUnlocked
? Math.min(rawDemand, overloadPolicy.rateLimitPerCustomer[tier] * 100)
: rawDemand;
tierResults[tier] = serveFromFleet(
effectiveDemand, sortedFleet, fleetState, overloadPolicy, tier, overallUtilization,
);
}
for (const tier of effectivePriorityOrder) {
if (!(tier in tierResults)) {
tierResults[tier] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
}
}
let batchTokensServed = 0;
let batchRevenue = 0;
const updatedBatchApi = { ...batchApi };
if (overloadPolicy.batchApiEnabled && researchUnlocks.batchApiUnlocked) {
let idleCapacity = 0;
for (const slot of fleet) {
const remaining = fleetState.remaining.get(slot.modelId) ?? 0;
idleCapacity += remaining;
}
const pendingBatch = Math.min(batchApi.pendingQueue + batchApi.totalBatchDemand, BATCH_API_MAX_PENDING);
batchTokensServed = Math.min(pendingBatch, idleCapacity);
const baseTokenPrice = 3.0;
batchRevenue = (batchTokensServed / 1_000_000) * baseTokenPrice * (1 - overloadPolicy.batchApiDiscount);
updatedBatchApi.pendingQueue = Math.max(0, pendingBatch - batchTokensServed);
updatedBatchApi.servedLastTick = batchTokensServed;
updatedBatchApi.revenue = batchRevenue;
}
let totalServed = 0;
let totalQueued = 0;
let totalRejected = 0;
let totalDegraded = 0;
let qualitySum = 0;
for (const tier of effectivePriorityOrder) {
const t = tierResults[tier];
if (!t) continue;
totalServed += t.servedTokens;
totalQueued += t.queuedTokens;
totalRejected += t.rejectedTokens;
totalDegraded += t.degradedTokens;
qualitySum += t.avgQualityDelivered * t.servedTokens;
}
const effectiveQuality = totalServed > 0 ? qualitySum / totalServed : modelQuality;
const queuedFraction = totalDemand > 0 ? totalQueued / totalDemand : 0;
const avgLatencyMs = BASE_LATENCY_MS + queuedFraction * 100 * QUEUE_LATENCY_MS_PER_PERCENT;
cachedUtilization.length = fleet.length;
for (let i = 0; i < fleet.length; i++) {
const slot = fleet[i];
const used = fleetState.used.get(slot.modelId) ?? 0;
const existing = cachedUtilization[i];
if (existing) {
existing.modelId = slot.modelId;
existing.modelName = slot.modelName;
existing.quantization = slot.quantization;
existing.qualityScore = slot.qualityScore;
existing.throughputCapacity = slot.throughputCapacity;
existing.throughputUsed = used;
existing.utilization = slot.throughputCapacity > 0 ? Math.min(1, used / slot.throughputCapacity) : 0;
} else {
cachedUtilization[i] = {
modelId: slot.modelId,
modelName: slot.modelName,
quantization: slot.quantization,
qualityScore: slot.qualityScore,
throughputCapacity: slot.throughputCapacity,
throughputUsed: used,
utilization: slot.throughputCapacity > 0 ? Math.min(1, used / slot.throughputCapacity) : 0,
};
}
}
const autoScaleBoost = researchUnlocks.autoScalingBonus;
if (autoScaleBoost > 0) {
for (const tier of Object.keys(tierResults) as TrafficPriority[]) {
const metrics = tierResults[tier];
if (metrics.rejectedTokens > 0) {
const recovered = Math.min(metrics.rejectedTokens, metrics.rejectedTokens * autoScaleBoost);
tierResults[tier] = {
...metrics,
servedTokens: metrics.servedTokens + recovered,
rejectedTokens: metrics.rejectedTokens - recovered,
};
}
}
}
return {
servingMetrics: {
tierMetrics: tierResults,
totalServed,
totalQueued,
totalRejected,
totalDegraded,
effectiveQuality,
avgLatencyMs,
modelUtilization: cachedUtilization,
batchApiTokensServed: batchTokensServed,
batchApiRevenue: batchRevenue,
},
batchApi: updatedBatchApi,
batchRevenue,
};
}
export function computeSatisfactionImpact(
metrics: TierServingMetrics,
): number {
if (metrics.demandTokens <= 0) return 0;
const rejectedFraction = metrics.rejectedTokens / metrics.demandTokens;
const queuedFraction = metrics.queuedTokens / metrics.demandTokens;
const degradedFraction = metrics.servedTokens > 0 ? metrics.degradedTokens / metrics.servedTokens : 0;
const rejectionPenalty = rejectedFraction * REJECTION_SATISFACTION_PENALTY * 10;
const queuePenalty = queuedFraction * QUEUE_SATISFACTION_PENALTY * 10;
const degradationPenalty = degradedFraction * (1 - metrics.avgQualityDelivered) * DEGRADATION_SATISFACTION_PENALTY * 10;
return -(rejectionPenalty + queuePenalty + degradationPenalty);
}