Cache serving pipeline fleet to eliminate per-tick rebuilds and reduce GC pressure

Fleet template is now rebuilt only when deploymentVersion changes (~68 times per
28,800-tick run instead of every tick). Reuses module-level Maps, arrays, and
utilization objects instead of allocating new ones each tick. Replaces 4x
Object.values().reduce() with single-pass aggregation and sorts fleet in-place.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-26 19:51:13 -04:00
parent bbb69a315c
commit 57a81be769
7 changed files with 190 additions and 99 deletions
@@ -6,7 +6,7 @@ import type {
ModelUtilizationEntry,
BatchApiState,
} from '@ai-tycoon/shared';
import type { BaseModel, ModelVariant, ModelFamily, ModelsState, SizeTier } from '@ai-tycoon/shared';
import type { BaseModel, ModelsState, SizeTier } from '@ai-tycoon/shared';
import {
MODEL_SIZE_THROUGHPUT_SCALER,
MOE_SPEED_MULTIPLIER,
@@ -62,73 +62,133 @@ export interface ServingPipelineResult {
batchRevenue: number;
}
interface CachedSlot {
modelId: string;
modelName: string;
sizeTier: SizeTier;
isVariant: boolean;
quantization: string | null;
qualityScore: number;
speedMultiplier: number;
throughputMultiplier: number;
isMoE: boolean;
}
let cachedDeploymentVersion = -1;
let cachedSlots: CachedSlot[] = [];
const fleetOutput: ModelServingSlot[] = [];
const mainRemaining = new Map<string, number>();
const mainUsed = new Map<string, number>();
const entRemaining = new Map<string, number>();
const entUsed = new Map<string, number>();
let cachedUtilization: ModelUtilizationEntry[] = [];
export function resetFleetCache(): void {
cachedDeploymentVersion = -1;
cachedSlots.length = 0;
fleetOutput.length = 0;
mainRemaining.clear();
mainUsed.clear();
entRemaining.clear();
entUsed.clear();
cachedUtilization.length = 0;
}
function buildModelFleet(
modelsState: ModelsState,
effectiveInferenceFlops: number,
): ModelServingSlot[] {
const slots: ModelServingSlot[] = [];
const version = modelsState.deploymentVersion;
const deployedBases: BaseModel[] = [];
const baseModelById = new Map<string, BaseModel>();
for (const m of modelsState.baseModels) {
if (m.isDeployed) deployedBases.push(m);
baseModelById.set(m.id, m);
}
if (version !== cachedDeploymentVersion) {
cachedSlots.length = 0;
const deployedVariants: { variant: ModelVariant; baseModel: BaseModel }[] = [];
for (const family of modelsState.families) {
for (const variant of family.variants) {
if (!variant.isDeployed) continue;
const base = baseModelById.get(variant.baseModelId);
if (base) deployedVariants.push({ variant, baseModel: base });
const baseModelById = new Map<string, BaseModel>();
for (const m of modelsState.baseModels) {
baseModelById.set(m.id, m);
if (!m.isDeployed) continue;
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[m.sizeTier] ?? 1.0;
const moeFactor = m.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
cachedSlots.push({
modelId: m.id,
modelName: m.name,
sizeTier: m.sizeTier,
isVariant: false,
quantization: null,
qualityScore: m.rawCapability / 100,
speedMultiplier: moeFactor,
throughputMultiplier: FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor,
isMoE: m.architecture.type === 'moe',
});
}
for (const family of modelsState.families) {
for (const variant of family.variants) {
if (!variant.isDeployed) continue;
const base = baseModelById.get(variant.baseModelId);
if (!base) continue;
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[base.sizeTier] ?? 1.0;
const moeFactor = variant.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
const quantConfig = variant.quantization ? QUANTIZATION_CONFIGS[variant.quantization] : null;
const quantSpeedFactor = quantConfig?.speedMultiplier ?? 1.0;
const qualityRetention = quantConfig?.qualityRetention ?? 1.0;
cachedSlots.push({
modelId: variant.id,
modelName: variant.name,
sizeTier: base.sizeTier,
isVariant: true,
quantization: variant.quantization ?? null,
qualityScore: (base.rawCapability / 100) * qualityRetention,
speedMultiplier: moeFactor * quantSpeedFactor,
throughputMultiplier: FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor * quantSpeedFactor,
isMoE: variant.architecture.type === 'moe',
});
}
}
cachedDeploymentVersion = version;
}
const totalDeployed = deployedBases.length + deployedVariants.length;
if (totalDeployed === 0 || effectiveInferenceFlops <= 0) return slots;
const totalDeployed = cachedSlots.length;
if (totalDeployed === 0 || effectiveInferenceFlops <= 0) {
fleetOutput.length = 0;
return fleetOutput;
}
const flopsPerModel = effectiveInferenceFlops / totalDeployed;
for (const model of deployedBases) {
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[model.sizeTier] ?? 1.0;
const moeFactor = model.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
const throughput = flopsPerModel * FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor;
slots.push({
modelId: model.id,
modelName: model.name,
sizeTier: model.sizeTier,
isVariant: false,
quantization: null,
qualityScore: model.rawCapability / 100,
speedMultiplier: moeFactor,
throughputCapacity: throughput,
isMoE: model.architecture.type === 'moe',
});
fleetOutput.length = totalDeployed;
for (let i = 0; i < totalDeployed; i++) {
const cs = cachedSlots[i];
const existing = fleetOutput[i];
if (existing) {
existing.modelId = cs.modelId;
existing.modelName = cs.modelName;
existing.sizeTier = cs.sizeTier;
existing.isVariant = cs.isVariant;
existing.quantization = cs.quantization;
existing.qualityScore = cs.qualityScore;
existing.speedMultiplier = cs.speedMultiplier;
existing.throughputCapacity = flopsPerModel * cs.throughputMultiplier;
existing.isMoE = cs.isMoE;
} else {
fleetOutput[i] = {
modelId: cs.modelId,
modelName: cs.modelName,
sizeTier: cs.sizeTier,
isVariant: cs.isVariant,
quantization: cs.quantization,
qualityScore: cs.qualityScore,
speedMultiplier: cs.speedMultiplier,
throughputCapacity: flopsPerModel * cs.throughputMultiplier,
isMoE: cs.isMoE,
};
}
}
for (const { variant, baseModel } of deployedVariants) {
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[baseModel.sizeTier] ?? 1.0;
const moeFactor = variant.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
const quantConfig = variant.quantization ? QUANTIZATION_CONFIGS[variant.quantization] : null;
const quantSpeedFactor = quantConfig?.speedMultiplier ?? 1.0;
const qualityRetention = quantConfig?.qualityRetention ?? 1.0;
const throughput = flopsPerModel * FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor * quantSpeedFactor;
slots.push({
modelId: variant.id,
modelName: variant.name,
sizeTier: baseModel.sizeTier,
isVariant: true,
quantization: variant.quantization ?? null,
qualityScore: (baseModel.rawCapability / 100) * qualityRetention,
speedMultiplier: moeFactor * quantSpeedFactor,
throughputCapacity: throughput,
isMoE: variant.architecture.type === 'moe',
});
}
return slots;
return fleetOutput;
}
function sortFleetByStrategy(
@@ -136,24 +196,23 @@ function sortFleetByStrategy(
strategy: string,
overallUtilization: number,
): ModelServingSlot[] {
const sorted = [...fleet];
switch (strategy) {
case 'quality-first':
sorted.sort((a, b) => b.qualityScore - a.qualityScore);
fleet.sort((a, b) => b.qualityScore - a.qualityScore);
break;
case 'speed-first':
sorted.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
fleet.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
break;
case 'balanced':
default:
if (overallUtilization > 0.8) {
sorted.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
fleet.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
} else {
sorted.sort((a, b) => b.qualityScore - a.qualityScore);
fleet.sort((a, b) => b.qualityScore - a.qualityScore);
}
break;
}
return sorted;
return fleet;
}
interface FleetState {
@@ -250,7 +309,8 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
const { modelsState, effectiveInferenceFlops, overloadPolicy, demandByTier, batchApi, modelQuality, researchUnlocks } = input;
const fleet = buildModelFleet(modelsState, effectiveInferenceFlops);
const totalFleetCapacity = fleet.reduce((sum, s) => sum + s.throughputCapacity, 0);
let totalFleetCapacity = 0;
for (const s of fleet) totalFleetCapacity += s.throughputCapacity;
if (fleet.length === 0 || totalFleetCapacity <= 0) {
const metrics = makeInitialServingMetrics();
@@ -275,7 +335,7 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
};
}
const totalDemand = Object.values(demandByTier).reduce((s, v) => s + v, 0);
const totalDemand = demandByTier.enterprise + demandByTier['api-paid'] + demandByTier['consumer-paid'] + demandByTier['api-free'] + demandByTier['consumer-free'];
const overallUtilization = totalFleetCapacity > 0 ? totalDemand / totalFleetCapacity : 0;
const effectiveStrategy = researchUnlocks.servingRoutingUnlocked
@@ -284,10 +344,13 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
const sortedFleet = sortFleetByStrategy(fleet, effectiveStrategy, overallUtilization);
const fleetState: FleetState = {
remaining: new Map(fleet.map(s => [s.modelId, s.throughputCapacity])),
used: new Map(fleet.map(s => [s.modelId, 0])),
};
mainRemaining.clear();
mainUsed.clear();
for (const s of fleet) {
mainRemaining.set(s.modelId, s.throughputCapacity);
mainUsed.set(s.modelId, 0);
}
const fleetState: FleetState = { remaining: mainRemaining, used: mainUsed };
const reservedCapacity = totalFleetCapacity * overloadPolicy.enterpriseReservation;
const enterpriseDemand = demandByTier['enterprise'] ?? 0;
@@ -310,10 +373,13 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
const nonEnterpriseTiers = effectivePriorityOrder.filter(t => t !== 'enterprise');
if (enterpriseDemand > 0) {
const enterpriseFleetState: FleetState = {
remaining: new Map(fleet.map(s => [s.modelId, s.throughputCapacity])),
used: new Map(fleet.map(s => [s.modelId, 0])),
};
entRemaining.clear();
entUsed.clear();
for (const s of fleet) {
entRemaining.set(s.modelId, s.throughputCapacity);
entUsed.set(s.modelId, 0);
}
const enterpriseFleetState: FleetState = { remaining: entRemaining, used: entUsed };
const reserveLimit = reservedCapacity > 0 ? reservedCapacity : totalFleetCapacity;
let budgetLeft = reserveLimit;
@@ -334,10 +400,10 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
);
for (const slot of fleet) {
const entUsed = enterpriseFleetState.used.get(slot.modelId) ?? 0;
const mainRemaining = fleetState.remaining.get(slot.modelId) ?? 0;
fleetState.remaining.set(slot.modelId, Math.max(0, mainRemaining - entUsed + (reservedCapacity > 0 ? reservedCapacity / fleet.length : 0)));
fleetState.used.set(slot.modelId, entUsed);
const entUsedForModel = enterpriseFleetState.used.get(slot.modelId) ?? 0;
const mainRemainingForModel = fleetState.remaining.get(slot.modelId) ?? 0;
fleetState.remaining.set(slot.modelId, Math.max(0, mainRemainingForModel - entUsedForModel + (reservedCapacity > 0 ? reservedCapacity / fleet.length : 0)));
fleetState.used.set(slot.modelId, entUsedForModel);
}
} else {
tierResults['enterprise'] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
@@ -390,34 +456,50 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
updatedBatchApi.revenue = batchRevenue;
}
const totalServed = Object.values(tierResults).reduce((s, t) => s + t.servedTokens, 0);
const totalQueued = Object.values(tierResults).reduce((s, t) => s + t.queuedTokens, 0);
const totalRejected = Object.values(tierResults).reduce((s, t) => s + t.rejectedTokens, 0);
const totalDegraded = Object.values(tierResults).reduce((s, t) => s + t.degradedTokens, 0);
let effectiveQuality = modelQuality;
if (totalServed > 0) {
let qualitySum = 0;
for (const t of Object.values(tierResults)) {
qualitySum += t.avgQualityDelivered * t.servedTokens;
}
effectiveQuality = qualitySum / totalServed;
let totalServed = 0;
let totalQueued = 0;
let totalRejected = 0;
let totalDegraded = 0;
let qualitySum = 0;
for (const tier of effectivePriorityOrder) {
const t = tierResults[tier];
if (!t) continue;
totalServed += t.servedTokens;
totalQueued += t.queuedTokens;
totalRejected += t.rejectedTokens;
totalDegraded += t.degradedTokens;
qualitySum += t.avgQualityDelivered * t.servedTokens;
}
const effectiveQuality = totalServed > 0 ? qualitySum / totalServed : modelQuality;
const queuedFraction = totalDemand > 0 ? totalQueued / totalDemand : 0;
const avgLatencyMs = BASE_LATENCY_MS + queuedFraction * 100 * QUEUE_LATENCY_MS_PER_PERCENT;
const modelUtilization: ModelUtilizationEntry[] = fleet.map(slot => ({
modelId: slot.modelId,
modelName: slot.modelName,
quantization: slot.quantization,
qualityScore: slot.qualityScore,
throughputCapacity: slot.throughputCapacity,
throughputUsed: fleetState.used.get(slot.modelId) ?? 0,
utilization: slot.throughputCapacity > 0
? Math.min(1, (fleetState.used.get(slot.modelId) ?? 0) / slot.throughputCapacity)
: 0,
}));
cachedUtilization.length = fleet.length;
for (let i = 0; i < fleet.length; i++) {
const slot = fleet[i];
const used = fleetState.used.get(slot.modelId) ?? 0;
const existing = cachedUtilization[i];
if (existing) {
existing.modelId = slot.modelId;
existing.modelName = slot.modelName;
existing.quantization = slot.quantization;
existing.qualityScore = slot.qualityScore;
existing.throughputCapacity = slot.throughputCapacity;
existing.throughputUsed = used;
existing.utilization = slot.throughputCapacity > 0 ? Math.min(1, used / slot.throughputCapacity) : 0;
} else {
cachedUtilization[i] = {
modelId: slot.modelId,
modelName: slot.modelName,
quantization: slot.quantization,
qualityScore: slot.qualityScore,
throughputCapacity: slot.throughputCapacity,
throughputUsed: used,
utilization: slot.throughputCapacity > 0 ? Math.min(1, used / slot.throughputCapacity) : 0,
};
}
}
const autoScaleBoost = researchUnlocks.autoScalingBonus;
if (autoScaleBoost > 0) {
@@ -443,7 +525,7 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
totalDegraded,
effectiveQuality,
avgLatencyMs,
modelUtilization,
modelUtilization: cachedUtilization,
batchApiTokensServed: batchTokensServed,
batchApiRevenue: batchRevenue,
},