Cache serving pipeline fleet to eliminate per-tick rebuilds and reduce GC pressure

Fleet template is now rebuilt only when deploymentVersion changes (~68 times per 28,800-tick run instead of every tick). Reuses module-level Maps, arrays, and utilization objects instead of allocating new ones each tick. Replaces 4x Object.values().reduce() with single-pass aggregation and sorts fleet in-place. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-26 19:51:13 -04:00
parent bbb69a315c
commit 57a81be769
7 changed files with 190 additions and 99 deletions
@@ -6,7 +6,7 @@ import type {
  ModelUtilizationEntry,
  BatchApiState,
 } from '@ai-tycoon/shared';
-import type { BaseModel, ModelVariant, ModelFamily, ModelsState, SizeTier } from '@ai-tycoon/shared';
+import type { BaseModel, ModelsState, SizeTier } from '@ai-tycoon/shared';
 import {
  MODEL_SIZE_THROUGHPUT_SCALER,
  MOE_SPEED_MULTIPLIER,
@@ -62,73 +62,133 @@ export interface ServingPipelineResult {
  batchRevenue: number;
 }

+interface CachedSlot {
+  modelId: string;
+  modelName: string;
+  sizeTier: SizeTier;
+  isVariant: boolean;
+  quantization: string | null;
+  qualityScore: number;
+  speedMultiplier: number;
+  throughputMultiplier: number;
+  isMoE: boolean;
+}
+
+let cachedDeploymentVersion = -1;
+let cachedSlots: CachedSlot[] = [];
+const fleetOutput: ModelServingSlot[] = [];
+
+const mainRemaining = new Map<string, number>();
+const mainUsed = new Map<string, number>();
+const entRemaining = new Map<string, number>();
+const entUsed = new Map<string, number>();
+
+let cachedUtilization: ModelUtilizationEntry[] = [];
+
+export function resetFleetCache(): void {
+  cachedDeploymentVersion = -1;
+  cachedSlots.length = 0;
+  fleetOutput.length = 0;
+  mainRemaining.clear();
+  mainUsed.clear();
+  entRemaining.clear();
+  entUsed.clear();
+  cachedUtilization.length = 0;
+}
+
 function buildModelFleet(
  modelsState: ModelsState,
  effectiveInferenceFlops: number,
 ): ModelServingSlot[] {
-  const slots: ModelServingSlot[] = [];
+  const version = modelsState.deploymentVersion;

-  const deployedBases: BaseModel[] = [];
-  const baseModelById = new Map<string, BaseModel>();
-  for (const m of modelsState.baseModels) {
-    if (m.isDeployed) deployedBases.push(m);
-    baseModelById.set(m.id, m);
-  }
+  if (version !== cachedDeploymentVersion) {
+    cachedSlots.length = 0;

-  const deployedVariants: { variant: ModelVariant; baseModel: BaseModel }[] = [];
-  for (const family of modelsState.families) {
-    for (const variant of family.variants) {
-      if (!variant.isDeployed) continue;
-      const base = baseModelById.get(variant.baseModelId);
-      if (base) deployedVariants.push({ variant, baseModel: base });
+    const baseModelById = new Map<string, BaseModel>();
+    for (const m of modelsState.baseModels) {
+      baseModelById.set(m.id, m);
+      if (!m.isDeployed) continue;
+      const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[m.sizeTier] ?? 1.0;
+      const moeFactor = m.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
+      cachedSlots.push({
+        modelId: m.id,
+        modelName: m.name,
+        sizeTier: m.sizeTier,
+        isVariant: false,
+        quantization: null,
+        qualityScore: m.rawCapability / 100,
+        speedMultiplier: moeFactor,
+        throughputMultiplier: FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor,
+        isMoE: m.architecture.type === 'moe',
+      });
    }
+
+    for (const family of modelsState.families) {
+      for (const variant of family.variants) {
+        if (!variant.isDeployed) continue;
+        const base = baseModelById.get(variant.baseModelId);
+        if (!base) continue;
+        const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[base.sizeTier] ?? 1.0;
+        const moeFactor = variant.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
+        const quantConfig = variant.quantization ? QUANTIZATION_CONFIGS[variant.quantization] : null;
+        const quantSpeedFactor = quantConfig?.speedMultiplier ?? 1.0;
+        const qualityRetention = quantConfig?.qualityRetention ?? 1.0;
+        cachedSlots.push({
+          modelId: variant.id,
+          modelName: variant.name,
+          sizeTier: base.sizeTier,
+          isVariant: true,
+          quantization: variant.quantization ?? null,
+          qualityScore: (base.rawCapability / 100) * qualityRetention,
+          speedMultiplier: moeFactor * quantSpeedFactor,
+          throughputMultiplier: FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor * quantSpeedFactor,
+          isMoE: variant.architecture.type === 'moe',
+        });
+      }
+    }
+
+    cachedDeploymentVersion = version;
  }

-  const totalDeployed = deployedBases.length + deployedVariants.length;
-  if (totalDeployed === 0 || effectiveInferenceFlops <= 0) return slots;
+  const totalDeployed = cachedSlots.length;
+  if (totalDeployed === 0 || effectiveInferenceFlops <= 0) {
+    fleetOutput.length = 0;
+    return fleetOutput;
+  }

  const flopsPerModel = effectiveInferenceFlops / totalDeployed;

-  for (const model of deployedBases) {
-    const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[model.sizeTier] ?? 1.0;
-    const moeFactor = model.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
-    const throughput = flopsPerModel * FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor;
-
-    slots.push({
-      modelId: model.id,
-      modelName: model.name,
-      sizeTier: model.sizeTier,
-      isVariant: false,
-      quantization: null,
-      qualityScore: model.rawCapability / 100,
-      speedMultiplier: moeFactor,
-      throughputCapacity: throughput,
-      isMoE: model.architecture.type === 'moe',
-    });
+  fleetOutput.length = totalDeployed;
+  for (let i = 0; i < totalDeployed; i++) {
+    const cs = cachedSlots[i];
+    const existing = fleetOutput[i];
+    if (existing) {
+      existing.modelId = cs.modelId;
+      existing.modelName = cs.modelName;
+      existing.sizeTier = cs.sizeTier;
+      existing.isVariant = cs.isVariant;
+      existing.quantization = cs.quantization;
+      existing.qualityScore = cs.qualityScore;
+      existing.speedMultiplier = cs.speedMultiplier;
+      existing.throughputCapacity = flopsPerModel * cs.throughputMultiplier;
+      existing.isMoE = cs.isMoE;
+    } else {
+      fleetOutput[i] = {
+        modelId: cs.modelId,
+        modelName: cs.modelName,
+        sizeTier: cs.sizeTier,
+        isVariant: cs.isVariant,
+        quantization: cs.quantization,
+        qualityScore: cs.qualityScore,
+        speedMultiplier: cs.speedMultiplier,
+        throughputCapacity: flopsPerModel * cs.throughputMultiplier,
+        isMoE: cs.isMoE,
+      };
+    }
  }

-  for (const { variant, baseModel } of deployedVariants) {
-    const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[baseModel.sizeTier] ?? 1.0;
-    const moeFactor = variant.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
-    const quantConfig = variant.quantization ? QUANTIZATION_CONFIGS[variant.quantization] : null;
-    const quantSpeedFactor = quantConfig?.speedMultiplier ?? 1.0;
-    const qualityRetention = quantConfig?.qualityRetention ?? 1.0;
-    const throughput = flopsPerModel * FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor * quantSpeedFactor;
-
-    slots.push({
-      modelId: variant.id,
-      modelName: variant.name,
-      sizeTier: baseModel.sizeTier,
-      isVariant: true,
-      quantization: variant.quantization ?? null,
-      qualityScore: (baseModel.rawCapability / 100) * qualityRetention,
-      speedMultiplier: moeFactor * quantSpeedFactor,
-      throughputCapacity: throughput,
-      isMoE: variant.architecture.type === 'moe',
-    });
-  }
-
-  return slots;
+  return fleetOutput;
 }

 function sortFleetByStrategy(
@@ -136,24 +196,23 @@ function sortFleetByStrategy(
  strategy: string,
  overallUtilization: number,
 ): ModelServingSlot[] {
-  const sorted = [...fleet];
  switch (strategy) {
    case 'quality-first':
-      sorted.sort((a, b) => b.qualityScore - a.qualityScore);
+      fleet.sort((a, b) => b.qualityScore - a.qualityScore);
      break;
    case 'speed-first':
-      sorted.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
+      fleet.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
      break;
    case 'balanced':
    default:
      if (overallUtilization > 0.8) {
-        sorted.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
+        fleet.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
      } else {
-        sorted.sort((a, b) => b.qualityScore - a.qualityScore);
+        fleet.sort((a, b) => b.qualityScore - a.qualityScore);
      }
      break;
  }
-  return sorted;
+  return fleet;
 }

 interface FleetState {
@@ -250,7 +309,8 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
  const { modelsState, effectiveInferenceFlops, overloadPolicy, demandByTier, batchApi, modelQuality, researchUnlocks } = input;

  const fleet = buildModelFleet(modelsState, effectiveInferenceFlops);
-  const totalFleetCapacity = fleet.reduce((sum, s) => sum + s.throughputCapacity, 0);
+  let totalFleetCapacity = 0;
+  for (const s of fleet) totalFleetCapacity += s.throughputCapacity;

  if (fleet.length === 0 || totalFleetCapacity <= 0) {
    const metrics = makeInitialServingMetrics();
@@ -275,7 +335,7 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
    };
  }

-  const totalDemand = Object.values(demandByTier).reduce((s, v) => s + v, 0);
+  const totalDemand = demandByTier.enterprise + demandByTier['api-paid'] + demandByTier['consumer-paid'] + demandByTier['api-free'] + demandByTier['consumer-free'];
  const overallUtilization = totalFleetCapacity > 0 ? totalDemand / totalFleetCapacity : 0;

  const effectiveStrategy = researchUnlocks.servingRoutingUnlocked
@@ -284,10 +344,13 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe

  const sortedFleet = sortFleetByStrategy(fleet, effectiveStrategy, overallUtilization);

-  const fleetState: FleetState = {
-    remaining: new Map(fleet.map(s => [s.modelId, s.throughputCapacity])),
-    used: new Map(fleet.map(s => [s.modelId, 0])),
-  };
+  mainRemaining.clear();
+  mainUsed.clear();
+  for (const s of fleet) {
+    mainRemaining.set(s.modelId, s.throughputCapacity);
+    mainUsed.set(s.modelId, 0);
+  }
+  const fleetState: FleetState = { remaining: mainRemaining, used: mainUsed };

  const reservedCapacity = totalFleetCapacity * overloadPolicy.enterpriseReservation;
  const enterpriseDemand = demandByTier['enterprise'] ?? 0;
@@ -310,10 +373,13 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
  const nonEnterpriseTiers = effectivePriorityOrder.filter(t => t !== 'enterprise');

  if (enterpriseDemand > 0) {
-    const enterpriseFleetState: FleetState = {
-      remaining: new Map(fleet.map(s => [s.modelId, s.throughputCapacity])),
-      used: new Map(fleet.map(s => [s.modelId, 0])),
-    };
+    entRemaining.clear();
+    entUsed.clear();
+    for (const s of fleet) {
+      entRemaining.set(s.modelId, s.throughputCapacity);
+      entUsed.set(s.modelId, 0);
+    }
+    const enterpriseFleetState: FleetState = { remaining: entRemaining, used: entUsed };

    const reserveLimit = reservedCapacity > 0 ? reservedCapacity : totalFleetCapacity;
    let budgetLeft = reserveLimit;
@@ -334,10 +400,10 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
    );

    for (const slot of fleet) {
-      const entUsed = enterpriseFleetState.used.get(slot.modelId) ?? 0;
-      const mainRemaining = fleetState.remaining.get(slot.modelId) ?? 0;
-      fleetState.remaining.set(slot.modelId, Math.max(0, mainRemaining - entUsed + (reservedCapacity > 0 ? reservedCapacity / fleet.length : 0)));
-      fleetState.used.set(slot.modelId, entUsed);
+      const entUsedForModel = enterpriseFleetState.used.get(slot.modelId) ?? 0;
+      const mainRemainingForModel = fleetState.remaining.get(slot.modelId) ?? 0;
+      fleetState.remaining.set(slot.modelId, Math.max(0, mainRemainingForModel - entUsedForModel + (reservedCapacity > 0 ? reservedCapacity / fleet.length : 0)));
+      fleetState.used.set(slot.modelId, entUsedForModel);
    }
  } else {
    tierResults['enterprise'] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
@@ -390,34 +456,50 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
    updatedBatchApi.revenue = batchRevenue;
  }

-  const totalServed = Object.values(tierResults).reduce((s, t) => s + t.servedTokens, 0);
-  const totalQueued = Object.values(tierResults).reduce((s, t) => s + t.queuedTokens, 0);
-  const totalRejected = Object.values(tierResults).reduce((s, t) => s + t.rejectedTokens, 0);
-  const totalDegraded = Object.values(tierResults).reduce((s, t) => s + t.degradedTokens, 0);
-
-  let effectiveQuality = modelQuality;
-  if (totalServed > 0) {
-    let qualitySum = 0;
-    for (const t of Object.values(tierResults)) {
-      qualitySum += t.avgQualityDelivered * t.servedTokens;
-    }
-    effectiveQuality = qualitySum / totalServed;
+  let totalServed = 0;
+  let totalQueued = 0;
+  let totalRejected = 0;
+  let totalDegraded = 0;
+  let qualitySum = 0;
+  for (const tier of effectivePriorityOrder) {
+    const t = tierResults[tier];
+    if (!t) continue;
+    totalServed += t.servedTokens;
+    totalQueued += t.queuedTokens;
+    totalRejected += t.rejectedTokens;
+    totalDegraded += t.degradedTokens;
+    qualitySum += t.avgQualityDelivered * t.servedTokens;
  }
+  const effectiveQuality = totalServed > 0 ? qualitySum / totalServed : modelQuality;

  const queuedFraction = totalDemand > 0 ? totalQueued / totalDemand : 0;
  const avgLatencyMs = BASE_LATENCY_MS + queuedFraction * 100 * QUEUE_LATENCY_MS_PER_PERCENT;

-  const modelUtilization: ModelUtilizationEntry[] = fleet.map(slot => ({
-    modelId: slot.modelId,
-    modelName: slot.modelName,
-    quantization: slot.quantization,
-    qualityScore: slot.qualityScore,
-    throughputCapacity: slot.throughputCapacity,
-    throughputUsed: fleetState.used.get(slot.modelId) ?? 0,
-    utilization: slot.throughputCapacity > 0
-      ? Math.min(1, (fleetState.used.get(slot.modelId) ?? 0) / slot.throughputCapacity)
-      : 0,
-  }));
+  cachedUtilization.length = fleet.length;
+  for (let i = 0; i < fleet.length; i++) {
+    const slot = fleet[i];
+    const used = fleetState.used.get(slot.modelId) ?? 0;
+    const existing = cachedUtilization[i];
+    if (existing) {
+      existing.modelId = slot.modelId;
+      existing.modelName = slot.modelName;
+      existing.quantization = slot.quantization;
+      existing.qualityScore = slot.qualityScore;
+      existing.throughputCapacity = slot.throughputCapacity;
+      existing.throughputUsed = used;
+      existing.utilization = slot.throughputCapacity > 0 ? Math.min(1, used / slot.throughputCapacity) : 0;
+    } else {
+      cachedUtilization[i] = {
+        modelId: slot.modelId,
+        modelName: slot.modelName,
+        quantization: slot.quantization,
+        qualityScore: slot.qualityScore,
+        throughputCapacity: slot.throughputCapacity,
+        throughputUsed: used,
+        utilization: slot.throughputCapacity > 0 ? Math.min(1, used / slot.throughputCapacity) : 0,
+      };
+    }
+  }

  const autoScaleBoost = researchUnlocks.autoScalingBonus;
  if (autoScaleBoost > 0) {
@@ -443,7 +525,7 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
      totalDegraded,
      effectiveQuality,
      avgLatencyMs,
-      modelUtilization,
+      modelUtilization: cachedUtilization,
      batchApiTokensServed: batchTokensServed,
      batchApiRevenue: batchRevenue,
    },