Cache serving pipeline fleet to eliminate per-tick rebuilds and reduce GC pressure
Fleet template is now rebuilt only when deploymentVersion changes (~68 times per 28,800-tick run instead of every tick). Reuses module-level Maps, arrays, and utilization objects instead of allocating new ones each tick. Replaces 4x Object.values().reduce() with single-pass aggregation and sorts fleet in-place. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@ export { processTick, setAchievementDefinitions } from './tick';
|
||||
export type { TickNotification } from './tick';
|
||||
export { getAvailableResearch, getResearchNode } from './systems/researchSystem';
|
||||
export { getResearchBonuses, resetResearchBonusCache } from './systems/researchBonuses';
|
||||
export { resetFleetCache } from './systems/market/servingPipeline';
|
||||
export type { ResearchBonuses } from './systems/researchBonuses';
|
||||
export { emptyDCNetworkSummary, emptyCampusNetworkSummary, emptyClusterNetworkSummary } from './systems/infrastructureSystem';
|
||||
export { onModelDeployed } from './systems/market/obsolescenceSystem';
|
||||
|
||||
@@ -6,7 +6,7 @@ import type {
|
||||
ModelUtilizationEntry,
|
||||
BatchApiState,
|
||||
} from '@ai-tycoon/shared';
|
||||
import type { BaseModel, ModelVariant, ModelFamily, ModelsState, SizeTier } from '@ai-tycoon/shared';
|
||||
import type { BaseModel, ModelsState, SizeTier } from '@ai-tycoon/shared';
|
||||
import {
|
||||
MODEL_SIZE_THROUGHPUT_SCALER,
|
||||
MOE_SPEED_MULTIPLIER,
|
||||
@@ -62,73 +62,133 @@ export interface ServingPipelineResult {
|
||||
batchRevenue: number;
|
||||
}
|
||||
|
||||
interface CachedSlot {
|
||||
modelId: string;
|
||||
modelName: string;
|
||||
sizeTier: SizeTier;
|
||||
isVariant: boolean;
|
||||
quantization: string | null;
|
||||
qualityScore: number;
|
||||
speedMultiplier: number;
|
||||
throughputMultiplier: number;
|
||||
isMoE: boolean;
|
||||
}
|
||||
|
||||
let cachedDeploymentVersion = -1;
|
||||
let cachedSlots: CachedSlot[] = [];
|
||||
const fleetOutput: ModelServingSlot[] = [];
|
||||
|
||||
const mainRemaining = new Map<string, number>();
|
||||
const mainUsed = new Map<string, number>();
|
||||
const entRemaining = new Map<string, number>();
|
||||
const entUsed = new Map<string, number>();
|
||||
|
||||
let cachedUtilization: ModelUtilizationEntry[] = [];
|
||||
|
||||
export function resetFleetCache(): void {
|
||||
cachedDeploymentVersion = -1;
|
||||
cachedSlots.length = 0;
|
||||
fleetOutput.length = 0;
|
||||
mainRemaining.clear();
|
||||
mainUsed.clear();
|
||||
entRemaining.clear();
|
||||
entUsed.clear();
|
||||
cachedUtilization.length = 0;
|
||||
}
|
||||
|
||||
function buildModelFleet(
|
||||
modelsState: ModelsState,
|
||||
effectiveInferenceFlops: number,
|
||||
): ModelServingSlot[] {
|
||||
const slots: ModelServingSlot[] = [];
|
||||
const version = modelsState.deploymentVersion;
|
||||
|
||||
const deployedBases: BaseModel[] = [];
|
||||
const baseModelById = new Map<string, BaseModel>();
|
||||
for (const m of modelsState.baseModels) {
|
||||
if (m.isDeployed) deployedBases.push(m);
|
||||
baseModelById.set(m.id, m);
|
||||
}
|
||||
if (version !== cachedDeploymentVersion) {
|
||||
cachedSlots.length = 0;
|
||||
|
||||
const deployedVariants: { variant: ModelVariant; baseModel: BaseModel }[] = [];
|
||||
for (const family of modelsState.families) {
|
||||
for (const variant of family.variants) {
|
||||
if (!variant.isDeployed) continue;
|
||||
const base = baseModelById.get(variant.baseModelId);
|
||||
if (base) deployedVariants.push({ variant, baseModel: base });
|
||||
const baseModelById = new Map<string, BaseModel>();
|
||||
for (const m of modelsState.baseModels) {
|
||||
baseModelById.set(m.id, m);
|
||||
if (!m.isDeployed) continue;
|
||||
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[m.sizeTier] ?? 1.0;
|
||||
const moeFactor = m.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
|
||||
cachedSlots.push({
|
||||
modelId: m.id,
|
||||
modelName: m.name,
|
||||
sizeTier: m.sizeTier,
|
||||
isVariant: false,
|
||||
quantization: null,
|
||||
qualityScore: m.rawCapability / 100,
|
||||
speedMultiplier: moeFactor,
|
||||
throughputMultiplier: FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor,
|
||||
isMoE: m.architecture.type === 'moe',
|
||||
});
|
||||
}
|
||||
|
||||
for (const family of modelsState.families) {
|
||||
for (const variant of family.variants) {
|
||||
if (!variant.isDeployed) continue;
|
||||
const base = baseModelById.get(variant.baseModelId);
|
||||
if (!base) continue;
|
||||
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[base.sizeTier] ?? 1.0;
|
||||
const moeFactor = variant.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
|
||||
const quantConfig = variant.quantization ? QUANTIZATION_CONFIGS[variant.quantization] : null;
|
||||
const quantSpeedFactor = quantConfig?.speedMultiplier ?? 1.0;
|
||||
const qualityRetention = quantConfig?.qualityRetention ?? 1.0;
|
||||
cachedSlots.push({
|
||||
modelId: variant.id,
|
||||
modelName: variant.name,
|
||||
sizeTier: base.sizeTier,
|
||||
isVariant: true,
|
||||
quantization: variant.quantization ?? null,
|
||||
qualityScore: (base.rawCapability / 100) * qualityRetention,
|
||||
speedMultiplier: moeFactor * quantSpeedFactor,
|
||||
throughputMultiplier: FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor * quantSpeedFactor,
|
||||
isMoE: variant.architecture.type === 'moe',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
cachedDeploymentVersion = version;
|
||||
}
|
||||
|
||||
const totalDeployed = deployedBases.length + deployedVariants.length;
|
||||
if (totalDeployed === 0 || effectiveInferenceFlops <= 0) return slots;
|
||||
const totalDeployed = cachedSlots.length;
|
||||
if (totalDeployed === 0 || effectiveInferenceFlops <= 0) {
|
||||
fleetOutput.length = 0;
|
||||
return fleetOutput;
|
||||
}
|
||||
|
||||
const flopsPerModel = effectiveInferenceFlops / totalDeployed;
|
||||
|
||||
for (const model of deployedBases) {
|
||||
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[model.sizeTier] ?? 1.0;
|
||||
const moeFactor = model.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
|
||||
const throughput = flopsPerModel * FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor;
|
||||
|
||||
slots.push({
|
||||
modelId: model.id,
|
||||
modelName: model.name,
|
||||
sizeTier: model.sizeTier,
|
||||
isVariant: false,
|
||||
quantization: null,
|
||||
qualityScore: model.rawCapability / 100,
|
||||
speedMultiplier: moeFactor,
|
||||
throughputCapacity: throughput,
|
||||
isMoE: model.architecture.type === 'moe',
|
||||
});
|
||||
fleetOutput.length = totalDeployed;
|
||||
for (let i = 0; i < totalDeployed; i++) {
|
||||
const cs = cachedSlots[i];
|
||||
const existing = fleetOutput[i];
|
||||
if (existing) {
|
||||
existing.modelId = cs.modelId;
|
||||
existing.modelName = cs.modelName;
|
||||
existing.sizeTier = cs.sizeTier;
|
||||
existing.isVariant = cs.isVariant;
|
||||
existing.quantization = cs.quantization;
|
||||
existing.qualityScore = cs.qualityScore;
|
||||
existing.speedMultiplier = cs.speedMultiplier;
|
||||
existing.throughputCapacity = flopsPerModel * cs.throughputMultiplier;
|
||||
existing.isMoE = cs.isMoE;
|
||||
} else {
|
||||
fleetOutput[i] = {
|
||||
modelId: cs.modelId,
|
||||
modelName: cs.modelName,
|
||||
sizeTier: cs.sizeTier,
|
||||
isVariant: cs.isVariant,
|
||||
quantization: cs.quantization,
|
||||
qualityScore: cs.qualityScore,
|
||||
speedMultiplier: cs.speedMultiplier,
|
||||
throughputCapacity: flopsPerModel * cs.throughputMultiplier,
|
||||
isMoE: cs.isMoE,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
for (const { variant, baseModel } of deployedVariants) {
|
||||
const sizeFactor = MODEL_SIZE_THROUGHPUT_SCALER[baseModel.sizeTier] ?? 1.0;
|
||||
const moeFactor = variant.architecture.type === 'moe' ? MOE_SPEED_MULTIPLIER : 1.0;
|
||||
const quantConfig = variant.quantization ? QUANTIZATION_CONFIGS[variant.quantization] : null;
|
||||
const quantSpeedFactor = quantConfig?.speedMultiplier ?? 1.0;
|
||||
const qualityRetention = quantConfig?.qualityRetention ?? 1.0;
|
||||
const throughput = flopsPerModel * FLOPS_TO_TOKENS_MULTIPLIER * sizeFactor * moeFactor * quantSpeedFactor;
|
||||
|
||||
slots.push({
|
||||
modelId: variant.id,
|
||||
modelName: variant.name,
|
||||
sizeTier: baseModel.sizeTier,
|
||||
isVariant: true,
|
||||
quantization: variant.quantization ?? null,
|
||||
qualityScore: (baseModel.rawCapability / 100) * qualityRetention,
|
||||
speedMultiplier: moeFactor * quantSpeedFactor,
|
||||
throughputCapacity: throughput,
|
||||
isMoE: variant.architecture.type === 'moe',
|
||||
});
|
||||
}
|
||||
|
||||
return slots;
|
||||
return fleetOutput;
|
||||
}
|
||||
|
||||
function sortFleetByStrategy(
|
||||
@@ -136,24 +196,23 @@ function sortFleetByStrategy(
|
||||
strategy: string,
|
||||
overallUtilization: number,
|
||||
): ModelServingSlot[] {
|
||||
const sorted = [...fleet];
|
||||
switch (strategy) {
|
||||
case 'quality-first':
|
||||
sorted.sort((a, b) => b.qualityScore - a.qualityScore);
|
||||
fleet.sort((a, b) => b.qualityScore - a.qualityScore);
|
||||
break;
|
||||
case 'speed-first':
|
||||
sorted.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
|
||||
fleet.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
|
||||
break;
|
||||
case 'balanced':
|
||||
default:
|
||||
if (overallUtilization > 0.8) {
|
||||
sorted.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
|
||||
fleet.sort((a, b) => b.throughputCapacity - a.throughputCapacity);
|
||||
} else {
|
||||
sorted.sort((a, b) => b.qualityScore - a.qualityScore);
|
||||
fleet.sort((a, b) => b.qualityScore - a.qualityScore);
|
||||
}
|
||||
break;
|
||||
}
|
||||
return sorted;
|
||||
return fleet;
|
||||
}
|
||||
|
||||
interface FleetState {
|
||||
@@ -250,7 +309,8 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
|
||||
const { modelsState, effectiveInferenceFlops, overloadPolicy, demandByTier, batchApi, modelQuality, researchUnlocks } = input;
|
||||
|
||||
const fleet = buildModelFleet(modelsState, effectiveInferenceFlops);
|
||||
const totalFleetCapacity = fleet.reduce((sum, s) => sum + s.throughputCapacity, 0);
|
||||
let totalFleetCapacity = 0;
|
||||
for (const s of fleet) totalFleetCapacity += s.throughputCapacity;
|
||||
|
||||
if (fleet.length === 0 || totalFleetCapacity <= 0) {
|
||||
const metrics = makeInitialServingMetrics();
|
||||
@@ -275,7 +335,7 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
|
||||
};
|
||||
}
|
||||
|
||||
const totalDemand = Object.values(demandByTier).reduce((s, v) => s + v, 0);
|
||||
const totalDemand = demandByTier.enterprise + demandByTier['api-paid'] + demandByTier['consumer-paid'] + demandByTier['api-free'] + demandByTier['consumer-free'];
|
||||
const overallUtilization = totalFleetCapacity > 0 ? totalDemand / totalFleetCapacity : 0;
|
||||
|
||||
const effectiveStrategy = researchUnlocks.servingRoutingUnlocked
|
||||
@@ -284,10 +344,13 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
|
||||
|
||||
const sortedFleet = sortFleetByStrategy(fleet, effectiveStrategy, overallUtilization);
|
||||
|
||||
const fleetState: FleetState = {
|
||||
remaining: new Map(fleet.map(s => [s.modelId, s.throughputCapacity])),
|
||||
used: new Map(fleet.map(s => [s.modelId, 0])),
|
||||
};
|
||||
mainRemaining.clear();
|
||||
mainUsed.clear();
|
||||
for (const s of fleet) {
|
||||
mainRemaining.set(s.modelId, s.throughputCapacity);
|
||||
mainUsed.set(s.modelId, 0);
|
||||
}
|
||||
const fleetState: FleetState = { remaining: mainRemaining, used: mainUsed };
|
||||
|
||||
const reservedCapacity = totalFleetCapacity * overloadPolicy.enterpriseReservation;
|
||||
const enterpriseDemand = demandByTier['enterprise'] ?? 0;
|
||||
@@ -310,10 +373,13 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
|
||||
const nonEnterpriseTiers = effectivePriorityOrder.filter(t => t !== 'enterprise');
|
||||
|
||||
if (enterpriseDemand > 0) {
|
||||
const enterpriseFleetState: FleetState = {
|
||||
remaining: new Map(fleet.map(s => [s.modelId, s.throughputCapacity])),
|
||||
used: new Map(fleet.map(s => [s.modelId, 0])),
|
||||
};
|
||||
entRemaining.clear();
|
||||
entUsed.clear();
|
||||
for (const s of fleet) {
|
||||
entRemaining.set(s.modelId, s.throughputCapacity);
|
||||
entUsed.set(s.modelId, 0);
|
||||
}
|
||||
const enterpriseFleetState: FleetState = { remaining: entRemaining, used: entUsed };
|
||||
|
||||
const reserveLimit = reservedCapacity > 0 ? reservedCapacity : totalFleetCapacity;
|
||||
let budgetLeft = reserveLimit;
|
||||
@@ -334,10 +400,10 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
|
||||
);
|
||||
|
||||
for (const slot of fleet) {
|
||||
const entUsed = enterpriseFleetState.used.get(slot.modelId) ?? 0;
|
||||
const mainRemaining = fleetState.remaining.get(slot.modelId) ?? 0;
|
||||
fleetState.remaining.set(slot.modelId, Math.max(0, mainRemaining - entUsed + (reservedCapacity > 0 ? reservedCapacity / fleet.length : 0)));
|
||||
fleetState.used.set(slot.modelId, entUsed);
|
||||
const entUsedForModel = enterpriseFleetState.used.get(slot.modelId) ?? 0;
|
||||
const mainRemainingForModel = fleetState.remaining.get(slot.modelId) ?? 0;
|
||||
fleetState.remaining.set(slot.modelId, Math.max(0, mainRemainingForModel - entUsedForModel + (reservedCapacity > 0 ? reservedCapacity / fleet.length : 0)));
|
||||
fleetState.used.set(slot.modelId, entUsedForModel);
|
||||
}
|
||||
} else {
|
||||
tierResults['enterprise'] = { demandTokens: 0, servedTokens: 0, queuedTokens: 0, rejectedTokens: 0, degradedTokens: 0, avgQualityDelivered: 1 };
|
||||
@@ -390,34 +456,50 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
|
||||
updatedBatchApi.revenue = batchRevenue;
|
||||
}
|
||||
|
||||
const totalServed = Object.values(tierResults).reduce((s, t) => s + t.servedTokens, 0);
|
||||
const totalQueued = Object.values(tierResults).reduce((s, t) => s + t.queuedTokens, 0);
|
||||
const totalRejected = Object.values(tierResults).reduce((s, t) => s + t.rejectedTokens, 0);
|
||||
const totalDegraded = Object.values(tierResults).reduce((s, t) => s + t.degradedTokens, 0);
|
||||
|
||||
let effectiveQuality = modelQuality;
|
||||
if (totalServed > 0) {
|
||||
let qualitySum = 0;
|
||||
for (const t of Object.values(tierResults)) {
|
||||
qualitySum += t.avgQualityDelivered * t.servedTokens;
|
||||
}
|
||||
effectiveQuality = qualitySum / totalServed;
|
||||
let totalServed = 0;
|
||||
let totalQueued = 0;
|
||||
let totalRejected = 0;
|
||||
let totalDegraded = 0;
|
||||
let qualitySum = 0;
|
||||
for (const tier of effectivePriorityOrder) {
|
||||
const t = tierResults[tier];
|
||||
if (!t) continue;
|
||||
totalServed += t.servedTokens;
|
||||
totalQueued += t.queuedTokens;
|
||||
totalRejected += t.rejectedTokens;
|
||||
totalDegraded += t.degradedTokens;
|
||||
qualitySum += t.avgQualityDelivered * t.servedTokens;
|
||||
}
|
||||
const effectiveQuality = totalServed > 0 ? qualitySum / totalServed : modelQuality;
|
||||
|
||||
const queuedFraction = totalDemand > 0 ? totalQueued / totalDemand : 0;
|
||||
const avgLatencyMs = BASE_LATENCY_MS + queuedFraction * 100 * QUEUE_LATENCY_MS_PER_PERCENT;
|
||||
|
||||
const modelUtilization: ModelUtilizationEntry[] = fleet.map(slot => ({
|
||||
modelId: slot.modelId,
|
||||
modelName: slot.modelName,
|
||||
quantization: slot.quantization,
|
||||
qualityScore: slot.qualityScore,
|
||||
throughputCapacity: slot.throughputCapacity,
|
||||
throughputUsed: fleetState.used.get(slot.modelId) ?? 0,
|
||||
utilization: slot.throughputCapacity > 0
|
||||
? Math.min(1, (fleetState.used.get(slot.modelId) ?? 0) / slot.throughputCapacity)
|
||||
: 0,
|
||||
}));
|
||||
cachedUtilization.length = fleet.length;
|
||||
for (let i = 0; i < fleet.length; i++) {
|
||||
const slot = fleet[i];
|
||||
const used = fleetState.used.get(slot.modelId) ?? 0;
|
||||
const existing = cachedUtilization[i];
|
||||
if (existing) {
|
||||
existing.modelId = slot.modelId;
|
||||
existing.modelName = slot.modelName;
|
||||
existing.quantization = slot.quantization;
|
||||
existing.qualityScore = slot.qualityScore;
|
||||
existing.throughputCapacity = slot.throughputCapacity;
|
||||
existing.throughputUsed = used;
|
||||
existing.utilization = slot.throughputCapacity > 0 ? Math.min(1, used / slot.throughputCapacity) : 0;
|
||||
} else {
|
||||
cachedUtilization[i] = {
|
||||
modelId: slot.modelId,
|
||||
modelName: slot.modelName,
|
||||
quantization: slot.quantization,
|
||||
qualityScore: slot.qualityScore,
|
||||
throughputCapacity: slot.throughputCapacity,
|
||||
throughputUsed: used,
|
||||
utilization: slot.throughputCapacity > 0 ? Math.min(1, used / slot.throughputCapacity) : 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const autoScaleBoost = researchUnlocks.autoScalingBonus;
|
||||
if (autoScaleBoost > 0) {
|
||||
@@ -443,7 +525,7 @@ export function processServingPipeline(input: ServingPipelineInput): ServingPipe
|
||||
totalDegraded,
|
||||
effectiveQuality,
|
||||
avgLatencyMs,
|
||||
modelUtilization,
|
||||
modelUtilization: cachedUtilization,
|
||||
batchApiTokensServed: batchTokensServed,
|
||||
batchApiRevenue: batchRevenue,
|
||||
},
|
||||
|
||||
@@ -3,12 +3,14 @@ import { processTick, setAchievementDefinitions } from './tick';
|
||||
import { createTestState, createSeededRNG } from './__test-utils__';
|
||||
import { ACHIEVEMENT_DEFINITIONS } from './data/achievements';
|
||||
import { resetResearchBonusCache } from './systems/researchBonuses';
|
||||
import { resetFleetCache } from './systems/market/servingPipeline';
|
||||
|
||||
const rng = createSeededRNG(42);
|
||||
|
||||
beforeEach(() => {
|
||||
rng.install();
|
||||
resetResearchBonusCache();
|
||||
resetFleetCache();
|
||||
setAchievementDefinitions(ACHIEVEMENT_DEFINITIONS);
|
||||
});
|
||||
afterEach(() => rng.uninstall());
|
||||
|
||||
Reference in New Issue
Block a user