From fc1f371c8c984707e31f7cbaca6f1a47e2035b10 Mon Sep 17 00:00:00 2001 From: josh Date: Sat, 25 Apr 2026 02:27:03 -0400 Subject: [PATCH] Overhaul rack system with split FLOPS, VRAM, cooling, interconnect, and multi-vendor SKUs Expand from 10 to 18 rack SKUs across NVIDIA, AMD, and custom ASIC vendors, each with distinct training vs inference FLOPS, VRAM capacity, cooling requirements, and interconnect technology. Adds cooling hierarchy (air/liquid/immersion) that gates rack deployment, VRAM requirements that gate model training by generation, interconnect multipliers for distributed training scaling, and PUE-based energy cost reduction for advanced cooling. Includes save migration from v4 to v5, 6 new research nodes, and UI updates showing split compute stats. Co-Authored-By: Claude Opus 4.6 --- .../src/components/dev/StateInspectionTab.tsx | 12 +- apps/web/src/pages/InfrastructurePage.tsx | 42 +- apps/web/src/pages/ModelsPage.tsx | 17 +- apps/web/src/store/index.ts | 91 +++- packages/game-engine/src/data/techTree.ts | 68 ++- .../game-engine/src/systems/computeSystem.ts | 33 +- .../src/systems/infrastructureSystem.ts | 51 +- .../game-engine/src/systems/modelSystem.ts | 7 +- packages/shared/src/constants/gameBalance.ts | 454 +++++++++++++++--- packages/shared/src/types/compute.ts | 10 + packages/shared/src/types/gameState.ts | 2 +- packages/shared/src/types/infrastructure.ts | 62 ++- 12 files changed, 749 insertions(+), 100 deletions(-) diff --git a/apps/web/src/components/dev/StateInspectionTab.tsx b/apps/web/src/components/dev/StateInspectionTab.tsx index 4901cf6..d9a45e7 100644 --- a/apps/web/src/components/dev/StateInspectionTab.tsx +++ b/apps/web/src/components/dev/StateInspectionTab.tsx @@ -62,9 +62,14 @@ export function StateInspectionTab() {
+ + + + + - - + +
@@ -85,6 +90,9 @@ export function StateInspectionTab() { + + +
diff --git a/apps/web/src/pages/InfrastructurePage.tsx b/apps/web/src/pages/InfrastructurePage.tsx index d390cbb..1ef255d 100644 --- a/apps/web/src/pages/InfrastructurePage.tsx +++ b/apps/web/src/pages/InfrastructurePage.tsx @@ -17,6 +17,7 @@ import { estimateNetworkSlots, maxComputeRacks, SWITCH_TIER_CONFIGS, DC_UPGRADE_COST_FRACTION, DC_UPGRADE_INCREMENT, + skuTotalFlops, } from '@ai-tycoon/shared'; import type { DCTier, RackSkuId, LocationId, PipelineStage, Era, @@ -357,7 +358,7 @@ function ClusterFillAllModal({ cluster, money, era, research, onConfirm, onClose }) { const availableSkus = Object.values(RACK_SKU_CONFIGS).filter(s => { if (ERA_ORDER.indexOf(era) < ERA_ORDER.indexOf(s.era)) return false; - if (s.requiredResearch && !research.includes(s.requiredResearch)) return false; + if (s.requiredResearch.length > 0 && !s.requiredResearch.every(r => research.includes(r))) return false; return true; }); @@ -540,7 +541,7 @@ function ClusterDetailView({ clusterId }: { clusterId: string }) {
FLOPS: { formatNumber(campus.dataCenters.reduce((s, d) => { const sku = d.rackSkuId ? RACK_SKU_CONFIGS[d.rackSkuId] : null; - return s + (sku ? d.effectiveComputeRacks * sku.flopsPerRack : 0); + return s + (sku ? d.effectiveComputeRacks * skuTotalFlops(sku) : 0); }, 0)) }
@@ -644,7 +645,7 @@ function FillAllDCsModal({ campus, money, era, research, onConfirm, onClose }: { }) { const availableSkus = Object.values(RACK_SKU_CONFIGS).filter(s => { if (ERA_ORDER.indexOf(era) < ERA_ORDER.indexOf(s.era)) return false; - if (s.requiredResearch && !research.includes(s.requiredResearch)) return false; + if (s.requiredResearch.length > 0 && !s.requiredResearch.every(r => research.includes(r))) return false; return true; }); @@ -754,7 +755,7 @@ function RetrofitCampusModal({ campus, era, research, onConfirm, onClose }: { const targetSkus = Object.values(RACK_SKU_CONFIGS).filter(s => { if (ERA_ORDER.indexOf(era) < ERA_ORDER.indexOf(s.era)) return false; - if (s.requiredResearch && !research.includes(s.requiredResearch)) return false; + if (s.requiredResearch.length > 0 && !s.requiredResearch.every(r => research.includes(r))) return false; return true; }); @@ -810,7 +811,7 @@ function RetrofitCampusModal({ campus, era, research, onConfirm, onClose }: { }`}>
{s.name}
-
{s.flopsPerRack} FLOPS | {s.powerDrawKW} kW | {formatMoney(s.baseCost)}/rack
+
{s.trainingFlops}T / {s.inferenceFlops}I FLOPS | {s.totalVramGB}GB | {s.powerDrawKW} kW | {formatMoney(s.baseCost)}/rack
{isCurrentOnly && Current} {selectedSku === s.id && } @@ -1140,7 +1141,7 @@ function DataCenterDetailView({ clusterId, campusId, datacenterId }: { const availableSkus = Object.values(RACK_SKU_CONFIGS).filter(s => { if (ERA_ORDER.indexOf(era) < ERA_ORDER.indexOf(s.era)) return false; - if (s.requiredResearch && !research.includes(s.requiredResearch)) return false; + if (s.requiredResearch.length > 0 && !s.requiredResearch.every(r => research.includes(r))) return false; if (dc.rackSkuId && dc.rackSkuId !== s.id) return false; return true; }); @@ -1168,9 +1169,10 @@ function DataCenterDetailView({ clusterId, campusId, datacenterId }: { {/* Stats Grid */} -
+
- + +
@@ -1240,7 +1242,7 @@ function DataCenterDetailView({ clusterId, campusId, datacenterId }: { setSelectedSku(s.id)} className="accent-accent" />
{s.name}
-
{s.flopsPerRack} FLOPS | {s.powerDrawKW} kW | {formatMoney(s.baseCost)}
+
{s.trainingFlops}T / {s.inferenceFlops}I FLOPS | {s.totalVramGB}GB | {s.powerDrawKW} kW | {formatMoney(s.baseCost)}
))} @@ -1311,14 +1313,14 @@ function DataCenterDetailView({ clusterId, campusId, datacenterId }: { {Object.values(RACK_SKU_CONFIGS).filter(s => { if (s.id === dc.rackSkuId) return false; if (ERA_ORDER.indexOf(era) < ERA_ORDER.indexOf(s.era)) return false; - if (s.requiredResearch && !research.includes(s.requiredResearch)) return false; + if (s.requiredResearch.length > 0 && !s.requiredResearch.every(r => research.includes(r))) return false; return true; }).map(s => ( @@ -1332,6 +1334,24 @@ function DataCenterDetailView({ clusterId, campusId, datacenterId }: { {/* Upgrades Tab */} {activeTab === 'upgrades' && (
+ {/* Cooling & Network Fabric */} +
+
+ +
+
Cooling Type
+
{dc.coolingType}
+
+
+
+ +
+
Network Fabric
+
{dc.networkFabric}
+
+
+
+ {(['cooling', 'redundancy'] as const).map(upgrade => { const level = upgrade === 'cooling' ? dc.coolingLevel : dc.redundancyLevel; const cost = tierConfig.baseCost * DC_UPGRADE_COST_FRACTION; diff --git a/apps/web/src/pages/ModelsPage.tsx b/apps/web/src/pages/ModelsPage.tsx index 767a43b..73d50f6 100644 --- a/apps/web/src/pages/ModelsPage.tsx +++ b/apps/web/src/pages/ModelsPage.tsx @@ -2,7 +2,7 @@ import { useState } from 'react'; import { Brain, Play, Rocket, Globe, SlidersHorizontal, ChevronDown, ChevronUp } from 'lucide-react'; import { TutorialHint } from '@/components/game/TutorialHint'; import { useGameStore } from '@/store'; -import { formatNumber, formatPercent, formatDuration } from '@ai-tycoon/shared'; +import { formatNumber, formatPercent, formatDuration, VRAM_REQUIREMENTS_BY_GENERATION } from '@ai-tycoon/shared'; import type { TuningPreset } from '@ai-tycoon/shared'; export function ModelsPage() { @@ -10,6 +10,7 @@ export function ModelsPage() { const activeTraining = useGameStore((s) => s.models.activeTraining); const productLines = useGameStore((s) => s.models.productLines); const totalFlops = useGameStore((s) => s.compute.totalFlops); + const totalVramGB = useGameStore((s) => s.compute.totalVramGB); const trainingAlloc = useGameStore((s) => s.compute.trainingAllocation); const totalData = useGameStore((s) => s.data.totalTrainingTokens); const startTraining = useGameStore((s) => s.startTraining); @@ -89,6 +90,14 @@ export function ModelsPage() {
ETA: {formatDuration(activeTraining.totalTicks - activeTraining.progressTicks)}
+ {(() => { + const reqVram = VRAM_REQUIREMENTS_BY_GENERATION[activeTraining.generation] ?? 0; + return reqVram > 0 && totalVramGB < reqVram ? ( +

+ Training stalled — requires {formatNumber(reqVram)} GB VRAM (have {formatNumber(totalVramGB)} GB). Deploy more GPU racks. +

+ ) : null; + })()}
) : (
@@ -102,11 +111,15 @@ export function ModelsPage() { className="w-full bg-surface-800 border border-surface-600 rounded px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-accent/50" />
-
+
Training Compute
{formatNumber(trainingFlops)} FLOPS
+
+
Available VRAM
+
{formatNumber(totalVramGB)} GB
+
Training Data
{formatNumber(totalData)} tokens
diff --git a/apps/web/src/store/index.ts b/apps/web/src/store/index.ts index 9059efc..1930b30 100644 --- a/apps/web/src/store/index.ts +++ b/apps/web/src/store/index.ts @@ -10,6 +10,7 @@ import type { ActiveResearch, OwnedDataset, LocationId, DeploymentCohort, PipelineStage, CampusRetrofitQueue, + CoolingType, NetworkFabric, } from '@ai-tycoon/shared'; import type { FundingRoundType, OverloadPolicy, TuningPreset, ModelTuning } from '@ai-tycoon/shared'; import { @@ -27,6 +28,7 @@ import { LOCATION_CONFIGS, estimateNetworkSlots, maxComputeRacks, uuid, + COOLING_TYPE_CONFIGS, COOLING_ORDER, NETWORK_FABRIC_CONFIGS, FABRIC_ORDER, } from '@ai-tycoon/shared'; import { emptyDCNetworkSummary, emptyCampusNetworkSummary, emptyClusterNetworkSummary, @@ -93,6 +95,8 @@ interface Actions { startCampusRetrofit: (campusId: string, targetSkuId: RackSkuId, maxConcurrent: number) => void; cancelCampusRetrofit: (campusId: string) => void; upgradeDataCenter: (dataCenterId: string, upgrade: 'cooling' | 'redundancy') => void; + upgradeCoolingType: (dataCenterId: string, targetCooling: CoolingType) => void; + upgradeNetworkFabric: (dataCenterId: string, targetFabric: NetworkFabric) => void; startTraining: (job: Omit) => void; deployModel: (modelId: string) => void; setProductPricing: (productLineId: string, field: string, value: number) => void; @@ -197,6 +201,9 @@ export function computeFillForDC( if (dc.rackSkuId !== null && dc.rackSkuId !== skuId) return { qty: 0, cost: 0 }; const sku = RACK_SKU_CONFIGS[skuId]; + const coolingOk = COOLING_ORDER.indexOf(sku.requiredCooling) <= COOLING_ORDER.indexOf(dc.coolingType); + if (!coolingOk) return { qty: 0, cost: 0 }; + const tierConfig = DC_TIER_CONFIGS[dc.tier]; const maxCompute = maxComputeRacks(tierConfig.rackSlots, dc.tier); const pipelineCount = dc.deploymentCohorts.filter(c => c.stage !== 'decommission').reduce((sum, c) => sum + c.count, 0); @@ -414,6 +421,11 @@ export const useGameStore = create()( retrofitState: null, coolingLevel: 0, redundancyLevel: 0, + coolingType: 'air' as CoolingType, + networkFabric: 'ethernet-100g' as NetworkFabric, + dcTrainingFlops: 0, + dcInferenceFlops: 0, + dcTotalVramGB: 0, }; return { @@ -439,7 +451,10 @@ export const useGameStore = create()( const sku = RACK_SKU_CONFIGS[skuId]; const eraOrder: Era[] = ['startup', 'scaleup', 'bigtech', 'agi']; if (eraOrder.indexOf(s.meta.currentEra) < eraOrder.indexOf(sku.era)) return s; - if (sku.requiredResearch && !s.research.completedResearch.includes(sku.requiredResearch)) return s; + if (sku.requiredResearch.length > 0 && !sku.requiredResearch.every(r => s.research.completedResearch.includes(r))) return s; + + const coolingOk = COOLING_ORDER.indexOf(sku.requiredCooling) <= COOLING_ORDER.indexOf(dc.coolingType); + if (!coolingOk) return s; const tierConfig = DC_TIER_CONFIGS[dc.tier]; const maxCompute = maxComputeRacks(tierConfig.rackSlots, dc.tier); @@ -532,6 +547,11 @@ export const useGameStore = create()( retrofitState: null, coolingLevel: 0, redundancyLevel: 0, + coolingType: 'air' as CoolingType, + networkFabric: 'ethernet-100g' as NetworkFabric, + dcTrainingFlops: 0, + dcInferenceFlops: 0, + dcTotalVramGB: 0, }); } @@ -556,7 +576,10 @@ export const useGameStore = create()( const sku = RACK_SKU_CONFIGS[newSkuId]; const eraOrder: Era[] = ['startup', 'scaleup', 'bigtech', 'agi']; if (eraOrder.indexOf(s.meta.currentEra) < eraOrder.indexOf(sku.era)) return s; - if (sku.requiredResearch && !s.research.completedResearch.includes(sku.requiredResearch)) return s; + if (sku.requiredResearch.length > 0 && !sku.requiredResearch.every(r => s.research.completedResearch.includes(r))) return s; + + const coolingOk = COOLING_ORDER.indexOf(sku.requiredCooling) <= COOLING_ORDER.indexOf(dc.coolingType); + if (!coolingOk) return s; const pipelineCount = dc.deploymentCohorts.filter(c => c.stage !== 'decommission').reduce((sum, c) => sum + c.count, 0); const totalRacksToRetrofit = dc.computeRacksOnline + pipelineCount; @@ -604,12 +627,14 @@ export const useGameStore = create()( const sku = RACK_SKU_CONFIGS[skuId]; const eraOrder: Era[] = ['startup', 'scaleup', 'bigtech', 'agi']; if (eraOrder.indexOf(s.meta.currentEra) < eraOrder.indexOf(sku.era)) return s; - if (sku.requiredResearch && !s.research.completedResearch.includes(sku.requiredResearch)) return s; + if (sku.requiredResearch.length > 0 && !sku.requiredResearch.every(r => s.research.completedResearch.includes(r))) return s; let remainingMoney = s.economy.money; const dcUpdates = new Map(); for (const dc of found.campus.dataCenters) { + const coolingOk = COOLING_ORDER.indexOf(sku.requiredCooling) <= COOLING_ORDER.indexOf(dc.coolingType); + if (!coolingOk) continue; const { qty, cost } = computeFillForDC(dc, skuId, remainingMoney); if (qty <= 0) continue; @@ -649,7 +674,7 @@ export const useGameStore = create()( const sku = RACK_SKU_CONFIGS[skuId]; const eraOrder: Era[] = ['startup', 'scaleup', 'bigtech', 'agi']; if (eraOrder.indexOf(s.meta.currentEra) < eraOrder.indexOf(sku.era)) return s; - if (sku.requiredResearch && !s.research.completedResearch.includes(sku.requiredResearch)) return s; + if (sku.requiredResearch.length > 0 && !sku.requiredResearch.every(r => s.research.completedResearch.includes(r))) return s; let remainingMoney = s.economy.money; const allDcUpdates = new Map(); @@ -657,6 +682,8 @@ export const useGameStore = create()( for (const campus of cluster.campuses) { if (campus.status !== 'operational') continue; for (const dc of campus.dataCenters) { + const coolingOk = COOLING_ORDER.indexOf(sku.requiredCooling) <= COOLING_ORDER.indexOf(dc.coolingType); + if (!coolingOk) continue; const { qty, cost } = computeFillForDC(dc, skuId, remainingMoney); if (qty <= 0) continue; @@ -701,7 +728,7 @@ export const useGameStore = create()( const sku = RACK_SKU_CONFIGS[targetSkuId]; const eraOrder: Era[] = ['startup', 'scaleup', 'bigtech', 'agi']; if (eraOrder.indexOf(s.meta.currentEra) < eraOrder.indexOf(sku.era)) return s; - if (sku.requiredResearch && !s.research.completedResearch.includes(sku.requiredResearch)) return s; + if (sku.requiredResearch.length > 0 && !sku.requiredResearch.every(r => s.research.completedResearch.includes(r))) return s; const eligible: string[] = []; const skipped: string[] = []; @@ -792,6 +819,58 @@ export const useGameStore = create()( }; }), + upgradeCoolingType: (dataCenterId, targetCooling) => set((s) => { + const found = findDC(s.infrastructure, dataCenterId); + if (!found) return s; + const { dc } = found; + if (dc.status !== 'operational') return s; + + const currentIdx = COOLING_ORDER.indexOf(dc.coolingType); + const targetIdx = COOLING_ORDER.indexOf(targetCooling); + if (targetIdx <= currentIdx) return s; + + // Research gates + if (targetCooling === 'liquid' && !s.research.completedResearch.includes('liquid-cooling-tech')) return s; + if (targetCooling === 'immersion' && !s.research.completedResearch.includes('immersion-cooling-tech')) return s; + + const cost = COOLING_TYPE_CONFIGS[targetCooling].upgradeCost[dc.tier]; + if (s.economy.money < cost) return s; + + return { + economy: { ...s.economy, money: s.economy.money - cost }, + infrastructure: updateDCInInfra(s.infrastructure, dataCenterId, (d) => ({ + ...d, + coolingType: targetCooling, + })), + }; + }), + + upgradeNetworkFabric: (dataCenterId, targetFabric) => set((s) => { + const found = findDC(s.infrastructure, dataCenterId); + if (!found) return s; + const { dc } = found; + if (dc.status !== 'operational') return s; + + const currentIdx = FABRIC_ORDER.indexOf(dc.networkFabric); + const targetIdx = FABRIC_ORDER.indexOf(targetFabric); + if (targetIdx <= currentIdx) return s; + + // InfiniBand requires research + if ((targetFabric === 'infiniband-ndr' || targetFabric === 'infiniband-xdr') + && !s.research.completedResearch.includes('infiniband-networking')) return s; + + const cost = NETWORK_FABRIC_CONFIGS[targetFabric].upgradeCost[dc.tier]; + if (s.economy.money < cost) return s; + + return { + economy: { ...s.economy, money: s.economy.money - cost }, + infrastructure: updateDCInInfra(s.infrastructure, dataCenterId, (d) => ({ + ...d, + networkFabric: targetFabric, + })), + }; + }), + // --- Non-infrastructure actions (unchanged) --- startTraining: (job) => set((s) => ({ @@ -979,7 +1058,7 @@ export const useGameStore = create()( notifications: [{ id: uuid(), title: 'Save Reset', - message: 'Your save was reset due to a major infrastructure redesign — Hypercluster scale! Build clusters, campuses, and data centers.', + message: 'Your save was reset due to a major rack system overhaul — 20 SKUs with training/inference specialization, VRAM, cooling tech, interconnects, and AMD/ASIC vendors!', type: 'info' as const, tick: 0, read: false, diff --git a/packages/game-engine/src/data/techTree.ts b/packages/game-engine/src/data/techTree.ts index ca32836..c2eb182 100644 --- a/packages/game-engine/src/data/techTree.ts +++ b/packages/game-engine/src/data/techTree.ts @@ -25,7 +25,7 @@ export const TECH_TREE: ResearchNode[] = [ { id: 'advanced-gpu-arch', name: 'Advanced GPU Architecture', - description: 'Unlocks procurement of NVIDIA A100 rack configurations.', + description: 'Unlocks NVIDIA A100 PCIe and SXM rack configurations.', era: 'startup', category: 'infrastructure', prerequisites: [], @@ -35,7 +35,7 @@ export const TECH_TREE: ResearchNode[] = [ { id: 'next-gen-gpu', name: 'Next-Gen GPU Architecture', - description: 'Unlocks procurement of NVIDIA H100 rack configurations.', + description: 'Unlocks NVIDIA H100 and H200 rack configurations.', era: 'scaleup', category: 'infrastructure', prerequisites: ['advanced-gpu-arch'], @@ -45,7 +45,7 @@ export const TECH_TREE: ResearchNode[] = [ { id: 'frontier-compute', name: 'Frontier Compute', - description: 'Unlocks procurement of NVIDIA B200 rack configurations.', + description: 'Unlocks NVIDIA B100 and B200 rack configurations.', era: 'bigtech', category: 'infrastructure', prerequisites: ['next-gen-gpu'], @@ -55,13 +55,73 @@ export const TECH_TREE: ResearchNode[] = [ { id: 'custom-silicon', name: 'Custom Silicon Design', - description: 'Design and fabricate custom AI ASICs for maximum efficiency.', + description: 'Unlocks custom Training and Inference ASIC configurations.', era: 'agi', category: 'infrastructure', prerequisites: ['frontier-compute'], cost: { researchPoints: 10, compute: 500, ticks: 900 }, effects: [{ type: 'unlock_rack', target: 'custom', value: 1 }], }, + { + id: 'amd-ecosystem', + name: 'AMD ROCm Ecosystem', + description: 'Adapt software stack for AMD GPUs. Unlocks MI250X, MI300X, MI325X racks.', + era: 'scaleup', + category: 'infrastructure', + prerequisites: ['advanced-gpu-arch'], + cost: { researchPoints: 2, compute: 30, ticks: 200 }, + effects: [{ type: 'unlock_rack', target: 'amd', value: 1 }], + }, + { + id: 'inference-specialization', + name: 'Inference Specialization', + description: 'Optimized inference kernels unlock L4, L40S, and custom inference racks.', + era: 'scaleup', + category: 'infrastructure', + prerequisites: ['quantization'], + cost: { researchPoints: 2, compute: 20, ticks: 150 }, + effects: [{ type: 'unlock_rack', target: 'inference', value: 1 }], + }, + { + id: 'rack-scale-compute', + name: 'Rack-Scale Computing', + description: 'Full NVLink domain architecture. Unlocks GB200 NVL72 — 72 GPUs in a single rack.', + era: 'agi', + category: 'infrastructure', + prerequisites: ['frontier-compute'], + cost: { researchPoints: 8, compute: 400, ticks: 720 }, + effects: [{ type: 'unlock_rack', target: 'gb200-nvl72', value: 1 }], + }, + { + id: 'liquid-cooling-tech', + name: 'Liquid Cooling Systems', + description: 'Enables liquid cooling upgrades for data centers. Required for SXM and high-power racks.', + era: 'scaleup', + category: 'infrastructure', + prerequisites: ['advanced-cooling'], + cost: { researchPoints: 2, compute: 25, ticks: 180 }, + effects: [{ type: 'unlock_feature', target: 'liquid-cooling', value: 1 }], + }, + { + id: 'immersion-cooling-tech', + name: 'Immersion Cooling', + description: 'Enables immersion cooling for maximum power density. Required for custom ASICs.', + era: 'bigtech', + category: 'infrastructure', + prerequisites: ['liquid-cooling-tech'], + cost: { researchPoints: 5, compute: 100, ticks: 400 }, + effects: [{ type: 'unlock_feature', target: 'immersion-cooling', value: 1 }], + }, + { + id: 'infiniband-networking', + name: 'InfiniBand Networking', + description: 'High-bandwidth interconnect for distributed training. Unlocks InfiniBand fabric upgrades.', + era: 'scaleup', + category: 'infrastructure', + prerequisites: ['network-engineering-i'], + cost: { researchPoints: 3, compute: 40, ticks: 240 }, + effects: [{ type: 'unlock_feature', target: 'infiniband', value: 1 }], + }, { id: 'dc-engineering-ii', name: 'DC Engineering II', diff --git a/packages/game-engine/src/systems/computeSystem.ts b/packages/game-engine/src/systems/computeSystem.ts index b4b63fe..d56fab8 100644 --- a/packages/game-engine/src/systems/computeSystem.ts +++ b/packages/game-engine/src/systems/computeSystem.ts @@ -3,19 +3,44 @@ import { FLOPS_TO_TOKENS_MULTIPLIER } from '@ai-tycoon/shared'; export interface CapacityResult { totalFlops: number; + totalTrainingFlops: number; + totalInferenceFlops: number; + totalVramGB: number; trainingAllocation: number; inferenceAllocation: number; + effectiveTrainingFlops: number; + effectiveInferenceFlops: number; tokensPerSecondCapacity: number; } export function computeCapacity(state: GameState, infrastructure: InfrastructureState): CapacityResult { - const totalFlops = infrastructure.totalFlops; + const { totalTrainingFlops, totalInferenceFlops, totalVramGB } = infrastructure; const trainingAllocation = state.compute.trainingAllocation; const inferenceAllocation = 1 - trainingAllocation; - const inferenceFlops = totalFlops * inferenceAllocation; - const tokensPerSecondCapacity = inferenceFlops * FLOPS_TO_TOKENS_MULTIPLIER; - return { totalFlops, trainingAllocation, inferenceAllocation, tokensPerSecondCapacity }; + // Training hardware can do inference at ~50% efficiency + // Inference hardware can do training at ~30% efficiency (no NVLink, poor scaling) + const effectiveTrainingFlops = + totalTrainingFlops * trainingAllocation + + totalInferenceFlops * trainingAllocation * 0.3; + + const effectiveInferenceFlops = + totalInferenceFlops * inferenceAllocation + + totalTrainingFlops * inferenceAllocation * 0.5; + + const tokensPerSecondCapacity = effectiveInferenceFlops * FLOPS_TO_TOKENS_MULTIPLIER; + + return { + totalFlops: totalTrainingFlops + totalInferenceFlops, + totalTrainingFlops, + totalInferenceFlops, + totalVramGB, + trainingAllocation, + inferenceAllocation, + effectiveTrainingFlops, + effectiveInferenceFlops, + tokensPerSecondCapacity, + }; } export function finalizeCompute(capacity: CapacityResult, totalTokenDemand: number): ComputeState { diff --git a/packages/game-engine/src/systems/infrastructureSystem.ts b/packages/game-engine/src/systems/infrastructureSystem.ts index 2233f62..f1fa0ed 100644 --- a/packages/game-engine/src/systems/infrastructureSystem.ts +++ b/packages/game-engine/src/systems/infrastructureSystem.ts @@ -2,7 +2,7 @@ import type { GameState, InfrastructureState, Cluster, Campus, DataCenter, DeploymentCohort, PipelineStage, RackSkuId, NetworkSwitch, SwitchTier, DCNetworkSummary, CampusNetworkSummary, ClusterNetworkSummary, - CampusRetrofitQueue, DCTier, + CampusRetrofitQueue, DCTier, IntraNodeInterconnect, NetworkFabric, RackSkuConfig, } from '@ai-tycoon/shared'; import { LOCATION_CONFIGS, @@ -19,6 +19,8 @@ import { T3_COUNT_PER_DC_TIER, SWITCH_REPAIR_COST_FRACTION, NETWORK_DEGRADATION, + COOLING_TYPE_CONFIGS, + NETWORK_FABRIC_CONFIGS, estimateNetworkSlots, } from '@ai-tycoon/shared'; import type { TickNotification } from '../tick'; @@ -435,6 +437,30 @@ function processNetworkTick( return { switchRepairCosts, notifications, dirty }; } +// --- Interconnect Training Multiplier --- + +const INTRA_NODE_BONUS: Record = { + 'pcie-gen4': 0.0, + 'pcie-gen5': 0.05, + 'nvlink-3': 0.15, + 'nvlink-4': 0.25, + 'nvlink-5': 0.35, + 'nvlink-domain': 0.50, + 'infinity-fabric': 0.10, + 'custom-mesh': 0.40, +}; + +function computeInterconnectMultiplier( + sku: RackSkuConfig, + rackCount: number, + fabric: NetworkFabric, +): number { + if (rackCount <= 1) return 1.0; + const intra = INTRA_NODE_BONUS[sku.intraNodeInterconnect] ?? 0; + const fabricBonus = NETWORK_FABRIC_CONFIGS[fabric].trainingScalingBonus; + return Math.min(1.0, 0.6 + intra + fabricBonus); +} + // --- Main Infrastructure Tick --- export function processInfrastructure(state: GameState): InfraTickResult { @@ -463,6 +489,9 @@ export function processInfrastructure(state: GameState): InfraTickResult { notifications.push(...netResult.notifications); let totalFlops = 0; + let totalTrainingFlops = 0; + let totalInferenceFlops = 0; + let totalVramGB = 0; let totalUptime = 0; let totalRackCount = 0; let totalComputeRackCount = 0; @@ -684,14 +713,23 @@ export function processInfrastructure(state: GameState): InfraTickResult { let usedPowerKW = 0; let dcFlops = 0; + let dcTrainingFlops = 0; + let dcInferenceFlops = 0; + let dcTotalVramGB = 0; if (dc.rackSkuId && computeRacksOnline > 0) { const sku = RACK_SKU_CONFIGS[dc.rackSkuId]; usedPowerKW = computeRacksOnline * sku.powerDrawKW; - dcFlops = effectiveComputeRacks * sku.flopsPerRack * networkSummary.effectiveFlopsFraction; + const bwFraction = networkSummary.effectiveFlopsFraction; + const interconnectMult = computeInterconnectMultiplier(sku, effectiveComputeRacks, dc.networkFabric); + dcTrainingFlops = effectiveComputeRacks * sku.trainingFlops * bwFraction * interconnectMult; + dcInferenceFlops = effectiveComputeRacks * sku.inferenceFlops * bwFraction; + dcTotalVramGB = computeRacksOnline * sku.totalVramGB; + dcFlops = dcTrainingFlops + dcInferenceFlops; } + const pue = COOLING_TYPE_CONFIGS[dc.coolingType].pueMultiplier; const energyCostPerTick = (tierConfig.baseEnergyCostPerTick + usedPowerKW * BASE_ENERGY_COST_PER_FLOP) - * location.energyCostMultiplier; + * location.energyCostMultiplier * pue; const maintenanceCostPerTick = totalRacksInDc * BASE_MAINTENANCE_PER_RACK; const currentUptime = totalRacksInDc > 0 ? effectiveComputeRacks / totalRacksInDc : 1; @@ -703,6 +741,9 @@ export function processInfrastructure(state: GameState): InfraTickResult { } totalFlops += dcFlops; + totalTrainingFlops += dcTrainingFlops; + totalInferenceFlops += dcInferenceFlops; + totalVramGB += dcTotalVramGB; totalRackCount += totalRacksInDc + netSlots; totalComputeRackCount += totalRacksInDc; totalDataCenterCount++; @@ -714,6 +755,7 @@ export function processInfrastructure(state: GameState): InfraTickResult { deploymentCohorts: updatedCohorts, networkSummary, effectiveComputeRacks, usedSlots, usedPowerKW, energyCostPerTick, maintenanceCostPerTick, currentUptime, + dcTrainingFlops, dcInferenceFlops, dcTotalVramGB, }; }); @@ -788,6 +830,9 @@ export function processInfrastructure(state: GameState): InfraTickResult { clusters, switchRegistry: registry, totalFlops, + totalTrainingFlops, + totalInferenceFlops, + totalVramGB, totalUptime: dcWithRacks > 0 ? totalUptime / dcWithRacks : 1, totalRackCount, totalComputeRackCount, diff --git a/packages/game-engine/src/systems/modelSystem.ts b/packages/game-engine/src/systems/modelSystem.ts index ec534f0..4352487 100644 --- a/packages/game-engine/src/systems/modelSystem.ts +++ b/packages/game-engine/src/systems/modelSystem.ts @@ -1,5 +1,5 @@ import type { GameState, ModelsState, TrainedModel, ModelCapabilities } from '@ai-tycoon/shared'; -import { uuid } from '@ai-tycoon/shared'; +import { uuid, VRAM_REQUIREMENTS_BY_GENERATION } from '@ai-tycoon/shared'; export interface ModelTickResult { modelsState: ModelsState; @@ -12,6 +12,11 @@ export function processModels(state: GameState): ModelTickResult { return { modelsState: state.models, modelCompleted: null }; } + const requiredVram = VRAM_REQUIREMENTS_BY_GENERATION[active.generation] ?? 0; + if (requiredVram > 0 && state.compute.totalVramGB < requiredVram) { + return { modelsState: state.models, modelCompleted: null }; + } + const researcherBoost = state.talent.departments.research.headcount * state.talent.departments.research.effectiveness; const engineerBoost = state.talent.departments.engineering.headcount * diff --git a/packages/shared/src/constants/gameBalance.ts b/packages/shared/src/constants/gameBalance.ts index 14a7496..6349ac2 100644 --- a/packages/shared/src/constants/gameBalance.ts +++ b/packages/shared/src/constants/gameBalance.ts @@ -1,4 +1,4 @@ -import type { DCTier, DCTierConfig, RackSkuId, RackSkuConfig, SwitchTier, SwitchTierConfig, CampusTierCost, ClusterCostConfig } from '../types/infrastructure'; +import type { DCTier, DCTierConfig, RackSkuId, RackSkuConfig, SwitchTier, SwitchTierConfig, CampusTierCost, ClusterCostConfig, CoolingType, CoolingTypeConfig, NetworkFabric, NetworkFabricConfig } from '../types/infrastructure'; export const TICK_INTERVAL_MS = 1000; export const MAX_OFFLINE_TICKS = 86_400; @@ -209,18 +209,87 @@ export function maxComputeRacks(totalSlots: number, dcTier: DCTier): number { return lo; } +// --- Cooling Type Configs --- + +export const COOLING_TYPE_CONFIGS: Record = { + air: { + name: 'Air Cooling', + upgradeCost: { small: 0, medium: 0, large: 0, mega: 0 }, + upgradeTimeTicks: 0, + pueMultiplier: 1.0, + }, + liquid: { + name: 'Liquid Cooling', + upgradeCost: { small: 200_000, medium: 600_000, large: 2_000_000, mega: 6_000_000 }, + upgradeTimeTicks: 300, + pueMultiplier: 0.85, + }, + immersion: { + name: 'Immersion Cooling', + upgradeCost: { small: 500_000, medium: 1_500_000, large: 5_000_000, mega: 15_000_000 }, + upgradeTimeTicks: 600, + pueMultiplier: 0.70, + }, +}; + +export const COOLING_ORDER: CoolingType[] = ['air', 'liquid', 'immersion']; + +// --- Network Fabric Configs --- + +export const NETWORK_FABRIC_CONFIGS: Record = { + 'ethernet-100g': { + name: '100G Ethernet', + upgradeCost: { small: 0, medium: 0, large: 0, mega: 0 }, + upgradeTimeTicks: 0, + trainingScalingBonus: 0, + }, + 'ethernet-400g': { + name: '400G Ethernet', + upgradeCost: { small: 100_000, medium: 300_000, large: 1_000_000, mega: 3_000_000 }, + upgradeTimeTicks: 200, + trainingScalingBonus: 0.10, + }, + 'infiniband-ndr': { + name: 'InfiniBand NDR', + upgradeCost: { small: 300_000, medium: 900_000, large: 3_000_000, mega: 9_000_000 }, + upgradeTimeTicks: 400, + trainingScalingBonus: 0.25, + }, + 'infiniband-xdr': { + name: 'InfiniBand XDR', + upgradeCost: { small: 800_000, medium: 2_400_000, large: 8_000_000, mega: 24_000_000 }, + upgradeTimeTicks: 600, + trainingScalingBonus: 0.40, + }, +}; + +export const FABRIC_ORDER: NetworkFabric[] = ['ethernet-100g', 'ethernet-400g', 'infiniband-ndr', 'infiniband-xdr']; + // --- Rack SKU Configs --- +export function skuTotalFlops(sku: RackSkuConfig): number { + return sku.trainingFlops + sku.inferenceFlops; +} + export const RACK_SKU_CONFIGS: Record = { + // === STARTUP ERA === 'consumer-x4': { id: 'consumer-x4', name: 'Consumer GPU x4', era: 'startup', + gpuVendor: 'nvidia', + gpuModel: 'RTX Consumer', gpuCount: 4, - flopsPerRack: 4, + trainingFlops: 2, + inferenceFlops: 4, + vramPerGpuGB: 12, + totalVramGB: 48, + requiredCooling: 'air', + intraNodeInterconnect: 'pcie-gen4', + intraNodeBandwidthGBps: 64, powerDrawKW: 0.4, baseCost: 3_200, - requiredResearch: null, + requiredResearch: [], pipelineTimeTicks: { manufacturing: 20, receiving: 10, installation: 15, testing: 15 }, testFailureRate: 0.05, productionFailureRate: 0.0002, @@ -230,11 +299,19 @@ export const RACK_SKU_CONFIGS: Record = { id: 't4-x4', name: 'NVIDIA T4 x4', era: 'startup', + gpuVendor: 'nvidia', + gpuModel: 'T4', gpuCount: 4, - flopsPerRack: 32, - powerDrawKW: 1.2, - baseCost: 20_000, - requiredResearch: null, + trainingFlops: 8, + inferenceFlops: 32, + vramPerGpuGB: 16, + totalVramGB: 64, + requiredCooling: 'air', + intraNodeInterconnect: 'pcie-gen4', + intraNodeBandwidthGBps: 64, + powerDrawKW: 0.5, + baseCost: 12_000, + requiredResearch: [], pipelineTimeTicks: { manufacturing: 30, receiving: 15, installation: 25, testing: 20 }, testFailureRate: 0.07, productionFailureRate: 0.0003, @@ -243,115 +320,370 @@ export const RACK_SKU_CONFIGS: Record = { 't4-x8': { id: 't4-x8', name: 'NVIDIA T4 x8', - era: 'scaleup', + era: 'startup', + gpuVendor: 'nvidia', + gpuModel: 'T4', gpuCount: 8, - flopsPerRack: 64, - powerDrawKW: 2.4, - baseCost: 38_000, - requiredResearch: null, + trainingFlops: 16, + inferenceFlops: 64, + vramPerGpuGB: 16, + totalVramGB: 128, + requiredCooling: 'air', + intraNodeInterconnect: 'pcie-gen4', + intraNodeBandwidthGBps: 64, + powerDrawKW: 1.0, + baseCost: 22_000, + requiredResearch: [], pipelineTimeTicks: { manufacturing: 40, receiving: 20, installation: 30, testing: 30 }, testFailureRate: 0.08, productionFailureRate: 0.0003, repairCostFraction: 0.12, }, - 'a100-x4': { - id: 'a100-x4', - name: 'NVIDIA A100 x4', + + // === SCALEUP ERA === + 'l4-x8': { + id: 'l4-x8', + name: 'NVIDIA L4 x8', era: 'scaleup', + gpuVendor: 'nvidia', + gpuModel: 'L4', + gpuCount: 8, + trainingFlops: 30, + inferenceFlops: 180, + vramPerGpuGB: 24, + totalVramGB: 192, + requiredCooling: 'air', + intraNodeInterconnect: 'pcie-gen5', + intraNodeBandwidthGBps: 128, + powerDrawKW: 0.8, + baseCost: 28_000, + requiredResearch: ['inference-specialization'], + pipelineTimeTicks: { manufacturing: 35, receiving: 15, installation: 25, testing: 25 }, + testFailureRate: 0.07, + productionFailureRate: 0.0002, + repairCostFraction: 0.10, + }, + 'a100-pcie-x4': { + id: 'a100-pcie-x4', + name: 'A100 PCIe x4', + era: 'scaleup', + gpuVendor: 'nvidia', + gpuModel: 'A100 PCIe 80GB', gpuCount: 4, - flopsPerRack: 160, - powerDrawKW: 4.0, - baseCost: 60_000, - requiredResearch: 'advanced-gpu-arch', + trainingFlops: 100, + inferenceFlops: 140, + vramPerGpuGB: 80, + totalVramGB: 320, + requiredCooling: 'air', + intraNodeInterconnect: 'pcie-gen4', + intraNodeBandwidthGBps: 64, + powerDrawKW: 2.0, + baseCost: 55_000, + requiredResearch: ['advanced-gpu-arch'], pipelineTimeTicks: { manufacturing: 60, receiving: 25, installation: 50, testing: 45 }, testFailureRate: 0.10, productionFailureRate: 0.0004, repairCostFraction: 0.15, }, - 'a100-x8': { - id: 'a100-x8', - name: 'NVIDIA A100 x8', + 'a100-sxm-x8': { + id: 'a100-sxm-x8', + name: 'A100 SXM x8', era: 'scaleup', + gpuVendor: 'nvidia', + gpuModel: 'A100 SXM 80GB', gpuCount: 8, - flopsPerRack: 320, - powerDrawKW: 8.0, + trainingFlops: 320, + inferenceFlops: 200, + vramPerGpuGB: 80, + totalVramGB: 640, + requiredCooling: 'liquid', + intraNodeInterconnect: 'nvlink-3', + intraNodeBandwidthGBps: 600, + powerDrawKW: 5.0, baseCost: 115_000, - requiredResearch: 'advanced-gpu-arch', + requiredResearch: ['advanced-gpu-arch'], pipelineTimeTicks: { manufacturing: 70, receiving: 30, installation: 55, testing: 55 }, testFailureRate: 0.12, productionFailureRate: 0.0004, repairCostFraction: 0.15, }, - 'h100-x4': { - id: 'h100-x4', - name: 'NVIDIA H100 x4', - era: 'bigtech', - gpuCount: 4, - flopsPerRack: 480, + 'mi250x-x8': { + id: 'mi250x-x8', + name: 'AMD MI250X x8', + era: 'scaleup', + gpuVendor: 'amd', + gpuModel: 'MI250X', + gpuCount: 8, + trainingFlops: 240, + inferenceFlops: 160, + vramPerGpuGB: 128, + totalVramGB: 1024, + requiredCooling: 'air', + intraNodeInterconnect: 'infinity-fabric', + intraNodeBandwidthGBps: 400, powerDrawKW: 5.6, - baseCost: 140_000, - requiredResearch: 'next-gen-gpu', - pipelineTimeTicks: { manufacturing: 80, receiving: 30, installation: 65, testing: 65 }, + baseCost: 80_000, + requiredResearch: ['amd-ecosystem'], + pipelineTimeTicks: { manufacturing: 75, receiving: 30, installation: 60, testing: 60 }, testFailureRate: 0.15, productionFailureRate: 0.0005, repairCostFraction: 0.18, }, - 'h100-x8': { - id: 'h100-x8', - name: 'NVIDIA H100 x8', + + // === BIG TECH ERA === + 'h100-pcie-x4': { + id: 'h100-pcie-x4', + name: 'H100 PCIe x4', era: 'bigtech', + gpuVendor: 'nvidia', + gpuModel: 'H100 PCIe 80GB', + gpuCount: 4, + trainingFlops: 180, + inferenceFlops: 480, + vramPerGpuGB: 80, + totalVramGB: 320, + requiredCooling: 'air', + intraNodeInterconnect: 'pcie-gen5', + intraNodeBandwidthGBps: 128, + powerDrawKW: 1.8, + baseCost: 130_000, + requiredResearch: ['next-gen-gpu'], + pipelineTimeTicks: { manufacturing: 80, receiving: 30, installation: 65, testing: 65 }, + testFailureRate: 0.12, + productionFailureRate: 0.0004, + repairCostFraction: 0.15, + }, + 'h100-sxm-x8': { + id: 'h100-sxm-x8', + name: 'H100 SXM x8', + era: 'bigtech', + gpuVendor: 'nvidia', + gpuModel: 'H100 SXM 80GB', gpuCount: 8, - flopsPerRack: 960, - powerDrawKW: 11.2, - baseCost: 270_000, - requiredResearch: 'next-gen-gpu', + trainingFlops: 960, + inferenceFlops: 600, + vramPerGpuGB: 80, + totalVramGB: 640, + requiredCooling: 'liquid', + intraNodeInterconnect: 'nvlink-4', + intraNodeBandwidthGBps: 900, + powerDrawKW: 7.0, + baseCost: 280_000, + requiredResearch: ['next-gen-gpu'], pipelineTimeTicks: { manufacturing: 90, receiving: 35, installation: 75, testing: 80 }, testFailureRate: 0.18, productionFailureRate: 0.0005, repairCostFraction: 0.18, }, - 'b200-x4': { - id: 'b200-x4', - name: 'NVIDIA B200 x4', + 'h200-sxm-x8': { + id: 'h200-sxm-x8', + name: 'H200 SXM x8', era: 'bigtech', - gpuCount: 4, - flopsPerRack: 1600, - powerDrawKW: 8.0, + gpuVendor: 'nvidia', + gpuModel: 'H200 SXM 141GB', + gpuCount: 8, + trainingFlops: 1000, + inferenceFlops: 650, + vramPerGpuGB: 141, + totalVramGB: 1128, + requiredCooling: 'liquid', + intraNodeInterconnect: 'nvlink-4', + intraNodeBandwidthGBps: 900, + powerDrawKW: 7.0, + baseCost: 340_000, + requiredResearch: ['next-gen-gpu'], + pipelineTimeTicks: { manufacturing: 95, receiving: 35, installation: 80, testing: 85 }, + testFailureRate: 0.18, + productionFailureRate: 0.0005, + repairCostFraction: 0.18, + }, + 'mi300x-x8': { + id: 'mi300x-x8', + name: 'AMD MI300X x8', + era: 'bigtech', + gpuVendor: 'amd', + gpuModel: 'MI300X 192GB', + gpuCount: 8, + trainingFlops: 700, + inferenceFlops: 450, + vramPerGpuGB: 192, + totalVramGB: 1536, + requiredCooling: 'liquid', + intraNodeInterconnect: 'infinity-fabric', + intraNodeBandwidthGBps: 500, + powerDrawKW: 7.5, baseCost: 200_000, - requiredResearch: 'frontier-compute', + requiredResearch: ['amd-ecosystem'], + pipelineTimeTicks: { manufacturing: 100, receiving: 35, installation: 75, testing: 80 }, + testFailureRate: 0.20, + productionFailureRate: 0.0006, + repairCostFraction: 0.20, + }, + 'l40s-x8': { + id: 'l40s-x8', + name: 'NVIDIA L40S x8', + era: 'bigtech', + gpuVendor: 'nvidia', + gpuModel: 'L40S 48GB', + gpuCount: 8, + trainingFlops: 120, + inferenceFlops: 900, + vramPerGpuGB: 48, + totalVramGB: 384, + requiredCooling: 'air', + intraNodeInterconnect: 'pcie-gen5', + intraNodeBandwidthGBps: 128, + powerDrawKW: 3.5, + baseCost: 160_000, + requiredResearch: ['inference-specialization'], + pipelineTimeTicks: { manufacturing: 70, receiving: 25, installation: 55, testing: 50 }, + testFailureRate: 0.10, + productionFailureRate: 0.0003, + repairCostFraction: 0.12, + }, + 'b100-x8': { + id: 'b100-x8', + name: 'NVIDIA B100 x8', + era: 'bigtech', + gpuVendor: 'nvidia', + gpuModel: 'B100 192GB', + gpuCount: 8, + trainingFlops: 1800, + inferenceFlops: 1100, + vramPerGpuGB: 192, + totalVramGB: 1536, + requiredCooling: 'air', + intraNodeInterconnect: 'nvlink-5', + intraNodeBandwidthGBps: 1800, + powerDrawKW: 7.0, + baseCost: 320_000, + requiredResearch: ['frontier-compute'], pipelineTimeTicks: { manufacturing: 100, receiving: 40, installation: 80, testing: 80 }, testFailureRate: 0.20, productionFailureRate: 0.0006, repairCostFraction: 0.20, }, - 'b200-x8': { - id: 'b200-x8', - name: 'NVIDIA B200 x8', + + // === AGI ERA === + 'b200-sxm-x8': { + id: 'b200-sxm-x8', + name: 'B200 SXM x8', era: 'agi', + gpuVendor: 'nvidia', + gpuModel: 'B200 SXM 192GB', gpuCount: 8, - flopsPerRack: 3200, - powerDrawKW: 16.0, - baseCost: 380_000, - requiredResearch: 'frontier-compute', + trainingFlops: 3200, + inferenceFlops: 1800, + vramPerGpuGB: 192, + totalVramGB: 1536, + requiredCooling: 'liquid', + intraNodeInterconnect: 'nvlink-5', + intraNodeBandwidthGBps: 1800, + powerDrawKW: 10.0, + baseCost: 400_000, + requiredResearch: ['frontier-compute'], pipelineTimeTicks: { manufacturing: 120, receiving: 45, installation: 95, testing: 100 }, testFailureRate: 0.22, productionFailureRate: 0.0006, repairCostFraction: 0.20, }, - 'custom-x8': { - id: 'custom-x8', - name: 'Custom ASIC x8', + 'gb200-nvl72': { + id: 'gb200-nvl72', + name: 'GB200 NVL72', era: 'agi', + gpuVendor: 'nvidia', + gpuModel: 'B200 NVL72', + gpuCount: 72, + trainingFlops: 36_000, + inferenceFlops: 18_000, + vramPerGpuGB: 192, + totalVramGB: 13_824, + requiredCooling: 'liquid', + intraNodeInterconnect: 'nvlink-domain', + intraNodeBandwidthGBps: 14_400, + powerDrawKW: 120.0, + baseCost: 2_500_000, + requiredResearch: ['frontier-compute', 'rack-scale-compute'], + pipelineTimeTicks: { manufacturing: 180, receiving: 60, installation: 120, testing: 120 }, + testFailureRate: 0.28, + productionFailureRate: 0.0008, + repairCostFraction: 0.15, + }, + 'mi325x-x8': { + id: 'mi325x-x8', + name: 'AMD MI325X x8', + era: 'agi', + gpuVendor: 'amd', + gpuModel: 'MI325X 256GB', gpuCount: 8, - flopsPerRack: 6400, + trainingFlops: 2400, + inferenceFlops: 1400, + vramPerGpuGB: 256, + totalVramGB: 2048, + requiredCooling: 'liquid', + intraNodeInterconnect: 'infinity-fabric', + intraNodeBandwidthGBps: 600, + powerDrawKW: 7.5, + baseCost: 280_000, + requiredResearch: ['amd-ecosystem'], + pipelineTimeTicks: { manufacturing: 130, receiving: 40, installation: 90, testing: 95 }, + testFailureRate: 0.22, + productionFailureRate: 0.0006, + repairCostFraction: 0.20, + }, + 'custom-training-x8': { + id: 'custom-training-x8', + name: 'Training ASIC x8', + era: 'agi', + gpuVendor: 'custom', + gpuModel: 'Custom Training ASIC', + gpuCount: 8, + trainingFlops: 8000, + inferenceFlops: 2000, + vramPerGpuGB: 256, + totalVramGB: 2048, + requiredCooling: 'immersion', + intraNodeInterconnect: 'custom-mesh', + intraNodeBandwidthGBps: 3200, powerDrawKW: 20.0, - baseCost: 640_000, - requiredResearch: 'custom-silicon', + baseCost: 700_000, + requiredResearch: ['custom-silicon'], pipelineTimeTicks: { manufacturing: 140, receiving: 50, installation: 100, testing: 110 }, testFailureRate: 0.25, productionFailureRate: 0.0008, repairCostFraction: 0.20, }, + 'custom-inference-x16': { + id: 'custom-inference-x16', + name: 'Inference ASIC x16', + era: 'agi', + gpuVendor: 'custom', + gpuModel: 'Custom Inference ASIC', + gpuCount: 16, + trainingFlops: 800, + inferenceFlops: 12_000, + vramPerGpuGB: 32, + totalVramGB: 512, + requiredCooling: 'air', + intraNodeInterconnect: 'custom-mesh', + intraNodeBandwidthGBps: 1600, + powerDrawKW: 5.0, + baseCost: 500_000, + requiredResearch: ['custom-silicon', 'inference-specialization'], + pipelineTimeTicks: { manufacturing: 130, receiving: 45, installation: 90, testing: 100 }, + testFailureRate: 0.22, + productionFailureRate: 0.0007, + repairCostFraction: 0.18, + }, +}; + +export const VRAM_REQUIREMENTS_BY_GENERATION: Record = { + 1: 48, + 2: 192, + 3: 640, + 4: 1536, + 5: 4096, + 6: 16384, }; // --- Pipeline & Infrastructure Constants --- diff --git a/packages/shared/src/types/compute.ts b/packages/shared/src/types/compute.ts index 1f3c45d..52df024 100644 --- a/packages/shared/src/types/compute.ts +++ b/packages/shared/src/types/compute.ts @@ -1,7 +1,12 @@ export interface ComputeState { totalFlops: number; + totalTrainingFlops: number; + totalInferenceFlops: number; + totalVramGB: number; trainingAllocation: number; inferenceAllocation: number; + effectiveTrainingFlops: number; + effectiveInferenceFlops: number; inferenceUtilization: number; tokensPerSecondCapacity: number; tokensPerSecondDemand: number; @@ -9,8 +14,13 @@ export interface ComputeState { export const INITIAL_COMPUTE: ComputeState = { totalFlops: 0, + totalTrainingFlops: 0, + totalInferenceFlops: 0, + totalVramGB: 0, trainingAllocation: 0.5, inferenceAllocation: 0.5, + effectiveTrainingFlops: 0, + effectiveInferenceFlops: 0, inferenceUtilization: 0, tokensPerSecondCapacity: 0, tokensPerSecondDemand: 0, diff --git a/packages/shared/src/types/gameState.ts b/packages/shared/src/types/gameState.ts index 3c32416..30061d4 100644 --- a/packages/shared/src/types/gameState.ts +++ b/packages/shared/src/types/gameState.ts @@ -58,4 +58,4 @@ export const INITIAL_SETTINGS: GameSettings = { sfxVolume: 0.7, }; -export const SAVE_VERSION = 4; +export const SAVE_VERSION = 5; diff --git a/packages/shared/src/types/infrastructure.ts b/packages/shared/src/types/infrastructure.ts index 8b5bfec..1a0be53 100644 --- a/packages/shared/src/types/infrastructure.ts +++ b/packages/shared/src/types/infrastructure.ts @@ -75,12 +75,17 @@ export interface DataCenter { retrofitState: RetrofitState | null; coolingLevel: number; redundancyLevel: number; + coolingType: CoolingType; + networkFabric: NetworkFabric; effectiveComputeRacks: number; usedSlots: number; usedPowerKW: number; energyCostPerTick: number; maintenanceCostPerTick: number; currentUptime: number; + dcTrainingFlops: number; + dcInferenceFlops: number; + dcTotalVramGB: number; } // --- Network Topology (6-Tier Clos) --- @@ -141,13 +146,46 @@ export interface ClusterNetworkSummary { crossCampusBandwidth: number; } +// --- Cooling, Interconnect & Vendor Types --- + +export type CoolingType = 'air' | 'liquid' | 'immersion'; +export type GpuVendor = 'nvidia' | 'amd' | 'custom'; +export type IntraNodeInterconnect = + | 'pcie-gen4' | 'pcie-gen5' + | 'nvlink-3' | 'nvlink-4' | 'nvlink-5' | 'nvlink-domain' + | 'infinity-fabric' | 'custom-mesh'; +export type NetworkFabric = + | 'ethernet-100g' | 'ethernet-400g' + | 'infiniband-ndr' | 'infiniband-xdr'; + +export interface CoolingTypeConfig { + name: string; + upgradeCost: Record; + upgradeTimeTicks: number; + pueMultiplier: number; +} + +export interface NetworkFabricConfig { + name: string; + upgradeCost: Record; + upgradeTimeTicks: number; + trainingScalingBonus: number; +} + // --- Racks --- export type RackSkuId = + // Startup | 'consumer-x4' | 't4-x4' | 't4-x8' - | 'a100-x4' | 'a100-x8' - | 'h100-x4' | 'h100-x8' - | 'b200-x4' | 'b200-x8' | 'custom-x8'; + // Scaleup + | 'a100-pcie-x4' | 'a100-sxm-x8' | 'mi250x-x8' | 'l4-x8' + // Big Tech + | 'h100-pcie-x4' | 'h100-sxm-x8' | 'h200-sxm-x8' + | 'mi300x-x8' | 'l40s-x8' | 'b100-x8' + // AGI + | 'b200-sxm-x8' | 'gb200-nvl72' + | 'mi325x-x8' + | 'custom-training-x8' | 'custom-inference-x16'; export type PipelineStage = | 'ordered' | 'manufacturing' | 'receiving' @@ -164,11 +202,19 @@ export interface RackSkuConfig { id: RackSkuId; name: string; era: Era; + gpuVendor: GpuVendor; + gpuModel: string; gpuCount: number; - flopsPerRack: number; + trainingFlops: number; + inferenceFlops: number; + vramPerGpuGB: number; + totalVramGB: number; + requiredCooling: CoolingType; + intraNodeInterconnect: IntraNodeInterconnect; + intraNodeBandwidthGBps: number; powerDrawKW: number; baseCost: number; - requiredResearch: string | null; + requiredResearch: string[]; pipelineTimeTicks: PipelineTimings; testFailureRate: number; productionFailureRate: number; @@ -218,6 +264,9 @@ export interface InfrastructureState { clusters: Cluster[]; switchRegistry: Record; totalFlops: number; + totalTrainingFlops: number; + totalInferenceFlops: number; + totalVramGB: number; totalUptime: number; totalRackCount: number; totalComputeRackCount: number; @@ -229,6 +278,9 @@ export const INITIAL_INFRASTRUCTURE: InfrastructureState = { clusters: [], switchRegistry: {}, totalFlops: 0, + totalTrainingFlops: 0, + totalInferenceFlops: 0, + totalVramGB: 0, totalUptime: 1, totalRackCount: 0, totalComputeRackCount: 0,