Overhaul rack system with split FLOPS, VRAM, cooling, interconnect, and multi-vendor SKUs

Expand from 10 to 18 rack SKUs across NVIDIA, AMD, and custom ASIC vendors, each with distinct training vs inference FLOPS, VRAM capacity, cooling requirements, and interconnect technology. Adds cooling hierarchy (air/liquid/immersion) that gates rack deployment, VRAM requirements that gate model training by generation, interconnect multipliers for distributed training scaling, and PUE-based energy cost reduction for advanced cooling. Includes save migration from v4 to v5, 6 new research nodes, and UI updates showing split compute stats. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-25 02:27:03 -04:00
parent 54220fca70
commit fc1f371c8c
12 changed files with 749 additions and 100 deletions
@@ -25,7 +25,7 @@ export const TECH_TREE: ResearchNode[] = [
  {
    id: 'advanced-gpu-arch',
    name: 'Advanced GPU Architecture',
-    description: 'Unlocks procurement of NVIDIA A100 rack configurations.',
+    description: 'Unlocks NVIDIA A100 PCIe and SXM rack configurations.',
    era: 'startup',
    category: 'infrastructure',
    prerequisites: [],
@@ -35,7 +35,7 @@ export const TECH_TREE: ResearchNode[] = [
  {
    id: 'next-gen-gpu',
    name: 'Next-Gen GPU Architecture',
-    description: 'Unlocks procurement of NVIDIA H100 rack configurations.',
+    description: 'Unlocks NVIDIA H100 and H200 rack configurations.',
    era: 'scaleup',
    category: 'infrastructure',
    prerequisites: ['advanced-gpu-arch'],
@@ -45,7 +45,7 @@ export const TECH_TREE: ResearchNode[] = [
  {
    id: 'frontier-compute',
    name: 'Frontier Compute',
-    description: 'Unlocks procurement of NVIDIA B200 rack configurations.',
+    description: 'Unlocks NVIDIA B100 and B200 rack configurations.',
    era: 'bigtech',
    category: 'infrastructure',
    prerequisites: ['next-gen-gpu'],
@@ -55,13 +55,73 @@ export const TECH_TREE: ResearchNode[] = [
  {
    id: 'custom-silicon',
    name: 'Custom Silicon Design',
-    description: 'Design and fabricate custom AI ASICs for maximum efficiency.',
+    description: 'Unlocks custom Training and Inference ASIC configurations.',
    era: 'agi',
    category: 'infrastructure',
    prerequisites: ['frontier-compute'],
    cost: { researchPoints: 10, compute: 500, ticks: 900 },
    effects: [{ type: 'unlock_rack', target: 'custom', value: 1 }],
  },
+  {
+    id: 'amd-ecosystem',
+    name: 'AMD ROCm Ecosystem',
+    description: 'Adapt software stack for AMD GPUs. Unlocks MI250X, MI300X, MI325X racks.',
+    era: 'scaleup',
+    category: 'infrastructure',
+    prerequisites: ['advanced-gpu-arch'],
+    cost: { researchPoints: 2, compute: 30, ticks: 200 },
+    effects: [{ type: 'unlock_rack', target: 'amd', value: 1 }],
+  },
+  {
+    id: 'inference-specialization',
+    name: 'Inference Specialization',
+    description: 'Optimized inference kernels unlock L4, L40S, and custom inference racks.',
+    era: 'scaleup',
+    category: 'infrastructure',
+    prerequisites: ['quantization'],
+    cost: { researchPoints: 2, compute: 20, ticks: 150 },
+    effects: [{ type: 'unlock_rack', target: 'inference', value: 1 }],
+  },
+  {
+    id: 'rack-scale-compute',
+    name: 'Rack-Scale Computing',
+    description: 'Full NVLink domain architecture. Unlocks GB200 NVL72 — 72 GPUs in a single rack.',
+    era: 'agi',
+    category: 'infrastructure',
+    prerequisites: ['frontier-compute'],
+    cost: { researchPoints: 8, compute: 400, ticks: 720 },
+    effects: [{ type: 'unlock_rack', target: 'gb200-nvl72', value: 1 }],
+  },
+  {
+    id: 'liquid-cooling-tech',
+    name: 'Liquid Cooling Systems',
+    description: 'Enables liquid cooling upgrades for data centers. Required for SXM and high-power racks.',
+    era: 'scaleup',
+    category: 'infrastructure',
+    prerequisites: ['advanced-cooling'],
+    cost: { researchPoints: 2, compute: 25, ticks: 180 },
+    effects: [{ type: 'unlock_feature', target: 'liquid-cooling', value: 1 }],
+  },
+  {
+    id: 'immersion-cooling-tech',
+    name: 'Immersion Cooling',
+    description: 'Enables immersion cooling for maximum power density. Required for custom ASICs.',
+    era: 'bigtech',
+    category: 'infrastructure',
+    prerequisites: ['liquid-cooling-tech'],
+    cost: { researchPoints: 5, compute: 100, ticks: 400 },
+    effects: [{ type: 'unlock_feature', target: 'immersion-cooling', value: 1 }],
+  },
+  {
+    id: 'infiniband-networking',
+    name: 'InfiniBand Networking',
+    description: 'High-bandwidth interconnect for distributed training. Unlocks InfiniBand fabric upgrades.',
+    era: 'scaleup',
+    category: 'infrastructure',
+    prerequisites: ['network-engineering-i'],
+    cost: { researchPoints: 3, compute: 40, ticks: 240 },
+    effects: [{ type: 'unlock_feature', target: 'infiniband', value: 1 }],
+  },
  {
    id: 'dc-engineering-ii',
    name: 'DC Engineering II',
@@ -3,19 +3,44 @@ import { FLOPS_TO_TOKENS_MULTIPLIER } from '@ai-tycoon/shared';

 export interface CapacityResult {
  totalFlops: number;
+  totalTrainingFlops: number;
+  totalInferenceFlops: number;
+  totalVramGB: number;
  trainingAllocation: number;
  inferenceAllocation: number;
+  effectiveTrainingFlops: number;
+  effectiveInferenceFlops: number;
  tokensPerSecondCapacity: number;
 }

 export function computeCapacity(state: GameState, infrastructure: InfrastructureState): CapacityResult {
-  const totalFlops = infrastructure.totalFlops;
+  const { totalTrainingFlops, totalInferenceFlops, totalVramGB } = infrastructure;
  const trainingAllocation = state.compute.trainingAllocation;
  const inferenceAllocation = 1 - trainingAllocation;
-  const inferenceFlops = totalFlops * inferenceAllocation;
-  const tokensPerSecondCapacity = inferenceFlops * FLOPS_TO_TOKENS_MULTIPLIER;

-  return { totalFlops, trainingAllocation, inferenceAllocation, tokensPerSecondCapacity };
+  // Training hardware can do inference at ~50% efficiency
+  // Inference hardware can do training at ~30% efficiency (no NVLink, poor scaling)
+  const effectiveTrainingFlops =
+    totalTrainingFlops * trainingAllocation +
+    totalInferenceFlops * trainingAllocation * 0.3;
+
+  const effectiveInferenceFlops =
+    totalInferenceFlops * inferenceAllocation +
+    totalTrainingFlops * inferenceAllocation * 0.5;
+
+  const tokensPerSecondCapacity = effectiveInferenceFlops * FLOPS_TO_TOKENS_MULTIPLIER;
+
+  return {
+    totalFlops: totalTrainingFlops + totalInferenceFlops,
+    totalTrainingFlops,
+    totalInferenceFlops,
+    totalVramGB,
+    trainingAllocation,
+    inferenceAllocation,
+    effectiveTrainingFlops,
+    effectiveInferenceFlops,
+    tokensPerSecondCapacity,
+  };
 }

 export function finalizeCompute(capacity: CapacityResult, totalTokenDemand: number): ComputeState {
@@ -2,7 +2,7 @@ import type {
  GameState, InfrastructureState, Cluster, Campus, DataCenter,
  DeploymentCohort, PipelineStage, RackSkuId, NetworkSwitch,
  SwitchTier, DCNetworkSummary, CampusNetworkSummary, ClusterNetworkSummary,
-  CampusRetrofitQueue, DCTier,
+  CampusRetrofitQueue, DCTier, IntraNodeInterconnect, NetworkFabric, RackSkuConfig,
 } from '@ai-tycoon/shared';
 import {
  LOCATION_CONFIGS,
@@ -19,6 +19,8 @@ import {
  T3_COUNT_PER_DC_TIER,
  SWITCH_REPAIR_COST_FRACTION,
  NETWORK_DEGRADATION,
+  COOLING_TYPE_CONFIGS,
+  NETWORK_FABRIC_CONFIGS,
  estimateNetworkSlots,
 } from '@ai-tycoon/shared';
 import type { TickNotification } from '../tick';
@@ -435,6 +437,30 @@ function processNetworkTick(
  return { switchRepairCosts, notifications, dirty };
 }

+// --- Interconnect Training Multiplier ---
+
+const INTRA_NODE_BONUS: Record<IntraNodeInterconnect, number> = {
+  'pcie-gen4': 0.0,
+  'pcie-gen5': 0.05,
+  'nvlink-3': 0.15,
+  'nvlink-4': 0.25,
+  'nvlink-5': 0.35,
+  'nvlink-domain': 0.50,
+  'infinity-fabric': 0.10,
+  'custom-mesh': 0.40,
+};
+
+function computeInterconnectMultiplier(
+  sku: RackSkuConfig,
+  rackCount: number,
+  fabric: NetworkFabric,
+): number {
+  if (rackCount <= 1) return 1.0;
+  const intra = INTRA_NODE_BONUS[sku.intraNodeInterconnect] ?? 0;
+  const fabricBonus = NETWORK_FABRIC_CONFIGS[fabric].trainingScalingBonus;
+  return Math.min(1.0, 0.6 + intra + fabricBonus);
+}
+
 // --- Main Infrastructure Tick ---

 export function processInfrastructure(state: GameState): InfraTickResult {
@@ -463,6 +489,9 @@ export function processInfrastructure(state: GameState): InfraTickResult {
  notifications.push(...netResult.notifications);

  let totalFlops = 0;
+  let totalTrainingFlops = 0;
+  let totalInferenceFlops = 0;
+  let totalVramGB = 0;
  let totalUptime = 0;
  let totalRackCount = 0;
  let totalComputeRackCount = 0;
@@ -684,14 +713,23 @@ export function processInfrastructure(state: GameState): InfraTickResult {

        let usedPowerKW = 0;
        let dcFlops = 0;
+        let dcTrainingFlops = 0;
+        let dcInferenceFlops = 0;
+        let dcTotalVramGB = 0;
        if (dc.rackSkuId && computeRacksOnline > 0) {
          const sku = RACK_SKU_CONFIGS[dc.rackSkuId];
          usedPowerKW = computeRacksOnline * sku.powerDrawKW;
-          dcFlops = effectiveComputeRacks * sku.flopsPerRack * networkSummary.effectiveFlopsFraction;
+          const bwFraction = networkSummary.effectiveFlopsFraction;
+          const interconnectMult = computeInterconnectMultiplier(sku, effectiveComputeRacks, dc.networkFabric);
+          dcTrainingFlops = effectiveComputeRacks * sku.trainingFlops * bwFraction * interconnectMult;
+          dcInferenceFlops = effectiveComputeRacks * sku.inferenceFlops * bwFraction;
+          dcTotalVramGB = computeRacksOnline * sku.totalVramGB;
+          dcFlops = dcTrainingFlops + dcInferenceFlops;
        }

+        const pue = COOLING_TYPE_CONFIGS[dc.coolingType].pueMultiplier;
        const energyCostPerTick = (tierConfig.baseEnergyCostPerTick + usedPowerKW * BASE_ENERGY_COST_PER_FLOP)
-          * location.energyCostMultiplier;
+          * location.energyCostMultiplier * pue;
        const maintenanceCostPerTick = totalRacksInDc * BASE_MAINTENANCE_PER_RACK;
        const currentUptime = totalRacksInDc > 0 ? effectiveComputeRacks / totalRacksInDc : 1;

@@ -703,6 +741,9 @@ export function processInfrastructure(state: GameState): InfraTickResult {
        }

        totalFlops += dcFlops;
+        totalTrainingFlops += dcTrainingFlops;
+        totalInferenceFlops += dcInferenceFlops;
+        totalVramGB += dcTotalVramGB;
        totalRackCount += totalRacksInDc + netSlots;
        totalComputeRackCount += totalRacksInDc;
        totalDataCenterCount++;
@@ -714,6 +755,7 @@ export function processInfrastructure(state: GameState): InfraTickResult {
          deploymentCohorts: updatedCohorts,
          networkSummary, effectiveComputeRacks,
          usedSlots, usedPowerKW, energyCostPerTick, maintenanceCostPerTick, currentUptime,
+          dcTrainingFlops, dcInferenceFlops, dcTotalVramGB,
        };
      });

@@ -788,6 +830,9 @@ export function processInfrastructure(state: GameState): InfraTickResult {
      clusters,
      switchRegistry: registry,
      totalFlops,
+      totalTrainingFlops,
+      totalInferenceFlops,
+      totalVramGB,
      totalUptime: dcWithRacks > 0 ? totalUptime / dcWithRacks : 1,
      totalRackCount,
      totalComputeRackCount,
@@ -1,5 +1,5 @@
 import type { GameState, ModelsState, TrainedModel, ModelCapabilities } from '@ai-tycoon/shared';
-import { uuid } from '@ai-tycoon/shared';
+import { uuid, VRAM_REQUIREMENTS_BY_GENERATION } from '@ai-tycoon/shared';

 export interface ModelTickResult {
  modelsState: ModelsState;
@@ -12,6 +12,11 @@ export function processModels(state: GameState): ModelTickResult {
    return { modelsState: state.models, modelCompleted: null };
  }

+  const requiredVram = VRAM_REQUIREMENTS_BY_GENERATION[active.generation] ?? 0;
+  if (requiredVram > 0 && state.compute.totalVramGB < requiredVram) {
+    return { modelsState: state.models, modelCompleted: null };
+  }
+
  const researcherBoost = state.talent.departments.research.headcount *
    state.talent.departments.research.effectiveness;
  const engineerBoost = state.talent.departments.engineering.headcount *