Replace per-switch network simulation with aggregate per-DC statistical model
Balance Check / balance-simulation (push) Successful in 48s
Balance Check / multi-run-balance (push) Successful in 1m24s
CI / build-and-push (push) Successful in 43s

Eliminates the 22K-object switchRegistry that caused O(n×m) scans 4x per tick.
Network health is now tracked as aggregate counts per tier (totalByTier/healthyByTier)
with RepairBatch timers, cutting late-game tick cost from ~50ms to ~0.3ms.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-26 20:06:40 -04:00
parent 57a81be769
commit 19f652b43a
4 changed files with 181 additions and 365 deletions
+8 -24
View File
@@ -91,24 +91,6 @@ export interface DataCenter {
// --- Network Topology (6-Tier Clos) ---
export type SwitchTier = 'tor' | 't1' | 't2' | 't3' | 't4' | 't5';
export type SwitchStatus = 'healthy' | 'failed' | 'repairing';
export interface NetworkSwitch {
id: string;
tier: SwitchTier;
status: SwitchStatus;
dcId: string | null;
campusId: string | null;
clusterId: string | null;
uplinkIds: string[];
downlinkIds: string[];
activeUplinks: number;
totalUplinks: number;
effectiveBandwidth: number;
repairProgress: number;
repairTotal: number;
}
export interface SwitchTierConfig {
tier: SwitchTier;
name: string;
@@ -121,11 +103,17 @@ export interface SwitchTierConfig {
powerDrawKW: number;
}
export interface RepairBatch {
tier: SwitchTier;
count: number;
ticksRemaining: number;
}
export interface DCNetworkSummary {
switchIds: string[];
networkRackCount: number;
totalByTier: Partial<Record<SwitchTier, number>>;
healthyByTier: Partial<Record<SwitchTier, number>>;
repairBatches: RepairBatch[];
networkRackCount: number;
racksDisconnected: number;
racksDegraded: number;
averageBandwidth: number;
@@ -133,14 +121,12 @@ export interface DCNetworkSummary {
}
export interface CampusNetworkSummary {
switchIds: string[];
totalT4: number;
healthyT4: number;
crossDCBandwidth: number;
}
export interface ClusterNetworkSummary {
switchIds: string[];
totalT5: number;
healthyT5: number;
crossCampusBandwidth: number;
@@ -262,7 +248,6 @@ export interface ClusterCostConfig {
export interface InfrastructureState {
clusters: Cluster[];
switchRegistry: Record<string, NetworkSwitch>;
totalFlops: number;
totalTrainingFlops: number;
totalInferenceFlops: number;
@@ -276,7 +261,6 @@ export interface InfrastructureState {
export const INITIAL_INFRASTRUCTURE: InfrastructureState = {
clusters: [],
switchRegistry: {},
totalFlops: 0,
totalTrainingFlops: 0,
totalInferenceFlops: 0,