Rework network to 6-tier Clos topology with individual switch entities
CI / build-and-push (push) Successful in 31s
CI / build-and-push (push) Successful in 31s
Replace aggregate network health stats with a full 6-tier Clos topology (ToR → T1 → T2 → T3 → T4 → T5) where every switch is an individually tracked entity with uplinks, repair pipelines, and failure cascades. Key mechanics: - Bottleneck bandwidth model (min along path) affects FLOPS and satisfaction - Rackdown on full disconnect → racks re-enter testing pipeline on recovery - Binomial failure sampling per tier, dirty-flag cascade optimization - Flat switch registry for performance at scale - Three new research nodes: network-redundancy, fast-repair, hot-standby Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -14,7 +14,8 @@ import {
|
||||
formatMoney, formatNumber, formatPercent,
|
||||
LOCATION_CONFIGS, DC_TIER_CONFIGS, RACK_SKU_CONFIGS,
|
||||
CAMPUS_TIER_COSTS, CLUSTER_COST_CONFIG, FIRST_CAMPUS_BUILD_TICKS,
|
||||
networkSlotsRequired, maxComputeRacks,
|
||||
estimateNetworkSlots, maxComputeRacks,
|
||||
SWITCH_TIER_CONFIGS,
|
||||
DC_UPGRADE_COST_FRACTION, DC_UPGRADE_INCREMENT,
|
||||
} from '@ai-tycoon/shared';
|
||||
import type {
|
||||
@@ -26,12 +27,14 @@ const ERA_ORDER: Era[] = ['startup', 'scaleup', 'bigtech', 'agi'];
|
||||
|
||||
const STAGE_LABELS: Record<PipelineStage, string> = {
|
||||
ordered: 'Ordered', manufacturing: 'Mfg', receiving: 'Recv',
|
||||
installation: 'Install', testing: 'Testing', repair: 'Repair', decommission: 'Decom',
|
||||
installation: 'Install', testing: 'Testing', repair: 'Repair',
|
||||
'network-down': 'Net Down', decommission: 'Decom',
|
||||
};
|
||||
|
||||
const STAGE_COLORS: Record<PipelineStage, string> = {
|
||||
ordered: 'bg-surface-600', manufacturing: 'bg-blue-500', receiving: 'bg-cyan-500',
|
||||
installation: 'bg-violet-500', testing: 'bg-amber-500', repair: 'bg-danger', decommission: 'bg-surface-500',
|
||||
installation: 'bg-violet-500', testing: 'bg-amber-500', repair: 'bg-danger',
|
||||
'network-down': 'bg-red-600', decommission: 'bg-surface-500',
|
||||
};
|
||||
|
||||
// ─── Shared Components ──────────────────────────────────────────
|
||||
@@ -112,7 +115,7 @@ function Breadcrumb({ nav }: { nav: InfraNav }) {
|
||||
|
||||
function DeploymentProgressBar({ dc }: { dc: DataCenter }) {
|
||||
const tierConfig = DC_TIER_CONFIGS[dc.tier];
|
||||
const maxCompute = maxComputeRacks(tierConfig.rackSlots);
|
||||
const maxCompute = maxComputeRacks(tierConfig.rackSlots, dc.tier);
|
||||
const pipelineRacks = dc.deploymentCohorts.filter(c => c.stage !== 'decommission').reduce((s, c) => s + c.count, 0);
|
||||
const totalTarget = dc.computeRacksOnline + pipelineRacks;
|
||||
const pct = totalTarget > 0 ? (dc.computeRacksOnline / totalTarget) * 100 : 0;
|
||||
@@ -157,23 +160,27 @@ function CohortStageBreakdown({ cohorts }: { cohorts: DeploymentCohort[] }) {
|
||||
}
|
||||
|
||||
function NetworkHealthIndicator({ dc }: { dc: DataCenter }) {
|
||||
const nh = dc.networkHealth;
|
||||
if (nh.tier1Required === 0) return null;
|
||||
const ns = dc.networkSummary;
|
||||
if (ns.switchIds.length === 0) return null;
|
||||
|
||||
const allHealthy = nh.tier1Healthy === nh.tier1Required
|
||||
&& nh.tier2Healthy === nh.tier2Required
|
||||
&& nh.tier3Healthy === nh.tier3Required;
|
||||
const hasDisconnected = ns.racksDisconnected > 0;
|
||||
const hasDegraded = ns.racksDegraded > 0;
|
||||
const coreDown = (ns.healthyByTier?.t3 ?? 0) < (ns.totalByTier?.t3 ?? 0);
|
||||
|
||||
const color = nh.tier3Healthy < nh.tier3Required ? 'text-danger'
|
||||
: !allHealthy ? 'text-amber-400'
|
||||
const color = coreDown ? 'text-danger'
|
||||
: hasDisconnected ? 'text-danger'
|
||||
: hasDegraded ? 'text-amber-400'
|
||||
: 'text-green-400';
|
||||
|
||||
const bwPct = Math.round(ns.averageBandwidth * 100);
|
||||
|
||||
return (
|
||||
<div className={`flex items-center gap-1 text-xs ${color}`}>
|
||||
<Network size={12} />
|
||||
<span>
|
||||
{nh.tier3Healthy < nh.tier3Required ? 'Core Down'
|
||||
: !allHealthy ? `${nh.racksDisconnected} disconnected`
|
||||
{coreDown ? 'Core Down'
|
||||
: hasDisconnected ? `${ns.racksDisconnected} disconnected`
|
||||
: hasDegraded ? `${bwPct}% bandwidth`
|
||||
: 'Healthy'}
|
||||
</span>
|
||||
</div>
|
||||
@@ -661,7 +668,7 @@ function FillAllDCsModal({ campus, money, era, research, onConfirm, onClose }: {
|
||||
const { qty, cost } = computeFillForDC(dc, selectedSku, remaining);
|
||||
if (qty === 0) {
|
||||
const tierConfig = DC_TIER_CONFIGS[dc.tier];
|
||||
const maxCompute = maxComputeRacks(tierConfig.rackSlots);
|
||||
const maxCompute = maxComputeRacks(tierConfig.rackSlots, dc.tier);
|
||||
const pipelineCount = dc.deploymentCohorts.filter(c => c.stage !== 'decommission').reduce((sum, c) => sum + c.count, 0);
|
||||
const isFull = maxCompute - (dc.computeRacksOnline + pipelineCount) <= 0;
|
||||
return { dc, qty: 0, cost: 0, reason: isFull ? 'Already full' : 'No budget' };
|
||||
@@ -929,7 +936,7 @@ function CampusDetailView({ clusterId, campusId }: { clusterId: string; campusId
|
||||
const hasRetrofitQueue = !!campus.retrofitQueue;
|
||||
|
||||
const fillableDCs = operationalDCs.filter(dc => {
|
||||
const maxCompute = maxComputeRacks(tierConfig.rackSlots);
|
||||
const maxCompute = maxComputeRacks(tierConfig.rackSlots, dc.tier);
|
||||
const pipelineCount = dc.deploymentCohorts.filter(c => c.stage !== 'decommission').reduce((sum, c) => sum + c.count, 0);
|
||||
return maxCompute - (dc.computeRacksOnline + pipelineCount) > 0;
|
||||
});
|
||||
@@ -1124,12 +1131,12 @@ function DataCenterDetailView({ clusterId, campusId, datacenterId }: {
|
||||
if (!dc || !cluster) return <div className="text-surface-400">Data center not found.</div>;
|
||||
|
||||
const tierConfig = DC_TIER_CONFIGS[dc.tier];
|
||||
const maxCompute = maxComputeRacks(tierConfig.rackSlots);
|
||||
const maxCompute = maxComputeRacks(tierConfig.rackSlots, dc.tier);
|
||||
const pipelineCount = dc.deploymentCohorts.filter(c => c.stage !== 'decommission').reduce((s, c) => s + c.count, 0);
|
||||
const existingCompute = dc.computeRacksOnline + pipelineCount;
|
||||
const availableSlots = maxCompute - existingCompute;
|
||||
const sku = dc.rackSkuId ? RACK_SKU_CONFIGS[dc.rackSkuId] : null;
|
||||
const netSlots = networkSlotsRequired(existingCompute);
|
||||
const netSlots = estimateNetworkSlots(existingCompute, dc.tier);
|
||||
|
||||
const availableSkus = Object.values(RACK_SKU_CONFIGS).filter(s => {
|
||||
if (ERA_ORDER.indexOf(era) < ERA_ORDER.indexOf(s.era)) return false;
|
||||
@@ -1257,7 +1264,7 @@ function DataCenterDetailView({ clusterId, campusId, datacenterId }: {
|
||||
{(() => {
|
||||
const skuToUse = dc.rackSkuId ?? selectedSku!;
|
||||
const skuConfig = RACK_SKU_CONFIGS[skuToUse];
|
||||
const newNetSlots = networkSlotsRequired(existingCompute + deployQty);
|
||||
const newNetSlots = estimateNetworkSlots(existingCompute + deployQty, dc.tier);
|
||||
const addedNet = newNetSlots - netSlots;
|
||||
const totalCost = skuConfig.baseCost * deployQty;
|
||||
return (
|
||||
@@ -1358,24 +1365,51 @@ function DataCenterDetailView({ clusterId, campusId, datacenterId }: {
|
||||
<p className="text-sm text-surface-400">No racks online. Deploy racks to see network topology.</p>
|
||||
) : (
|
||||
<div className="space-y-3">
|
||||
{[
|
||||
{ label: 'Tier-1 (ToR)', required: dc.networkHealth.tier1Required, healthy: dc.networkHealth.tier1Healthy, desc: `1 per ${24} compute racks` },
|
||||
{ label: 'Tier-2 (Aggr)', required: dc.networkHealth.tier2Required, healthy: dc.networkHealth.tier2Healthy, desc: `1 per ${6} Tier-1 switches` },
|
||||
{ label: 'Tier-3 (Core)', required: dc.networkHealth.tier3Required, healthy: dc.networkHealth.tier3Healthy, desc: 'Redundant pair' },
|
||||
].map(tier => (
|
||||
<div key={tier.label} className="flex items-center justify-between p-3 border border-surface-600 rounded-lg">
|
||||
<div>
|
||||
<div className="font-medium text-sm">{tier.label}</div>
|
||||
<div className="text-xs text-surface-400">{tier.desc}</div>
|
||||
</div>
|
||||
<div className={`text-sm font-mono ${tier.healthy < tier.required ? 'text-danger' : 'text-green-400'}`}>
|
||||
{tier.healthy} / {tier.required}
|
||||
</div>
|
||||
{/* Bandwidth gauge */}
|
||||
<div className="p-3 border border-surface-600 rounded-lg">
|
||||
<div className="flex items-center justify-between mb-1">
|
||||
<span className="text-sm font-medium">Bandwidth</span>
|
||||
<span className={`text-sm font-mono ${dc.networkSummary.averageBandwidth < 0.8 ? 'text-warning' : dc.networkSummary.averageBandwidth < 0.5 ? 'text-danger' : 'text-green-400'}`}>
|
||||
{formatPercent(dc.networkSummary.averageBandwidth)}
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
{dc.networkHealth.racksDisconnected > 0 && (
|
||||
<div className="w-full bg-surface-900 rounded-full h-2">
|
||||
<div
|
||||
className={`h-2 rounded-full transition-all ${dc.networkSummary.averageBandwidth >= 0.8 ? 'bg-green-500' : dc.networkSummary.averageBandwidth >= 0.5 ? 'bg-yellow-500' : 'bg-red-500'}`}
|
||||
style={{ width: `${dc.networkSummary.averageBandwidth * 100}%` }}
|
||||
/>
|
||||
</div>
|
||||
<div className="flex justify-between mt-1 text-xs text-surface-500">
|
||||
<span>Effective FLOPS: {formatPercent(dc.networkSummary.effectiveFlopsFraction)}</span>
|
||||
<span>{dc.networkSummary.racksDegraded} degraded</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Per-tier switch health */}
|
||||
{(['tor', 't1', 't2', 't3'] as const).map(tier => {
|
||||
const total = dc.networkSummary.totalByTier[tier] ?? 0;
|
||||
if (total === 0) return null;
|
||||
const healthy = dc.networkSummary.healthyByTier[tier] ?? 0;
|
||||
const failed = total - healthy;
|
||||
const config = SWITCH_TIER_CONFIGS[tier];
|
||||
return (
|
||||
<div key={tier} className="flex items-center justify-between p-3 border border-surface-600 rounded-lg">
|
||||
<div>
|
||||
<div className="font-medium text-sm">{config.name}</div>
|
||||
<div className="text-xs text-surface-400">
|
||||
{tier === 'tor' ? '1 per rack (embedded)' : `Fan-out ${config.fanOut}, ${config.uplinkCount} uplinks`}
|
||||
</div>
|
||||
</div>
|
||||
<div className={`text-sm font-mono ${failed > 0 ? 'text-danger' : 'text-green-400'}`}>
|
||||
{healthy} / {total}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
|
||||
{dc.networkSummary.racksDisconnected > 0 && (
|
||||
<div className="text-sm text-danger flex items-center gap-2 p-2">
|
||||
<Activity size={14} /> {dc.networkHealth.racksDisconnected} compute racks disconnected due to network failures
|
||||
<Activity size={14} /> {dc.networkSummary.racksDisconnected} compute racks disconnected due to network failures
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
Reference in New Issue
Block a user