EP GB200 (4 GPUs/node) support

- configs.cuh: NUM_MAX_NVL_PEERS 8 -> 4
- internode.cu: introduce NvlPackT (uint64_t for 8 peers, uint32_t for 4)
  to handle packed-bool loads of is_token_in_rank; relax SourceMeta
  static_assert; replace 4 uint64_t-coupled sites
- buffer.hpp/buffer.cc: relax NUM_MAX_NVL_PEERS assert (4 || 8); read
  MSCCLPP_EP_LOCAL_WORLD_SIZE env to override rdma_rank/nvl_rank
  partitioning when local world size != NUM_MAX_NVL_PEERS
- CMakeLists.txt (ext/ep): rpath / install fix
- pyproject.toml: MSCCLPP_BUILD_EXT_EP=ON
- src/core/atomicadd_kernel.cu, kernels/buffer.cuh, kernels/utils.cuh:
  related EP fixes
- test_internode_multirank.py: NUM_MAX_NVL_PEERS=4, rank %% 4
This commit is contained in:
Qinghua Zhou
2026-05-08 01:42:21 +00:00
parent e87c66a85d
commit 5d16ac958e
10 changed files with 44 additions and 19 deletions

View File

@@ -44,7 +44,7 @@ import torch.distributed as dist
def init_dist():
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
local_rank = int(os.environ.get("LOCAL_RANK", rank % 8))
local_rank = int(os.environ.get("LOCAL_RANK", rank % 4))
torch.cuda.set_device(local_rank)
dist.init_process_group(
backend="nccl", world_size=world_size, rank=rank, device_id=torch.device(f"cuda:{local_rank}")
@@ -71,7 +71,7 @@ def main():
rank, num_ranks, local_rank, group = init_dist()
from mscclpp.ext import ep
NUM_MAX_NVL_PEERS = 8
NUM_MAX_NVL_PEERS = 4
assert (
num_ranks % NUM_MAX_NVL_PEERS == 0 and num_ranks > NUM_MAX_NVL_PEERS
), f"expected >1 node with 8 GPUs each, got num_ranks={num_ranks}"