mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 17:26:04 +00:00
EP GB200 (4 GPUs/node) support
- configs.cuh: NUM_MAX_NVL_PEERS 8 -> 4 - internode.cu: introduce NvlPackT (uint64_t for 8 peers, uint32_t for 4) to handle packed-bool loads of is_token_in_rank; relax SourceMeta static_assert; replace 4 uint64_t-coupled sites - buffer.hpp/buffer.cc: relax NUM_MAX_NVL_PEERS assert (4 || 8); read MSCCLPP_EP_LOCAL_WORLD_SIZE env to override rdma_rank/nvl_rank partitioning when local world size != NUM_MAX_NVL_PEERS - CMakeLists.txt (ext/ep): rpath / install fix - pyproject.toml: MSCCLPP_BUILD_EXT_EP=ON - src/core/atomicadd_kernel.cu, kernels/buffer.cuh, kernels/utils.cuh: related EP fixes - test_internode_multirank.py: NUM_MAX_NVL_PEERS=4, rank %% 4
This commit is contained in:
@@ -44,7 +44,7 @@ import torch.distributed as dist
|
||||
def init_dist():
|
||||
rank = int(os.environ["RANK"])
|
||||
world_size = int(os.environ["WORLD_SIZE"])
|
||||
local_rank = int(os.environ.get("LOCAL_RANK", rank % 8))
|
||||
local_rank = int(os.environ.get("LOCAL_RANK", rank % 4))
|
||||
torch.cuda.set_device(local_rank)
|
||||
dist.init_process_group(
|
||||
backend="nccl", world_size=world_size, rank=rank, device_id=torch.device(f"cuda:{local_rank}")
|
||||
@@ -71,7 +71,7 @@ def main():
|
||||
rank, num_ranks, local_rank, group = init_dist()
|
||||
from mscclpp.ext import ep
|
||||
|
||||
NUM_MAX_NVL_PEERS = 8
|
||||
NUM_MAX_NVL_PEERS = 4
|
||||
assert (
|
||||
num_ranks % NUM_MAX_NVL_PEERS == 0 and num_ranks > NUM_MAX_NVL_PEERS
|
||||
), f"expected >1 node with 8 GPUs each, got num_ranks={num_ranks}"
|
||||
|
||||
Reference in New Issue
Block a user