Support cross-node CudaIPC

2026-04-20 06:49:29 +00:00 · 2026-03-21 10:41:32 +00:00
parent 9ef1fb7cee
commit 7e1cb7b8cf
5 changed files with 100 additions and 19 deletions
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -62,6 +62,7 @@ void register_algorithm(nb::module_& m) {
          .def_prop_ro("buffer_mode", &Algorithm::bufferMode)
          .def_prop_ro("constraint", &Algorithm::constraint)
          .def_prop_ro("type", &Algorithm::type)
+          .def("reset", &Algorithm::reset)
          .def(
              "execute",
              [](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
--- a/python/mscclpp/ext/alltoallv_single.py
+++ b/python/mscclpp/ext/alltoallv_single.py
@@ -239,6 +239,10 @@ class MscclppAlltoAllV:
        # Fast path: skip GPU copies + bootstrap exchange if split sizes unchanged
        splits_key = (tuple(send_counts_bytes), tuple(recv_counts_bytes))
        if splits_key != self._cached_splits_key:
+            # Clear cached contexts to free RegisteredMemory for old (possibly freed) tensors.
+            # Without this, stale CUDA IPC handles accumulate and eventually SIGSEGV.
+            if hasattr(self._algo, 'reset'):
+                self._algo.reset()
            # Copy counts/displacements to GPU
            self._d_send_counts.copy_(torch.tensor(send_counts_bytes, dtype=torch.int64))
            self._d_send_displs.copy_(torch.tensor(send_displs_bytes, dtype=torch.int64))
@@ -259,16 +263,24 @@ class MscclppAlltoAllV:
            stream = torch.cuda.current_stream()
        cuda_stream = stream.cuda_stream

-        input_size = self._cached_input_size
-        output_size = self._cached_output_size
+        # Use the full underlying storage size (not just the view's active data)
+        # for the context key, so that reusing views of the same tensor with
+        # different split sizes doesn't create new contexts (which leak
+        # RegisteredMemory for stale buffers).
+        try:
+            input_alloc_size = input.untyped_storage().size()
+            output_alloc_size = output.untyped_storage().size()
+        except Exception:
+            input_alloc_size = input.nelement() * input.element_size()
+            output_alloc_size = output.nelement() * output.element_size()
        
        # Execute the optimized kernel
        result = self._algo.execute(
            self._comm,
            input.data_ptr(),
            output.data_ptr(),
-            input_size,
-            output_size,
+            input_alloc_size,
+            output_alloc_size,
            _torch_dtype_to_mscclpp(dtype),
            ReduceOp.NOP,
            cuda_stream,
--- a/python/test/test_alltoallv_mscclpp.py
+++ b/python/test/test_alltoallv_mscclpp.py
@@ -92,19 +92,31 @@ def main():
    # otherwise gloo avoids IB configuration issues on some clusters.
    # Set ALLTOALLV_BACKEND=nccl to enable torch baseline comparison.
    backend = os.environ.get("ALLTOALLV_BACKEND", "gloo")
-    # For multi-node: detect a routable IP instead of 127.0.0.1
+    # For multi-node: MASTER_ADDR must be set to rank 0's routable IP.
+    # Single-node auto-detects; multi-node requires it from the launcher.
    if "MASTER_ADDR" not in os.environ:
        if rank == 0:
            os.environ["MASTER_ADDR"] = _get_routable_ip()
        else:
-            # Non-zero ranks: MASTER_ADDR must be set externally for multi-node
-            os.environ["MASTER_ADDR"] = "127.0.0.1"
+            # Check if we're single-node (all ranks on same host)
+            n_gpus = torch.cuda.device_count()
+            if world_size <= n_gpus:
+                # Likely single-node – 127.0.0.1 works
+                os.environ["MASTER_ADDR"] = "127.0.0.1"
+            else:
+                raise RuntimeError(
+                    f"Rank {rank}: MASTER_ADDR not set for multi-node run "
+                    f"(world_size={world_size} > local GPUs={n_gpus}). "
+                    f"Set it in your launcher, e.g.:\n"
+                    f"  mpirun -x MASTER_ADDR=<node0_ip> -x MASTER_PORT=29500 ..."
+                )
    os.environ.setdefault("MASTER_PORT", "29500")
    os.environ["RANK"] = str(rank)
    os.environ["WORLD_SIZE"] = str(world_size)
    if backend == "nccl":
-        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size,
-                                 device_id=torch.device(f"cuda:{local_rank}"))
+        # Don't use device_id= eager init — it triggers an immediate NCCL allreduce
+        # that fails on some platforms (e.g. GB200 with NCCL 2.28.9).
+        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
    else:
        dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
    
@@ -341,6 +353,13 @@ def main():
    msg_sizes = [1 << s for s in range(10, 28) if s % 2 == 0]
    msg_sizes.append(128 * 1024 * 1024)

+    # Pre-compute max split sizes across all sweep iterations to allocate
+    # fixed-size tensors. Reusing the same tensors keeps the NativeAlgorithm
+    # context key stable (same ptrs + sizes) and avoids the context cache
+    # leak that causes SIGSEGV when stale RegisteredMemory accumulates.
+    max_in_elems = 0
+    max_out_elems = 0
+    sweep_params = []  # (avg_msg_size, in_splits, out_splits)
    for avg_msg_size in msg_sizes:
        random.seed(12345)
        avg_elems = avg_msg_size // 4
@@ -348,19 +367,27 @@ def main():
        for i in range(world_size):
            row = [max(1, int(avg_elems * (0.5 + random.random()))) for _ in range(world_size)]
            send_matrix.append(row)
-
        in_splits = send_matrix[rank]
        out_splits = [send_matrix[j][rank] for j in range(world_size)]
+        max_in_elems = max(max_in_elems, sum(in_splits))
+        max_out_elems = max(max_out_elems, sum(out_splits))
+        sweep_params.append((avg_msg_size, in_splits, out_splits))

-        inp = torch.randn(sum(in_splits), dtype=torch.float32, device='cuda')
-        out = torch.empty(sum(out_splits), dtype=torch.float32, device='cuda')
+    # Allocate once at max size
+    inp = torch.randn(max_in_elems, dtype=torch.float32, device='cuda')
+    out = torch.empty(max_out_elems, dtype=torch.float32, device='cuda')

+    for avg_msg_size, in_splits, out_splits in sweep_params:
        n_warmup = 3 if avg_msg_size >= 16 * 1024 * 1024 else 5
        n_iters = 5 if avg_msg_size >= 64 * 1024 * 1024 else (10 if avg_msg_size >= 4 * 1024 * 1024 else 20)

-        m_lat, m_bw = bench_alltoallv(mscclpp_fn, inp, out, in_splits, out_splits, n_warmup, n_iters)
+        # Use views into the fixed buffers (same data_ptr → same context key)
+        inp_view = inp[:sum(in_splits)]
+        out_view = out[:sum(out_splits)]
+
+        m_lat, m_bw = bench_alltoallv(mscclpp_fn, inp_view, out_view, in_splits, out_splits, n_warmup, n_iters)
        if use_torch_baseline:
-            t_lat, t_bw = bench_alltoallv(torch_fn, inp, out, in_splits, out_splits, n_warmup, n_iters)
+            t_lat, t_bw = bench_alltoallv(torch_fn, inp_view, out_view, in_splits, out_splits, n_warmup, n_iters)
            print_row(fmt_size(avg_msg_size), m_lat, m_bw, t_lat, t_bw)
        else:
            print_row(fmt_size(avg_msg_size), m_lat, m_bw)
--- a/src/core/registered_memory.cc
+++ b/src/core/registered_memory.cc
@@ -164,8 +164,46 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
    this->remoteMemMap = gpuIpcMem->map();
    this->data = this->remoteMemMap.get();
  } else if (transports.has(Transport::CudaIpc) && getHostHash() != this->hostHash) {
-    WARN(GPU, "Skipping CudaIpc map for cross-node peer (local hostHash=", getHostHash(),
-         ", remote hostHash=", this->hostHash, ")");
+    // Cross-node CudaIpc: try available handle types in order of preference.
+    // On GB200 NVSwitch, both Fabric and RuntimeIpc handles work cross-node.
+    // On H100 (no NVSwitch across nodes), none of these will work.
+    auto entry = getTransportInfo(Transport::CudaIpc);
+    bool mapped = false;
+
+    // 1) Try Fabric handle first (works on any NVSwitch-connected system)
+    if (!mapped && (entry.gpuIpcMemHandle.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
+      GpuIpcMemHandle fabricOnlyHandle = entry.gpuIpcMemHandle;
+      fabricOnlyHandle.typeFlags = GpuIpcMemHandle::Type::Fabric;
+      try {
+        auto gpuIpcMem = GpuIpcMem::create(fabricOnlyHandle);
+        this->remoteMemMap = gpuIpcMem->map();
+        this->data = this->remoteMemMap.get();
+        mapped = true;
+        INFO(GPU, "Mapped cross-node CudaIpc memory via Fabric handle at pointer ", this->data);
+      } catch (const std::exception& e) {
+        INFO(GPU, "Fabric handle mapping failed (will try RuntimeIpc): ", e.what());
+      }
+    }
+
+    // 2) Try RuntimeIpc handle (cudaIpcOpenMemHandle — works on GB200 NVSwitch cross-node)
+    if (!mapped && (entry.gpuIpcMemHandle.typeFlags & GpuIpcMemHandle::Type::RuntimeIpc)) {
+      GpuIpcMemHandle runtimeOnlyHandle = entry.gpuIpcMemHandle;
+      runtimeOnlyHandle.typeFlags = GpuIpcMemHandle::Type::RuntimeIpc;
+      try {
+        auto gpuIpcMem = GpuIpcMem::create(runtimeOnlyHandle);
+        this->remoteMemMap = gpuIpcMem->map();
+        this->data = this->remoteMemMap.get();
+        mapped = true;
+        INFO(GPU, "Mapped cross-node CudaIpc memory via RuntimeIpc handle at pointer ", this->data);
+      } catch (const std::exception& e) {
+        INFO(GPU, "RuntimeIpc handle mapping failed for cross-node peer: ", e.what());
+      }
+    }
+
+    if (!mapped) {
+      WARN(GPU, "Skipping CudaIpc map for cross-node peer (all handle types failed, local hostHash=",
+           getHostHash(), ", remote hostHash=", this->hostHash, ")");
+    }
  }
  if (this->data != nullptr) {
    INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);
--- a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
+++ b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
@@ -90,11 +90,14 @@ void AlltoallvFullmesh::initialize(std::shared_ptr<Communicator> comm) {
  int nRanksPerNode = comm->bootstrap()->getNranksPerNode();
  int localGpuIdx = rank % nRanksPerNode;

-  // Use hybrid connections: CudaIpc for intra-node, IB for inter-node
-  bool hasIB = getIBDeviceCount() > 0;
+  // Use hybrid connections: CudaIpc for intra-node, IB for inter-node.
+  // On systems where CudaIpc works across nodes (e.g. GB200 NVSwitch),
+  // set MSCCLPP_FORCE_CUDAIPC=1 to skip IB and use CudaIpc for all peers.
+  const char* forceCudaIpc = std::getenv("MSCCLPP_FORCE_CUDAIPC");
+  bool useIB = (getIBDeviceCount() > 0) && !(forceCudaIpc && std::string(forceCudaIpc) == "1");
  bool isMultiNode = (worldSize_ > nRanksPerNode);

-  if (hasIB && isMultiNode) {
+  if (useIB && isMultiNode) {
    this->conns_ = setupHybridConnections(comm, localGpuIdx);
    // Check if any connections are actually inter-node
    hasRemotePeers_ = false;