ext/ep: route LL internode atomics over NVL72 fabric (Proposal A)

Azure CX-7 RoCE has IBV_ATOMIC_NONE so the proxy emulated atomicAdd hangs LL internode tests. Bypass RDMA atomics for the LL path by routing peer pointers through cuMem fabric IPC over the NVL72 NVSwitch fabric (intra-node CUDA-IPC, cross-node fabric handles imported via nvidia-imex). LL kernels then perform direct st.global + atomicAdd on peer pointers. - buffer.cc: allocate rdma_buffer_ptr via mscclpp::detail::gpuCallocPhysical (POSIX_FD|FABRIC handle types) so it is eligible for cuMem fabric IPC. - buffer.cc: lift LL IPC fast-path gate from low_latency_mode && num_rdma_ranks==1 to low_latency_mode; drop cudaIpcGet/OpenMemHandle exchange and resolve peer bases from RegisteredMemory::data() (mscclpp CudaIpc transport handles fabric handle import). - buffer.hpp: peer_rdma_bases is std::vector<void*> sized to num_ranks (was fixed-size NUM_MAX_NVL_PEERS); destructor relies on RegisteredMemory shared_ptrs for IPC mapping cleanup. Validated on 2x Azure GB200 (8 ranks): LL dispatch/combine PASS with bit-exact results; LL bench at 128 tokens/h7168/topk=8 hits 39.9us dispatch, 37.7us combine (~3 TB/s aggregated).
2026-05-11 17:00:22 +00:00 · 2026-05-08 22:04:35 +00:00
parent 5d16ac958e
commit 8f2c4e7d98
2 changed files with 53 additions and 39 deletions
--- a/src/ext/ep/buffer.cc
+++ b/src/ext/ep/buffer.cc
@@ -180,12 +180,9 @@ Buffer::~Buffer() noexcept(false) {
    // failed, so there is nothing to tear down.
  }

-  // Intra-node LL IPC fast-path teardown.
+  // LL fast-path teardown. RegisteredMemory shared_ptrs in ll_memory_channels
+  // own the peer IPC mappings; we just release the device-side base array.
  if (ll_ipc_ready) {
-    for (int i = 0; i < num_ranks; ++i) {
-      if (i == rank or peer_rdma_bases[i] == nullptr) continue;
-      CUDA_CHECK(cudaIpcCloseMemHandle(peer_rdma_bases[i]));
-    }
    if (peer_rdma_bases_gpu != nullptr) {
      CUDA_CHECK(cudaFree(peer_rdma_bases_gpu));
      peer_rdma_bases_gpu = nullptr;
@@ -331,9 +328,17 @@ void Buffer::sync(const std::vector<int>& device_ids,
    EP_HOST_ASSERT(communicator != nullptr);
    EP_HOST_ASSERT(bootstrap != nullptr);

-    // Allocate the RDMA buffer
-    CUDA_CHECK(cudaMalloc(&rdma_buffer_ptr, num_rdma_bytes));
-    CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
+    // Allocate the RDMA buffer.
+    //
+    // Use mscclpp's `gpuCallocPhysical` (cuMemCreate + cuMemMap with the
+    // POSIX_FD|FABRIC handle types) instead of plain cudaMalloc. This makes
+    // the allocation eligible for cuMem fabric IPC, which lets the LL fast
+    // path map the buffer across the NVL72 fabric via nvidia-imex and
+    // perform atomicAdd over NVLink rather than RDMA. Cross-node HT (which
+    // still goes through PortChannel/IB) is unaffected — the IB MR
+    // registration in `registerMemory(..., all_transport)` below handles
+    // physical-allocator-backed pointers identically to cudaMalloc'd ones.
+    rdma_buffer_ptr = mscclpp::detail::gpuCallocPhysical(num_rdma_bytes);
    bootstrap->barrier();
    CUDA_CHECK(cudaDeviceSynchronize());

@@ -466,31 +471,21 @@ void Buffer::sync(const std::vector<int>& device_ids,
    // NVLink's multi-TB/s). We additionally set up CUDA-IPC peer pointers
    // to each peer's `rdma_buffer_ptr` plus a set of per-peer MemoryChannels
    // for a barrier ring. The LL kernels select this path at launch time.
-    // Cross-node LL is unaffected: this block is a no-op there.
+    // Cross-node LL uses cuMem fabric IPC (Proposal A): peers map
+    // `rdma_buffer_ptr` through the NVL72 NVSwitch fabric via nvidia-imex,
+    // and the LL kernels do direct `st.global` + atomicAdd through those
+    // peer pointers. This bypasses the RDMA path entirely (Azure CX-7 RoCE
+    // has IBV_ATOMIC_NONE which makes the proxy-emulated atomicAdd hang).
+    // Requires nvidia-imex active on every rank's host with a shared
+    // `nodes_config.cfg` covering all node IPs.
    // ------------------------------------------------------------------
-    if (low_latency_mode and num_rdma_ranks == 1) {
-      EP_HOST_ASSERT(num_ranks == num_nvl_ranks);
-      EP_HOST_ASSERT(num_ranks <= NUM_MAX_NVL_PEERS);
-
-      // 1. Exchange CUDA IPC handles for rdma_buffer_ptr via bootstrap.
-      CUDA_CHECK(cudaIpcGetMemHandle(&rdma_ipc_handles[rank], rdma_buffer_ptr));
-      std::vector<cudaIpcMemHandle_t> all_rdma_handles(num_ranks);
-      all_rdma_handles[rank] = rdma_ipc_handles[rank];
-      bootstrap->allGather(all_rdma_handles.data(), sizeof(cudaIpcMemHandle_t));
-
-      peer_rdma_bases[rank] = rdma_buffer_ptr;
-      for (int r = 0; r < num_ranks; ++r) {
-        if (r == rank) continue;
-        rdma_ipc_handles[r] = all_rdma_handles[r];
-        CUDA_CHECK(cudaIpcOpenMemHandle(&peer_rdma_bases[r], rdma_ipc_handles[r], cudaIpcMemLazyEnablePeerAccess));
-      }
-      CUDA_CHECK(cudaMalloc(&peer_rdma_bases_gpu, sizeof(void*) * NUM_MAX_NVL_PEERS));
-      CUDA_CHECK(
-          cudaMemcpy(peer_rdma_bases_gpu, peer_rdma_bases, sizeof(void*) * NUM_MAX_NVL_PEERS, cudaMemcpyHostToDevice));
-
-      // 2. Build MemoryChannels for the per-peer barrier ring. These use
-      //    CUDA IPC connections (distinct tag from the existing port-channel
-      //    machinery) so setup does not interfere with cross-node fallback.
+    if (low_latency_mode) {
+      // Reuse the local RDMA registration's CudaIpc transport entry. The
+      // existing `local_rdma_buffer_mem` was registered with `all_transport`
+      // (= ipc | ib), so its CudaIpc TransportInfo is already populated
+      // with the FABRIC handle (when supported by the underlying physical
+      // allocation). We need a separate registration only because the
+      // remote-side `recvMemory` below is tagged independently.
      constexpr int kLlIpcTag = 2;
      auto rdma_mem_ipc = communicator->registerMemory(rdma_buffer_ptr, num_rdma_bytes, ipc_transport);
      std::vector<std::shared_future<mscclpp::RegisteredMemory>> remote_futures(num_ranks);
@@ -513,11 +508,28 @@ void Buffer::sync(const std::vector<int>& device_ids,
        }
      }

+      // Resolve peer base pointers from the (now mapped) remote
+      // RegisteredMemory. `data()` returns the locally-mapped peer pointer;
+      // for fabric handles this address lives in the cuMem fabric VA range
+      // and is dereferenceable from the GPU.
+      peer_rdma_bases.assign(num_ranks, nullptr);
+      peer_rdma_bases[rank] = rdma_buffer_ptr;
+      std::vector<mscclpp::RegisteredMemory> remote_mems(num_ranks);
+      for (int r = 0; r < num_ranks; ++r) {
+        if (r == rank) continue;
+        remote_mems[r] = remote_futures[r].get();
+        peer_rdma_bases[r] = remote_mems[r].data();
+      }
+      CUDA_CHECK(cudaMalloc(&peer_rdma_bases_gpu, sizeof(void*) * num_ranks));
+      CUDA_CHECK(cudaMemcpy(peer_rdma_bases_gpu, peer_rdma_bases.data(),
+                            sizeof(void*) * num_ranks, cudaMemcpyHostToDevice));
+
+      // Build MemoryChannels for the per-peer barrier ring.
      std::vector<mscclpp::MemoryChannelDeviceHandle> ll_handles(num_ranks);
      for (int r = 0; r < num_ranks; ++r) {
        if (r == rank) continue;
        auto sema = std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*communicator, ll_ipc_conns[r]);
-        ll_memory_channels.emplace_back(sema, remote_futures[r].get(), rdma_mem_ipc);
+        ll_memory_channels.emplace_back(sema, remote_mems[r], rdma_mem_ipc);
        ll_handles[r] = ll_memory_channels.rbegin()->deviceHandle();
      }
      ll_memory_channel_handles_device_ptr =
--- a/src/ext/ep/buffer.hpp
+++ b/src/ext/ep/buffer.hpp
@@ -90,12 +90,14 @@ struct Buffer {
  std::shared_ptr<mscclpp::PortChannelDeviceHandle> port_channel_handles_device_ptr;
  std::shared_ptr<mscclpp::MemoryChannelDeviceHandle> memory_channel_handles_device_ptr;

-  // Intra-node LL only: peer-mapped RDMA buffer pointers (CUDA IPC).
-  // ``peer_rdma_bases[r]`` aliases rank ``r``'s ``rdma_buffer_ptr`` via
-  // ``cudaIpcOpenMemHandle`` (lazy peer access). Populated in ``sync()`` when
-  // ``low_latency_mode && num_rdma_ranks == 1``; null otherwise.
-  cudaIpcMemHandle_t rdma_ipc_handles[NUM_MAX_NVL_PEERS];
-  void* peer_rdma_bases[NUM_MAX_NVL_PEERS] = {nullptr};
+  // LL fast path: peer-mapped RDMA buffer pointers.
+  // ``peer_rdma_bases[r]`` aliases rank ``r``'s ``rdma_buffer_ptr`` through
+  // mscclpp's CudaIpc transport. Intranode peers use POSIX-FD CUDA IPC;
+  // cross-node peers use cuMem fabric handles routed through nvidia-imex
+  // over the NVL72 NVSwitch fabric (Proposal A — replaces RDMA atomicAdd
+  // with NVLink atomics, since Azure CX-7 RoCE has IBV_ATOMIC_NONE).
+  // Populated in ``sync()`` when ``low_latency_mode``; empty otherwise.
+  std::vector<void*> peer_rdma_bases;
  void** peer_rdma_bases_gpu = nullptr;
  // MemoryChannels over CUDA IPC used only for the LL barrier ring.
  std::vector<mscclpp::MemoryChannel> ll_memory_channels;