Use cudaMalloc instead of GpuBuffer for communication buffers in alltoallv_test

2026-04-20 06:49:29 +00:00 · 2026-03-05 14:58:44 +00:00
parent 237302258d
commit 3d3e272d3b
2 changed files with 22 additions and 8 deletions
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -30,11 +30,12 @@ struct SemaphoreStub::Impl {
 };

 std::shared_ptr<uint64_t> SemaphoreStub::Impl::gpuCallocToken([[maybe_unused]] std::shared_ptr<Context> context) {
-#if (CUDA_NVLS_API_AVAILABLE)
-  if (isNvlsSupported()) {
-    return context->pimpl_->getToken();
-  }
-#endif  // CUDA_NVLS_API_AVAILABLE
+  // Always use cudaMalloc-based allocation for semaphore tokens.
+  // On GB200 NVL, CudaIpc connections may span nodes that share an NVLink domain.
+  // TokenPool uses cuMemCreate (physical alloc) whose IPC handles require either
+  // Fabric (IMEX daemon) or PosixFd (host-local unix socket) — both can fail
+  // cross-node.  cudaMalloc-based tokens use RuntimeIpc (cudaIpcGetMemHandle /
+  // cudaIpcOpenMemHandle) which works across the shared NVLink domain.
 #if defined(MSCCLPP_USE_ROCM)
  return detail::gpuCallocUncachedShared<uint64_t>();
 #else   // !defined(MSCCLPP_USE_ROCM)
--- a/test/mscclpp-test/alltoallv_test.cu
+++ b/test/mscclpp-test/alltoallv_test.cu
@@ -256,9 +256,22 @@ bool AllToAllVTestEngine::isInPlace() const { return false; }
 AllToAllVTestEngine::AllToAllVTestEngine(const TestArgs& args) : BaseTestEngine(args, "alltoallv") { inPlace_ = false; }

 void AllToAllVTestEngine::allocateBuffer() {
-  sendBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
-  recvBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
-  expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
+  // Use cudaMalloc instead of GpuBuffer for communication buffers.
+  // GpuBuffer uses cuMemCreate (physical alloc) on NVLS-capable GPUs, which only
+  // supports Fabric and PosixFd IPC handles.  Cross-node on GB200 NVL, PosixFd fails
+  // (host-local unix socket) and Fabric requires the IMEX daemon.  cudaMalloc memory
+  // supports RuntimeIpc (cudaIpcGetMemHandle/cudaIpcOpenMemHandle), which works
+  // cross-node over the shared NVLink domain.
+  size_t numElems = args_.maxBytes / sizeof(int);
+  int* sendPtr = nullptr;
+  int* recvPtr = nullptr;
+  CUDATHROW(cudaMalloc(&sendPtr, numElems * sizeof(int)));
+  CUDATHROW(cudaMemset(sendPtr, 0, numElems * sizeof(int)));
+  CUDATHROW(cudaMalloc(&recvPtr, numElems * sizeof(int)));
+  CUDATHROW(cudaMemset(recvPtr, 0, numElems * sizeof(int)));
+  sendBuff_ = std::shared_ptr<int>(sendPtr, [](int* p) { cudaFree(p); });
+  recvBuff_ = std::shared_ptr<int>(recvPtr, [](int* p) { cudaFree(p); });
+  expectedBuff_ = std::shared_ptr<int[]>(new int[numElems]);

  localSendBuffV = sendBuff_.get();
  localRecvBuffV = recvBuff_.get();