Use cudaMalloc instead of GpuBuffer for communication buffers in alltoallv_test

This commit is contained in:
Qinghua Zhou
2026-03-05 14:58:44 +00:00
parent 237302258d
commit 3d3e272d3b
2 changed files with 22 additions and 8 deletions

View File

@@ -30,11 +30,12 @@ struct SemaphoreStub::Impl {
};
std::shared_ptr<uint64_t> SemaphoreStub::Impl::gpuCallocToken([[maybe_unused]] std::shared_ptr<Context> context) {
#if (CUDA_NVLS_API_AVAILABLE)
if (isNvlsSupported()) {
return context->pimpl_->getToken();
}
#endif // CUDA_NVLS_API_AVAILABLE
// Always use cudaMalloc-based allocation for semaphore tokens.
// On GB200 NVL, CudaIpc connections may span nodes that share an NVLink domain.
// TokenPool uses cuMemCreate (physical alloc) whose IPC handles require either
// Fabric (IMEX daemon) or PosixFd (host-local unix socket) — both can fail
// cross-node. cudaMalloc-based tokens use RuntimeIpc (cudaIpcGetMemHandle /
// cudaIpcOpenMemHandle) which works across the shared NVLink domain.
#if defined(MSCCLPP_USE_ROCM)
return detail::gpuCallocUncachedShared<uint64_t>();
#else // !defined(MSCCLPP_USE_ROCM)

View File

@@ -256,9 +256,22 @@ bool AllToAllVTestEngine::isInPlace() const { return false; }
AllToAllVTestEngine::AllToAllVTestEngine(const TestArgs& args) : BaseTestEngine(args, "alltoallv") { inPlace_ = false; }
void AllToAllVTestEngine::allocateBuffer() {
sendBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
recvBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
// Use cudaMalloc instead of GpuBuffer for communication buffers.
// GpuBuffer uses cuMemCreate (physical alloc) on NVLS-capable GPUs, which only
// supports Fabric and PosixFd IPC handles. Cross-node on GB200 NVL, PosixFd fails
// (host-local unix socket) and Fabric requires the IMEX daemon. cudaMalloc memory
// supports RuntimeIpc (cudaIpcGetMemHandle/cudaIpcOpenMemHandle), which works
// cross-node over the shared NVLink domain.
size_t numElems = args_.maxBytes / sizeof(int);
int* sendPtr = nullptr;
int* recvPtr = nullptr;
CUDATHROW(cudaMalloc(&sendPtr, numElems * sizeof(int)));
CUDATHROW(cudaMemset(sendPtr, 0, numElems * sizeof(int)));
CUDATHROW(cudaMalloc(&recvPtr, numElems * sizeof(int)));
CUDATHROW(cudaMemset(recvPtr, 0, numElems * sizeof(int)));
sendBuff_ = std::shared_ptr<int>(sendPtr, [](int* p) { cudaFree(p); });
recvBuff_ = std::shared_ptr<int>(recvPtr, [](int* p) { cudaFree(p); });
expectedBuff_ = std::shared_ptr<int[]>(new int[numElems]);
localSendBuffV = sendBuff_.get();
localRecvBuffV = recvBuff_.get();