mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-20 06:49:29 +00:00
Use cudaMalloc instead of GpuBuffer for communication buffers in alltoallv_test
This commit is contained in:
@@ -30,11 +30,12 @@ struct SemaphoreStub::Impl {
|
||||
};
|
||||
|
||||
std::shared_ptr<uint64_t> SemaphoreStub::Impl::gpuCallocToken([[maybe_unused]] std::shared_ptr<Context> context) {
|
||||
#if (CUDA_NVLS_API_AVAILABLE)
|
||||
if (isNvlsSupported()) {
|
||||
return context->pimpl_->getToken();
|
||||
}
|
||||
#endif // CUDA_NVLS_API_AVAILABLE
|
||||
// Always use cudaMalloc-based allocation for semaphore tokens.
|
||||
// On GB200 NVL, CudaIpc connections may span nodes that share an NVLink domain.
|
||||
// TokenPool uses cuMemCreate (physical alloc) whose IPC handles require either
|
||||
// Fabric (IMEX daemon) or PosixFd (host-local unix socket) — both can fail
|
||||
// cross-node. cudaMalloc-based tokens use RuntimeIpc (cudaIpcGetMemHandle /
|
||||
// cudaIpcOpenMemHandle) which works across the shared NVLink domain.
|
||||
#if defined(MSCCLPP_USE_ROCM)
|
||||
return detail::gpuCallocUncachedShared<uint64_t>();
|
||||
#else // !defined(MSCCLPP_USE_ROCM)
|
||||
|
||||
@@ -256,9 +256,22 @@ bool AllToAllVTestEngine::isInPlace() const { return false; }
|
||||
AllToAllVTestEngine::AllToAllVTestEngine(const TestArgs& args) : BaseTestEngine(args, "alltoallv") { inPlace_ = false; }
|
||||
|
||||
void AllToAllVTestEngine::allocateBuffer() {
|
||||
sendBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
|
||||
recvBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
|
||||
expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
|
||||
// Use cudaMalloc instead of GpuBuffer for communication buffers.
|
||||
// GpuBuffer uses cuMemCreate (physical alloc) on NVLS-capable GPUs, which only
|
||||
// supports Fabric and PosixFd IPC handles. Cross-node on GB200 NVL, PosixFd fails
|
||||
// (host-local unix socket) and Fabric requires the IMEX daemon. cudaMalloc memory
|
||||
// supports RuntimeIpc (cudaIpcGetMemHandle/cudaIpcOpenMemHandle), which works
|
||||
// cross-node over the shared NVLink domain.
|
||||
size_t numElems = args_.maxBytes / sizeof(int);
|
||||
int* sendPtr = nullptr;
|
||||
int* recvPtr = nullptr;
|
||||
CUDATHROW(cudaMalloc(&sendPtr, numElems * sizeof(int)));
|
||||
CUDATHROW(cudaMemset(sendPtr, 0, numElems * sizeof(int)));
|
||||
CUDATHROW(cudaMalloc(&recvPtr, numElems * sizeof(int)));
|
||||
CUDATHROW(cudaMemset(recvPtr, 0, numElems * sizeof(int)));
|
||||
sendBuff_ = std::shared_ptr<int>(sendPtr, [](int* p) { cudaFree(p); });
|
||||
recvBuff_ = std::shared_ptr<int>(recvPtr, [](int* p) { cudaFree(p); });
|
||||
expectedBuff_ = std::shared_ptr<int[]>(new int[numElems]);
|
||||
|
||||
localSendBuffV = sendBuff_.get();
|
||||
localRecvBuffV = recvBuff_.get();
|
||||
|
||||
Reference in New Issue
Block a user