From 2c3f125d4c1481b53bfd2a3c267e15946f7db4d8 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 6 Apr 2026 03:29:54 +0000 Subject: [PATCH] add changes from ib and connection --- src/core/connection.cc | 8 ------ src/core/ib.cc | 58 +++++++++++++++++++++++++----------------- 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/src/core/connection.cc b/src/core/connection.cc index d0fb19e7..8b6c0afb 100644 --- a/src/core/connection.cc +++ b/src/core/connection.cc @@ -309,14 +309,6 @@ IBConnection::IBConnection(std::shared_ptr context, const Endpoint& loc // Pre-post receive requests for incoming WRITE_WITH_IMM notifications. // The recv CQE guarantees the preceding data WRITE has been committed to GPU memory. auto qp = qp_.lock(); - // dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect() && - // localSignalGpuMap_ && localSignalGpuMap_->valid(); - dataDirectEnabled_ = true; - if (dataDirectEnabled_) { - INFO(CONN, "IBConnection: Data Direct enabled"); - } - - // Pre-post receive requests for incoming write-with-imm int maxRecvWr = localEndpoint.config().ib.maxRecvWr; for (int i = 0; i < maxRecvWr; ++i) { qp->stageRecv(/*wrId=*/0); diff --git a/src/core/ib.cc b/src/core/ib.cc index f4972f46..557f0426 100644 --- a/src/core/ib.cc +++ b/src/core/ib.cc @@ -84,40 +84,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nu if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) { #if !defined(MSCCLPP_USE_ROCM) int fd = -1; + size_t rangeSize = pages * pageSize; + + // Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU + // bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag + // routes DMA through the Data Direct engine for correct ordering and higher throughput. + // Fall back to the default (non-PCIe) mapping if the flag is unsupported. +#if (CUDA_VERSION >= 12030) + CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, + CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE); + if (cuRes != CUDA_SUCCESS || fd < 0) { + if (fd >= 0) ::close(fd); + fd = -1; + } + bool usedPcieFlag = (fd >= 0); +#endif // CUDA_VERSION >= 12030 + if (fd < 0) { + MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + } + + // Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API + // which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs. size_t offsetInDmaBuf = buffIntPtr % pageSize; int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC; #if defined(MSCCLPP_USE_MLX5DV) - if (isMlx5 && MLX5DV::isAvailable()) { - // DATA_DIRECT requires a PCIe BAR1-mapped DMA-BUF fd (CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE). - // This matches the perftest approach for achieving full bandwidth with DATA_DIRECT. - CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, - CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, - CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE); - if (cuRes == CUDA_SUCCESS && fd >= 0) { - mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags); - if (mr_ != nullptr) { - isDataDirect_ = true; - } else { - INFO(NET, "mlx5dv_reg_dmabuf_mr failed with PCIe DMA-BUF, falling back to regular DMA-BUF"); - ::close(fd); - fd = -1; - } - } else { - INFO(NET, "cuMemGetHandleForAddressRange with PCIE flag failed (", cuRes, "), falling back"); - if (fd >= 0) { ::close(fd); fd = -1; } - } + if (isDataDirect) { + mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags); } #endif if (mr_ == nullptr) { - if (fd < 0) { - MSCCLPP_CUTHROW( - cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); - } mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags); } - if (fd >= 0) ::close(fd); + + // If MR registration failed with a PCIe-mapped fd, retry with the default mapping. +#if (CUDA_VERSION >= 12030) + if (mr_ == nullptr && usedPcieFlag) { + ::close(fd); + MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags); + } +#endif // CUDA_VERSION >= 12030 + + ::close(fd); if (mr_ == nullptr) { THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")"); }