add changes from ib and connection

This commit is contained in:
Ubuntu
2026-04-06 03:29:54 +00:00
parent e487f831e6
commit 2c3f125d4c
2 changed files with 34 additions and 32 deletions

View File

@@ -309,14 +309,6 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
// Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
// The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
auto qp = qp_.lock();
// dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect() &&
// localSignalGpuMap_ && localSignalGpuMap_->valid();
dataDirectEnabled_ = true;
if (dataDirectEnabled_) {
INFO(CONN, "IBConnection: Data Direct enabled");
}
// Pre-post receive requests for incoming write-with-imm
int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
for (int i = 0; i < maxRecvWr; ++i) {
qp->stageRecv(/*wrId=*/0);

View File

@@ -84,40 +84,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nu
if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
#if !defined(MSCCLPP_USE_ROCM)
int fd = -1;
size_t rangeSize = pages * pageSize;
// Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
// bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
// routes DMA through the Data Direct engine for correct ordering and higher throughput.
// Fall back to the default (non-PCIe) mapping if the flag is unsupported.
#if (CUDA_VERSION >= 12030)
CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
if (cuRes != CUDA_SUCCESS || fd < 0) {
if (fd >= 0) ::close(fd);
fd = -1;
}
bool usedPcieFlag = (fd >= 0);
#endif // CUDA_VERSION >= 12030
if (fd < 0) {
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
}
// Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
// which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
size_t offsetInDmaBuf = buffIntPtr % pageSize;
int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
#if defined(MSCCLPP_USE_MLX5DV)
if (isMlx5 && MLX5DV::isAvailable()) {
// DATA_DIRECT requires a PCIe BAR1-mapped DMA-BUF fd (CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE).
// This matches the perftest approach for achieving full bandwidth with DATA_DIRECT.
CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize,
CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
if (cuRes == CUDA_SUCCESS && fd >= 0) {
mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
if (mr_ != nullptr) {
isDataDirect_ = true;
} else {
INFO(NET, "mlx5dv_reg_dmabuf_mr failed with PCIe DMA-BUF, falling back to regular DMA-BUF");
::close(fd);
fd = -1;
}
} else {
INFO(NET, "cuMemGetHandleForAddressRange with PCIE flag failed (", cuRes, "), falling back");
if (fd >= 0) { ::close(fd); fd = -1; }
}
if (isDataDirect) {
mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
}
#endif
if (mr_ == nullptr) {
if (fd < 0) {
MSCCLPP_CUTHROW(
cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
}
mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
}
if (fd >= 0) ::close(fd);
// If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
#if (CUDA_VERSION >= 12030)
if (mr_ == nullptr && usedPcieFlag) {
::close(fd);
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
}
#endif // CUDA_VERSION >= 12030
::close(fd);
if (mr_ == nullptr) {
THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
}