mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 08:50:21 +00:00
add changes from ib and connection
This commit is contained in:
@@ -309,14 +309,6 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
|
||||
// Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
|
||||
// The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
|
||||
auto qp = qp_.lock();
|
||||
// dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect() &&
|
||||
// localSignalGpuMap_ && localSignalGpuMap_->valid();
|
||||
dataDirectEnabled_ = true;
|
||||
if (dataDirectEnabled_) {
|
||||
INFO(CONN, "IBConnection: Data Direct enabled");
|
||||
}
|
||||
|
||||
// Pre-post receive requests for incoming write-with-imm
|
||||
int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
|
||||
for (int i = 0; i < maxRecvWr; ++i) {
|
||||
qp->stageRecv(/*wrId=*/0);
|
||||
|
||||
@@ -84,40 +84,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nu
|
||||
if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
|
||||
#if !defined(MSCCLPP_USE_ROCM)
|
||||
int fd = -1;
|
||||
size_t rangeSize = pages * pageSize;
|
||||
|
||||
// Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
|
||||
// bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
|
||||
// routes DMA through the Data Direct engine for correct ordering and higher throughput.
|
||||
// Fall back to the default (non-PCIe) mapping if the flag is unsupported.
|
||||
#if (CUDA_VERSION >= 12030)
|
||||
CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
|
||||
CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
|
||||
if (cuRes != CUDA_SUCCESS || fd < 0) {
|
||||
if (fd >= 0) ::close(fd);
|
||||
fd = -1;
|
||||
}
|
||||
bool usedPcieFlag = (fd >= 0);
|
||||
#endif // CUDA_VERSION >= 12030
|
||||
if (fd < 0) {
|
||||
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
}
|
||||
|
||||
// Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
|
||||
// which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
|
||||
size_t offsetInDmaBuf = buffIntPtr % pageSize;
|
||||
int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
|
||||
IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
|
||||
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
if (isMlx5 && MLX5DV::isAvailable()) {
|
||||
// DATA_DIRECT requires a PCIe BAR1-mapped DMA-BUF fd (CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE).
|
||||
// This matches the perftest approach for achieving full bandwidth with DATA_DIRECT.
|
||||
CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize,
|
||||
CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
|
||||
CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
|
||||
if (cuRes == CUDA_SUCCESS && fd >= 0) {
|
||||
mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
|
||||
if (mr_ != nullptr) {
|
||||
isDataDirect_ = true;
|
||||
} else {
|
||||
INFO(NET, "mlx5dv_reg_dmabuf_mr failed with PCIe DMA-BUF, falling back to regular DMA-BUF");
|
||||
::close(fd);
|
||||
fd = -1;
|
||||
}
|
||||
} else {
|
||||
INFO(NET, "cuMemGetHandleForAddressRange with PCIE flag failed (", cuRes, "), falling back");
|
||||
if (fd >= 0) { ::close(fd); fd = -1; }
|
||||
}
|
||||
if (isDataDirect) {
|
||||
mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
|
||||
}
|
||||
#endif
|
||||
if (mr_ == nullptr) {
|
||||
if (fd < 0) {
|
||||
MSCCLPP_CUTHROW(
|
||||
cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
}
|
||||
mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
|
||||
}
|
||||
if (fd >= 0) ::close(fd);
|
||||
|
||||
// If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
|
||||
#if (CUDA_VERSION >= 12030)
|
||||
if (mr_ == nullptr && usedPcieFlag) {
|
||||
::close(fd);
|
||||
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
|
||||
}
|
||||
#endif // CUDA_VERSION >= 12030
|
||||
|
||||
::close(fd);
|
||||
if (mr_ == nullptr) {
|
||||
THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user