This commit is contained in:
Changho Hwang
2026-03-05 22:59:33 +00:00
parent 3b56b08bcb
commit 448ceb66f6
4 changed files with 36 additions and 13 deletions

View File

@@ -1,7 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Find the GDRCopy libraries
# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE)
#
# The following variables are optionally searched for defaults
# GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
@@ -32,6 +32,17 @@ find_library(GDRCOPY_LIBRARIES
/usr/lib
/usr/lib/x86_64-linux-gnu)
if(GDRCOPY_INCLUDE_DIRS)
include(CheckSymbolExists)
set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
unset(CMAKE_REQUIRED_INCLUDES)
if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
endif()
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)

View File

@@ -316,15 +316,17 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
localSignalGpuPtr_ = reinterpret_cast<uint64_t*>(localImpl.ibSignalGpuBuffer_.get());
}
// When the QP is mlx5 and the signal GPU buffer MR is a Data Direct DMABUF
// (registered via mlx5dv_reg_dmabuf_mr with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT),
// and the semaphore token write also goes through Data Direct (via GDRCopy to a
// Data Direct DMABUF MR), all writes are visible in GPU memory when the CQE is
// polled. This allows reading the token from imm_data instead of the signal GPU buffer.
// Data Direct requires all three conditions:
// 1. Signal GPU buffer MR registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
// 2. Local signal GPU GDRCopy mapping pinned with GDR_PIN_FLAG_FORCE_PCIE
// 3. (remoteUpdateDstAddr GDRCopy mapping checked at setRemoteUpdateDstAddr time)
// When all conditions are met, RDMA data writes and GDRCopy token writes both go
// through the Data Direct engine, guaranteeing GPU memory visibility at CQE poll time.
auto qp = qp_.lock();
dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect();
dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect() &&
localSignalGpuMap_ && localSignalGpuMap_->valid();
if (dataDirectEnabled_) {
INFO(CONN, "IBConnection: Data Direct enabled (mlx5 + DMABUF)");
INFO(CONN, "IBConnection: Data Direct enabled");
}
// Pre-post receive requests for incoming write-with-imm
@@ -361,6 +363,11 @@ void IBConnection::setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> gpuMem) {
if (gdrEnabled()) {
if (gpuMem) {
remoteUpdateDstAddrMap_ = std::make_unique<GdrMap>(std::move(gpuMem), localGpuDeviceId_);
// Data Direct requires the token write mapping to also use FORCE_PCIE
if (dataDirectEnabled_ && !(remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid())) {
dataDirectEnabled_ = false;
INFO(CONN, "IBConnection: Data Direct disabled (remoteUpdateDstAddr GDRCopy mapping not available)");
}
} else {
remoteUpdateDstAddrMap_.reset();
}

View File

@@ -80,7 +80,12 @@ GdrContext::~GdrContext() {
// GdrMap
GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId)
: ctx_(gdrContext()), gpuMem_(std::move(gpuMem)), mh_{}, barPtr_(nullptr), hostDstPtr_(nullptr), mappedSize_(0) {
: ctx_(gdrContext()),
gpuMem_(std::move(gpuMem)),
mh_{},
barPtr_(nullptr),
hostDstPtr_(nullptr),
mappedSize_(0) {
// Ensure CUDA device context is active for gdr_pin_buffer
CudaDeviceGuard deviceGuard(deviceId);

View File

@@ -125,10 +125,10 @@ class IBConnection : public BaseConnection {
uint64_t* localSignalGpuPtr_;
// When true, recvThreadFunc reads the token from imm_data (from CQE) instead of the
// signal GPU buffer via GDRCopy. Enabled when the QP is mlx5 and the signal GPU buffer
// MR is a Data Direct DMABUF. Memory consistency is guaranteed because both the RDMA
// data write and the semaphore token write (via GDRCopy) go through the Data Direct path,
// so all writes are visible in GPU memory when the CQE is polled.
// signal GPU buffer via GDRCopy. Enabled only when all Data Direct conditions are met:
// the signal GPU buffer MR is registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT,
// and all GDRCopy mappings (local signal buffer and remoteUpdateDstAddr) are valid,
// so both RDMA data writes and GDRCopy token writes go through the Data Direct engine.
bool dataDirectEnabled_;
void recvThreadFunc();