mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 08:50:21 +00:00
fix data direct
This commit is contained in:
2
.github/copilot-instructions.md
vendored
2
.github/copilot-instructions.md
vendored
@@ -43,7 +43,7 @@ For testing after successful build:
|
||||
# To run tests with two GPUs - two is enough for most tests
|
||||
mpirun -np 2 ./build/bin/mp_unit_tests
|
||||
# To run tests excluding IB-related ones (when IB is not available)
|
||||
mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
|
||||
mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
|
||||
```
|
||||
|
||||
For building a Python package:
|
||||
|
||||
@@ -35,7 +35,9 @@ find_library(GDRCOPY_LIBRARIES
|
||||
if(GDRCOPY_INCLUDE_DIRS)
|
||||
include(CheckSymbolExists)
|
||||
set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
|
||||
set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
|
||||
check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
|
||||
unset(CMAKE_REQUIRED_LIBRARIES)
|
||||
unset(CMAKE_REQUIRED_INCLUDES)
|
||||
if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
|
||||
message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
|
||||
|
||||
@@ -33,5 +33,6 @@ find_library(MLX5_LIBRARIES
|
||||
/usr/lib/x86_64-linux-gnu)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
|
||||
find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
|
||||
mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
|
||||
|
||||
@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
|
||||
return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
|
||||
}
|
||||
|
||||
#elif defined(MSCCLPP_DEVICE_HIP)
|
||||
#else // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
|
||||
constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
|
||||
constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
|
||||
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
|
||||
constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
|
||||
constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
|
||||
|
||||
// HIP does not have thread scope enums like CUDA
|
||||
constexpr auto scopeSystem = 0;
|
||||
constexpr auto scopeDevice = 0;
|
||||
|
||||
@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
|
||||
return __atomic_fetch_add(ptr, val, memoryOrder);
|
||||
}
|
||||
|
||||
#endif // defined(MSCCLPP_DEVICE_HIP)
|
||||
#endif // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <mscclpp/npkit/npkit.hpp>
|
||||
#endif
|
||||
|
||||
#include <mscclpp/atomic_device.hpp>
|
||||
#include <mscclpp/numa.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
#include <sstream>
|
||||
@@ -219,29 +220,18 @@ void IBConnection::recvThreadFunc() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Read the token value from the incoming write-with-imm completion.
|
||||
if (dataDirectEnabled_) {
|
||||
// Data Direct path: the signal GPU buffer MR was registered with
|
||||
// MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT, and the semaphore token is also written
|
||||
// through Data Direct (via GDRCopy). Both writes go through the same path, so
|
||||
// all data is visible in GPU memory when the CQE is polled. Read from imm_data.
|
||||
newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
|
||||
} else {
|
||||
// Slow path: read the 64-bit token from the local signal GPU buffer via volatile load.
|
||||
// localSignalGpuPtr_ points to either a GDRCopy BAR1 mapping (CUDA) or the
|
||||
// GPU buffer directly (ROCm system-coherent/uncached memory).
|
||||
newValueHost = *static_cast<volatile uint64_t*>(localSignalGpuPtr_);
|
||||
}
|
||||
// Read the token from imm_data (always available and correct in the CQE).
|
||||
newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
|
||||
|
||||
// Read token address from the local stored address (set by setSignalForwardingDst)
|
||||
if (remoteUpdateDstAddr_ != 0) {
|
||||
uint64_t* dstPtr = reinterpret_cast<uint64_t*>(remoteUpdateDstAddr_);
|
||||
|
||||
if (remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid()) {
|
||||
// Direct host-side write to GPU memory via GDRCopy BAR1 mapping
|
||||
remoteUpdateDstAddrMap_->copyTo(&newValueHost, sizeof(uint64_t));
|
||||
// Forward the token to the semaphore's inbound token address via atomicStore
|
||||
// through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
|
||||
if (signalAddr_ != 0) {
|
||||
if (signalGdrMap_ && signalGdrMap_->valid()) {
|
||||
atomicStore(signalGdrMap_->hostPtr(), newValueHost, memoryOrderRelaxed);
|
||||
} else {
|
||||
*dstPtr = newValueHost;
|
||||
// For HIP/ROCm.
|
||||
// NOTE: may need a fix in the future to ensure BAR1 mapping.
|
||||
*reinterpret_cast<volatile uint64_t*>(signalAddr_) = newValueHost;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -259,12 +249,10 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
|
||||
remoteTransport_(remoteEndpoint.transport()),
|
||||
atomicSrc_(std::make_unique<uint64_t>(0)),
|
||||
ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
|
||||
gdrSignalForwarding_(false),
|
||||
stopRecvThread_(false),
|
||||
localGpuDeviceId_(localEndpoint.device().id),
|
||||
remoteUpdateDstAddr_(0),
|
||||
remoteSignalGpuMrInfo_{0, 0},
|
||||
localSignalGpuPtr_(nullptr),
|
||||
dataDirectEnabled_(false) {
|
||||
signalAddr_(0) {
|
||||
qp_ = getImpl(localEndpoint).ibQp_;
|
||||
qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
|
||||
qp_.lock()->rts();
|
||||
@@ -274,105 +262,89 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
|
||||
|
||||
if (ibNoAtomic_) {
|
||||
#if defined(MSCCLPP_USE_CUDA)
|
||||
// On CUDA, HostNoAtomic requires GDRCopy for CPU→GPU signal forwarding through BAR1.
|
||||
if (!gdrEnabled()) {
|
||||
std::string reason = "unknown";
|
||||
switch (gdrStatus()) {
|
||||
case GdrStatus::NotBuilt:
|
||||
reason = "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
|
||||
break;
|
||||
case GdrStatus::Disabled:
|
||||
reason = "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
|
||||
break;
|
||||
case GdrStatus::DriverMissing:
|
||||
reason = "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
|
||||
break;
|
||||
case GdrStatus::OpenFailed:
|
||||
reason = "gdr_open() failed; GDRCopy driver may be misconfigured";
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage,
|
||||
"IB host-no-atomic mode on CUDA requires GDRCopy: ", gdrStatusMessage());
|
||||
}
|
||||
gdrSignalForwarding_ = true;
|
||||
#endif // defined(MSCCLPP_USE_CUDA)
|
||||
|
||||
// On platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200
|
||||
// NVLink-C2C), HostNoAtomic requires Data Direct for correct memory ordering. Data Direct
|
||||
// routes NIC DMA through the PCIe Data Direct engine, bypassing the bridge. It is available
|
||||
// on Virtual Function (VF) devices. On platforms without such a bridge (x86, non-Grace
|
||||
// aarch64), HostNoAtomic works without Data Direct.
|
||||
//
|
||||
// We cannot reliably detect the bridge at compile time or runtime, so we emit a warning
|
||||
// when the device is not a VF. If data corruption occurs, switching to VF devices with
|
||||
// Data Direct or using IbMode::Host with RDMA atomics will resolve it.
|
||||
{
|
||||
IbCtx* ibCtx = getImpl(*context).getIbContext(transport_);
|
||||
if (!ibCtx->isVirtualFunction()) {
|
||||
WARN(CONN,
|
||||
"IB HostNoAtomic mode without a Virtual Function (VF) device may cause data corruption "
|
||||
"on platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200). "
|
||||
"Device ",
|
||||
ibCtx->getDevName(),
|
||||
" is not a VF. "
|
||||
"If you experience data corruption, use VF devices with Data Direct or IbMode::Host.");
|
||||
}
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "IB host-no-atomic mode on CUDA requires GDRCopy: ", reason);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Extract remote endpoint's signal GPU buffer MR info for write-with-imm destination
|
||||
const auto& remoteImpl = getImpl(remoteEndpoint);
|
||||
remoteSignalGpuMrInfo_ = remoteImpl.ibSignalGpuMrInfo_;
|
||||
|
||||
// Create a GDR mapping of the local signal GPU buffer. recvThreadFunc reads the
|
||||
// 64-bit token via localSignalGpuPtr_, which points to the BAR1-mapped host address
|
||||
// (CUDA/GDRCopy) or the GPU buffer directly (ROCm system-coherent memory).
|
||||
const auto& localImpl = getImpl(localEndpoint);
|
||||
if (gdrEnabled() && localImpl.ibSignalGpuBuffer_) {
|
||||
localSignalGpuMap_ =
|
||||
std::make_unique<GdrMap>(std::static_pointer_cast<void>(localImpl.ibSignalGpuBuffer_), localGpuDeviceId_);
|
||||
}
|
||||
if (localSignalGpuMap_ && localSignalGpuMap_->valid()) {
|
||||
// Use the BAR1-mapped host pointer; uncacheable MMIO ensures ordered volatile reads.
|
||||
localSignalGpuPtr_ = localSignalGpuMap_->hostPtr();
|
||||
} else if (localImpl.ibSignalGpuBuffer_) {
|
||||
// ROCm: GPU memory is system-coherent, so direct volatile read is safe.
|
||||
localSignalGpuPtr_ = reinterpret_cast<uint64_t*>(localImpl.ibSignalGpuBuffer_.get());
|
||||
}
|
||||
|
||||
// Data Direct requires all three conditions:
|
||||
// 1. Signal GPU buffer MR registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
|
||||
// 2. Local signal GPU GDRCopy mapping pinned with GDR_PIN_FLAG_FORCE_PCIE
|
||||
// 3. (signal forwarding dst GDRCopy mapping checked at setSignalForwardingDst time)
|
||||
// When all conditions are met, RDMA data writes and GDRCopy token writes both go
|
||||
// through the Data Direct engine, guaranteeing GPU memory visibility at CQE poll time.
|
||||
// Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
|
||||
// The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
|
||||
auto qp = qp_.lock();
|
||||
dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect() &&
|
||||
localSignalGpuMap_ && localSignalGpuMap_->valid();
|
||||
if (dataDirectEnabled_) {
|
||||
INFO(CONN, "IBConnection: Data Direct enabled");
|
||||
}
|
||||
|
||||
// Pre-post receive requests for incoming write-with-imm
|
||||
int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
|
||||
for (int i = 0; i < maxRecvWr; ++i) {
|
||||
qp->stageRecv(/*wrId=*/0);
|
||||
}
|
||||
qp->postRecv();
|
||||
// Start the background thread to poll recv CQ
|
||||
recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
|
||||
INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with no-atomic mode");
|
||||
// The recv thread is started later in startSignalForwarding() when the semaphore
|
||||
// provides the signal forwarding destination. This ensures the thread lifetime is
|
||||
// bounded by the GdrMap lifetime (created before start, destroyed after stop).
|
||||
INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with signal forwarding (HostNoAtomic) mode");
|
||||
} else {
|
||||
INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode");
|
||||
}
|
||||
}
|
||||
|
||||
IBConnection::~IBConnection() {
|
||||
IBConnection::~IBConnection() { stopSignalForwarding(); }
|
||||
|
||||
Transport IBConnection::transport() const { return transport_; }
|
||||
|
||||
Transport IBConnection::remoteTransport() const { return remoteTransport_; }
|
||||
|
||||
bool IBConnection::isSignalForwarding() const { return ibNoAtomic_; }
|
||||
|
||||
void IBConnection::startSignalForwarding(std::shared_ptr<uint64_t> mem) {
|
||||
// Set up the forwarding destination and GdrMap, then start the recv thread.
|
||||
// Order: set address → create GdrMap → start thread.
|
||||
signalAddr_ = reinterpret_cast<uint64_t>(mem.get());
|
||||
if (gdrSignalForwarding_) {
|
||||
signalGdrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
|
||||
}
|
||||
if (ibNoAtomic_) {
|
||||
stopRecvThread_.store(false, std::memory_order_relaxed);
|
||||
recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
|
||||
}
|
||||
INFO(CONN, "IBConnection startSignalForwarding: ", (void*)signalAddr_);
|
||||
}
|
||||
|
||||
void IBConnection::stopSignalForwarding() {
|
||||
// Stop the recv thread, then tear down GdrMap and address.
|
||||
// Order: stop thread → destroy GdrMap → clear address.
|
||||
if (ibNoAtomic_) {
|
||||
stopRecvThread_.store(true, std::memory_order_relaxed);
|
||||
if (recvThread_.joinable()) {
|
||||
recvThread_.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Transport IBConnection::transport() const { return transport_; }
|
||||
|
||||
Transport IBConnection::remoteTransport() const { return remoteTransport_; }
|
||||
|
||||
bool IBConnection::usesSignalForwarding() const { return ibNoAtomic_; }
|
||||
|
||||
void IBConnection::setSignalForwardingDst(std::shared_ptr<uint64_t> mem) {
|
||||
remoteUpdateDstAddr_ = reinterpret_cast<uint64_t>(mem.get());
|
||||
if (gdrEnabled()) {
|
||||
if (mem) {
|
||||
remoteUpdateDstAddrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
|
||||
// Data Direct requires the token write mapping to also use FORCE_PCIE
|
||||
if (dataDirectEnabled_ && !(remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid())) {
|
||||
dataDirectEnabled_ = false;
|
||||
INFO(CONN, "IBConnection: Data Direct disabled (signal forwarding dst GDRCopy mapping not available)");
|
||||
}
|
||||
} else {
|
||||
remoteUpdateDstAddrMap_.reset();
|
||||
}
|
||||
if (gdrSignalForwarding_) {
|
||||
signalGdrMap_.reset();
|
||||
}
|
||||
INFO(CONN, "IBConnection setSignalForwardingDst: ", (void*)remoteUpdateDstAddr_);
|
||||
signalAddr_ = 0;
|
||||
INFO(CONN, "IBConnection stopSignalForwarding");
|
||||
}
|
||||
|
||||
void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
|
||||
@@ -425,27 +397,23 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
|
||||
*src = newValue;
|
||||
|
||||
if (ibNoAtomic_) {
|
||||
// Use RDMA write-with-imm instead of atomic operation.
|
||||
// Write the token value (8 bytes) from the local host buffer to the remote signal GPU buffer,
|
||||
// with newValue also in imm_data (32-bit). The remote's recvThreadFunc reads the token from
|
||||
// the signal GPU buffer and forwards it to the semaphore's inbound token address.
|
||||
|
||||
// Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
|
||||
// Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the token in imm_data.
|
||||
// The receiver's recv thread polls the CQE, which guarantees the preceding data WRITE
|
||||
// has been committed to GPU memory. The recv thread then forwards the token to the
|
||||
// semaphore's inbound token via GDRCopy atomicStore.
|
||||
unsigned int immData = static_cast<unsigned int>(newValue);
|
||||
|
||||
// Write the real token value into the host buffer, then RDMA write host->remote GPU
|
||||
*atomicSrc_ = newValue;
|
||||
qp_.lock()->stageSendWriteWithImm(atomicSrcTransportInfo_.ibMr, remoteSignalGpuMrInfo_,
|
||||
/*size=*/sizeof(uint64_t), /*wrId=*/0,
|
||||
qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
|
||||
/*size=*/0, /*wrId=*/0,
|
||||
/*srcOffset=*/0, /*dstOffset=*/0,
|
||||
/*signaled=*/true, /*immData=*/immData);
|
||||
qp_.lock()->postSend();
|
||||
INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
|
||||
INFO(CONN, "IBConnection signal forwarding: value ", oldValue, " -> ", newValue);
|
||||
} else {
|
||||
qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
|
||||
/*signaled=*/true);
|
||||
qp_.lock()->postSend();
|
||||
INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
|
||||
INFO(CONN, "IBConnection atomic write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
|
||||
" -> ", newValue);
|
||||
}
|
||||
|
||||
|
||||
@@ -53,21 +53,6 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
|
||||
->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
|
||||
config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
|
||||
ibQpInfo_ = ibQp_->getInfo();
|
||||
|
||||
// Allocate a 64-bit signal GPU buffer for write-with-imm data payload (ibNoAtomic_ only).
|
||||
if (ibNoAtomic_ && config_.device.type == DeviceType::GPU && config_.device.id >= 0) {
|
||||
CudaDeviceGuard deviceGuard(config_.device.id);
|
||||
#if defined(MSCCLPP_DEVICE_HIP)
|
||||
ibSignalGpuBuffer_ = detail::gpuCallocUncachedShared<uint64_t>();
|
||||
#else
|
||||
ibSignalGpuBuffer_ = detail::gpuCallocShared<uint64_t>();
|
||||
#endif
|
||||
ibSignalGpuMr_ =
|
||||
contextImpl.getIbContext(config_.transport)->registerMr(ibSignalGpuBuffer_.get(), sizeof(uint64_t));
|
||||
ibSignalGpuMrInfo_ = ibSignalGpuMr_->getInfo();
|
||||
} else {
|
||||
ibSignalGpuMrInfo_ = {0, 0};
|
||||
}
|
||||
} else if (config_.transport == Transport::Ethernet) {
|
||||
// Configuring Ethernet Interfaces
|
||||
abortFlag_ = 0;
|
||||
@@ -90,9 +75,6 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
|
||||
ibLocal_ = false;
|
||||
it = detail::deserialize(it, ibQpInfo_);
|
||||
it = detail::deserialize(it, ibNoAtomic_);
|
||||
if (ibNoAtomic_) {
|
||||
it = detail::deserialize(it, ibSignalGpuMrInfo_);
|
||||
}
|
||||
} else if (config_.transport == Transport::Ethernet) {
|
||||
it = detail::deserialize(it, socketAddress_);
|
||||
}
|
||||
@@ -123,9 +105,6 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
|
||||
if (AllIBTransports.has(pimpl_->config_.transport)) {
|
||||
detail::serialize(data, pimpl_->ibQpInfo_);
|
||||
detail::serialize(data, pimpl_->ibNoAtomic_);
|
||||
if (pimpl_->ibNoAtomic_) {
|
||||
detail::serialize(data, pimpl_->ibSignalGpuMrInfo_);
|
||||
}
|
||||
} else if (pimpl_->config_.transport == Transport::Ethernet) {
|
||||
detail::serialize(data, pimpl_->socketAddress_);
|
||||
}
|
||||
|
||||
112
src/core/gdr.cc
112
src/core/gdr.cc
@@ -5,6 +5,7 @@
|
||||
|
||||
#if defined(MSCCLPP_USE_GDRCOPY)
|
||||
|
||||
#include <gdrapi.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <mscclpp/env.hpp>
|
||||
@@ -12,9 +13,11 @@
|
||||
|
||||
#include "logger.hpp"
|
||||
|
||||
#ifndef GPU_PAGE_SHIFT
|
||||
#define GPU_PAGE_SHIFT 16
|
||||
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
|
||||
#define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
|
||||
#endif
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
@@ -45,6 +48,23 @@ GdrStatus gdrStatus() { return gdrContext()->status(); }
|
||||
|
||||
bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
|
||||
|
||||
const char* gdrStatusMessage() {
|
||||
switch (gdrStatus()) {
|
||||
case GdrStatus::Ok:
|
||||
return "GDRCopy initialized successfully";
|
||||
case GdrStatus::NotBuilt:
|
||||
return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
|
||||
case GdrStatus::Disabled:
|
||||
return "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
|
||||
case GdrStatus::DriverMissing:
|
||||
return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
|
||||
case GdrStatus::OpenFailed:
|
||||
return "gdr_open() failed; GDRCopy driver may be misconfigured";
|
||||
default:
|
||||
return "unknown GDRCopy status";
|
||||
}
|
||||
}
|
||||
|
||||
GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
|
||||
if (env()->forceDisableGdr) {
|
||||
INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
|
||||
@@ -77,53 +97,79 @@ GdrContext::~GdrContext() {
|
||||
}
|
||||
}
|
||||
|
||||
// GdrMap
|
||||
// GdrMap::Impl — real implementation with GDRCopy
|
||||
|
||||
struct GdrMap::Impl {
|
||||
std::shared_ptr<GdrContext> ctx;
|
||||
std::shared_ptr<void> gpuMem;
|
||||
gdr_mh_t mh;
|
||||
void* barPtr;
|
||||
uint64_t* hostDstPtr;
|
||||
size_t mappedSize;
|
||||
};
|
||||
|
||||
GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : pimpl_(std::make_unique<Impl>()) {
|
||||
pimpl_->ctx = gdrContext();
|
||||
pimpl_->gpuMem = std::move(gpuMem);
|
||||
pimpl_->mh = {};
|
||||
pimpl_->barPtr = nullptr;
|
||||
pimpl_->hostDstPtr = nullptr;
|
||||
pimpl_->mappedSize = 0;
|
||||
|
||||
GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId)
|
||||
: ctx_(gdrContext()),
|
||||
gpuMem_(std::move(gpuMem)),
|
||||
mh_{},
|
||||
barPtr_(nullptr),
|
||||
hostDstPtr_(nullptr),
|
||||
mappedSize_(0) {
|
||||
// Ensure CUDA device context is active for gdr_pin_buffer
|
||||
CudaDeviceGuard deviceGuard(deviceId);
|
||||
|
||||
uint64_t gpuAddr = reinterpret_cast<uint64_t>(gpuMem_.get());
|
||||
uint64_t gpuAddr = reinterpret_cast<uint64_t>(pimpl_->gpuMem.get());
|
||||
// Align to GPU page boundary and pin one page around the target address
|
||||
unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK;
|
||||
unsigned long pageOffset = gpuAddr - alignedAddr;
|
||||
mappedSize_ = GPU_PAGE_SIZE;
|
||||
pimpl_->mappedSize = GPU_PAGE_SIZE;
|
||||
|
||||
int ret = gdr_pin_buffer_v2(ctx_->handle(), alignedAddr, mappedSize_, GDR_PIN_FLAG_FORCE_PCIE, &mh_);
|
||||
// Pin the GPU memory for GDRCopy BAR1 mapping. Try GDR_PIN_FLAG_FORCE_PCIE first for optimal
|
||||
// ordering on platforms that support it (e.g., GB200). Fall back to flags=0 if FORCE_PCIE is
|
||||
// not supported. Both paths work correctly: CPU writes via atomicStore, GPU reads via
|
||||
// system-scope acquire.
|
||||
int ret =
|
||||
gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, GDR_PIN_FLAG_FORCE_PCIE, &pimpl_->mh);
|
||||
if (ret != 0) {
|
||||
THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
|
||||
". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
|
||||
ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh);
|
||||
if (ret != 0) {
|
||||
THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
|
||||
". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
|
||||
}
|
||||
}
|
||||
|
||||
ret = gdr_map(ctx_->handle(), mh_, &barPtr_, mappedSize_);
|
||||
ret = gdr_map(pimpl_->ctx->handle(), pimpl_->mh, &pimpl_->barPtr, pimpl_->mappedSize);
|
||||
if (ret != 0) {
|
||||
(void)gdr_unpin_buffer(ctx_->handle(), mh_);
|
||||
(void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
|
||||
THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
|
||||
}
|
||||
|
||||
hostDstPtr_ = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(barPtr_) + pageOffset);
|
||||
pimpl_->hostDstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(pimpl_->barPtr) + pageOffset);
|
||||
|
||||
INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)hostDstPtr_);
|
||||
INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)pimpl_->hostDstPtr);
|
||||
}
|
||||
|
||||
GdrMap::~GdrMap() {
|
||||
if (barPtr_ != nullptr) {
|
||||
(void)gdr_unmap(ctx_->handle(), mh_, barPtr_, mappedSize_);
|
||||
}
|
||||
if (hostDstPtr_ != nullptr) {
|
||||
(void)gdr_unpin_buffer(ctx_->handle(), mh_);
|
||||
if (pimpl_) {
|
||||
if (pimpl_->barPtr != nullptr) {
|
||||
(void)gdr_unmap(pimpl_->ctx->handle(), pimpl_->mh, pimpl_->barPtr, pimpl_->mappedSize);
|
||||
}
|
||||
if (pimpl_->hostDstPtr != nullptr) {
|
||||
(void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(mh_, hostDstPtr_, src, size); }
|
||||
bool GdrMap::valid() const { return pimpl_ && pimpl_->hostDstPtr != nullptr; }
|
||||
|
||||
void GdrMap::copyFrom(void* dst, size_t size) const { gdr_copy_from_mapping(mh_, dst, hostDstPtr_, size); }
|
||||
uint64_t* GdrMap::hostPtr() const { return pimpl_ ? pimpl_->hostDstPtr : nullptr; }
|
||||
|
||||
void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(pimpl_->mh, pimpl_->hostDstPtr, src, size); }
|
||||
|
||||
void GdrMap::copyFrom(void* dst, size_t size) const {
|
||||
gdr_copy_from_mapping(pimpl_->mh, dst, pimpl_->hostDstPtr, size);
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -135,6 +181,24 @@ GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
|
||||
|
||||
bool gdrEnabled() { return false; }
|
||||
|
||||
const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
|
||||
|
||||
// GdrMap::Impl — stub (no GDRCopy)
|
||||
|
||||
struct GdrMap::Impl {};
|
||||
|
||||
GdrMap::GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
|
||||
|
||||
GdrMap::~GdrMap() = default;
|
||||
|
||||
bool GdrMap::valid() const { return false; }
|
||||
|
||||
uint64_t* GdrMap::hostPtr() const { return nullptr; }
|
||||
|
||||
void GdrMap::copyTo(const void* /*src*/, size_t /*size*/) {}
|
||||
|
||||
void GdrMap::copyFrom(void* /*dst*/, size_t /*size*/) const {}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // !defined(MSCCLPP_USE_GDRCOPY)
|
||||
|
||||
115
src/core/ib.cc
115
src/core/ib.cc
@@ -67,8 +67,7 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5)
|
||||
: mr_(nullptr), buff_(buff), size_(0), isDmabuf_(false), isDataDirect_(false) {
|
||||
IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nullptr), buff_(buff), size_(0) {
|
||||
if (size == 0) {
|
||||
THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0");
|
||||
}
|
||||
@@ -91,11 +90,8 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5)
|
||||
int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
|
||||
IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
if (isMlx5 && MLX5DV::isAvailable()) {
|
||||
if (isDataDirect && MLX5DV::isAvailable()) {
|
||||
mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
|
||||
if (mr_ != nullptr) {
|
||||
isDataDirect_ = true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (mr_ == nullptr) {
|
||||
@@ -105,7 +101,6 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5)
|
||||
if (mr_ == nullptr) {
|
||||
THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
|
||||
}
|
||||
isDmabuf_ = true;
|
||||
#else // defined(MSCCLPP_USE_ROCM)
|
||||
THROW(NET, Error, ErrorCode::InvalidUsage, "We don't support DMABUF on HIP platforms yet");
|
||||
#endif // defined(MSCCLPP_USE_ROCM)
|
||||
@@ -145,12 +140,8 @@ const void* IbMr::getBuff() const { return buff_; }
|
||||
|
||||
uint32_t IbMr::getLkey() const { return mr_->lkey; }
|
||||
|
||||
bool IbMr::isDmabuf() const { return isDmabuf_; }
|
||||
|
||||
bool IbMr::isDataDirect() const { return isDataDirect_; }
|
||||
|
||||
IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
|
||||
int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic, bool isMlx5)
|
||||
int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
|
||||
: portNum_(portNum),
|
||||
gidIndex_(gidIndex),
|
||||
info_(),
|
||||
@@ -171,8 +162,7 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
|
||||
maxSendWr_(maxSendWr),
|
||||
maxWrPerSend_(maxWrPerSend),
|
||||
maxRecvWr_(maxRecvWr),
|
||||
noAtomic_(noAtomic),
|
||||
isMlx5_(isMlx5) {
|
||||
noAtomic_(noAtomic) {
|
||||
sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
|
||||
if (sendCq_ == nullptr) {
|
||||
THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
|
||||
@@ -186,47 +176,21 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
|
||||
}
|
||||
}
|
||||
|
||||
struct ibv_qp* qp = nullptr;
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
if (isMlx5_) {
|
||||
struct ibv_qp_init_attr_ex qpInitAttrEx = {};
|
||||
qpInitAttrEx.sq_sig_all = 0;
|
||||
qpInitAttrEx.send_cq = sendCq_;
|
||||
qpInitAttrEx.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
|
||||
qpInitAttrEx.qp_type = IBV_QPT_RC;
|
||||
qpInitAttrEx.cap.max_send_wr = maxSendWr;
|
||||
qpInitAttrEx.cap.max_recv_wr = maxRecvWr;
|
||||
qpInitAttrEx.cap.max_send_sge = 1;
|
||||
qpInitAttrEx.cap.max_recv_sge = 1;
|
||||
qpInitAttrEx.cap.max_inline_data = 0;
|
||||
qpInitAttrEx.pd = pd;
|
||||
qpInitAttrEx.comp_mask = IBV_QP_INIT_ATTR_PD;
|
||||
struct ibv_qp_init_attr qpInitAttr = {};
|
||||
qpInitAttr.sq_sig_all = 0;
|
||||
qpInitAttr.send_cq = sendCq_;
|
||||
// Use separate recv CQ if created, otherwise use the send CQ
|
||||
qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
|
||||
qpInitAttr.qp_type = IBV_QPT_RC;
|
||||
qpInitAttr.cap.max_send_wr = maxSendWr;
|
||||
qpInitAttr.cap.max_recv_wr = maxRecvWr;
|
||||
qpInitAttr.cap.max_send_sge = 1;
|
||||
qpInitAttr.cap.max_recv_sge = 1;
|
||||
qpInitAttr.cap.max_inline_data = 0;
|
||||
|
||||
struct mlx5dv_qp_init_attr mlx5QpAttr = {};
|
||||
|
||||
qp = MLX5DV::mlx5dv_create_qp(ctx, &qpInitAttrEx, &mlx5QpAttr);
|
||||
if (qp == nullptr) {
|
||||
THROW(NET, IbError, errno, "mlx5dv_create_qp failed (errno ", errno, ")");
|
||||
}
|
||||
} else
|
||||
#endif // defined(MSCCLPP_USE_MLX5DV)
|
||||
{
|
||||
struct ibv_qp_init_attr qpInitAttr = {};
|
||||
qpInitAttr.sq_sig_all = 0;
|
||||
qpInitAttr.send_cq = sendCq_;
|
||||
// Use separate recv CQ if created, otherwise use the send CQ
|
||||
qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
|
||||
qpInitAttr.qp_type = IBV_QPT_RC;
|
||||
qpInitAttr.cap.max_send_wr = maxSendWr;
|
||||
qpInitAttr.cap.max_recv_wr = maxRecvWr;
|
||||
qpInitAttr.cap.max_send_sge = 1;
|
||||
qpInitAttr.cap.max_recv_sge = 1;
|
||||
qpInitAttr.cap.max_inline_data = 0;
|
||||
|
||||
qp = IBVerbs::ibv_create_qp(pd, &qpInitAttr);
|
||||
if (qp == nullptr) {
|
||||
THROW(NET, IbError, errno, "ibv_create_qp failed (errno ", errno, ")");
|
||||
}
|
||||
struct ibv_qp* qp = IBVerbs::ibv_create_qp(pd, &qpInitAttr);
|
||||
if (qp == nullptr) {
|
||||
THROW(NET, IbError, errno, "ibv_create_qp failed (errno ", errno, ")");
|
||||
}
|
||||
|
||||
struct ibv_port_attr portAttr;
|
||||
@@ -483,12 +447,29 @@ std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_
|
||||
unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }
|
||||
|
||||
IbCtx::IbCtx(const std::string& devName)
|
||||
: devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false), isMlx5_(false) {
|
||||
: devName_(devName),
|
||||
ctx_(nullptr),
|
||||
pd_(nullptr),
|
||||
supportsRdmaAtomics_(false),
|
||||
isMlx5_(false),
|
||||
dataDirect_(false),
|
||||
isVF_(false) {
|
||||
int num;
|
||||
struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
|
||||
for (int i = 0; i < num; ++i) {
|
||||
if (std::string(devices[i]->name) == devName_) {
|
||||
ctx_ = IBVerbs::ibv_open_device(devices[i]);
|
||||
|
||||
// Detect if this IB device is a Virtual Function (VF).
|
||||
// VFs have a 'physfn' sysfs symlink pointing to their parent PF; PFs do not.
|
||||
{
|
||||
std::string physfnPath = "/sys/class/infiniband/" + devName_ + "/device/physfn";
|
||||
isVF_ = (access(physfnPath.c_str(), F_OK) == 0);
|
||||
if (isVF_) {
|
||||
INFO(NET, "IB device ", devName_, " is a Virtual Function (Data Direct ordering available)");
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
if (MLX5DV::isAvailable()) {
|
||||
isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]);
|
||||
@@ -509,6 +490,20 @@ IbCtx::IbCtx(const std::string& devName)
|
||||
THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")");
|
||||
}
|
||||
|
||||
// Detect Data Direct support via mlx5dv_get_data_direct_sysfs_path
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
if (isMlx5_ && MLX5DV::isAvailable()) {
|
||||
char sysfsPath[256];
|
||||
int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath));
|
||||
if (ret == 0) {
|
||||
dataDirect_ = true;
|
||||
INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")");
|
||||
} else {
|
||||
INFO(NET, "IB device ", devName_, " does not support Data Direct");
|
||||
}
|
||||
}
|
||||
#endif // defined(MSCCLPP_USE_MLX5DV)
|
||||
|
||||
// Query and cache RDMA atomics capability
|
||||
struct ibv_device_attr attr = {};
|
||||
if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) {
|
||||
@@ -579,17 +574,21 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
|
||||
THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
|
||||
}
|
||||
return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
|
||||
maxRecvWr, maxWrPerSend, noAtomic, isMlx5_));
|
||||
maxRecvWr, maxWrPerSend, noAtomic));
|
||||
}
|
||||
|
||||
std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
|
||||
return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isMlx5_));
|
||||
return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, dataDirect_));
|
||||
}
|
||||
|
||||
bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
|
||||
|
||||
bool IbCtx::isMlx5() const { return isMlx5_; }
|
||||
|
||||
bool IbCtx::supportsDataDirect() const { return dataDirect_; }
|
||||
|
||||
bool IbCtx::isVirtualFunction() const { return isVF_; }
|
||||
|
||||
MSCCLPP_API_CPP int getIBDeviceCount() {
|
||||
int num;
|
||||
IBVerbs::ibv_get_device_list(&num);
|
||||
@@ -699,8 +698,6 @@ IbMr::~IbMr() {}
|
||||
IbMrInfo IbMr::getInfo() const { return IbMrInfo(); }
|
||||
const void* IbMr::getBuff() const { return nullptr; }
|
||||
uint32_t IbMr::getLkey() const { return 0; }
|
||||
bool IbMr::isDmabuf() const { return false; }
|
||||
bool IbMr::isDataDirect() const { return false; }
|
||||
|
||||
IbQp::~IbQp() {}
|
||||
void IbQp::rtr(const IbQpInfo& /*info*/) {}
|
||||
|
||||
@@ -10,19 +10,37 @@
|
||||
|
||||
#include "logger.hpp"
|
||||
|
||||
// NOTE: MRC_SUPPORT is a temporal macro that makes the current MRC implementation work.
|
||||
// MRC_SUPPORT is needed because the current libibverbs implmentation of MRC does not provide
|
||||
// all symbols that we need, so we need to load some symbols from the original libibverbs.
|
||||
// This macro will be removed (set 0) once MRC provides all necessary symbols.
|
||||
// Non-MRC environments will not be affected by this macro as long as VMRC_LIBIBVERBS_SO
|
||||
// environment variable is not set.
|
||||
#define MRC_SUPPORT 1
|
||||
#if (MRC_SUPPORT)
|
||||
#include <cstdlib>
|
||||
#include <set>
|
||||
#endif // (MRC_SUPPORT)
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
static std::unique_ptr<void, int (*)(void*)> globalIBVerbsHandle(nullptr, &::dlclose);
|
||||
#if (MRC_SUPPORT)
|
||||
static std::unique_ptr<void, int (*)(void*)> globalOrigIBVerbsHandle(nullptr, &::dlclose);
|
||||
#endif // (MRC_SUPPORT)
|
||||
|
||||
void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
|
||||
#if (MRC_SUPPORT)
|
||||
static std::set<std::string> mrcSymbols = {
|
||||
"ibv_get_device_list", "ibv_get_device_name", "ibv_open_device", "ibv_close_device", "ibv_query_qp",
|
||||
"ibv_create_cq", "ibv_destroy_cq", "ibv_create_qp", "ibv_modify_qp", "ibv_destroy_qp",
|
||||
};
|
||||
#endif // (MRC_SUPPORT)
|
||||
if (!globalIBVerbsHandle) {
|
||||
if (mscclpp::env()->ibvSo != "") {
|
||||
void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW);
|
||||
if (handle) {
|
||||
globalIBVerbsHandle.reset(handle);
|
||||
} else {
|
||||
THROW(NET, SysError, errno, "Failed to load libibverbs library specified by MSCCLPP_IBV_SO ('",
|
||||
mscclpp::env()->ibvSo, "'): ", std::string(::dlerror()));
|
||||
}
|
||||
} else {
|
||||
const char* possibleLibNames[] = {"libibverbs.so", "libibverbs.so.1", nullptr};
|
||||
@@ -38,7 +56,26 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
|
||||
THROW(NET, SysError, errno, "Failed to open libibverbs: ", std::string(::dlerror()));
|
||||
}
|
||||
}
|
||||
#if (MRC_SUPPORT)
|
||||
// In MRC mode, `VMRC_LIBIBVERBS_SO` should be set.
|
||||
char* vmrcLibibverbsSo = ::getenv("VMRC_LIBIBVERBS_SO");
|
||||
void* ptr;
|
||||
if (vmrcLibibverbsSo != nullptr && mrcSymbols.find(symbol) == mrcSymbols.end()) {
|
||||
// If we are in MRC mode and the symbol is not in the table, get it from the original libibverbs.
|
||||
if (!globalOrigIBVerbsHandle) {
|
||||
void* handle = ::dlopen(vmrcLibibverbsSo, RTLD_NOW);
|
||||
if (!handle) {
|
||||
THROW(NET, SysError, errno, "Failed to open ", std::string(vmrcLibibverbsSo));
|
||||
}
|
||||
globalOrigIBVerbsHandle.reset(handle);
|
||||
}
|
||||
ptr = ::dlsym(globalOrigIBVerbsHandle.get(), symbol.c_str());
|
||||
} else {
|
||||
ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
|
||||
}
|
||||
#else // !(MRC_SUPPORT)
|
||||
void* ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
|
||||
#endif // !(MRC_SUPPORT)
|
||||
if (!ptr && !allowReturnNull) {
|
||||
THROW(NET, SysError, errno, "Failed to load libibverbs symbol: ", symbol);
|
||||
}
|
||||
|
||||
@@ -37,16 +37,18 @@ class BaseConnection {
|
||||
|
||||
virtual void flush(int64_t timeoutUsec = -1) = 0;
|
||||
|
||||
/// Set the local address where forwarded signals should be written.
|
||||
/// This is called by the receiver to specify where incoming signals should be forwarded.
|
||||
/// Default implementation is a no-op for connections that don't need it.
|
||||
/// @param mem Shared pointer to the memory for incoming writes (nullptr to clear).
|
||||
virtual void setSignalForwardingDst(std::shared_ptr<uint64_t> /*mem*/) {}
|
||||
/// Start signal forwarding to the given memory address.
|
||||
/// Called by the semaphore to specify where incoming signals should be written.
|
||||
/// @param mem Shared pointer to the GPU memory for the signal token.
|
||||
virtual void startSignalForwarding(std::shared_ptr<uint64_t> /*mem*/) {}
|
||||
|
||||
/// Stop signal forwarding and release associated resources.
|
||||
virtual void stopSignalForwarding() {}
|
||||
|
||||
/// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode).
|
||||
/// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
|
||||
/// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
|
||||
virtual bool usesSignalForwarding() const { return false; }
|
||||
virtual bool isSignalForwarding() const { return false; }
|
||||
|
||||
virtual Transport transport() const = 0;
|
||||
|
||||
@@ -105,31 +107,20 @@ class IBConnection : public BaseConnection {
|
||||
// For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
|
||||
// instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
|
||||
bool ibNoAtomic_;
|
||||
bool gdrSignalForwarding_; // ibNoAtomic_ && gdrEnabled() — decided once at construction
|
||||
std::thread recvThread_;
|
||||
std::atomic<bool> stopRecvThread_;
|
||||
int localGpuDeviceId_; // Local GPU device ID for CUDA context and GDR mapping
|
||||
|
||||
// Write-with-imm design:
|
||||
// - Sender: 8-byte RDMA write-with-imm from local host buffer to remote signal GPU buffer,
|
||||
// carrying the token value both as RDMA payload and in imm_data (32-bit).
|
||||
// - Receiver: reads the full 64-bit token from the local signal GPU buffer (via BAR1 or
|
||||
// volatile read), then writes it to remoteUpdateDstAddr_ (the semaphore's inbound token).
|
||||
uint64_t remoteUpdateDstAddr_;
|
||||
// Signal forwarding design (HostNoAtomic mode):
|
||||
// - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the token value in imm_data (32-bit).
|
||||
// - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
|
||||
// the token from imm_data, then writes it to signalAddr_ (the semaphore's
|
||||
// inbound token) via atomicStore through the GDRCopy BAR1 mapping. The GPU reads
|
||||
// inboundToken with system-scope acquire ordering.
|
||||
uint64_t signalAddr_;
|
||||
|
||||
// Remote endpoint's signal GPU buffer MR info (destination for RDMA write-with-imm).
|
||||
// The local host buffer (atomicSrc_ / atomicSrcTransportInfo_.ibMr) serves as the source.
|
||||
IbMrInfo remoteSignalGpuMrInfo_;
|
||||
|
||||
std::unique_ptr<GdrMap> remoteUpdateDstAddrMap_;
|
||||
std::unique_ptr<GdrMap> localSignalGpuMap_;
|
||||
uint64_t* localSignalGpuPtr_;
|
||||
|
||||
// When true, recvThreadFunc reads the token from imm_data (from CQE) instead of the
|
||||
// signal GPU buffer via GDRCopy. Enabled only when all Data Direct conditions are met:
|
||||
// the signal GPU buffer MR is registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT,
|
||||
// and all GDRCopy mappings (local signal buffer and remoteUpdateDstAddr) are valid,
|
||||
// so both RDMA data writes and GDRCopy token writes go through the Data Direct engine.
|
||||
bool dataDirectEnabled_;
|
||||
std::unique_ptr<GdrMap> signalGdrMap_;
|
||||
|
||||
void recvThreadFunc();
|
||||
|
||||
@@ -137,12 +128,15 @@ class IBConnection : public BaseConnection {
|
||||
IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
|
||||
~IBConnection();
|
||||
|
||||
/// Set the local address where forwarded signals should be written.
|
||||
/// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
|
||||
/// @param mem Shared pointer to the memory for incoming writes (nullptr to clear).
|
||||
void setSignalForwardingDst(std::shared_ptr<uint64_t> mem) override;
|
||||
/// Start signal forwarding to the given memory address.
|
||||
/// Must be called before the remote sends any updateAndSync in HostNoAtomic mode.
|
||||
/// @param mem Shared pointer to the GPU memory for the signal token.
|
||||
void startSignalForwarding(std::shared_ptr<uint64_t> mem) override;
|
||||
|
||||
bool usesSignalForwarding() const override;
|
||||
/// Stop signal forwarding and release associated resources.
|
||||
void stopSignalForwarding() override;
|
||||
|
||||
bool isSignalForwarding() const override;
|
||||
|
||||
Transport transport() const override;
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
#include <memory>
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "ib.hpp"
|
||||
@@ -30,13 +29,6 @@ struct Endpoint::Impl {
|
||||
std::shared_ptr<IbQp> ibQp_;
|
||||
IbQpInfo ibQpInfo_;
|
||||
|
||||
// Signal GPU buffer for write-with-imm data payload (ibNoAtomic_ only).
|
||||
// Each endpoint allocates a 64-bit GPU buffer and registers it as an IB MR.
|
||||
// The MR info is serialized/exchanged so the remote can RDMA-write to it.
|
||||
std::shared_ptr<uint64_t> ibSignalGpuBuffer_;
|
||||
std::unique_ptr<const IbMr> ibSignalGpuMr_;
|
||||
IbMrInfo ibSignalGpuMrInfo_;
|
||||
|
||||
// The following are only used for Ethernet and are undefined for other transports.
|
||||
std::unique_ptr<Socket> socket_;
|
||||
SocketAddress socketAddress_;
|
||||
|
||||
@@ -4,6 +4,10 @@
|
||||
#ifndef MSCCLPP_GDR_HPP_
|
||||
#define MSCCLPP_GDR_HPP_
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
enum class GdrStatus {
|
||||
@@ -20,25 +24,14 @@ GdrStatus gdrStatus();
|
||||
/// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok).
|
||||
bool gdrEnabled();
|
||||
|
||||
} // namespace mscclpp
|
||||
/// Return a human-readable error message for the current GDRCopy status.
|
||||
const char* gdrStatusMessage();
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#if defined(MSCCLPP_USE_GDRCOPY)
|
||||
|
||||
#include <gdrapi.h>
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
class GdrContext;
|
||||
|
||||
/// RAII wrapper for a per-connection GDRCopy BAR1 mapping of a GPU address.
|
||||
/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
|
||||
/// When GDRCopy is not available, all operations are no-ops and valid() returns false.
|
||||
class GdrMap {
|
||||
public:
|
||||
/// Pin and map a GPU address for direct host-side access.
|
||||
/// Holds a shared reference to the GPU memory to keep it alive.
|
||||
/// @param gpuMem Shared pointer to the GPU memory (e.g. from gpuCallocShared).
|
||||
/// @param deviceId The CUDA device ID for setting context.
|
||||
GdrMap(std::shared_ptr<void> gpuMem, int deviceId);
|
||||
@@ -48,10 +41,10 @@ class GdrMap {
|
||||
GdrMap& operator=(const GdrMap&) = delete;
|
||||
|
||||
/// Whether the mapping was established successfully.
|
||||
bool valid() const { return hostDstPtr_ != nullptr; }
|
||||
bool valid() const;
|
||||
|
||||
/// Return the BAR1-mapped host pointer to the GPU location.
|
||||
uint64_t* hostPtr() const { return hostDstPtr_; }
|
||||
uint64_t* hostPtr() const;
|
||||
|
||||
/// Copy data from host memory to the mapped GPU location.
|
||||
void copyTo(const void* src, size_t size);
|
||||
@@ -60,36 +53,10 @@ class GdrMap {
|
||||
void copyFrom(void* dst, size_t size) const;
|
||||
|
||||
private:
|
||||
std::shared_ptr<GdrContext> ctx_;
|
||||
std::shared_ptr<void> gpuMem_;
|
||||
gdr_mh_t mh_;
|
||||
void* barPtr_;
|
||||
uint64_t* hostDstPtr_;
|
||||
size_t mappedSize_;
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> pimpl_;
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#else // !defined(MSCCLPP_USE_GDRCOPY)
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
/// Stub GdrMap when GDRCopy is not available.
|
||||
class GdrMap {
|
||||
public:
|
||||
GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
|
||||
~GdrMap() = default;
|
||||
|
||||
GdrMap(const GdrMap&) = delete;
|
||||
GdrMap& operator=(const GdrMap&) = delete;
|
||||
|
||||
bool valid() const { return false; }
|
||||
void copyTo(const void* /*src*/, size_t /*size*/) {}
|
||||
void copyFrom(void* /*dst*/, size_t /*size*/) const {}
|
||||
uint64_t* hostPtr() const { return nullptr; }
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // !defined(MSCCLPP_USE_GDRCOPY)
|
||||
#endif // MSCCLPP_GDR_HPP_
|
||||
|
||||
@@ -34,17 +34,13 @@ class IbMr {
|
||||
IbMrInfo getInfo() const;
|
||||
const void* getBuff() const;
|
||||
uint32_t getLkey() const;
|
||||
bool isDmabuf() const;
|
||||
bool isDataDirect() const;
|
||||
|
||||
private:
|
||||
IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5);
|
||||
IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect);
|
||||
|
||||
ibv_mr* mr_;
|
||||
void* buff_;
|
||||
std::size_t size_;
|
||||
bool isDmabuf_;
|
||||
bool isDataDirect_;
|
||||
|
||||
friend class IbCtx;
|
||||
};
|
||||
@@ -92,7 +88,6 @@ class IbQp {
|
||||
int getRecvWcStatus(int idx) const;
|
||||
std::string getRecvWcStatusString(int idx) const;
|
||||
unsigned int getRecvWcImmData(int idx) const;
|
||||
bool isMlx5() const { return isMlx5_; }
|
||||
|
||||
private:
|
||||
struct SendWrInfo {
|
||||
@@ -106,7 +101,7 @@ class IbQp {
|
||||
};
|
||||
|
||||
IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
|
||||
int maxRecvWr, int maxWrPerSend, bool noAtomic, bool isMlx5);
|
||||
int maxRecvWr, int maxWrPerSend, bool noAtomic);
|
||||
SendWrInfo getNewSendWrInfo();
|
||||
RecvWrInfo getNewRecvWrInfo();
|
||||
|
||||
@@ -134,7 +129,6 @@ class IbQp {
|
||||
const int maxWrPerSend_;
|
||||
const int maxRecvWr_;
|
||||
const bool noAtomic_;
|
||||
const bool isMlx5_;
|
||||
|
||||
friend class IbCtx;
|
||||
};
|
||||
@@ -150,6 +144,8 @@ class IbCtx {
|
||||
std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
|
||||
bool supportsRdmaAtomics() const;
|
||||
bool isMlx5() const;
|
||||
bool supportsDataDirect() const;
|
||||
bool isVirtualFunction() const;
|
||||
#else
|
||||
IbCtx([[maybe_unused]] const std::string& devName) {}
|
||||
~IbCtx() {}
|
||||
@@ -160,6 +156,8 @@ class IbCtx {
|
||||
}
|
||||
bool supportsRdmaAtomics() const { return false; }
|
||||
bool isMlx5() const { return false; }
|
||||
bool supportsDataDirect() const { return false; }
|
||||
bool isVirtualFunction() const { return false; }
|
||||
#endif
|
||||
|
||||
const std::string& getDevName() const { return devName_; };
|
||||
@@ -173,6 +171,8 @@ class IbCtx {
|
||||
ibv_pd* pd_;
|
||||
bool supportsRdmaAtomics_;
|
||||
bool isMlx5_;
|
||||
bool dataDirect_;
|
||||
bool isVF_;
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
|
||||
#include <infiniband/mlx5dv.h>
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
@@ -19,14 +19,14 @@ struct MLX5DV {
|
||||
/// Check if the given IB device supports mlx5 Direct Verbs.
|
||||
static bool mlx5dv_is_supported(struct ibv_device* device);
|
||||
|
||||
/// Create a QP using mlx5dv extensions.
|
||||
static struct ibv_qp* mlx5dv_create_qp(struct ibv_context* ctx, struct ibv_qp_init_attr_ex* qpAttr,
|
||||
struct mlx5dv_qp_init_attr* mlx5QpAttr);
|
||||
|
||||
/// Register a DMABUF memory region using mlx5dv extensions.
|
||||
/// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version.
|
||||
static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
|
||||
int access);
|
||||
int access);
|
||||
|
||||
/// Query the Data Direct sysfs path for the given IB context.
|
||||
/// Returns 0 on success (device supports Data Direct), non-zero otherwise.
|
||||
static int mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len);
|
||||
|
||||
private:
|
||||
static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
|
||||
|
||||
@@ -3,9 +3,19 @@
|
||||
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
|
||||
// _GNU_SOURCE is required for dlvsym()
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
|
||||
#include "mlx5dv_wrapper.hpp"
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <infiniband/mlx5dv.h>
|
||||
|
||||
#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
|
||||
#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
|
||||
#endif
|
||||
|
||||
#include <memory>
|
||||
|
||||
@@ -72,14 +82,6 @@ bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
|
||||
return impl(device);
|
||||
}
|
||||
|
||||
struct ibv_qp* MLX5DV::mlx5dv_create_qp(struct ibv_context* ctx, struct ibv_qp_init_attr_ex* qpAttr,
|
||||
struct mlx5dv_qp_init_attr* mlx5QpAttr) {
|
||||
using FuncType = struct ibv_qp* (*)(struct ibv_context*, struct ibv_qp_init_attr_ex*, struct mlx5dv_qp_init_attr*);
|
||||
static FuncType impl = nullptr;
|
||||
if (!impl) impl = reinterpret_cast<FuncType>(MLX5DV::dlsym("mlx5dv_create_qp"));
|
||||
return impl(ctx, qpAttr, mlx5QpAttr);
|
||||
}
|
||||
|
||||
struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
|
||||
int access) {
|
||||
// mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
|
||||
@@ -92,12 +94,27 @@ struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset,
|
||||
resolved = true;
|
||||
}
|
||||
if (!impl) return nullptr;
|
||||
#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
|
||||
#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
|
||||
#endif
|
||||
return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
|
||||
}
|
||||
|
||||
int MLX5DV::mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) {
|
||||
using FuncType = int (*)(struct ibv_context*, char*, size_t);
|
||||
static FuncType impl = nullptr;
|
||||
static bool resolved = false;
|
||||
if (!resolved) {
|
||||
if (globalMLX5Handle) {
|
||||
void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_get_data_direct_sysfs_path", "MLX5_1.25");
|
||||
if (!ptr) {
|
||||
ptr = MLX5DV::dlsym("mlx5dv_get_data_direct_sysfs_path", /*allowReturnNull=*/true);
|
||||
}
|
||||
impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
|
||||
}
|
||||
resolved = true;
|
||||
}
|
||||
if (!impl) return -1;
|
||||
return impl(context, buf, buf_len);
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // defined(MSCCLPP_USE_MLX5DV)
|
||||
|
||||
@@ -123,19 +123,18 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
|
||||
}
|
||||
auto connImpl = BaseConnection::getImpl(connection());
|
||||
if (connImpl->usesSignalForwarding()) {
|
||||
// Signal forwarding mode: the recv thread writes the token to GPU memory.
|
||||
// Allocate a separate inbound token via plain cudaMalloc (not TokenPool/VMM)
|
||||
// so that it is always compatible with GDRCopy pinning (VMM memory cannot be pinned by gdr_pin_buffer).
|
||||
if (connImpl->isSignalForwarding()) {
|
||||
// Signal forwarding (HostNoAtomic): the receiver's recv thread polls the recv CQ for
|
||||
// WRITE_WITH_IMM completions, then forwards the token to inboundToken_ via GDRCopy.
|
||||
CudaDeviceGuard deviceGuard(connection().localDevice().id);
|
||||
#if defined(MSCCLPP_USE_ROCM)
|
||||
inboundToken_ = detail::gpuCallocUncachedShared<uint64_t>();
|
||||
#else
|
||||
inboundToken_ = detail::gpuCallocShared<uint64_t>();
|
||||
#endif
|
||||
connImpl->setSignalForwardingDst(inboundToken_);
|
||||
connImpl->startSignalForwarding(inboundToken_);
|
||||
}
|
||||
// When usesSignalForwarding() is false (e.g., atomic mode), inboundToken_ stays null
|
||||
// When isSignalForwarding() is false (atomic mode), inboundToken_ stays null
|
||||
// and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
|
||||
}
|
||||
|
||||
@@ -144,9 +143,9 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communi
|
||||
|
||||
MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
|
||||
if (inboundToken_) {
|
||||
// Clear the connection's signal forwarding destination (and any associated GdrMap)
|
||||
// Clear the connection's signal forwarding destination (and GdrMap)
|
||||
// before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
|
||||
BaseConnection::getImpl(connection())->setSignalForwardingDst(nullptr);
|
||||
BaseConnection::getImpl(connection())->stopSignalForwarding();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,7 +157,7 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
|
||||
|
||||
MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const {
|
||||
Host2DeviceSemaphore::DeviceHandle device;
|
||||
// If inboundToken_ is allocated (host-no-atomic mode), the GPU polls it.
|
||||
// If inboundToken_ is allocated (signal forwarding mode), the GPU polls it.
|
||||
// Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
|
||||
// which is the same address targeted by the NIC's atomic operation.
|
||||
device.inboundToken =
|
||||
@@ -178,12 +177,12 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
|
||||
}
|
||||
auto connImpl = BaseConnection::getImpl(connection());
|
||||
if (connImpl->usesSignalForwarding()) {
|
||||
if (connImpl->isSignalForwarding()) {
|
||||
// Signal forwarding mode: tell the recv thread where to write the incoming token.
|
||||
// Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
|
||||
auto token =
|
||||
std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
|
||||
connImpl->setSignalForwardingDst(std::move(token));
|
||||
connImpl->startSignalForwarding(std::move(token));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -285,6 +285,9 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
|
||||
passed++;
|
||||
} else {
|
||||
std::cout << "[ FAILED ] " << fullName << std::endl;
|
||||
if (!gCurrentTestFailureMessage.empty()) {
|
||||
std::cout << " Reason: " << gCurrentTestFailureMessage << std::endl;
|
||||
}
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,8 +3,12 @@
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <mscclpp/atomic_device.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <thread>
|
||||
|
||||
#include "gdr.hpp"
|
||||
#include "mp_unit_tests.hpp"
|
||||
#include "utils_internal.hpp"
|
||||
|
||||
@@ -41,7 +45,10 @@ void IbPeerToPeerTest::SetUp() {
|
||||
|
||||
ibCtx = std::make_shared<mscclpp::IbCtx>(ibDevName);
|
||||
bool noAtomic = !ibCtx->supportsRdmaAtomics();
|
||||
qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, 0, 64, noAtomic);
|
||||
// When atomics are not supported, the MemoryConsistency test uses
|
||||
// write-with-imm which requires recv WRs on the receiver side.
|
||||
int maxRecvWr = noAtomic ? 64 : 0;
|
||||
qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, maxRecvWr, 64, noAtomic);
|
||||
|
||||
qpInfo[gEnv->rank] = qp->getInfo();
|
||||
bootstrap->allGather(qpInfo.data(), sizeof(mscclpp::IbQpInfo));
|
||||
@@ -199,15 +206,34 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
|
||||
// This test needs only two ranks
|
||||
return;
|
||||
}
|
||||
if (!ibCtx->supportsRdmaAtomics()) {
|
||||
GTEST_SKIP() << "This test requires RDMA atomics support.";
|
||||
}
|
||||
|
||||
// Use atomic path if supported by the IB device.
|
||||
bool useAtomic = ibCtx->supportsRdmaAtomics();
|
||||
|
||||
const uint64_t signalPeriod = 1024;
|
||||
const uint64_t maxIter = 10000;
|
||||
const uint64_t nelem = 65536 + 1;
|
||||
auto data = mscclpp::detail::gpuCallocUnique<uint64_t>(nelem);
|
||||
|
||||
// For no-atomic mode: allocate a separate signal buffer for write-with-imm destination.
|
||||
// The sender writes-with-imm to this buffer; the receiver's CPU thread reads the imm_data
|
||||
// from the recv CQ and writes the iteration value to data[0] via GDRCopy atomicStore.
|
||||
std::shared_ptr<uint64_t> signalBuf;
|
||||
std::unique_ptr<const mscclpp::IbMr> signalMr;
|
||||
std::array<mscclpp::IbMrInfo, 2> signalMrInfo{};
|
||||
if (!useAtomic) {
|
||||
signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
signalMr = ibCtx->registerMr(signalBuf.get(), sizeof(uint64_t));
|
||||
signalMrInfo[gEnv->rank] = signalMr->getInfo();
|
||||
bootstrap->allGather(signalMrInfo.data(), sizeof(mscclpp::IbMrInfo));
|
||||
|
||||
// Pre-post recv WRs for write-with-imm on both ranks
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
qp->stageRecv(0);
|
||||
}
|
||||
qp->postRecv();
|
||||
}
|
||||
|
||||
registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);
|
||||
|
||||
uint64_t res = 0;
|
||||
@@ -226,6 +252,40 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
|
||||
ASSERT_EQ(*ptrCurIter, 0);
|
||||
ASSERT_EQ(*ptrResult, 0);
|
||||
|
||||
// For no-atomic mode: create a GDRCopy mapping for data[0] and start a CPU thread that
|
||||
// polls recv CQ and forwards the signal via GDRCopy BAR1 write — the same mechanism
|
||||
// used by IBConnection::recvThreadFunc for port channels.
|
||||
std::atomic<bool> stopRecvThread(false);
|
||||
std::thread recvThread;
|
||||
std::unique_ptr<mscclpp::GdrMap> dataGdrMap;
|
||||
if (!useAtomic) {
|
||||
if (!mscclpp::gdrEnabled()) {
|
||||
SKIP_TEST() << "No-atomic mode requires GDRCopy but it is not available.";
|
||||
}
|
||||
// Create GDRCopy BAR1 mapping for data[0] — same as how connection.cc maps inboundToken_
|
||||
dataGdrMap =
|
||||
std::make_unique<mscclpp::GdrMap>(std::shared_ptr<void>(data.get(), [](void*) {}), // non-owning shared_ptr
|
||||
cudaDevId);
|
||||
|
||||
recvThread = std::thread([&]() {
|
||||
while (!stopRecvThread.load(std::memory_order_relaxed)) {
|
||||
int wcNum = qp->pollRecvCq();
|
||||
if (wcNum <= 0) continue;
|
||||
for (int i = 0; i < wcNum; ++i) {
|
||||
int status = qp->getRecvWcStatus(i);
|
||||
if (status != static_cast<int>(mscclpp::WsStatus::Success)) continue;
|
||||
uint64_t val = static_cast<uint64_t>(qp->getRecvWcImmData(i));
|
||||
// Write the iteration value to data[0] via GDRCopy BAR1 atomicStore —
|
||||
// same pattern as IBConnection::recvThreadFunc.
|
||||
mscclpp::atomicStore(dataGdrMap->hostPtr(), val, mscclpp::memoryOrderRelaxed);
|
||||
// Re-post recv
|
||||
qp->stageRecv(0);
|
||||
qp->postRecv();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
kernelMemoryConsistency<<<1, 1024>>>(data.get(), ptrCurIter, ptrResult, nelem, maxIter);
|
||||
MSCCLPP_CUDATHROW(cudaGetLastError());
|
||||
|
||||
@@ -247,6 +307,11 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
|
||||
}
|
||||
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
|
||||
if (!useAtomic) {
|
||||
stopRecvThread.store(true, std::memory_order_relaxed);
|
||||
if (recvThread.joinable()) recvThread.join();
|
||||
}
|
||||
} else if (gEnv->rank == 1) {
|
||||
// Sender
|
||||
std::vector<uint64_t> hostBuffer(nelem, 0);
|
||||
@@ -267,15 +332,20 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
|
||||
stageSendWrite(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
|
||||
qp->postSend();
|
||||
|
||||
#if 0
|
||||
// For reference: send the first element using a normal send. This should occasionally see a wrong result.
|
||||
stageSendWrite(sizeof(uint64_t), 0, 0, 0, false);
|
||||
qp->postSend();
|
||||
#else
|
||||
// Send the first element using AtomicAdd. This should see the correct result.
|
||||
stageSendAtomicAdd(0, 0, 1, false);
|
||||
qp->postSend();
|
||||
#endif
|
||||
if (useAtomic) {
|
||||
// Send the first element using AtomicAdd. The non-posted PCIe atomic operation
|
||||
// provides end-to-end ordering: data[1..N] are guaranteed visible when data[0] updates.
|
||||
stageSendAtomicAdd(0, 0, 1, false);
|
||||
qp->postSend();
|
||||
} else {
|
||||
// No-atomic mode: send a 0-byte WRITE_WITH_IMM carrying the iteration in imm_data.
|
||||
// The receiver's CPU thread polls the recv CQ and writes the value to data[0]
|
||||
// via GDRCopy atomicStore.
|
||||
// QP ordering guarantees data[1..N] WRITE completes before this write-with-imm.
|
||||
const mscclpp::IbMrInfo& remoteSignalMrInfo = signalMrInfo[(gEnv->rank == 1) ? 0 : 1];
|
||||
qp->stageSendWriteWithImm(nullptr, remoteSignalMrInfo, 0, 0, 0, 0, false, static_cast<unsigned int>(iter));
|
||||
qp->postSend();
|
||||
}
|
||||
|
||||
if (signaled) {
|
||||
int wcNum = qp->pollSendCq();
|
||||
@@ -296,13 +366,23 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
|
||||
}
|
||||
}
|
||||
|
||||
if (res & 2) {
|
||||
FAIL() << "The receiver is stuck at iteration " << iter << ".";
|
||||
} else if (res != 0 && res != 1) {
|
||||
FAIL() << "Unknown error is detected at iteration " << iter << ". res =" << res;
|
||||
if (useAtomic) {
|
||||
// With RDMA atomics, memory consistency must be guaranteed.
|
||||
if (res & 2) {
|
||||
FAIL() << "The receiver is stuck at iteration " << iter << ".";
|
||||
}
|
||||
EXPECT_EQ(res, 0);
|
||||
} else {
|
||||
if (res == 0) {
|
||||
// No-atomic path works correctly here.
|
||||
} else if (res & 2) {
|
||||
SKIP_TEST() << "No-atomic signal forwarding: receiver stuck at iteration " << iter
|
||||
<< ". NIC DMA and CPU writes are not ordered on this platform.";
|
||||
} else {
|
||||
SKIP_TEST() << "No-atomic signal forwarding: memory inconsistency detected at iteration " << iter
|
||||
<< ". NIC DMA and CPU writes are not ordered on this platform.";
|
||||
}
|
||||
}
|
||||
|
||||
EXPECT_EQ(res, 0);
|
||||
}
|
||||
|
||||
TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
|
||||
@@ -311,7 +391,7 @@ TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
|
||||
return;
|
||||
}
|
||||
if (!ibCtx->supportsRdmaAtomics()) {
|
||||
GTEST_SKIP() << "This test requires RDMA atomics support.";
|
||||
SKIP_TEST() << "This test requires RDMA atomics support.";
|
||||
}
|
||||
|
||||
mscclpp::Timer timeout(3);
|
||||
|
||||
@@ -4,9 +4,24 @@
|
||||
#include <cstdint>
|
||||
#include <mscclpp/concurrency_device.hpp>
|
||||
|
||||
#include "gdr.hpp"
|
||||
#include "mp_unit_tests.hpp"
|
||||
#include "utils_internal.hpp"
|
||||
|
||||
// Skip the current test if HostNoAtomic mode is not supported.
|
||||
// On CUDA, HostNoAtomic requires GDRCopy for BAR1 signal forwarding.
|
||||
// On ROCm, HostNoAtomic uses direct volatile writes and does not need GDRCopy.
|
||||
#if defined(MSCCLPP_USE_CUDA)
|
||||
#define REQUIRE_HOST_NO_ATOMIC \
|
||||
do { \
|
||||
if (!mscclpp::gdrEnabled()) { \
|
||||
SKIP_TEST() << "HostNoAtomic requires GDRCopy: " << mscclpp::gdrStatusMessage(); \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
#define REQUIRE_HOST_NO_ATOMIC // No extra requirements on non-CUDA platforms.
|
||||
#endif
|
||||
|
||||
void PortChannelOneToOneTest::SetUp() {
|
||||
// Use only two ranks
|
||||
setNumRanksToUse(2);
|
||||
@@ -272,6 +287,7 @@ TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
|
||||
|
||||
TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
REQUIRE_HOST_NO_ATOMIC;
|
||||
testPingPongPerf(PingPongTestParams{
|
||||
.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
|
||||
}
|
||||
@@ -465,16 +481,19 @@ TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
|
||||
|
||||
TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
REQUIRE_HOST_NO_ATOMIC;
|
||||
testPacketPingPongPerf(true, IbMode::HostNoAtomic);
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
REQUIRE_HOST_NO_ATOMIC;
|
||||
testPingPong(PingPongTestParams{
|
||||
.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
REQUIRE_HOST_NO_ATOMIC;
|
||||
testPacketPingPong(true, IbMode::HostNoAtomic);
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
target_sources(unit_tests PRIVATE
|
||||
unit_tests_main.cc
|
||||
core_tests.cc
|
||||
gdr_tests.cu
|
||||
gpu_utils_tests.cc
|
||||
errors_tests.cc
|
||||
fifo_tests.cu
|
||||
|
||||
251
test/unit/gdr_tests.cu
Normal file
251
test/unit/gdr_tests.cu
Normal file
@@ -0,0 +1,251 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include <mscclpp/atomic_device.hpp>
|
||||
#include <mscclpp/errors.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
|
||||
#include "../framework.hpp"
|
||||
#include "gdr.hpp"
|
||||
|
||||
// GdrStatus and gdrEnabled
|
||||
|
||||
class GdrStatusTest : public ::mscclpp::test::TestCase {};
|
||||
|
||||
TEST(GdrStatusTest, StatusIsValid) {
|
||||
// gdrStatus() should return one of the defined enum values
|
||||
auto status = mscclpp::gdrStatus();
|
||||
ASSERT_TRUE(status == mscclpp::GdrStatus::Ok || status == mscclpp::GdrStatus::NotBuilt ||
|
||||
status == mscclpp::GdrStatus::Disabled || status == mscclpp::GdrStatus::DriverMissing ||
|
||||
status == mscclpp::GdrStatus::OpenFailed);
|
||||
}
|
||||
|
||||
TEST(GdrStatusTest, EnabledConsistentWithStatus) {
|
||||
// gdrEnabled() should be true iff gdrStatus() == Ok
|
||||
EXPECT_EQ(mscclpp::gdrEnabled(), mscclpp::gdrStatus() == mscclpp::GdrStatus::Ok);
|
||||
}
|
||||
|
||||
// GdrMap tests — only run when GDRCopy is available
|
||||
|
||||
class GdrMapTest : public ::mscclpp::test::TestCase {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
if (!mscclpp::gdrEnabled()) {
|
||||
SKIP_TEST() << "GDRCopy not enabled on this platform.";
|
||||
}
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
|
||||
// Try creating a GDRCopy mapping to check if pin+map works on this platform.
|
||||
try {
|
||||
auto testMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
mscclpp::GdrMap testMap(std::static_pointer_cast<void>(testMem), deviceId_);
|
||||
} catch (const std::exception&) {
|
||||
SKIP_TEST() << "GDRCopy mapping not supported on this platform.";
|
||||
}
|
||||
}
|
||||
|
||||
int deviceId_ = 0;
|
||||
};
|
||||
|
||||
TEST(GdrMapTest, BasicMapping) {
|
||||
// Allocate GPU memory via cudaMalloc (not VMM) and create a GDRCopy mapping
|
||||
auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
|
||||
|
||||
ASSERT_TRUE(map.valid());
|
||||
EXPECT_NE(map.hostPtr(), nullptr);
|
||||
}
|
||||
|
||||
TEST(GdrMapTest, CopyToAndFrom) {
|
||||
auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
|
||||
ASSERT_TRUE(map.valid());
|
||||
|
||||
// Write a value to GPU via GDRCopy
|
||||
uint64_t writeVal = 0xDEADBEEFCAFE0123ULL;
|
||||
map.copyTo(&writeVal, sizeof(uint64_t));
|
||||
|
||||
// Read it back via GDRCopy
|
||||
uint64_t readVal = 0;
|
||||
map.copyFrom(&readVal, sizeof(uint64_t));
|
||||
EXPECT_EQ(readVal, writeVal);
|
||||
|
||||
// Also verify via cudaMemcpy
|
||||
uint64_t cudaVal = 0;
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(&cudaVal, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
|
||||
EXPECT_EQ(cudaVal, writeVal);
|
||||
}
|
||||
|
||||
TEST(GdrMapTest, CopyToVisibleFromGpu) {
|
||||
auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
|
||||
ASSERT_TRUE(map.valid());
|
||||
|
||||
// Write via GDRCopy, verify GPU sees it via cudaMemcpy
|
||||
uint64_t val = 42;
|
||||
map.copyTo(&val, sizeof(uint64_t));
|
||||
|
||||
uint64_t result = 0;
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
|
||||
EXPECT_EQ(result, 42);
|
||||
}
|
||||
|
||||
TEST(GdrMapTest, MultipleWritesReadBack) {
|
||||
auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
|
||||
ASSERT_TRUE(map.valid());
|
||||
|
||||
// Write multiple values sequentially and verify each
|
||||
for (uint64_t i = 1; i <= 100; ++i) {
|
||||
map.copyTo(&i, sizeof(uint64_t));
|
||||
uint64_t readback = 0;
|
||||
map.copyFrom(&readback, sizeof(uint64_t));
|
||||
EXPECT_EQ(readback, i);
|
||||
if (readback != i) break;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(GdrMapTest, HostPtrIsWritable) {
|
||||
auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
|
||||
ASSERT_TRUE(map.valid());
|
||||
|
||||
// Write directly through the hostPtr (volatile store)
|
||||
volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
|
||||
*ptr = 12345;
|
||||
|
||||
// Read back via GDRCopy
|
||||
uint64_t readback = 0;
|
||||
map.copyFrom(&readback, sizeof(uint64_t));
|
||||
EXPECT_EQ(readback, 12345);
|
||||
}
|
||||
|
||||
TEST(GdrMapTest, HostPtrIsReadable) {
|
||||
auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
|
||||
ASSERT_TRUE(map.valid());
|
||||
|
||||
// Write via GDRCopy copyTo (same BAR1 path as the read)
|
||||
uint64_t val = 99999;
|
||||
map.copyTo(&val, sizeof(uint64_t));
|
||||
|
||||
// Read through the hostPtr (volatile load via BAR1)
|
||||
volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
|
||||
EXPECT_EQ(*ptr, 99999);
|
||||
}
|
||||
|
||||
TEST(GdrMapTest, DestroyDoesNotCrash) {
|
||||
auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
{
|
||||
mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
|
||||
ASSERT_TRUE(map.valid());
|
||||
uint64_t val = 1;
|
||||
map.copyTo(&val, sizeof(uint64_t));
|
||||
}
|
||||
// After GdrMap is destroyed, gpuMem should still be valid
|
||||
uint64_t result = 0;
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
|
||||
EXPECT_EQ(result, 1);
|
||||
}
|
||||
|
||||
// GPU kernel: polls signalFromCpu until it reaches expectedIter, then writes expectedIter to ackToHost.
|
||||
// Repeats for maxIter iterations. The GPU uses system-scope acquire loads on signalFromCpu
|
||||
// and plain stores to ackToHost (which is host-pinned memory visible to CPU).
|
||||
__global__ void kernelGdrVisibilityPingPong(volatile uint64_t* signalFromCpu, volatile uint64_t* ackToHost,
|
||||
uint64_t maxIter) {
|
||||
for (uint64_t iter = 1; iter <= maxIter; ++iter) {
|
||||
// Poll until CPU writes the expected iteration value via GDRCopy BAR1
|
||||
while (*signalFromCpu < iter) {
|
||||
}
|
||||
// Ack back to CPU via host-pinned memory
|
||||
*ackToHost = iter;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(GdrMapTest, CpuGpuVisibilityPingPong) {
|
||||
const uint64_t maxIter = 10000;
|
||||
|
||||
// signalBuf: GPU memory mapped via GDRCopy BAR1. CPU writes here, GPU polls.
|
||||
auto signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
mscclpp::GdrMap signalMap(std::static_pointer_cast<void>(signalBuf), deviceId_);
|
||||
ASSERT_TRUE(signalMap.valid());
|
||||
|
||||
// ackBuf: host-pinned memory (gpuCallocHostShared). GPU writes here, CPU polls.
|
||||
auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
|
||||
volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
|
||||
*ackPtr = 0;
|
||||
|
||||
// Launch kernel — it will poll signalBuf and write ackBuf for each iteration
|
||||
kernelGdrVisibilityPingPong<<<1, 1>>>(signalBuf.get(), ackBuf.get(), maxIter);
|
||||
MSCCLPP_CUDATHROW(cudaGetLastError());
|
||||
|
||||
for (uint64_t iter = 1; iter <= maxIter; ++iter) {
|
||||
// CPU writes iteration value to GPU via GDRCopy BAR1
|
||||
uint64_t val = iter;
|
||||
signalMap.copyTo(&val, sizeof(uint64_t));
|
||||
|
||||
// CPU polls host-pinned ack until GPU confirms it saw the value
|
||||
int spin = 0;
|
||||
while (*ackPtr < iter) {
|
||||
if (++spin > 100000000) {
|
||||
FAIL() << "GPU did not ack iteration " << iter << " (ack=" << *ackPtr << ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
EXPECT_EQ(*ackPtr, maxIter);
|
||||
}
|
||||
|
||||
// GPU kernel that polls a counter using system-scope acquire load.
|
||||
// When counter >= expectedIter, writes ack.
|
||||
__global__ void kernelCounterWait(uint64_t* counter, volatile uint64_t* ackToHost, uint64_t maxIter) {
|
||||
for (uint64_t iter = 1; iter <= maxIter; ++iter) {
|
||||
// System-scope acquire load — matches the atomicStore(relaxed) on the CPU side
|
||||
uint64_t got;
|
||||
do {
|
||||
got = mscclpp::atomicLoad(counter, mscclpp::memoryOrderAcquire);
|
||||
} while (got < iter);
|
||||
// Ack back
|
||||
*ackToHost = iter;
|
||||
}
|
||||
}
|
||||
|
||||
// Test the GDRCopy counter pattern used by HostNoAtomic mode:
|
||||
// - GPU memory allocated via gpuCallocShared (cudaMalloc)
|
||||
// - GdrMap for BAR1 mapping
|
||||
// - CPU writes via atomicStore(relaxed) through GDRCopy BAR1 mapping
|
||||
// - GPU reads via atomicLoad with memory_order_acquire
|
||||
TEST(GdrMapTest, AtomicStoreCounterPingPong) {
|
||||
const uint64_t maxIter = 10000;
|
||||
|
||||
// Allocate GPU memory via gpuCallocShared
|
||||
auto counterBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
|
||||
mscclpp::GdrMap counterMap(std::static_pointer_cast<void>(counterBuf), deviceId_);
|
||||
ASSERT_TRUE(counterMap.valid());
|
||||
|
||||
// Ack buffer: host-pinned memory
|
||||
auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
|
||||
volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
|
||||
*ackPtr = 0;
|
||||
|
||||
// Launch kernel — polls counterBuf with system-scope acquire load
|
||||
kernelCounterWait<<<1, 1>>>(counterBuf.get(), ackBuf.get(), maxIter);
|
||||
MSCCLPP_CUDATHROW(cudaGetLastError());
|
||||
|
||||
for (uint64_t iter = 1; iter <= maxIter; ++iter) {
|
||||
// CPU writes counter via atomicStore (relaxed — GPU uses acquire on read)
|
||||
mscclpp::atomicStore(counterMap.hostPtr(), iter, mscclpp::memoryOrderRelaxed);
|
||||
|
||||
// Wait for GPU ack
|
||||
int spin = 0;
|
||||
while (*ackPtr < iter) {
|
||||
if (++spin > 100000000) {
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
FAIL() << "GPU did not ack iteration " << iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
EXPECT_EQ(*ackPtr, maxIter);
|
||||
}
|
||||
Reference in New Issue
Block a user