fix data direct

2026-05-11 08:50:21 +00:00 · 2026-04-01 10:20:43 +00:00
parent 80f554ebaf
commit 67f9933ba1
21 changed files with 735 additions and 365 deletions
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -43,7 +43,7 @@ For testing after successful build:
 # To run tests with two GPUs - two is enough for most tests
 mpirun -np 2 ./build/bin/mp_unit_tests
 # To run tests excluding IB-related ones (when IB is not available)
-mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
+mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
 ```

 For building a Python package:
--- a/cmake/FindGDRCopy.cmake
+++ b/cmake/FindGDRCopy.cmake
@@ -35,7 +35,9 @@ find_library(GDRCOPY_LIBRARIES
 if(GDRCOPY_INCLUDE_DIRS)
    include(CheckSymbolExists)
    set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
+    set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
    check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
+    unset(CMAKE_REQUIRED_LIBRARIES)
    unset(CMAKE_REQUIRED_INCLUDES)
    if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
        message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
--- a/cmake/FindMLX5.cmake
+++ b/cmake/FindMLX5.cmake
@@ -33,5 +33,6 @@ find_library(MLX5_LIBRARIES
  /usr/lib/x86_64-linux-gnu)

 include(FindPackageHandleStandardArgs)
+
 find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
 mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
--- a/include/mscclpp/atomic_device.hpp
+++ b/include/mscclpp/atomic_device.hpp
@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
  return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
 }

-#elif defined(MSCCLPP_DEVICE_HIP)
+#else  // !defined(MSCCLPP_DEVICE_CUDA)

 constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
 constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
 constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
 constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;

-// HIP does not have thread scope enums like CUDA
 constexpr auto scopeSystem = 0;
 constexpr auto scopeDevice = 0;

@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
  return __atomic_fetch_add(ptr, val, memoryOrder);
 }

-#endif  // defined(MSCCLPP_DEVICE_HIP)
+#endif  // !defined(MSCCLPP_DEVICE_CUDA)

 }  // namespace mscclpp

--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -7,6 +7,7 @@
 #include <mscclpp/npkit/npkit.hpp>
 #endif

+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
@@ -219,29 +220,18 @@ void IBConnection::recvThreadFunc() {
        continue;
      }

-      // Read the token value from the incoming write-with-imm completion.
-      if (dataDirectEnabled_) {
-        // Data Direct path: the signal GPU buffer MR was registered with
-        // MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT, and the semaphore token is also written
-        // through Data Direct (via GDRCopy). Both writes go through the same path, so
-        // all data is visible in GPU memory when the CQE is polled. Read from imm_data.
-        newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
-      } else {
-        // Slow path: read the 64-bit token from the local signal GPU buffer via volatile load.
-        // localSignalGpuPtr_ points to either a GDRCopy BAR1 mapping (CUDA) or the
-        // GPU buffer directly (ROCm system-coherent/uncached memory).
-        newValueHost = *static_cast<volatile uint64_t*>(localSignalGpuPtr_);
-      }
+      // Read the token from imm_data (always available and correct in the CQE).
+      newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));

-      // Read token address from the local stored address (set by setSignalForwardingDst)
-      if (remoteUpdateDstAddr_ != 0) {
-        uint64_t* dstPtr = reinterpret_cast<uint64_t*>(remoteUpdateDstAddr_);
-
-        if (remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid()) {
-          // Direct host-side write to GPU memory via GDRCopy BAR1 mapping
-          remoteUpdateDstAddrMap_->copyTo(&newValueHost, sizeof(uint64_t));
+      // Forward the token to the semaphore's inbound token address via atomicStore
+      // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
+      if (signalAddr_ != 0) {
+        if (signalGdrMap_ && signalGdrMap_->valid()) {
+          atomicStore(signalGdrMap_->hostPtr(), newValueHost, memoryOrderRelaxed);
        } else {
-          *dstPtr = newValueHost;
+          // For HIP/ROCm.
+          // NOTE: may need a fix in the future to ensure BAR1 mapping.
+          *reinterpret_cast<volatile uint64_t*>(signalAddr_) = newValueHost;
        }
      }

@@ -259,12 +249,10 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
      remoteTransport_(remoteEndpoint.transport()),
      atomicSrc_(std::make_unique<uint64_t>(0)),
      ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
+      gdrSignalForwarding_(false),
      stopRecvThread_(false),
      localGpuDeviceId_(localEndpoint.device().id),
-      remoteUpdateDstAddr_(0),
-      remoteSignalGpuMrInfo_{0, 0},
-      localSignalGpuPtr_(nullptr),
-      dataDirectEnabled_(false) {
+      signalAddr_(0) {
  qp_ = getImpl(localEndpoint).ibQp_;
  qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
  qp_.lock()->rts();
@@ -274,105 +262,89 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc

  if (ibNoAtomic_) {
 #if defined(MSCCLPP_USE_CUDA)
+    // On CUDA, HostNoAtomic requires GDRCopy for CPU→GPU signal forwarding through BAR1.
    if (!gdrEnabled()) {
-      std::string reason = "unknown";
-      switch (gdrStatus()) {
-        case GdrStatus::NotBuilt:
-          reason = "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
-          break;
-        case GdrStatus::Disabled:
-          reason = "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
-          break;
-        case GdrStatus::DriverMissing:
-          reason = "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
-          break;
-        case GdrStatus::OpenFailed:
-          reason = "gdr_open() failed; GDRCopy driver may be misconfigured";
-          break;
-        default:
-          break;
+      THROW(CONN, Error, ErrorCode::InvalidUsage,
+            "IB host-no-atomic mode on CUDA requires GDRCopy: ", gdrStatusMessage());
+    }
+    gdrSignalForwarding_ = true;
+#endif  // defined(MSCCLPP_USE_CUDA)
+
+    // On platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200
+    // NVLink-C2C), HostNoAtomic requires Data Direct for correct memory ordering. Data Direct
+    // routes NIC DMA through the PCIe Data Direct engine, bypassing the bridge. It is available
+    // on Virtual Function (VF) devices. On platforms without such a bridge (x86, non-Grace
+    // aarch64), HostNoAtomic works without Data Direct.
+    //
+    // We cannot reliably detect the bridge at compile time or runtime, so we emit a warning
+    // when the device is not a VF. If data corruption occurs, switching to VF devices with
+    // Data Direct or using IbMode::Host with RDMA atomics will resolve it.
+    {
+      IbCtx* ibCtx = getImpl(*context).getIbContext(transport_);
+      if (!ibCtx->isVirtualFunction()) {
+        WARN(CONN,
+             "IB HostNoAtomic mode without a Virtual Function (VF) device may cause data corruption "
+             "on platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200). "
+             "Device ",
+             ibCtx->getDevName(),
+             " is not a VF. "
+             "If you experience data corruption, use VF devices with Data Direct or IbMode::Host.");
      }
-      THROW(CONN, Error, ErrorCode::InvalidUsage, "IB host-no-atomic mode on CUDA requires GDRCopy: ", reason);
-    }
-#endif
-
-    // Extract remote endpoint's signal GPU buffer MR info for write-with-imm destination
-    const auto& remoteImpl = getImpl(remoteEndpoint);
-    remoteSignalGpuMrInfo_ = remoteImpl.ibSignalGpuMrInfo_;
-
-    // Create a GDR mapping of the local signal GPU buffer. recvThreadFunc reads the
-    // 64-bit token via localSignalGpuPtr_, which points to the BAR1-mapped host address
-    // (CUDA/GDRCopy) or the GPU buffer directly (ROCm system-coherent memory).
-    const auto& localImpl = getImpl(localEndpoint);
-    if (gdrEnabled() && localImpl.ibSignalGpuBuffer_) {
-      localSignalGpuMap_ =
-          std::make_unique<GdrMap>(std::static_pointer_cast<void>(localImpl.ibSignalGpuBuffer_), localGpuDeviceId_);
-    }
-    if (localSignalGpuMap_ && localSignalGpuMap_->valid()) {
-      // Use the BAR1-mapped host pointer; uncacheable MMIO ensures ordered volatile reads.
-      localSignalGpuPtr_ = localSignalGpuMap_->hostPtr();
-    } else if (localImpl.ibSignalGpuBuffer_) {
-      // ROCm: GPU memory is system-coherent, so direct volatile read is safe.
-      localSignalGpuPtr_ = reinterpret_cast<uint64_t*>(localImpl.ibSignalGpuBuffer_.get());
    }

-    // Data Direct requires all three conditions:
-    // 1. Signal GPU buffer MR registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
-    // 2. Local signal GPU GDRCopy mapping pinned with GDR_PIN_FLAG_FORCE_PCIE
-    // 3. (signal forwarding dst GDRCopy mapping checked at setSignalForwardingDst time)
-    // When all conditions are met, RDMA data writes and GDRCopy token writes both go
-    // through the Data Direct engine, guaranteeing GPU memory visibility at CQE poll time.
+    // Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
+    // The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
    auto qp = qp_.lock();
-    dataDirectEnabled_ = localImpl.ibSignalGpuMr_ && localImpl.ibSignalGpuMr_->isDataDirect() &&
-                          localSignalGpuMap_ && localSignalGpuMap_->valid();
-    if (dataDirectEnabled_) {
-      INFO(CONN, "IBConnection: Data Direct enabled");
-    }
-
-    // Pre-post receive requests for incoming write-with-imm
    int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
    for (int i = 0; i < maxRecvWr; ++i) {
      qp->stageRecv(/*wrId=*/0);
    }
    qp->postRecv();
-    // Start the background thread to poll recv CQ
-    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
-    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with no-atomic mode");
+    // The recv thread is started later in startSignalForwarding() when the semaphore
+    // provides the signal forwarding destination. This ensures the thread lifetime is
+    // bounded by the GdrMap lifetime (created before start, destroyed after stop).
+    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with signal forwarding (HostNoAtomic) mode");
  } else {
    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode");
  }
 }

-IBConnection::~IBConnection() {
+IBConnection::~IBConnection() { stopSignalForwarding(); }
+
+Transport IBConnection::transport() const { return transport_; }
+
+Transport IBConnection::remoteTransport() const { return remoteTransport_; }
+
+bool IBConnection::isSignalForwarding() const { return ibNoAtomic_; }
+
+void IBConnection::startSignalForwarding(std::shared_ptr<uint64_t> mem) {
+  // Set up the forwarding destination and GdrMap, then start the recv thread.
+  // Order: set address → create GdrMap → start thread.
+  signalAddr_ = reinterpret_cast<uint64_t>(mem.get());
+  if (gdrSignalForwarding_) {
+    signalGdrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
+  }
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(false, std::memory_order_relaxed);
+    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
+  }
+  INFO(CONN, "IBConnection startSignalForwarding: ", (void*)signalAddr_);
+}
+
+void IBConnection::stopSignalForwarding() {
+  // Stop the recv thread, then tear down GdrMap and address.
+  // Order: stop thread → destroy GdrMap → clear address.
  if (ibNoAtomic_) {
    stopRecvThread_.store(true, std::memory_order_relaxed);
    if (recvThread_.joinable()) {
      recvThread_.join();
    }
  }
-}
-
-Transport IBConnection::transport() const { return transport_; }
-
-Transport IBConnection::remoteTransport() const { return remoteTransport_; }
-
-bool IBConnection::usesSignalForwarding() const { return ibNoAtomic_; }
-
-void IBConnection::setSignalForwardingDst(std::shared_ptr<uint64_t> mem) {
-  remoteUpdateDstAddr_ = reinterpret_cast<uint64_t>(mem.get());
-  if (gdrEnabled()) {
-    if (mem) {
-      remoteUpdateDstAddrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
-      // Data Direct requires the token write mapping to also use FORCE_PCIE
-      if (dataDirectEnabled_ && !(remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid())) {
-        dataDirectEnabled_ = false;
-        INFO(CONN, "IBConnection: Data Direct disabled (signal forwarding dst GDRCopy mapping not available)");
-      }
-    } else {
-      remoteUpdateDstAddrMap_.reset();
-    }
+  if (gdrSignalForwarding_) {
+    signalGdrMap_.reset();
  }
-  INFO(CONN, "IBConnection setSignalForwardingDst: ", (void*)remoteUpdateDstAddr_);
+  signalAddr_ = 0;
+  INFO(CONN, "IBConnection stopSignalForwarding");
 }

 void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
@@ -425,27 +397,23 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
  *src = newValue;

  if (ibNoAtomic_) {
-    // Use RDMA write-with-imm instead of atomic operation.
-    // Write the token value (8 bytes) from the local host buffer to the remote signal GPU buffer,
-    // with newValue also in imm_data (32-bit). The remote's recvThreadFunc reads the token from
-    // the signal GPU buffer and forwards it to the semaphore's inbound token address.
-
-    // Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
+    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the token in imm_data.
+    // The receiver's recv thread polls the CQE, which guarantees the preceding data WRITE
+    // has been committed to GPU memory. The recv thread then forwards the token to the
+    // semaphore's inbound token via GDRCopy atomicStore.
    unsigned int immData = static_cast<unsigned int>(newValue);
-
-    // Write the real token value into the host buffer, then RDMA write host->remote GPU
    *atomicSrc_ = newValue;
-    qp_.lock()->stageSendWriteWithImm(atomicSrcTransportInfo_.ibMr, remoteSignalGpuMrInfo_,
-                                      /*size=*/sizeof(uint64_t), /*wrId=*/0,
+    qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
+                                      /*size=*/0, /*wrId=*/0,
                                      /*srcOffset=*/0, /*dstOffset=*/0,
                                      /*signaled=*/true, /*immData=*/immData);
    qp_.lock()->postSend();
-    INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
+    INFO(CONN, "IBConnection signal forwarding: value ", oldValue, " -> ", newValue);
  } else {
    qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
                                   /*signaled=*/true);
    qp_.lock()->postSend();
-    INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
+    INFO(CONN, "IBConnection atomic write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
         " -> ", newValue);
  }

--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -53,21 +53,6 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
                ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
    ibQpInfo_ = ibQp_->getInfo();
-
-    // Allocate a 64-bit signal GPU buffer for write-with-imm data payload (ibNoAtomic_ only).
-    if (ibNoAtomic_ && config_.device.type == DeviceType::GPU && config_.device.id >= 0) {
-      CudaDeviceGuard deviceGuard(config_.device.id);
-#if defined(MSCCLPP_DEVICE_HIP)
-      ibSignalGpuBuffer_ = detail::gpuCallocUncachedShared<uint64_t>();
-#else
-      ibSignalGpuBuffer_ = detail::gpuCallocShared<uint64_t>();
-#endif
-      ibSignalGpuMr_ =
-          contextImpl.getIbContext(config_.transport)->registerMr(ibSignalGpuBuffer_.get(), sizeof(uint64_t));
-      ibSignalGpuMrInfo_ = ibSignalGpuMr_->getInfo();
-    } else {
-      ibSignalGpuMrInfo_ = {0, 0};
-    }
  } else if (config_.transport == Transport::Ethernet) {
    // Configuring Ethernet Interfaces
    abortFlag_ = 0;
@@ -90,9 +75,6 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
    ibLocal_ = false;
    it = detail::deserialize(it, ibQpInfo_);
    it = detail::deserialize(it, ibNoAtomic_);
-    if (ibNoAtomic_) {
-      it = detail::deserialize(it, ibSignalGpuMrInfo_);
-    }
  } else if (config_.transport == Transport::Ethernet) {
    it = detail::deserialize(it, socketAddress_);
  }
@@ -123,9 +105,6 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
  if (AllIBTransports.has(pimpl_->config_.transport)) {
    detail::serialize(data, pimpl_->ibQpInfo_);
    detail::serialize(data, pimpl_->ibNoAtomic_);
-    if (pimpl_->ibNoAtomic_) {
-      detail::serialize(data, pimpl_->ibSignalGpuMrInfo_);
-    }
  } else if (pimpl_->config_.transport == Transport::Ethernet) {
    detail::serialize(data, pimpl_->socketAddress_);
  }
--- a/src/core/gdr.cc
+++ b/src/core/gdr.cc
@@ -5,6 +5,7 @@

 #if defined(MSCCLPP_USE_GDRCOPY)

+#include <gdrapi.h>
 #include <unistd.h>

 #include <mscclpp/env.hpp>
@@ -12,9 +13,11 @@

 #include "logger.hpp"

+#ifndef GPU_PAGE_SHIFT
 #define GPU_PAGE_SHIFT 16
 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
 #define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
+#endif

 namespace mscclpp {

@@ -45,6 +48,23 @@ GdrStatus gdrStatus() { return gdrContext()->status(); }

 bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }

+const char* gdrStatusMessage() {
+  switch (gdrStatus()) {
+    case GdrStatus::Ok:
+      return "GDRCopy initialized successfully";
+    case GdrStatus::NotBuilt:
+      return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
+    case GdrStatus::Disabled:
+      return "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
+    case GdrStatus::DriverMissing:
+      return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
+    case GdrStatus::OpenFailed:
+      return "gdr_open() failed; GDRCopy driver may be misconfigured";
+    default:
+      return "unknown GDRCopy status";
+  }
+}
+
 GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
  if (env()->forceDisableGdr) {
    INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
@@ -77,53 +97,79 @@ GdrContext::~GdrContext() {
  }
 }

-// GdrMap
+// GdrMap::Impl — real implementation with GDRCopy
+
+struct GdrMap::Impl {
+  std::shared_ptr<GdrContext> ctx;
+  std::shared_ptr<void> gpuMem;
+  gdr_mh_t mh;
+  void* barPtr;
+  uint64_t* hostDstPtr;
+  size_t mappedSize;
+};
+
+GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : pimpl_(std::make_unique<Impl>()) {
+  pimpl_->ctx = gdrContext();
+  pimpl_->gpuMem = std::move(gpuMem);
+  pimpl_->mh = {};
+  pimpl_->barPtr = nullptr;
+  pimpl_->hostDstPtr = nullptr;
+  pimpl_->mappedSize = 0;

-GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId)
-    : ctx_(gdrContext()),
-      gpuMem_(std::move(gpuMem)),
-      mh_{},
-      barPtr_(nullptr),
-      hostDstPtr_(nullptr),
-      mappedSize_(0) {
  // Ensure CUDA device context is active for gdr_pin_buffer
  CudaDeviceGuard deviceGuard(deviceId);

-  uint64_t gpuAddr = reinterpret_cast<uint64_t>(gpuMem_.get());
+  uint64_t gpuAddr = reinterpret_cast<uint64_t>(pimpl_->gpuMem.get());
  // Align to GPU page boundary and pin one page around the target address
  unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK;
  unsigned long pageOffset = gpuAddr - alignedAddr;
-  mappedSize_ = GPU_PAGE_SIZE;
+  pimpl_->mappedSize = GPU_PAGE_SIZE;

-  int ret = gdr_pin_buffer_v2(ctx_->handle(), alignedAddr, mappedSize_, GDR_PIN_FLAG_FORCE_PCIE, &mh_);
+  // Pin the GPU memory for GDRCopy BAR1 mapping. Try GDR_PIN_FLAG_FORCE_PCIE first for optimal
+  // ordering on platforms that support it (e.g., GB200). Fall back to flags=0 if FORCE_PCIE is
+  // not supported. Both paths work correctly: CPU writes via atomicStore, GPU reads via
+  // system-scope acquire.
+  int ret =
+      gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, GDR_PIN_FLAG_FORCE_PCIE, &pimpl_->mh);
  if (ret != 0) {
-    THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
-          ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
+    ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh);
+    if (ret != 0) {
+      THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
+            ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
+    }
  }

-  ret = gdr_map(ctx_->handle(), mh_, &barPtr_, mappedSize_);
+  ret = gdr_map(pimpl_->ctx->handle(), pimpl_->mh, &pimpl_->barPtr, pimpl_->mappedSize);
  if (ret != 0) {
-    (void)gdr_unpin_buffer(ctx_->handle(), mh_);
+    (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
    THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
  }

-  hostDstPtr_ = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(barPtr_) + pageOffset);
+  pimpl_->hostDstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(pimpl_->barPtr) + pageOffset);

-  INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)hostDstPtr_);
+  INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)pimpl_->hostDstPtr);
 }

 GdrMap::~GdrMap() {
-  if (barPtr_ != nullptr) {
-    (void)gdr_unmap(ctx_->handle(), mh_, barPtr_, mappedSize_);
-  }
-  if (hostDstPtr_ != nullptr) {
-    (void)gdr_unpin_buffer(ctx_->handle(), mh_);
+  if (pimpl_) {
+    if (pimpl_->barPtr != nullptr) {
+      (void)gdr_unmap(pimpl_->ctx->handle(), pimpl_->mh, pimpl_->barPtr, pimpl_->mappedSize);
+    }
+    if (pimpl_->hostDstPtr != nullptr) {
+      (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
+    }
  }
 }

-void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(mh_, hostDstPtr_, src, size); }
+bool GdrMap::valid() const { return pimpl_ && pimpl_->hostDstPtr != nullptr; }

-void GdrMap::copyFrom(void* dst, size_t size) const { gdr_copy_from_mapping(mh_, dst, hostDstPtr_, size); }
+uint64_t* GdrMap::hostPtr() const { return pimpl_ ? pimpl_->hostDstPtr : nullptr; }
+
+void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(pimpl_->mh, pimpl_->hostDstPtr, src, size); }
+
+void GdrMap::copyFrom(void* dst, size_t size) const {
+  gdr_copy_from_mapping(pimpl_->mh, dst, pimpl_->hostDstPtr, size);
+}

 }  // namespace mscclpp

@@ -135,6 +181,24 @@ GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }

 bool gdrEnabled() { return false; }

+const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
+
+// GdrMap::Impl — stub (no GDRCopy)
+
+struct GdrMap::Impl {};
+
+GdrMap::GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
+
+GdrMap::~GdrMap() = default;
+
+bool GdrMap::valid() const { return false; }
+
+uint64_t* GdrMap::hostPtr() const { return nullptr; }
+
+void GdrMap::copyTo(const void* /*src*/, size_t /*size*/) {}
+
+void GdrMap::copyFrom(void* /*dst*/, size_t /*size*/) const {}
+
 }  // namespace mscclpp

 #endif  // !defined(MSCCLPP_USE_GDRCOPY)
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -67,8 +67,7 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
  return ret;
 }

-IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5)
-    : mr_(nullptr), buff_(buff), size_(0), isDmabuf_(false), isDataDirect_(false) {
+IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nullptr), buff_(buff), size_(0) {
  if (size == 0) {
    THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0");
  }
@@ -91,11 +90,8 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5)
    int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
                      IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
 #if defined(MSCCLPP_USE_MLX5DV)
-    if (isMlx5 && MLX5DV::isAvailable()) {
+    if (isDataDirect && MLX5DV::isAvailable()) {
      mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
-      if (mr_ != nullptr) {
-        isDataDirect_ = true;
-      }
    }
 #endif
    if (mr_ == nullptr) {
@@ -105,7 +101,6 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5)
    if (mr_ == nullptr) {
      THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
    }
-    isDmabuf_ = true;
 #else   // defined(MSCCLPP_USE_ROCM)
    THROW(NET, Error, ErrorCode::InvalidUsage, "We don't support DMABUF on HIP platforms yet");
 #endif  // defined(MSCCLPP_USE_ROCM)
@@ -145,12 +140,8 @@ const void* IbMr::getBuff() const { return buff_; }

 uint32_t IbMr::getLkey() const { return mr_->lkey; }

-bool IbMr::isDmabuf() const { return isDmabuf_; }
-
-bool IbMr::isDataDirect() const { return isDataDirect_; }
-
 IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
-           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic, bool isMlx5)
+           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
    : portNum_(portNum),
      gidIndex_(gidIndex),
      info_(),
@@ -171,8 +162,7 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
      maxSendWr_(maxSendWr),
      maxWrPerSend_(maxWrPerSend),
      maxRecvWr_(maxRecvWr),
-      noAtomic_(noAtomic),
-      isMlx5_(isMlx5) {
+      noAtomic_(noAtomic) {
  sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
  if (sendCq_ == nullptr) {
    THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
@@ -186,47 +176,21 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
    }
  }

-  struct ibv_qp* qp = nullptr;
-#if defined(MSCCLPP_USE_MLX5DV)
-  if (isMlx5_) {
-    struct ibv_qp_init_attr_ex qpInitAttrEx = {};
-    qpInitAttrEx.sq_sig_all = 0;
-    qpInitAttrEx.send_cq = sendCq_;
-    qpInitAttrEx.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
-    qpInitAttrEx.qp_type = IBV_QPT_RC;
-    qpInitAttrEx.cap.max_send_wr = maxSendWr;
-    qpInitAttrEx.cap.max_recv_wr = maxRecvWr;
-    qpInitAttrEx.cap.max_send_sge = 1;
-    qpInitAttrEx.cap.max_recv_sge = 1;
-    qpInitAttrEx.cap.max_inline_data = 0;
-    qpInitAttrEx.pd = pd;
-    qpInitAttrEx.comp_mask = IBV_QP_INIT_ATTR_PD;
+  struct ibv_qp_init_attr qpInitAttr = {};
+  qpInitAttr.sq_sig_all = 0;
+  qpInitAttr.send_cq = sendCq_;
+  // Use separate recv CQ if created, otherwise use the send CQ
+  qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
+  qpInitAttr.qp_type = IBV_QPT_RC;
+  qpInitAttr.cap.max_send_wr = maxSendWr;
+  qpInitAttr.cap.max_recv_wr = maxRecvWr;
+  qpInitAttr.cap.max_send_sge = 1;
+  qpInitAttr.cap.max_recv_sge = 1;
+  qpInitAttr.cap.max_inline_data = 0;

-    struct mlx5dv_qp_init_attr mlx5QpAttr = {};
-
-    qp = MLX5DV::mlx5dv_create_qp(ctx, &qpInitAttrEx, &mlx5QpAttr);
-    if (qp == nullptr) {
-      THROW(NET, IbError, errno, "mlx5dv_create_qp failed (errno ", errno, ")");
-    }
-  } else
-#endif  // defined(MSCCLPP_USE_MLX5DV)
-  {
-    struct ibv_qp_init_attr qpInitAttr = {};
-    qpInitAttr.sq_sig_all = 0;
-    qpInitAttr.send_cq = sendCq_;
-    // Use separate recv CQ if created, otherwise use the send CQ
-    qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
-    qpInitAttr.qp_type = IBV_QPT_RC;
-    qpInitAttr.cap.max_send_wr = maxSendWr;
-    qpInitAttr.cap.max_recv_wr = maxRecvWr;
-    qpInitAttr.cap.max_send_sge = 1;
-    qpInitAttr.cap.max_recv_sge = 1;
-    qpInitAttr.cap.max_inline_data = 0;
-
-    qp = IBVerbs::ibv_create_qp(pd, &qpInitAttr);
-    if (qp == nullptr) {
-      THROW(NET, IbError, errno, "ibv_create_qp failed (errno ", errno, ")");
-    }
+  struct ibv_qp* qp = IBVerbs::ibv_create_qp(pd, &qpInitAttr);
+  if (qp == nullptr) {
+    THROW(NET, IbError, errno, "ibv_create_qp failed (errno ", errno, ")");
  }

  struct ibv_port_attr portAttr;
@@ -483,12 +447,29 @@ std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_
 unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }

 IbCtx::IbCtx(const std::string& devName)
-    : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false), isMlx5_(false) {
+    : devName_(devName),
+      ctx_(nullptr),
+      pd_(nullptr),
+      supportsRdmaAtomics_(false),
+      isMlx5_(false),
+      dataDirect_(false),
+      isVF_(false) {
  int num;
  struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
  for (int i = 0; i < num; ++i) {
    if (std::string(devices[i]->name) == devName_) {
      ctx_ = IBVerbs::ibv_open_device(devices[i]);
+
+      // Detect if this IB device is a Virtual Function (VF).
+      // VFs have a 'physfn' sysfs symlink pointing to their parent PF; PFs do not.
+      {
+        std::string physfnPath = "/sys/class/infiniband/" + devName_ + "/device/physfn";
+        isVF_ = (access(physfnPath.c_str(), F_OK) == 0);
+        if (isVF_) {
+          INFO(NET, "IB device ", devName_, " is a Virtual Function (Data Direct ordering available)");
+        }
+      }
+
 #if defined(MSCCLPP_USE_MLX5DV)
      if (MLX5DV::isAvailable()) {
        isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]);
@@ -509,6 +490,20 @@ IbCtx::IbCtx(const std::string& devName)
    THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")");
  }

+  // Detect Data Direct support via mlx5dv_get_data_direct_sysfs_path
+#if defined(MSCCLPP_USE_MLX5DV)
+  if (isMlx5_ && MLX5DV::isAvailable()) {
+    char sysfsPath[256];
+    int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath));
+    if (ret == 0) {
+      dataDirect_ = true;
+      INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")");
+    } else {
+      INFO(NET, "IB device ", devName_, " does not support Data Direct");
+    }
+  }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+
  // Query and cache RDMA atomics capability
  struct ibv_device_attr attr = {};
  if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) {
@@ -579,17 +574,21 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
    THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
  }
  return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
-                                        maxRecvWr, maxWrPerSend, noAtomic, isMlx5_));
+                                        maxRecvWr, maxWrPerSend, noAtomic));
 }

 std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
-  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isMlx5_));
+  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, dataDirect_));
 }

 bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }

 bool IbCtx::isMlx5() const { return isMlx5_; }

+bool IbCtx::supportsDataDirect() const { return dataDirect_; }
+
+bool IbCtx::isVirtualFunction() const { return isVF_; }
+
 MSCCLPP_API_CPP int getIBDeviceCount() {
  int num;
  IBVerbs::ibv_get_device_list(&num);
@@ -699,8 +698,6 @@ IbMr::~IbMr() {}
 IbMrInfo IbMr::getInfo() const { return IbMrInfo(); }
 const void* IbMr::getBuff() const { return nullptr; }
 uint32_t IbMr::getLkey() const { return 0; }
-bool IbMr::isDmabuf() const { return false; }
-bool IbMr::isDataDirect() const { return false; }

 IbQp::~IbQp() {}
 void IbQp::rtr(const IbQpInfo& /*info*/) {}
--- a/src/core/ibverbs_wrapper.cc
+++ b/src/core/ibverbs_wrapper.cc
@@ -10,19 +10,37 @@

 #include "logger.hpp"

+// NOTE: MRC_SUPPORT is a temporal macro that makes the current MRC implementation work.
+// MRC_SUPPORT is needed because the current libibverbs implmentation of MRC does not provide
+// all symbols that we need, so we need to load some symbols from the original libibverbs.
+// This macro will be removed (set 0) once MRC provides all necessary symbols.
+// Non-MRC environments will not be affected by this macro as long as VMRC_LIBIBVERBS_SO
+// environment variable is not set.
+#define MRC_SUPPORT 1
+#if (MRC_SUPPORT)
+#include <cstdlib>
+#include <set>
+#endif  // (MRC_SUPPORT)
+
 namespace mscclpp {

 static std::unique_ptr<void, int (*)(void*)> globalIBVerbsHandle(nullptr, &::dlclose);
+#if (MRC_SUPPORT)
+static std::unique_ptr<void, int (*)(void*)> globalOrigIBVerbsHandle(nullptr, &::dlclose);
+#endif  // (MRC_SUPPORT)

 void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
+#if (MRC_SUPPORT)
+  static std::set<std::string> mrcSymbols = {
+      "ibv_get_device_list", "ibv_get_device_name", "ibv_open_device", "ibv_close_device", "ibv_query_qp",
+      "ibv_create_cq",       "ibv_destroy_cq",      "ibv_create_qp",   "ibv_modify_qp",    "ibv_destroy_qp",
+  };
+#endif  // (MRC_SUPPORT)
  if (!globalIBVerbsHandle) {
    if (mscclpp::env()->ibvSo != "") {
      void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW);
      if (handle) {
        globalIBVerbsHandle.reset(handle);
-      } else {
-        THROW(NET, SysError, errno, "Failed to load libibverbs library specified by MSCCLPP_IBV_SO ('",
-              mscclpp::env()->ibvSo, "'): ", std::string(::dlerror()));
      }
    } else {
      const char* possibleLibNames[] = {"libibverbs.so", "libibverbs.so.1", nullptr};
@@ -38,7 +56,26 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
      THROW(NET, SysError, errno, "Failed to open libibverbs: ", std::string(::dlerror()));
    }
  }
+#if (MRC_SUPPORT)
+  // In MRC mode, `VMRC_LIBIBVERBS_SO` should be set.
+  char* vmrcLibibverbsSo = ::getenv("VMRC_LIBIBVERBS_SO");
+  void* ptr;
+  if (vmrcLibibverbsSo != nullptr && mrcSymbols.find(symbol) == mrcSymbols.end()) {
+    // If we are in MRC mode and the symbol is not in the table, get it from the original libibverbs.
+    if (!globalOrigIBVerbsHandle) {
+      void* handle = ::dlopen(vmrcLibibverbsSo, RTLD_NOW);
+      if (!handle) {
+        THROW(NET, SysError, errno, "Failed to open ", std::string(vmrcLibibverbsSo));
+      }
+      globalOrigIBVerbsHandle.reset(handle);
+    }
+    ptr = ::dlsym(globalOrigIBVerbsHandle.get(), symbol.c_str());
+  } else {
+    ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
+  }
+#else   // !(MRC_SUPPORT)
  void* ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
+#endif  // !(MRC_SUPPORT)
  if (!ptr && !allowReturnNull) {
    THROW(NET, SysError, errno, "Failed to load libibverbs symbol: ", symbol);
  }
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -37,16 +37,18 @@ class BaseConnection {

  virtual void flush(int64_t timeoutUsec = -1) = 0;

-  /// Set the local address where forwarded signals should be written.
-  /// This is called by the receiver to specify where incoming signals should be forwarded.
-  /// Default implementation is a no-op for connections that don't need it.
-  /// @param mem Shared pointer to the memory for incoming writes (nullptr to clear).
-  virtual void setSignalForwardingDst(std::shared_ptr<uint64_t> /*mem*/) {}
+  /// Start signal forwarding to the given memory address.
+  /// Called by the semaphore to specify where incoming signals should be written.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  virtual void startSignalForwarding(std::shared_ptr<uint64_t> /*mem*/) {}
+
+  /// Stop signal forwarding and release associated resources.
+  virtual void stopSignalForwarding() {}

  /// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode).
  /// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
  /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
-  virtual bool usesSignalForwarding() const { return false; }
+  virtual bool isSignalForwarding() const { return false; }

  virtual Transport transport() const = 0;

@@ -105,31 +107,20 @@ class IBConnection : public BaseConnection {
  // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
  // instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
  bool ibNoAtomic_;
+  bool gdrSignalForwarding_;  // ibNoAtomic_ && gdrEnabled() — decided once at construction
  std::thread recvThread_;
  std::atomic<bool> stopRecvThread_;
  int localGpuDeviceId_;  // Local GPU device ID for CUDA context and GDR mapping

-  // Write-with-imm design:
-  // - Sender: 8-byte RDMA write-with-imm from local host buffer to remote signal GPU buffer,
-  //   carrying the token value both as RDMA payload and in imm_data (32-bit).
-  // - Receiver: reads the full 64-bit token from the local signal GPU buffer (via BAR1 or
-  //   volatile read), then writes it to remoteUpdateDstAddr_ (the semaphore's inbound token).
-  uint64_t remoteUpdateDstAddr_;
+  // Signal forwarding design (HostNoAtomic mode):
+  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the token value in imm_data (32-bit).
+  // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
+  //   the token from imm_data, then writes it to signalAddr_ (the semaphore's
+  //   inbound token) via atomicStore through the GDRCopy BAR1 mapping. The GPU reads
+  //   inboundToken with system-scope acquire ordering.
+  uint64_t signalAddr_;

-  // Remote endpoint's signal GPU buffer MR info (destination for RDMA write-with-imm).
-  // The local host buffer (atomicSrc_ / atomicSrcTransportInfo_.ibMr) serves as the source.
-  IbMrInfo remoteSignalGpuMrInfo_;
-
-  std::unique_ptr<GdrMap> remoteUpdateDstAddrMap_;
-  std::unique_ptr<GdrMap> localSignalGpuMap_;
-  uint64_t* localSignalGpuPtr_;
-
-  // When true, recvThreadFunc reads the token from imm_data (from CQE) instead of the
-  // signal GPU buffer via GDRCopy. Enabled only when all Data Direct conditions are met:
-  // the signal GPU buffer MR is registered with MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT,
-  // and all GDRCopy mappings (local signal buffer and remoteUpdateDstAddr) are valid,
-  // so both RDMA data writes and GDRCopy token writes go through the Data Direct engine.
-  bool dataDirectEnabled_;
+  std::unique_ptr<GdrMap> signalGdrMap_;

  void recvThreadFunc();

@@ -137,12 +128,15 @@ class IBConnection : public BaseConnection {
  IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
  ~IBConnection();

-  /// Set the local address where forwarded signals should be written.
-  /// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
-  /// @param mem Shared pointer to the memory for incoming writes (nullptr to clear).
-  void setSignalForwardingDst(std::shared_ptr<uint64_t> mem) override;
+  /// Start signal forwarding to the given memory address.
+  /// Must be called before the remote sends any updateAndSync in HostNoAtomic mode.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  void startSignalForwarding(std::shared_ptr<uint64_t> mem) override;

-  bool usesSignalForwarding() const override;
+  /// Stop signal forwarding and release associated resources.
+  void stopSignalForwarding() override;
+
+  bool isSignalForwarding() const override;

  Transport transport() const override;

--- a/src/core/include/endpoint.hpp
+++ b/src/core/include/endpoint.hpp
@@ -6,7 +6,6 @@

 #include <memory>
 #include <mscclpp/core.hpp>
-#include <mscclpp/gpu_utils.hpp>
 #include <vector>

 #include "ib.hpp"
@@ -30,13 +29,6 @@ struct Endpoint::Impl {
  std::shared_ptr<IbQp> ibQp_;
  IbQpInfo ibQpInfo_;

-  // Signal GPU buffer for write-with-imm data payload (ibNoAtomic_ only).
-  // Each endpoint allocates a 64-bit GPU buffer and registers it as an IB MR.
-  // The MR info is serialized/exchanged so the remote can RDMA-write to it.
-  std::shared_ptr<uint64_t> ibSignalGpuBuffer_;
-  std::unique_ptr<const IbMr> ibSignalGpuMr_;
-  IbMrInfo ibSignalGpuMrInfo_;
-
  // The following are only used for Ethernet and are undefined for other transports.
  std::unique_ptr<Socket> socket_;
  SocketAddress socketAddress_;
--- a/src/core/include/gdr.hpp
+++ b/src/core/include/gdr.hpp
@@ -4,6 +4,10 @@
 #ifndef MSCCLPP_GDR_HPP_
 #define MSCCLPP_GDR_HPP_

+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
 namespace mscclpp {

 enum class GdrStatus {
@@ -20,25 +24,14 @@ GdrStatus gdrStatus();
 /// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok).
 bool gdrEnabled();

-}  // namespace mscclpp
+/// Return a human-readable error message for the current GDRCopy status.
+const char* gdrStatusMessage();

-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-#if defined(MSCCLPP_USE_GDRCOPY)
-
-#include <gdrapi.h>
-
-namespace mscclpp {
-
-class GdrContext;
-
-/// RAII wrapper for a per-connection GDRCopy BAR1 mapping of a GPU address.
+/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
+/// When GDRCopy is not available, all operations are no-ops and valid() returns false.
 class GdrMap {
 public:
  /// Pin and map a GPU address for direct host-side access.
-  /// Holds a shared reference to the GPU memory to keep it alive.
  /// @param gpuMem   Shared pointer to the GPU memory (e.g. from gpuCallocShared).
  /// @param deviceId The CUDA device ID for setting context.
  GdrMap(std::shared_ptr<void> gpuMem, int deviceId);
@@ -48,10 +41,10 @@ class GdrMap {
  GdrMap& operator=(const GdrMap&) = delete;

  /// Whether the mapping was established successfully.
-  bool valid() const { return hostDstPtr_ != nullptr; }
+  bool valid() const;

  /// Return the BAR1-mapped host pointer to the GPU location.
-  uint64_t* hostPtr() const { return hostDstPtr_; }
+  uint64_t* hostPtr() const;

  /// Copy data from host memory to the mapped GPU location.
  void copyTo(const void* src, size_t size);
@@ -60,36 +53,10 @@ class GdrMap {
  void copyFrom(void* dst, size_t size) const;

 private:
-  std::shared_ptr<GdrContext> ctx_;
-  std::shared_ptr<void> gpuMem_;
-  gdr_mh_t mh_;
-  void* barPtr_;
-  uint64_t* hostDstPtr_;
-  size_t mappedSize_;
+  struct Impl;
+  std::unique_ptr<Impl> pimpl_;
 };

 }  // namespace mscclpp

-#else  // !defined(MSCCLPP_USE_GDRCOPY)
-
-namespace mscclpp {
-
-/// Stub GdrMap when GDRCopy is not available.
-class GdrMap {
- public:
-  GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
-  ~GdrMap() = default;
-
-  GdrMap(const GdrMap&) = delete;
-  GdrMap& operator=(const GdrMap&) = delete;
-
-  bool valid() const { return false; }
-  void copyTo(const void* /*src*/, size_t /*size*/) {}
-  void copyFrom(void* /*dst*/, size_t /*size*/) const {}
-  uint64_t* hostPtr() const { return nullptr; }
-};
-
-}  // namespace mscclpp
-
-#endif  // !defined(MSCCLPP_USE_GDRCOPY)
 #endif  // MSCCLPP_GDR_HPP_
--- a/src/core/include/ib.hpp
+++ b/src/core/include/ib.hpp
@@ -34,17 +34,13 @@ class IbMr {
  IbMrInfo getInfo() const;
  const void* getBuff() const;
  uint32_t getLkey() const;
-  bool isDmabuf() const;
-  bool isDataDirect() const;

 private:
-  IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isMlx5);
+  IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect);

  ibv_mr* mr_;
  void* buff_;
  std::size_t size_;
-  bool isDmabuf_;
-  bool isDataDirect_;

  friend class IbCtx;
 };
@@ -92,7 +88,6 @@ class IbQp {
  int getRecvWcStatus(int idx) const;
  std::string getRecvWcStatusString(int idx) const;
  unsigned int getRecvWcImmData(int idx) const;
-  bool isMlx5() const { return isMlx5_; }

 private:
  struct SendWrInfo {
@@ -106,7 +101,7 @@ class IbQp {
  };

  IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-       int maxRecvWr, int maxWrPerSend, bool noAtomic, bool isMlx5);
+       int maxRecvWr, int maxWrPerSend, bool noAtomic);
  SendWrInfo getNewSendWrInfo();
  RecvWrInfo getNewRecvWrInfo();

@@ -134,7 +129,6 @@ class IbQp {
  const int maxWrPerSend_;
  const int maxRecvWr_;
  const bool noAtomic_;
-  const bool isMlx5_;

  friend class IbCtx;
 };
@@ -150,6 +144,8 @@ class IbCtx {
  std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
  bool supportsRdmaAtomics() const;
  bool isMlx5() const;
+  bool supportsDataDirect() const;
+  bool isVirtualFunction() const;
 #else
  IbCtx([[maybe_unused]] const std::string& devName) {}
  ~IbCtx() {}
@@ -160,6 +156,8 @@ class IbCtx {
  }
  bool supportsRdmaAtomics() const { return false; }
  bool isMlx5() const { return false; }
+  bool supportsDataDirect() const { return false; }
+  bool isVirtualFunction() const { return false; }
 #endif

  const std::string& getDevName() const { return devName_; };
@@ -173,6 +171,8 @@ class IbCtx {
  ibv_pd* pd_;
  bool supportsRdmaAtomics_;
  bool isMlx5_;
+  bool dataDirect_;
+  bool isVF_;
 };

 }  // namespace mscclpp
--- a/src/core/include/mlx5dv_wrapper.hpp
+++ b/src/core/include/mlx5dv_wrapper.hpp
@@ -6,7 +6,7 @@

 #if defined(MSCCLPP_USE_MLX5DV)

-#include <infiniband/mlx5dv.h>
+#include <infiniband/verbs.h>

 #include <string>

@@ -19,14 +19,14 @@ struct MLX5DV {
  /// Check if the given IB device supports mlx5 Direct Verbs.
  static bool mlx5dv_is_supported(struct ibv_device* device);

-  /// Create a QP using mlx5dv extensions.
-  static struct ibv_qp* mlx5dv_create_qp(struct ibv_context* ctx, struct ibv_qp_init_attr_ex* qpAttr,
-                                          struct mlx5dv_qp_init_attr* mlx5QpAttr);
-
  /// Register a DMABUF memory region using mlx5dv extensions.
  /// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version.
  static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
-                                              int access);
+                                             int access);
+
+  /// Query the Data Direct sysfs path for the given IB context.
+  /// Returns 0 on success (device supports Data Direct), non-zero otherwise.
+  static int mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len);

 private:
  static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
--- a/src/core/mlx5dv_wrapper.cc
+++ b/src/core/mlx5dv_wrapper.cc
@@ -3,9 +3,19 @@

 #if defined(MSCCLPP_USE_MLX5DV)

+// _GNU_SOURCE is required for dlvsym()
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include "mlx5dv_wrapper.hpp"

 #include <dlfcn.h>
+#include <infiniband/mlx5dv.h>
+
+#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
+#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
+#endif

 #include <memory>

@@ -72,14 +82,6 @@ bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
  return impl(device);
 }

-struct ibv_qp* MLX5DV::mlx5dv_create_qp(struct ibv_context* ctx, struct ibv_qp_init_attr_ex* qpAttr,
-                                        struct mlx5dv_qp_init_attr* mlx5QpAttr) {
-  using FuncType = struct ibv_qp* (*)(struct ibv_context*, struct ibv_qp_init_attr_ex*, struct mlx5dv_qp_init_attr*);
-  static FuncType impl = nullptr;
-  if (!impl) impl = reinterpret_cast<FuncType>(MLX5DV::dlsym("mlx5dv_create_qp"));
-  return impl(ctx, qpAttr, mlx5QpAttr);
-}
-
 struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
                                            int access) {
  // mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
@@ -92,12 +94,27 @@ struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset,
    resolved = true;
  }
  if (!impl) return nullptr;
-#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
-#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
-#endif
  return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
 }

+int MLX5DV::mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) {
+  using FuncType = int (*)(struct ibv_context*, char*, size_t);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_get_data_direct_sysfs_path", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_get_data_direct_sysfs_path", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
+    resolved = true;
+  }
+  if (!impl) return -1;
+  return impl(context, buf, buf_len);
+}
+
 }  // namespace mscclpp

 #endif  // defined(MSCCLPP_USE_MLX5DV)
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -123,19 +123,18 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
  }
  auto connImpl = BaseConnection::getImpl(connection());
-  if (connImpl->usesSignalForwarding()) {
-    // Signal forwarding mode: the recv thread writes the token to GPU memory.
-    // Allocate a separate inbound token via plain cudaMalloc (not TokenPool/VMM)
-    // so that it is always compatible with GDRCopy pinning (VMM memory cannot be pinned by gdr_pin_buffer).
+  if (connImpl->isSignalForwarding()) {
+    // Signal forwarding (HostNoAtomic): the receiver's recv thread polls the recv CQ for
+    // WRITE_WITH_IMM completions, then forwards the token to inboundToken_ via GDRCopy.
    CudaDeviceGuard deviceGuard(connection().localDevice().id);
 #if defined(MSCCLPP_USE_ROCM)
    inboundToken_ = detail::gpuCallocUncachedShared<uint64_t>();
 #else
    inboundToken_ = detail::gpuCallocShared<uint64_t>();
 #endif
-    connImpl->setSignalForwardingDst(inboundToken_);
+    connImpl->startSignalForwarding(inboundToken_);
  }
-  // When usesSignalForwarding() is false (e.g., atomic mode), inboundToken_ stays null
+  // When isSignalForwarding() is false (atomic mode), inboundToken_ stays null
  // and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
 }

@@ -144,9 +143,9 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communi

 MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
  if (inboundToken_) {
-    // Clear the connection's signal forwarding destination (and any associated GdrMap)
+    // Clear the connection's signal forwarding destination (and GdrMap)
    // before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
-    BaseConnection::getImpl(connection())->setSignalForwardingDst(nullptr);
+    BaseConnection::getImpl(connection())->stopSignalForwarding();
  }
 }

@@ -158,7 +157,7 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {

 MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const {
  Host2DeviceSemaphore::DeviceHandle device;
-  // If inboundToken_ is allocated (host-no-atomic mode), the GPU polls it.
+  // If inboundToken_ is allocated (signal forwarding mode), the GPU polls it.
  // Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
  // which is the same address targeted by the NIC's atomic operation.
  device.inboundToken =
@@ -178,12 +177,12 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
  }
  auto connImpl = BaseConnection::getImpl(connection());
-  if (connImpl->usesSignalForwarding()) {
+  if (connImpl->isSignalForwarding()) {
    // Signal forwarding mode: tell the recv thread where to write the incoming token.
    // Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
    auto token =
        std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
-    connImpl->setSignalForwardingDst(std::move(token));
+    connImpl->startSignalForwarding(std::move(token));
  }
 }

--- a/test/framework.cc
+++ b/test/framework.cc
@@ -285,6 +285,9 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
        passed++;
      } else {
        std::cout << "[  FAILED  ] " << fullName << std::endl;
+        if (!gCurrentTestFailureMessage.empty()) {
+          std::cout << "            Reason: " << gCurrentTestFailureMessage << std::endl;
+        }
        failed++;
      }
    }
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -3,8 +3,12 @@

 #include <mpi.h>

+#include <atomic>
+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <thread>

+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"

@@ -41,7 +45,10 @@ void IbPeerToPeerTest::SetUp() {

  ibCtx = std::make_shared<mscclpp::IbCtx>(ibDevName);
  bool noAtomic = !ibCtx->supportsRdmaAtomics();
-  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, 0, 64, noAtomic);
+  // When atomics are not supported, the MemoryConsistency test uses
+  // write-with-imm which requires recv WRs on the receiver side.
+  int maxRecvWr = noAtomic ? 64 : 0;
+  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, maxRecvWr, 64, noAtomic);

  qpInfo[gEnv->rank] = qp->getInfo();
  bootstrap->allGather(qpInfo.data(), sizeof(mscclpp::IbQpInfo));
@@ -199,15 +206,34 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
    // This test needs only two ranks
    return;
  }
-  if (!ibCtx->supportsRdmaAtomics()) {
-    GTEST_SKIP() << "This test requires RDMA atomics support.";
-  }
+
+  // Use atomic path if supported by the IB device.
+  bool useAtomic = ibCtx->supportsRdmaAtomics();

  const uint64_t signalPeriod = 1024;
  const uint64_t maxIter = 10000;
  const uint64_t nelem = 65536 + 1;
  auto data = mscclpp::detail::gpuCallocUnique<uint64_t>(nelem);

+  // For no-atomic mode: allocate a separate signal buffer for write-with-imm destination.
+  // The sender writes-with-imm to this buffer; the receiver's CPU thread reads the imm_data
+  // from the recv CQ and writes the iteration value to data[0] via GDRCopy atomicStore.
+  std::shared_ptr<uint64_t> signalBuf;
+  std::unique_ptr<const mscclpp::IbMr> signalMr;
+  std::array<mscclpp::IbMrInfo, 2> signalMrInfo{};
+  if (!useAtomic) {
+    signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+    signalMr = ibCtx->registerMr(signalBuf.get(), sizeof(uint64_t));
+    signalMrInfo[gEnv->rank] = signalMr->getInfo();
+    bootstrap->allGather(signalMrInfo.data(), sizeof(mscclpp::IbMrInfo));
+
+    // Pre-post recv WRs for write-with-imm on both ranks
+    for (int i = 0; i < 64; ++i) {
+      qp->stageRecv(0);
+    }
+    qp->postRecv();
+  }
+
  registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);

  uint64_t res = 0;
@@ -226,6 +252,40 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
    ASSERT_EQ(*ptrCurIter, 0);
    ASSERT_EQ(*ptrResult, 0);

+    // For no-atomic mode: create a GDRCopy mapping for data[0] and start a CPU thread that
+    // polls recv CQ and forwards the signal via GDRCopy BAR1 write — the same mechanism
+    // used by IBConnection::recvThreadFunc for port channels.
+    std::atomic<bool> stopRecvThread(false);
+    std::thread recvThread;
+    std::unique_ptr<mscclpp::GdrMap> dataGdrMap;
+    if (!useAtomic) {
+      if (!mscclpp::gdrEnabled()) {
+        SKIP_TEST() << "No-atomic mode requires GDRCopy but it is not available.";
+      }
+      // Create GDRCopy BAR1 mapping for data[0] — same as how connection.cc maps inboundToken_
+      dataGdrMap =
+          std::make_unique<mscclpp::GdrMap>(std::shared_ptr<void>(data.get(), [](void*) {}),  // non-owning shared_ptr
+                                            cudaDevId);
+
+      recvThread = std::thread([&]() {
+        while (!stopRecvThread.load(std::memory_order_relaxed)) {
+          int wcNum = qp->pollRecvCq();
+          if (wcNum <= 0) continue;
+          for (int i = 0; i < wcNum; ++i) {
+            int status = qp->getRecvWcStatus(i);
+            if (status != static_cast<int>(mscclpp::WsStatus::Success)) continue;
+            uint64_t val = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+            // Write the iteration value to data[0] via GDRCopy BAR1 atomicStore —
+            // same pattern as IBConnection::recvThreadFunc.
+            mscclpp::atomicStore(dataGdrMap->hostPtr(), val, mscclpp::memoryOrderRelaxed);
+            // Re-post recv
+            qp->stageRecv(0);
+            qp->postRecv();
+          }
+        }
+      });
+    }
+
    kernelMemoryConsistency<<<1, 1024>>>(data.get(), ptrCurIter, ptrResult, nelem, maxIter);
    MSCCLPP_CUDATHROW(cudaGetLastError());

@@ -247,6 +307,11 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
    }

    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+    if (!useAtomic) {
+      stopRecvThread.store(true, std::memory_order_relaxed);
+      if (recvThread.joinable()) recvThread.join();
+    }
  } else if (gEnv->rank == 1) {
    // Sender
    std::vector<uint64_t> hostBuffer(nelem, 0);
@@ -267,15 +332,20 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
      stageSendWrite(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
      qp->postSend();

-#if 0
-      // For reference: send the first element using a normal send. This should occasionally see a wrong result.
-      stageSendWrite(sizeof(uint64_t), 0, 0, 0, false);
-      qp->postSend();
-#else
-      // Send the first element using AtomicAdd. This should see the correct result.
-      stageSendAtomicAdd(0, 0, 1, false);
-      qp->postSend();
-#endif
+      if (useAtomic) {
+        // Send the first element using AtomicAdd. The non-posted PCIe atomic operation
+        // provides end-to-end ordering: data[1..N] are guaranteed visible when data[0] updates.
+        stageSendAtomicAdd(0, 0, 1, false);
+        qp->postSend();
+      } else {
+        // No-atomic mode: send a 0-byte WRITE_WITH_IMM carrying the iteration in imm_data.
+        // The receiver's CPU thread polls the recv CQ and writes the value to data[0]
+        // via GDRCopy atomicStore.
+        // QP ordering guarantees data[1..N] WRITE completes before this write-with-imm.
+        const mscclpp::IbMrInfo& remoteSignalMrInfo = signalMrInfo[(gEnv->rank == 1) ? 0 : 1];
+        qp->stageSendWriteWithImm(nullptr, remoteSignalMrInfo, 0, 0, 0, 0, false, static_cast<unsigned int>(iter));
+        qp->postSend();
+      }

      if (signaled) {
        int wcNum = qp->pollSendCq();
@@ -296,13 +366,23 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
    }
  }

-  if (res & 2) {
-    FAIL() << "The receiver is stuck at iteration " << iter << ".";
-  } else if (res != 0 && res != 1) {
-    FAIL() << "Unknown error is detected at iteration " << iter << ". res =" << res;
+  if (useAtomic) {
+    // With RDMA atomics, memory consistency must be guaranteed.
+    if (res & 2) {
+      FAIL() << "The receiver is stuck at iteration " << iter << ".";
+    }
+    EXPECT_EQ(res, 0);
+  } else {
+    if (res == 0) {
+      // No-atomic path works correctly here.
+    } else if (res & 2) {
+      SKIP_TEST() << "No-atomic signal forwarding: receiver stuck at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    } else {
+      SKIP_TEST() << "No-atomic signal forwarding: memory inconsistency detected at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    }
  }
-
-  EXPECT_EQ(res, 0);
 }

 TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
@@ -311,7 +391,7 @@ TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
    return;
  }
  if (!ibCtx->supportsRdmaAtomics()) {
-    GTEST_SKIP() << "This test requires RDMA atomics support.";
+    SKIP_TEST() << "This test requires RDMA atomics support.";
  }

  mscclpp::Timer timeout(3);
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -4,9 +4,24 @@
 #include <cstdint>
 #include <mscclpp/concurrency_device.hpp>

+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"

+// Skip the current test if HostNoAtomic mode is not supported.
+// On CUDA, HostNoAtomic requires GDRCopy for BAR1 signal forwarding.
+// On ROCm, HostNoAtomic uses direct volatile writes and does not need GDRCopy.
+#if defined(MSCCLPP_USE_CUDA)
+#define REQUIRE_HOST_NO_ATOMIC                                                         \
+  do {                                                                                 \
+    if (!mscclpp::gdrEnabled()) {                                                      \
+      SKIP_TEST() << "HostNoAtomic requires GDRCopy: " << mscclpp::gdrStatusMessage(); \
+    }                                                                                  \
+  } while (0)
+#else
+#define REQUIRE_HOST_NO_ATOMIC  // No extra requirements on non-CUDA platforms.
+#endif
+
 void PortChannelOneToOneTest::SetUp() {
  // Use only two ranks
  setNumRanksToUse(2);
@@ -272,6 +287,7 @@ TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {

 TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
  REQUIRE_IBVERBS;
+  REQUIRE_HOST_NO_ATOMIC;
  testPingPongPerf(PingPongTestParams{
      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
@@ -465,16 +481,19 @@ TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {

 TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
  REQUIRE_IBVERBS;
+  REQUIRE_HOST_NO_ATOMIC;
  testPacketPingPongPerf(true, IbMode::HostNoAtomic);
 }

 TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
  REQUIRE_IBVERBS;
+  REQUIRE_HOST_NO_ATOMIC;
  testPingPong(PingPongTestParams{
      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }

 TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
  REQUIRE_IBVERBS;
+  REQUIRE_HOST_NO_ATOMIC;
  testPacketPingPong(true, IbMode::HostNoAtomic);
 }
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -4,6 +4,7 @@
 target_sources(unit_tests PRIVATE
    unit_tests_main.cc
    core_tests.cc
+    gdr_tests.cu
    gpu_utils_tests.cc
    errors_tests.cc
    fifo_tests.cu
--- a/test/unit/gdr_tests.cu
+++ b/test/unit/gdr_tests.cu
@@ -0,0 +1,251 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <mscclpp/atomic_device.hpp>
+#include <mscclpp/errors.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "../framework.hpp"
+#include "gdr.hpp"
+
+// GdrStatus and gdrEnabled
+
+class GdrStatusTest : public ::mscclpp::test::TestCase {};
+
+TEST(GdrStatusTest, StatusIsValid) {
+  // gdrStatus() should return one of the defined enum values
+  auto status = mscclpp::gdrStatus();
+  ASSERT_TRUE(status == mscclpp::GdrStatus::Ok || status == mscclpp::GdrStatus::NotBuilt ||
+              status == mscclpp::GdrStatus::Disabled || status == mscclpp::GdrStatus::DriverMissing ||
+              status == mscclpp::GdrStatus::OpenFailed);
+}
+
+TEST(GdrStatusTest, EnabledConsistentWithStatus) {
+  // gdrEnabled() should be true iff gdrStatus() == Ok
+  EXPECT_EQ(mscclpp::gdrEnabled(), mscclpp::gdrStatus() == mscclpp::GdrStatus::Ok);
+}
+
+// GdrMap tests — only run when GDRCopy is available
+
+class GdrMapTest : public ::mscclpp::test::TestCase {
+ protected:
+  void SetUp() override {
+    if (!mscclpp::gdrEnabled()) {
+      SKIP_TEST() << "GDRCopy not enabled on this platform.";
+    }
+    MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
+    // Try creating a GDRCopy mapping to check if pin+map works on this platform.
+    try {
+      auto testMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+      mscclpp::GdrMap testMap(std::static_pointer_cast<void>(testMem), deviceId_);
+    } catch (const std::exception&) {
+      SKIP_TEST() << "GDRCopy mapping not supported on this platform.";
+    }
+  }
+
+  int deviceId_ = 0;
+};
+
+TEST(GdrMapTest, BasicMapping) {
+  // Allocate GPU memory via cudaMalloc (not VMM) and create a GDRCopy mapping
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+
+  ASSERT_TRUE(map.valid());
+  EXPECT_NE(map.hostPtr(), nullptr);
+}
+
+TEST(GdrMapTest, CopyToAndFrom) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write a value to GPU via GDRCopy
+  uint64_t writeVal = 0xDEADBEEFCAFE0123ULL;
+  map.copyTo(&writeVal, sizeof(uint64_t));
+
+  // Read it back via GDRCopy
+  uint64_t readVal = 0;
+  map.copyFrom(&readVal, sizeof(uint64_t));
+  EXPECT_EQ(readVal, writeVal);
+
+  // Also verify via cudaMemcpy
+  uint64_t cudaVal = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&cudaVal, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(cudaVal, writeVal);
+}
+
+TEST(GdrMapTest, CopyToVisibleFromGpu) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy, verify GPU sees it via cudaMemcpy
+  uint64_t val = 42;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 42);
+}
+
+TEST(GdrMapTest, MultipleWritesReadBack) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write multiple values sequentially and verify each
+  for (uint64_t i = 1; i <= 100; ++i) {
+    map.copyTo(&i, sizeof(uint64_t));
+    uint64_t readback = 0;
+    map.copyFrom(&readback, sizeof(uint64_t));
+    EXPECT_EQ(readback, i);
+    if (readback != i) break;
+  }
+}
+
+TEST(GdrMapTest, HostPtrIsWritable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write directly through the hostPtr (volatile store)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  *ptr = 12345;
+
+  // Read back via GDRCopy
+  uint64_t readback = 0;
+  map.copyFrom(&readback, sizeof(uint64_t));
+  EXPECT_EQ(readback, 12345);
+}
+
+TEST(GdrMapTest, HostPtrIsReadable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy copyTo (same BAR1 path as the read)
+  uint64_t val = 99999;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  // Read through the hostPtr (volatile load via BAR1)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  EXPECT_EQ(*ptr, 99999);
+}
+
+TEST(GdrMapTest, DestroyDoesNotCrash) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  {
+    mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+    ASSERT_TRUE(map.valid());
+    uint64_t val = 1;
+    map.copyTo(&val, sizeof(uint64_t));
+  }
+  // After GdrMap is destroyed, gpuMem should still be valid
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 1);
+}
+
+// GPU kernel: polls signalFromCpu until it reaches expectedIter, then writes expectedIter to ackToHost.
+// Repeats for maxIter iterations. The GPU uses system-scope acquire loads on signalFromCpu
+// and plain stores to ackToHost (which is host-pinned memory visible to CPU).
+__global__ void kernelGdrVisibilityPingPong(volatile uint64_t* signalFromCpu, volatile uint64_t* ackToHost,
+                                            uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // Poll until CPU writes the expected iteration value via GDRCopy BAR1
+    while (*signalFromCpu < iter) {
+    }
+    // Ack back to CPU via host-pinned memory
+    *ackToHost = iter;
+  }
+}
+
+TEST(GdrMapTest, CpuGpuVisibilityPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // signalBuf: GPU memory mapped via GDRCopy BAR1. CPU writes here, GPU polls.
+  auto signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap signalMap(std::static_pointer_cast<void>(signalBuf), deviceId_);
+  ASSERT_TRUE(signalMap.valid());
+
+  // ackBuf: host-pinned memory (gpuCallocHostShared). GPU writes here, CPU polls.
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — it will poll signalBuf and write ackBuf for each iteration
+  kernelGdrVisibilityPingPong<<<1, 1>>>(signalBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes iteration value to GPU via GDRCopy BAR1
+    uint64_t val = iter;
+    signalMap.copyTo(&val, sizeof(uint64_t));
+
+    // CPU polls host-pinned ack until GPU confirms it saw the value
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        FAIL() << "GPU did not ack iteration " << iter << " (ack=" << *ackPtr << ")";
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}
+
+// GPU kernel that polls a counter using system-scope acquire load.
+// When counter >= expectedIter, writes ack.
+__global__ void kernelCounterWait(uint64_t* counter, volatile uint64_t* ackToHost, uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // System-scope acquire load — matches the atomicStore(relaxed) on the CPU side
+    uint64_t got;
+    do {
+      got = mscclpp::atomicLoad(counter, mscclpp::memoryOrderAcquire);
+    } while (got < iter);
+    // Ack back
+    *ackToHost = iter;
+  }
+}
+
+// Test the GDRCopy counter pattern used by HostNoAtomic mode:
+// - GPU memory allocated via gpuCallocShared (cudaMalloc)
+// - GdrMap for BAR1 mapping
+// - CPU writes via atomicStore(relaxed) through GDRCopy BAR1 mapping
+// - GPU reads via atomicLoad with memory_order_acquire
+TEST(GdrMapTest, AtomicStoreCounterPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // Allocate GPU memory via gpuCallocShared
+  auto counterBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap counterMap(std::static_pointer_cast<void>(counterBuf), deviceId_);
+  ASSERT_TRUE(counterMap.valid());
+
+  // Ack buffer: host-pinned memory
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — polls counterBuf with system-scope acquire load
+  kernelCounterWait<<<1, 1>>>(counterBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes counter via atomicStore (relaxed — GPU uses acquire on read)
+    mscclpp::atomicStore(counterMap.hostPtr(), iter, mscclpp::memoryOrderRelaxed);
+
+    // Wait for GPU ack
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+        FAIL() << "GPU did not ack iteration " << iter;
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}