64-bit token reconstruction

2026-05-12 01:10:22 +00:00 · 2026-04-01 21:00:54 +00:00
parent 4cf53328ad
commit 848b89b59c
2 changed files with 27 additions and 10 deletions
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -198,6 +198,8 @@ void IBConnection::recvThreadFunc() {
    }
  }

+  uint32_t lastImmData = 0;
+  uint64_t immHighBits = 0;
  uint64_t newValueHost = 0;

  auto qp = qp_.lock();
@@ -220,8 +222,15 @@ void IBConnection::recvThreadFunc() {
        continue;
      }

-      // Read the token from imm_data (always available and correct in the CQE).
-      newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+      // Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
+      // using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits
+      // are less than the previous value, the upper 32 bits must have incremented by 1.
+      uint32_t immData = qp->getRecvWcImmData(i);
+      if (immData < lastImmData) {
+        immHighBits += (1ULL << 32);
+      }
+      lastImmData = immData;
+      newValueHost = immHighBits | static_cast<uint64_t>(immData);

      // Forward the token to the semaphore's inbound token address via atomicStore
      // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
@@ -397,10 +406,17 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
  *src = newValue;

  if (ibNoAtomic_) {
-    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the token in imm_data.
-    // The receiver's recv thread polls the CQE, which guarantees the preceding data WRITE
-    // has been committed to GPU memory. The recv thread then forwards the token to the
-    // semaphore's inbound token via GDRCopy atomicStore.
+    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the
+    // token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around
+    // detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
+    // indicates the upper 32 bits incremented by 1).
+    if (newValue <= oldValue) {
+      WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ",
+           newValue);
+    } else if (newValue - oldValue >= (1ULL << 32)) {
+      WARN(CONN, "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ",
+           oldValue, " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
+    }
    unsigned int immData = static_cast<unsigned int>(newValue);
    qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
                                      /*size=*/0, /*wrId=*/0,
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -113,13 +113,14 @@ class IBConnection : public BaseConnection {
  int localGpuDeviceId_;  // Local GPU device ID for CUDA context and GDR mapping

  // Signal forwarding design (HostNoAtomic mode):
-  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the token value in imm_data (32-bit).
+  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
  // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
-  //   the token from imm_data, then writes it to signalAddr_ (the semaphore's
-  //   inbound token) via atomicStore through the GDRCopy BAR1 mapping. The GPU reads
-  //   inboundToken with system-scope acquire ordering.
+  //   the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around
+  //   detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half
+  //   incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
  uint64_t signalAddr_;

+
  std::unique_ptr<GdrMap> signalGdrMap_;

  void recvThreadFunc();