From 848b89b59c2f61b1834e6aaf32e4bdabc857a1ef Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 1 Apr 2026 21:00:54 +0000
Subject: [PATCH] 64-bit token reconstruction

---
 src/core/connection.cc          | 28 ++++++++++++++++++++++------
 src/core/include/connection.hpp |  9 +++++----
 2 files changed, 27 insertions(+), 10 deletions(-)
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 9767a315..db978943 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -198,6 +198,8 @@ void IBConnection::recvThreadFunc() {
     }
   }
 
+  uint32_t lastImmData = 0;
+  uint64_t immHighBits = 0;
   uint64_t newValueHost = 0;
 
   auto qp = qp_.lock();
@@ -220,8 +222,15 @@ void IBConnection::recvThreadFunc() {
         continue;
       }
 
-      // Read the token from imm_data (always available and correct in the CQE).
-      newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+      // Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
+      // using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits
+      // are less than the previous value, the upper 32 bits must have incremented by 1.
+      uint32_t immData = qp->getRecvWcImmData(i);
+      if (immData < lastImmData) {
+        immHighBits += (1ULL << 32);
+      }
+      lastImmData = immData;
+      newValueHost = immHighBits | static_cast<uint64_t>(immData);
 
       // Forward the token to the semaphore's inbound token address via atomicStore
       // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
@@ -397,10 +406,17 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
   *src = newValue;
 
   if (ibNoAtomic_) {
-    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the token in imm_data.
-    // The receiver's recv thread polls the CQE, which guarantees the preceding data WRITE
-    // has been committed to GPU memory. The recv thread then forwards the token to the
-    // semaphore's inbound token via GDRCopy atomicStore.
+    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the
+    // token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around
+    // detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
+    // indicates the upper 32 bits incremented by 1).
+    if (newValue <= oldValue) {
+      WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ",
+           newValue);
+    } else if (newValue - oldValue >= (1ULL << 32)) {
+      WARN(CONN, "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ",
+           oldValue, " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
+    }
     unsigned int immData = static_cast<unsigned int>(newValue);
     qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
                                       /*size=*/0, /*wrId=*/0,
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 47b03d6c..432ce9ab 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -113,13 +113,14 @@ class IBConnection : public BaseConnection {
   int localGpuDeviceId_;  // Local GPU device ID for CUDA context and GDR mapping
 
   // Signal forwarding design (HostNoAtomic mode):
-  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the token value in imm_data (32-bit).
+  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
   // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
-  //   the token from imm_data, then writes it to signalAddr_ (the semaphore's
-  //   inbound token) via atomicStore through the GDRCopy BAR1 mapping. The GPU reads
-  //   inboundToken with system-scope acquire ordering.
+  //   the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around
+  //   detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half
+  //   incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
   uint64_t signalAddr_;
 
+
   std::unique_ptr<GdrMap> signalGdrMap_;
 
   void recvThreadFunc();