From 848b89b59c2f61b1834e6aaf32e4bdabc857a1ef Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 1 Apr 2026 21:00:54 +0000 Subject: [PATCH] 64-bit token reconstruction --- src/core/connection.cc | 28 ++++++++++++++++++++++------ src/core/include/connection.hpp | 9 +++++---- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/core/connection.cc b/src/core/connection.cc index 9767a315..db978943 100644 --- a/src/core/connection.cc +++ b/src/core/connection.cc @@ -198,6 +198,8 @@ void IBConnection::recvThreadFunc() { } } + uint32_t lastImmData = 0; + uint64_t immHighBits = 0; uint64_t newValueHost = 0; auto qp = qp_.lock(); @@ -220,8 +222,15 @@ void IBConnection::recvThreadFunc() { continue; } - // Read the token from imm_data (always available and correct in the CQE). - newValueHost = static_cast(qp->getRecvWcImmData(i)); + // Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value + // using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits + // are less than the previous value, the upper 32 bits must have incremented by 1. + uint32_t immData = qp->getRecvWcImmData(i); + if (immData < lastImmData) { + immHighBits += (1ULL << 32); + } + lastImmData = immData; + newValueHost = immHighBits | static_cast(immData); // Forward the token to the semaphore's inbound token address via atomicStore // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire. @@ -397,10 +406,17 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6 *src = newValue; if (ibNoAtomic_) { - // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the token in imm_data. - // The receiver's recv thread polls the CQE, which guarantees the preceding data WRITE - // has been committed to GPU memory. The recv thread then forwards the token to the - // semaphore's inbound token via GDRCopy atomicStore. + // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the + // token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around + // detection (tokens are monotonically increasing, so a decrease in the lower 32 bits + // indicates the upper 32 bits incremented by 1). + if (newValue <= oldValue) { + WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ", + newValue); + } else if (newValue - oldValue >= (1ULL << 32)) { + WARN(CONN, "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ", + oldValue, " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)"); + } unsigned int immData = static_cast(newValue); qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo, /*size=*/0, /*wrId=*/0, diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp index 47b03d6c..432ce9ab 100644 --- a/src/core/include/connection.hpp +++ b/src/core/include/connection.hpp @@ -113,13 +113,14 @@ class IBConnection : public BaseConnection { int localGpuDeviceId_; // Local GPU device ID for CUDA context and GDR mapping // Signal forwarding design (HostNoAtomic mode): - // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the token value in imm_data (32-bit). + // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data. // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads - // the token from imm_data, then writes it to signalAddr_ (the semaphore's - // inbound token) via atomicStore through the GDRCopy BAR1 mapping. The GPU reads - // inboundToken with system-scope acquire ordering. + // the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around + // detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half + // incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1. uint64_t signalAddr_; + std::unique_ptr signalGdrMap_; void recvThreadFunc();