mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
64-bit token reconstruction
This commit is contained in:
@@ -198,6 +198,8 @@ void IBConnection::recvThreadFunc() {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t lastImmData = 0;
|
||||
uint64_t immHighBits = 0;
|
||||
uint64_t newValueHost = 0;
|
||||
|
||||
auto qp = qp_.lock();
|
||||
@@ -220,8 +222,15 @@ void IBConnection::recvThreadFunc() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Read the token from imm_data (always available and correct in the CQE).
|
||||
newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
|
||||
// Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
|
||||
// using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits
|
||||
// are less than the previous value, the upper 32 bits must have incremented by 1.
|
||||
uint32_t immData = qp->getRecvWcImmData(i);
|
||||
if (immData < lastImmData) {
|
||||
immHighBits += (1ULL << 32);
|
||||
}
|
||||
lastImmData = immData;
|
||||
newValueHost = immHighBits | static_cast<uint64_t>(immData);
|
||||
|
||||
// Forward the token to the semaphore's inbound token address via atomicStore
|
||||
// through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
|
||||
@@ -397,10 +406,17 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
|
||||
*src = newValue;
|
||||
|
||||
if (ibNoAtomic_) {
|
||||
// Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the token in imm_data.
|
||||
// The receiver's recv thread polls the CQE, which guarantees the preceding data WRITE
|
||||
// has been committed to GPU memory. The recv thread then forwards the token to the
|
||||
// semaphore's inbound token via GDRCopy atomicStore.
|
||||
// Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the
|
||||
// token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around
|
||||
// detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
|
||||
// indicates the upper 32 bits incremented by 1).
|
||||
if (newValue <= oldValue) {
|
||||
WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ",
|
||||
newValue);
|
||||
} else if (newValue - oldValue >= (1ULL << 32)) {
|
||||
WARN(CONN, "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ",
|
||||
oldValue, " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
|
||||
}
|
||||
unsigned int immData = static_cast<unsigned int>(newValue);
|
||||
qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
|
||||
/*size=*/0, /*wrId=*/0,
|
||||
|
||||
@@ -113,13 +113,14 @@ class IBConnection : public BaseConnection {
|
||||
int localGpuDeviceId_; // Local GPU device ID for CUDA context and GDR mapping
|
||||
|
||||
// Signal forwarding design (HostNoAtomic mode):
|
||||
// - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the token value in imm_data (32-bit).
|
||||
// - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
|
||||
// - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
|
||||
// the token from imm_data, then writes it to signalAddr_ (the semaphore's
|
||||
// inbound token) via atomicStore through the GDRCopy BAR1 mapping. The GPU reads
|
||||
// inboundToken with system-scope acquire ordering.
|
||||
// the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around
|
||||
// detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half
|
||||
// incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
|
||||
uint64_t signalAddr_;
|
||||
|
||||
|
||||
std::unique_ptr<GdrMap> signalGdrMap_;
|
||||
|
||||
void recvThreadFunc();
|
||||
|
||||
Reference in New Issue
Block a user