diff --git a/src/ext/ep/kernels/internode.cu b/src/ext/ep/kernels/internode.cu index a5b3b165..b64205e0 100644 --- a/src/ext/ep/kernels/internode.cu +++ b/src/ext/ep/kernels/internode.cu @@ -892,7 +892,13 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV if (lane_id < kNumRDMARanks and not kCachedMode) send_rdma_head[token_idx * kNumRDMARanks + lane_id] = rdma_tail_idx; - // Update last token tail + // Update last token tail. In-loop writes are sequenced by the + // per-channel sequential lock and the warp-stride property of the + // token loop, so monotonicity is guaranteed and a plain + // st_release_cta is correct AND faster than atomicMax (which + // would serialize through L2 if the compiler can't infer shared + // address space). The epilogue (out of the seq-lock contract for + // the highest in-rank slot) needs atomicMax separately. if (last_rdma_tail_idx >= 0) st_release_cta(const_cast(rdma_send_channel_tail + lane_id), last_rdma_tail_idx + 1); last_rdma_tail_idx = rdma_tail_idx; @@ -962,9 +968,9 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV ; __syncwarp(); - // Update last token tail + // Update last token tail (epilogue). See in-loop note on atomicMax. if (last_rdma_tail_idx >= 0) - st_release_cta(const_cast(rdma_send_channel_tail + lane_id), last_rdma_tail_idx + 1); + atomicMax(const_cast(rdma_send_channel_tail + lane_id), last_rdma_tail_idx + 1); // Release sequential lock lane_id == 0 ? (rdma_send_next_token_idx += 1) : 0;