mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
ext/ep: WIP Phase 4 NVLS HT B2 third multimem barrier in notify_dispatch
Replace the cross-node port_channel signal/wait at the tail of
notify_dispatch (the "B2" barrier — third NVLS barrier after b0 and
b1) with an epoch-monotonic NVLS multimem.red.add.u64 at slot
+16 on the per-rank multicast region.
Root cause this fixes: with port_channel signal/wait still in place,
exactly one CTA thread (the one with `barrier_thread_id != rdma_rank`)
hangs in IB wait() because Azure CX-7 RoCE has the same broken
semaphore path that failed in Phase 1-3. All other threads then
deadlock at the subsequent `if constexpr (!kLowLatencyMode)
__syncthreads()` waiting for the IB thread, so notify_dispatch never
reaches the second barrier_device or the dispatch kernel launch.
Diagnostic prints confirmed all ranks reached "pass final-barrier"
only AFTER the print, never the "DONE" print.
Validated cross-node on 2x Azure GB200:
* All 8 ranks pass NVLS b0, b1, AND b2 barriers.
* `notify_dispatch DONE` prints for all ranks.
* Dispatch kernel enters (`[ph4-K] dispatch entry rank=0..3`).
* Sender hot path runs; cooperative-copy `[ph4-d]` print fires
showing valid fabric VA dst_p (0xbf80..0xbfe0 range) and
local src_p — confirming the int4 cooperative store at least
starts executing.
Next failure: dispatch kernel hits "CUDA error: unspecified launch
failure" shortly after the first `[ph4-d]` cooperative copy. Almost
certainly an illegal memory access in `dst_p[k] = src_p[k]` (wrong
n_int4, wrong slot offset, or stale peer base). To be debugged in
next iteration.
This commit is contained in:
@@ -504,7 +504,25 @@ __global__ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_co
|
||||
__syncthreads();
|
||||
if (thread_id == 0) printf("[ph4-N] rank=%d enter final-barrier run=%d ch=%d\n", rank, (int)run_barrier, barrier_channel_idx);
|
||||
|
||||
if (run_barrier) {
|
||||
if (nvls_mc_ptr != nullptr) {
|
||||
// Phase 4 / NVLS HT "B2": replace the cross-node port_channel
|
||||
// signal/wait pair with an epoch-monotonic NVLS multimem barrier
|
||||
// (slot at +16). Bypasses the broken IB signal/wait path on Azure
|
||||
// CX-7 RoCE that hangs the entire CTA at the !kLowLatencyMode
|
||||
// __syncthreads below (thread 33's wait never completes).
|
||||
if (thread_id == 0) {
|
||||
if (rank == 0) printf("[nvls] rank=%d epoch=%llu enter b2\n", rank, (unsigned long long)nvls_epoch);
|
||||
uint64_t* mc_b2 = reinterpret_cast<uint64_t*>(static_cast<char*>(nvls_mc_ptr) + nvls_off_barrier + 16);
|
||||
uint64_t* dev_b2 = reinterpret_cast<uint64_t*>(static_cast<char*>(nvls_dev_ptr) + nvls_off_barrier + 16);
|
||||
asm volatile("multimem.red.release.sys.global.add.u64 [%0], 1;" ::"l"(mc_b2) : "memory");
|
||||
const uint64_t expected = nvls_epoch * static_cast<uint64_t>(num_ranks);
|
||||
uint64_t v;
|
||||
do {
|
||||
asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(v) : "l"(dev_b2) : "memory");
|
||||
} while (v < expected);
|
||||
if (rank == 0) printf("[nvls] rank=%d epoch=%llu pass b2 v=%llu\n", rank, (unsigned long long)nvls_epoch, (unsigned long long)v);
|
||||
}
|
||||
} else if (run_barrier) {
|
||||
port_channel_handles[barrier_channel_idx].signal();
|
||||
port_channel_handles[barrier_channel_idx].wait();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user