From 441bfa5265d5480f4e5f80a271cd4399449f44f0 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Thu, 23 Apr 2026 15:58:57 +0000
Subject: [PATCH] ext/ep: keep self slots in LL sema+port-channel loops (fixes
 cross-node)

The prior commit skipped r==rank in the semaphore and port-channel
build loops on the theory that the self-slot handshake skew was the
cause of LL direction asymmetry. That was wrong (the real bug was
int32 atomic alignment), and skipping self breaks other code paths
that assume every rank slot is represented -- cross-node HT and LL
failed with cudaErrorInvalidResourceHandle at the first barrier after
Buffer init. Restore the self-inclusive loop.
---
 src/ext/ep/buffer.cc | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)
diff --git a/src/ext/ep/buffer.cc b/src/ext/ep/buffer.cc
index 989594b6..98746abb 100644
--- a/src/ext/ep/buffer.cc
+++ b/src/ext/ep/buffer.cc
@@ -312,11 +312,7 @@ void Buffer::sync(const std::vector<int> &device_ids,
             memory_ids[r] = proxy_service->addMemory(std::move(mem));
         }
 
-        // Rank -> vector of connections. Keep a self CUDA-IPC connection
-        // because other code paths assume every rank is represented in the
-        // connections map. The self slot in the semaphore/port-channel loops
-        // below is skipped so the kernel indexing still hits peer handles
-        // only (the same-rank kernel branch uses a direct warp copy).
+        // Rank -> vector of connections
         std::unordered_map<int, std::vector<mscclpp::Connection>> connections;
         const mscclpp::EndpointConfig ipc_cfg(ipc_transport);
         const mscclpp::EndpointConfig ib_cfg(ib_transport);
@@ -337,14 +333,11 @@ void Buffer::sync(const std::vector<int> &device_ids,
         }
 
         // Rank -> vector of semaphore IDs. Iterate peers in sorted rank order so
-        // semaphore pairings between nodes line up deterministically. Skip self:
-        // the kernel's same-rank path uses a direct warp copy and the self
-        // port-channel slot is filled with a zero-initialized placeholder.
+        // semaphore pairings between nodes line up deterministically.
         std::unordered_map<int, std::vector<mscclpp::SemaphoreId>> sema_ids;
         const int num_semaphores_per_rank = 16;
         for (int i = 0; i < num_semaphores_per_rank; ++i) {
             for (int r = 0; r < num_ranks; ++r) {
-                if (r == rank) continue;
                 auto conn_it = connections.find(r);
                 EP_HOST_ASSERT(conn_it != connections.end());
                 auto& conns = conn_it->second;
@@ -358,16 +351,12 @@ void Buffer::sync(const std::vector<int> &device_ids,
         //
         // The kernels index `port_channel_handles[channel_id * num_ranks + peer_rank]`
         // where peer_rank is a GLOBAL rank in [0..num_ranks). So the outer stride must
-        // be num_ranks with peers in ascending rank order. The self slot is filled
-        // with a zero-initialized placeholder handle that the kernels never touch.
+        // be num_ranks with peers in ascending rank order. Iterating `memory_ids` (an
+        // `unordered_map`) yields hash order and would misroute signals, deadlocking.
         const int num_port_channels_per_rank = num_semaphores_per_rank;
         std::vector<mscclpp::PortChannelDeviceHandle> port_channel_handles;
         for (int i = 0; i < num_port_channels_per_rank; ++i) {
             for (int r = 0; r < num_ranks; ++r) {
-                if (r == rank) {
-                    port_channel_handles.emplace_back(mscclpp::PortChannelDeviceHandle{});
-                    continue;
-                }
                 auto mem_it = memory_ids.find(r);
                 EP_HOST_ASSERT(mem_it != memory_ids.end());
                 auto memory_id = mem_it->second;