From a073ca7bef75c4831348abd8a6b57f64a44099f0 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Thu, 23 Apr 2026 06:42:34 +0000
Subject: [PATCH] ext/ep: restore self CUDA-IPC connection (was needed by HT/LL
 paths)

Dropping the self ipc_cfg connection caused cudaErrorInvalidResourceHandle
on multi-node launches. Keep the self connection (needed by other code
paths that assume every rank is in the connections map) but continue to
skip the self slot in the semaphore + port-channel construction loops so
the kernel's [local_expert*num_ranks + dst_rank] indexing hits only peer
handles; the self slot is a zero-initialized placeholder since the
kernel's same-rank branch uses a direct warp copy.
---
 src/ext/ep/buffer.cc | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/src/ext/ep/buffer.cc b/src/ext/ep/buffer.cc
index 798d8ac2..989594b6 100644
--- a/src/ext/ep/buffer.cc
+++ b/src/ext/ep/buffer.cc
@@ -312,16 +312,18 @@ void Buffer::sync(const std::vector<int> &device_ids,
             memory_ids[r] = proxy_service->addMemory(std::move(mem));
         }
 
-        // Rank -> vector of connections. Skip self: the kernel's same-rank
-        // path uses a direct warp copy (see internode_ll.cu `dst_rank != rank`
-        // check) and never dereferences the self-slot port channel. Creating
-        // a self CUDA-IPC connection + self semaphore previously skewed the
-        // cross-rank `buildAndAddSemaphore` handshake sequence between ranks,
-        // leading to asymmetric semaphore pairings that prevented atomicAdd
-        // signals from being delivered in one direction during LL dispatch.
+        // Rank -> vector of connections. Keep a self CUDA-IPC connection
+        // because other code paths assume every rank is represented in the
+        // connections map. The self slot in the semaphore/port-channel loops
+        // below is skipped so the kernel indexing still hits peer handles
+        // only (the same-rank kernel branch uses a direct warp copy).
         std::unordered_map<int, std::vector<mscclpp::Connection>> connections;
+        const mscclpp::EndpointConfig ipc_cfg(ipc_transport);
         const mscclpp::EndpointConfig ib_cfg(ib_transport);
 
+        // Self connection for local memory (CUDA IPC).
+        connections[rank].emplace_back(communicator->connect(ipc_cfg, rank, kRdmaTag).get());
+
         // Remote IB connections (multi-QP per peer).
         const int num_ib_connections_per_rank = 12;  // #QPs per rank (mirrors DeepEP).
         for (int r = 0; r < num_ranks; ++r) {
@@ -335,9 +337,9 @@ void Buffer::sync(const std::vector<int> &device_ids,
         }
 
         // Rank -> vector of semaphore IDs. Iterate peers in sorted rank order so
-        // semaphore pairings between nodes line up deterministically. Self is
-        // skipped so both sides see an identical sequence of cross-rank
-        // `buildAndAddSemaphore` calls.
+        // semaphore pairings between nodes line up deterministically. Skip self:
+        // the kernel's same-rank path uses a direct warp copy and the self
+        // port-channel slot is filled with a zero-initialized placeholder.
         std::unordered_map<int, std::vector<mscclpp::SemaphoreId>> sema_ids;
         const int num_semaphores_per_rank = 16;
         for (int i = 0; i < num_semaphores_per_rank; ++i) {
@@ -363,7 +365,6 @@ void Buffer::sync(const std::vector<int> &device_ids,
         for (int i = 0; i < num_port_channels_per_rank; ++i) {
             for (int r = 0; r < num_ranks; ++r) {
                 if (r == rank) {
-                    // Placeholder; indexed but never dispatched by kernels.
                     port_channel_handles.emplace_back(mscclpp::PortChannelDeviceHandle{});
                     continue;
                 }