From da60eb7f46d3f0765e3e835956cbfcf65c033bbd Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 11 Dec 2025 15:15:58 -0800
Subject: [PATCH] Add an IB multi-node tutorial (#702)

---
 docs/tutorials/04-port-channel.md             | 70 +++++++++++++++-
 .../04-port-channel/bidir_port_channel.cu     | 82 ++++++++++++-------
 2 files changed, 123 insertions(+), 29 deletions(-)
diff --git a/docs/tutorials/04-port-channel.md b/docs/tutorials/04-port-channel.md
index 4e0238c1..6a10a556 100644
--- a/docs/tutorials/04-port-channel.md
+++ b/docs/tutorials/04-port-channel.md
@@ -35,7 +35,7 @@ Note that this example is **NOT** a performance benchmark. The performance numbe
 
 ## Code Overview
 
-The example code implements a bidirectional data transfer using a `PortChannel` between two GPUs on the same machine. The code is similar to the [Memory Channel](./03-memory-channel.md) tutorial, with the main difference being that the construction of a `PortChannel` is done by a `ProxyService` instance. We need to "add" the pre-built `Semaphore` and `RegisteredMemory` objects to the `ProxyService`, which return `SemaphoreId` and `MemoryId`s, respectively:
+The example code implements a bidirectional data transfer using a `PortChannel` between two GPUs. The code is similar to the [Memory Channel](./03-memory-channel.md) tutorial, with the main difference being that the construction of a `PortChannel` is done by a `ProxyService` instance. We need to "add" the pre-built `Semaphore` and `RegisteredMemory` objects to the `ProxyService`, which return `SemaphoreId` and `MemoryId`s, respectively:
 
 ```cpp
 mscclpp::ProxyService proxyService;
@@ -102,6 +102,74 @@ The device handle methods of `PortChannel` are thread-safe except when the numbe
 Advanced users may want to customize the behavior of `ProxyService` to support custom request types or transport mechanisms, which can be done by subclassing `BaseProxyService`. See an example in [class AllGatherProxyService](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allgather_test.cu#L503).
 ```
 
+## Cross-node Execution
+
+This section explains running the example code with two GPUs on different nodes using InfiniBand (or RoCE) transport.
+
+### Running the Example across Nodes
+
+```{note}
+Before running the code across nodes, make sure that your environment meets the [prerequisites of GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites) and the RDMA networking is properly configured.
+```
+
+Run the program on two nodes with command line arguments:
+
+```
+./bidir_port_channel [<ip_port> <rank> <gpu_id> <transport>]
+```
+
+For example, assume we use `192.168.0.1:50000` as the bootstrap IP address and port, and both nodes use GPU 0 with the InfiniBand device index 0 (`IB0`).
+
+On Node 0 (Rank 0):
+```bash
+$ ./bidir_port_channel 192.168.0.1:50000 0 0 IB0
+```
+
+On Node 1 (Rank 1):
+```bash
+$ ./bidir_port_channel 192.168.0.1:50000 1 0 IB0
+```
+
+You should see output indicating successful data transfer.
+
+```{tip}
+The example code also supports running two instances on the same node. For example:
+
+Terminal 1: `./bidir_port_channel 127.0.0.1:50000 0 0 IB0`
+
+Terminal 2: `./bidir_port_channel 127.0.0.1:50000 1 1 IB1`
+```
+
+```{tip}
+If your bootstrap IP address is not on the default network interface of your node, you can specify the network interface by passing `interface_name:ip:port` as the first argument (such as `eth1:192.168.0.1:50000`).
+```
+
+### What's Happening in Terms of InfiniBand?
+
+When we use InfiniBand transport, each `Endpoint` holds a unique InfiniBand context, and each `Connection` holds a unique InfiniBand queue pair (QP). Therefore, multiple `Semaphore`s and `PortChannel`s will share the same QP if they are created out of the same `Connection`. If you want multiple QPs between two endpoints, you need to create multiple parallel `Connection`s, and then create `Semaphore`s and `PortChannel`s from different `Connection`s.
+
+The `PortChannel` methods would have the following behavior in terms of InfiniBand operations:
+- `put()`: Posts an RDMA Write operation to the QP to transfer data.
+- `signal()`: Asynchronously triggers a PCIe flush on the remote side (e.g., by an RDMA atomic operation) to ensure all previous RDMA Writes are visible to the remote GPU.
+- `wait()`: Polls the completion queue (CQ) of the QP until the corresponding signal is received.
+- `poll()`: Non-blocking version of `wait()`, checks the CQ for the signal.
+- `flush()`: Ensures the CQ is drained and all previous operations are completed.
+
+The example code does not pass InfiniBand-specific parameters in the endpoint configuration for simplicity, which can be done like the following example:
+
+```cpp
+mscclpp::EndpointConfig epConfig;
+epConfig.transport = mscclpp::Transport::IB0;
+epConfig.device = {mscclpp::DeviceType::GPU, 0}; // GPU 0
+// InfiniBand-specific parameters
+epConfig.ib.maxCqSize = 8192;
+epConfig.ib.maxCqPollNum = 4;
+// Create an endpoint and establish a connection
+auto conn = comm.connect(epConfig, remoteRank).get();
+```
+
+See all available InfiniBand-specific parameters in {cpp:struct}`mscclpp::EndpointConfig::Ib`.
+
 ## Summary and Next Steps
 
 In this tutorial, we learned how to use `PortChannel` for bidirectional data transfer between two GPUs using a `ProxyService`.
diff --git a/examples/tutorials/04-port-channel/bidir_port_channel.cu b/examples/tutorials/04-port-channel/bidir_port_channel.cu
index 54dbf22e..46064581 100644
--- a/examples/tutorials/04-port-channel/bidir_port_channel.cu
+++ b/examples/tutorials/04-port-channel/bidir_port_channel.cu
@@ -58,20 +58,19 @@ __global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size
   }
 }
 
-void worker(int gpuId) {
+void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport transport) {
   MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
-  const int myRank = gpuId;
+  const int myRank = rank;
   const int remoteRank = myRank == 0 ? 1 : 0;
   const int nRanks = 2;
   const int iter = 1000;
-  const mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
   const size_t bufferBytes = 256 * 1024 * 1024;
 
-  log("GPU ", gpuId, ": Preparing for tests ...");
+  log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");
 
   // Build a connection and a semaphore
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
-  bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER);
+  bootstrap->initialize(ipPort);
   mscclpp::Communicator comm(bootstrap);
   auto conn = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank).get();
   auto sema = comm.buildSemaphore(conn, remoteRank).get();
@@ -106,7 +105,7 @@ void worker(int gpuId) {
   };
 
   cudaEvent_t start, end;
-  if (gpuId == 0) {
+  if (myRank == 0) {
     MSCCLPP_CUDATHROW(cudaEventCreate(&start));
     MSCCLPP_CUDATHROW(cudaEventCreate(&end));
   }
@@ -138,13 +137,13 @@ void worker(int gpuId) {
       proxyService.startProxy();
       bootstrap->barrier();
 
-      if (gpuId == 0) {
+      if (myRank == 0) {
         MSCCLPP_CUDATHROW(cudaEventRecord(start, stream));
       }
 
       MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream));
 
-      if (gpuId == 0) {
+      if (myRank == 0) {
         MSCCLPP_CUDATHROW(cudaEventRecord(end, stream));
         MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
         float elapsedTime;
@@ -153,7 +152,7 @@ void worker(int gpuId) {
         MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
         elapsedTimePerIter = elapsedTime / iter;
         gbps = float(copyBytes) / elapsedTimePerIter * 1e-6f;
-        log("GPU ", gpuId, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
+        log("Rank ", myRank, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
             gbps, " GB/s");
       }
       MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
@@ -167,23 +166,50 @@ void worker(int gpuId) {
   bootstrap->barrier();
 }
 
-int main() {
-  int pid0 = spawn_process([]() { worker(0); });
-  int pid1 = spawn_process([]() { worker(1); });
-  if (pid0 < 0 || pid1 < 0) {
-    log("Failed to spawn processes.");
-    return -1;
-  }
-  int status0 = wait_process(pid0);
-  int status1 = wait_process(pid1);
-  if (status0 < 0 || status1 < 0) {
-    log("Failed to wait for processes.");
-    return -1;
-  }
-  if (status0 != 0 || status1 != 0) {
-    log("One of the processes failed.");
-    return -1;
-  }
-  log("Succeed!");
-  return 0;
+mscclpp::Transport parseTransport(const std::string &transportStr) {
+  if (transportStr == "CudaIpc") return mscclpp::Transport::CudaIpc;
+  if (transportStr == "IB0") return mscclpp::Transport::IB0;
+  if (transportStr == "IB1") return mscclpp::Transport::IB1;
+  if (transportStr == "IB2") return mscclpp::Transport::IB2;
+  if (transportStr == "IB3") return mscclpp::Transport::IB3;
+  if (transportStr == "IB4") return mscclpp::Transport::IB4;
+  if (transportStr == "IB5") return mscclpp::Transport::IB5;
+  if (transportStr == "IB6") return mscclpp::Transport::IB6;
+  if (transportStr == "IB7") return mscclpp::Transport::IB7;
+  if (transportStr == "Ethernet") return mscclpp::Transport::Ethernet;
+  throw std::runtime_error("Unknown transport: " + transportStr);
+}
+
+int main(int argc, char **argv) {
+  if (argc == 1) {
+    int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
+    int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
+    if (pid0 < 0 || pid1 < 0) {
+      log("Failed to spawn processes.");
+      return -1;
+    }
+    int status0 = wait_process(pid0);
+    int status1 = wait_process(pid1);
+    if (status0 < 0 || status1 < 0) {
+      log("Failed to wait for processes.");
+      return -1;
+    }
+    if (status0 != 0 || status1 != 0) {
+      log("One of the processes failed.");
+      return -1;
+    }
+    log("Succeed!");
+    return 0;
+  } else if (argc == 5) {
+    std::string ipPort = argv[1];
+    int rank = std::atoi(argv[2]);
+    int gpuId = std::atoi(argv[3]);
+    mscclpp::Transport transport = parseTransport(argv[4]);
+    worker(rank, gpuId, ipPort, transport);
+    log("Rank ", rank, ": Succeed!");
+    return 0;
+  } else {
+    std::cerr << "Usage: " << argv[0] << " [<ip_port> <rank> <gpu_id> <transport>]" << std::endl;
+    return -1;
+  }
 }