From da60eb7f46d3f0765e3e835956cbfcf65c033bbd Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 11 Dec 2025 15:15:58 -0800 Subject: [PATCH] Add an IB multi-node tutorial (#702) --- docs/tutorials/04-port-channel.md | 70 +++++++++++++++- .../04-port-channel/bidir_port_channel.cu | 82 ++++++++++++------- 2 files changed, 123 insertions(+), 29 deletions(-) diff --git a/docs/tutorials/04-port-channel.md b/docs/tutorials/04-port-channel.md index 4e0238c1..6a10a556 100644 --- a/docs/tutorials/04-port-channel.md +++ b/docs/tutorials/04-port-channel.md @@ -35,7 +35,7 @@ Note that this example is **NOT** a performance benchmark. The performance numbe ## Code Overview -The example code implements a bidirectional data transfer using a `PortChannel` between two GPUs on the same machine. The code is similar to the [Memory Channel](./03-memory-channel.md) tutorial, with the main difference being that the construction of a `PortChannel` is done by a `ProxyService` instance. We need to "add" the pre-built `Semaphore` and `RegisteredMemory` objects to the `ProxyService`, which return `SemaphoreId` and `MemoryId`s, respectively: +The example code implements a bidirectional data transfer using a `PortChannel` between two GPUs. The code is similar to the [Memory Channel](./03-memory-channel.md) tutorial, with the main difference being that the construction of a `PortChannel` is done by a `ProxyService` instance. We need to "add" the pre-built `Semaphore` and `RegisteredMemory` objects to the `ProxyService`, which return `SemaphoreId` and `MemoryId`s, respectively: ```cpp mscclpp::ProxyService proxyService; @@ -102,6 +102,74 @@ The device handle methods of `PortChannel` are thread-safe except when the numbe Advanced users may want to customize the behavior of `ProxyService` to support custom request types or transport mechanisms, which can be done by subclassing `BaseProxyService`. See an example in [class AllGatherProxyService](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allgather_test.cu#L503). ``` +## Cross-node Execution + +This section explains running the example code with two GPUs on different nodes using InfiniBand (or RoCE) transport. + +### Running the Example across Nodes + +```{note} +Before running the code across nodes, make sure that your environment meets the [prerequisites of GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites) and the RDMA networking is properly configured. +``` + +Run the program on two nodes with command line arguments: + +``` +./bidir_port_channel [ ] +``` + +For example, assume we use `192.168.0.1:50000` as the bootstrap IP address and port, and both nodes use GPU 0 with the InfiniBand device index 0 (`IB0`). + +On Node 0 (Rank 0): +```bash +$ ./bidir_port_channel 192.168.0.1:50000 0 0 IB0 +``` + +On Node 1 (Rank 1): +```bash +$ ./bidir_port_channel 192.168.0.1:50000 1 0 IB0 +``` + +You should see output indicating successful data transfer. + +```{tip} +The example code also supports running two instances on the same node. For example: + +Terminal 1: `./bidir_port_channel 127.0.0.1:50000 0 0 IB0` + +Terminal 2: `./bidir_port_channel 127.0.0.1:50000 1 1 IB1` +``` + +```{tip} +If your bootstrap IP address is not on the default network interface of your node, you can specify the network interface by passing `interface_name:ip:port` as the first argument (such as `eth1:192.168.0.1:50000`). +``` + +### What's Happening in Terms of InfiniBand? + +When we use InfiniBand transport, each `Endpoint` holds a unique InfiniBand context, and each `Connection` holds a unique InfiniBand queue pair (QP). Therefore, multiple `Semaphore`s and `PortChannel`s will share the same QP if they are created out of the same `Connection`. If you want multiple QPs between two endpoints, you need to create multiple parallel `Connection`s, and then create `Semaphore`s and `PortChannel`s from different `Connection`s. + +The `PortChannel` methods would have the following behavior in terms of InfiniBand operations: +- `put()`: Posts an RDMA Write operation to the QP to transfer data. +- `signal()`: Asynchronously triggers a PCIe flush on the remote side (e.g., by an RDMA atomic operation) to ensure all previous RDMA Writes are visible to the remote GPU. +- `wait()`: Polls the completion queue (CQ) of the QP until the corresponding signal is received. +- `poll()`: Non-blocking version of `wait()`, checks the CQ for the signal. +- `flush()`: Ensures the CQ is drained and all previous operations are completed. + +The example code does not pass InfiniBand-specific parameters in the endpoint configuration for simplicity, which can be done like the following example: + +```cpp +mscclpp::EndpointConfig epConfig; +epConfig.transport = mscclpp::Transport::IB0; +epConfig.device = {mscclpp::DeviceType::GPU, 0}; // GPU 0 +// InfiniBand-specific parameters +epConfig.ib.maxCqSize = 8192; +epConfig.ib.maxCqPollNum = 4; +// Create an endpoint and establish a connection +auto conn = comm.connect(epConfig, remoteRank).get(); +``` + +See all available InfiniBand-specific parameters in {cpp:struct}`mscclpp::EndpointConfig::Ib`. + ## Summary and Next Steps In this tutorial, we learned how to use `PortChannel` for bidirectional data transfer between two GPUs using a `ProxyService`. diff --git a/examples/tutorials/04-port-channel/bidir_port_channel.cu b/examples/tutorials/04-port-channel/bidir_port_channel.cu index 54dbf22e..46064581 100644 --- a/examples/tutorials/04-port-channel/bidir_port_channel.cu +++ b/examples/tutorials/04-port-channel/bidir_port_channel.cu @@ -58,20 +58,19 @@ __global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size } } -void worker(int gpuId) { +void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport transport) { MSCCLPP_CUDATHROW(cudaSetDevice(gpuId)); - const int myRank = gpuId; + const int myRank = rank; const int remoteRank = myRank == 0 ? 1 : 0; const int nRanks = 2; const int iter = 1000; - const mscclpp::Transport transport = mscclpp::Transport::CudaIpc; const size_t bufferBytes = 256 * 1024 * 1024; - log("GPU ", gpuId, ": Preparing for tests ..."); + log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ..."); // Build a connection and a semaphore auto bootstrap = std::make_shared(myRank, nRanks); - bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER); + bootstrap->initialize(ipPort); mscclpp::Communicator comm(bootstrap); auto conn = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank).get(); auto sema = comm.buildSemaphore(conn, remoteRank).get(); @@ -106,7 +105,7 @@ void worker(int gpuId) { }; cudaEvent_t start, end; - if (gpuId == 0) { + if (myRank == 0) { MSCCLPP_CUDATHROW(cudaEventCreate(&start)); MSCCLPP_CUDATHROW(cudaEventCreate(&end)); } @@ -138,13 +137,13 @@ void worker(int gpuId) { proxyService.startProxy(); bootstrap->barrier(); - if (gpuId == 0) { + if (myRank == 0) { MSCCLPP_CUDATHROW(cudaEventRecord(start, stream)); } MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream)); - if (gpuId == 0) { + if (myRank == 0) { MSCCLPP_CUDATHROW(cudaEventRecord(end, stream)); MSCCLPP_CUDATHROW(cudaEventSynchronize(end)); float elapsedTime; @@ -153,7 +152,7 @@ void worker(int gpuId) { MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end)); elapsedTimePerIter = elapsedTime / iter; gbps = float(copyBytes) / elapsedTimePerIter * 1e-6f; - log("GPU ", gpuId, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ", + log("Rank ", myRank, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ", gbps, " GB/s"); } MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); @@ -167,23 +166,50 @@ void worker(int gpuId) { bootstrap->barrier(); } -int main() { - int pid0 = spawn_process([]() { worker(0); }); - int pid1 = spawn_process([]() { worker(1); }); - if (pid0 < 0 || pid1 < 0) { - log("Failed to spawn processes."); - return -1; - } - int status0 = wait_process(pid0); - int status1 = wait_process(pid1); - if (status0 < 0 || status1 < 0) { - log("Failed to wait for processes."); - return -1; - } - if (status0 != 0 || status1 != 0) { - log("One of the processes failed."); - return -1; - } - log("Succeed!"); - return 0; +mscclpp::Transport parseTransport(const std::string &transportStr) { + if (transportStr == "CudaIpc") return mscclpp::Transport::CudaIpc; + if (transportStr == "IB0") return mscclpp::Transport::IB0; + if (transportStr == "IB1") return mscclpp::Transport::IB1; + if (transportStr == "IB2") return mscclpp::Transport::IB2; + if (transportStr == "IB3") return mscclpp::Transport::IB3; + if (transportStr == "IB4") return mscclpp::Transport::IB4; + if (transportStr == "IB5") return mscclpp::Transport::IB5; + if (transportStr == "IB6") return mscclpp::Transport::IB6; + if (transportStr == "IB7") return mscclpp::Transport::IB7; + if (transportStr == "Ethernet") return mscclpp::Transport::Ethernet; + throw std::runtime_error("Unknown transport: " + transportStr); +} + +int main(int argc, char **argv) { + if (argc == 1) { + int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); }); + int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); }); + if (pid0 < 0 || pid1 < 0) { + log("Failed to spawn processes."); + return -1; + } + int status0 = wait_process(pid0); + int status1 = wait_process(pid1); + if (status0 < 0 || status1 < 0) { + log("Failed to wait for processes."); + return -1; + } + if (status0 != 0 || status1 != 0) { + log("One of the processes failed."); + return -1; + } + log("Succeed!"); + return 0; + } else if (argc == 5) { + std::string ipPort = argv[1]; + int rank = std::atoi(argv[2]); + int gpuId = std::atoi(argv[3]); + mscclpp::Transport transport = parseTransport(argv[4]); + worker(rank, gpuId, ipPort, transport); + log("Rank ", rank, ": Succeed!"); + return 0; + } else { + std::cerr << "Usage: " << argv[0] << " [ ]" << std::endl; + return -1; + } }