[Doc] mscclpp docs (#348)

Generate docs for mescclpp. Setup github action to auto-deploy github-page doc link here: https://microsoft.github.io/mscclpp --------- Co-authored-by: Changho Hwang <changhohwang@microsoft.com> Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
2026-05-04 05:31:27 +00:00 · 2024-10-17 23:08:31 -07:00
parent 0c150e5166
commit 4136153a76
21 changed files with 575 additions and 19 deletions
--- a/docs/getting-started/tutorials/customized-proxy-service.md
+++ b/docs/getting-started/tutorials/customized-proxy-service.md
@@ -0,0 +1 @@
+# Customize the Proxy Service
--- a/docs/getting-started/tutorials/index.rst
+++ b/docs/getting-started/tutorials/index.rst
@@ -0,0 +1,16 @@
+Tutorials
+----------
+
+This tutorial section provides a step-by-step guide to help you get started with the C++/Python API.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Tutorials
+   :hidden:
+
+   initialization
+   proxy-channel
+   sm-channel
+   packet-api
+   customized-proxy-service
+   python-api
--- a/docs/getting-started/tutorials/initialization.md
+++ b/docs/getting-started/tutorials/initialization.md
@@ -0,0 +1,71 @@
+# Commnunication initialize with mscclpp API
+
+In this tutorial, you will write a simple program to initialize communication between eight GPUs using MSCCL++ C++ API. You will also learn how to use the Python API to initialize communication.
+
+## Prerequisites
+A system with eight GPUs is required to run this tutorial.
+
+Also make sure that you have installed MSCCL++ on your system. If not, please follow the [quick start](../quickstart.md).
+
+## Initialize Communication with C++ API
+We will setup a mesh topology with eight GPUs. Each GPU will be connected to its neighbors. The following code shows how to initialize communication with MSCCL++ C++ API.
+
+```cpp
+#include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/proxy_channel.hpp>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+template <class T>
+using DeviceHandle = mscclpp::DeviceHandle<T>;
+__constant__ DeviceHandle<mscclpp::SimpleProxyChannel> constProxyChans[8];
+
+void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {
+  std::string ip_port = "10.0.0.4:50000";
+  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, worldsize);
+  bootstrap->initialize(ip_port);
+  mscclpp::Communicator comm(bootstrap);
+  mscclpp::ProxyService proxyService;
+
+  std::vector<mscclpp::SemaphoreId> semaphoreIds;
+  std::vector<mscclpp::RegisteredMemory> localMemories;
+  std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connections(world_size);
+  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteMemories;
+
+  for (int r = 0; r < world_size; ++r) {
+    if (r == rank) continue;
+    mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
+    // Connect with all other ranks
+    connections[r] = comm.connectOnSetup(r, 0, transport);
+    auto memory = comm.registerMemory(data, dataSize, mscclpp::Transport::CudaIpc | ibTransport);
+    localMemories.push_back(memory);
+    comm.sendMemoryOnSetup(memory, r, 0);
+    remoteMemories.push_back(comm.recvMemoryOnSetup(r, 0));
+  }
+
+  comm.setup();
+
+  for (int r = 0; r < world_size; ++r) {
+    if (r == rank) continue;
+    semaphoreIds.push_back(proxyService.buildAndAddSemaphore(comm, connections[r].get()));
+  }
+
+  comm.setup();
+
+  std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>> proxyChannels;
+  for (size_t i = 0; i < semaphoreIds.size(); ++i) {
+    proxyChannels.push_back(mscclpp::deviceHandle(mscclpp::SimpleProxyChannel(
+        proxyService.proxyChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()),
+        proxyService.addMemory(localMemories[i]))));
+  }
+
+  if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
+    std::runtime_error("unexpected error");
+  }
+  CUDACHECK(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(),
+                              sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>) * proxyChannels.size()));
+}
+```
--- a/docs/getting-started/tutorials/packet-api.md
+++ b/docs/getting-started/tutorials/packet-api.md
@@ -0,0 +1 @@
+# Packet API for latency sensitive applications
--- a/docs/getting-started/tutorials/proxy-channel.md
+++ b/docs/getting-started/tutorials/proxy-channel.md
@@ -0,0 +1,3 @@
+# Offload commnunication to CPU with ProxyChannel
+
+TBU
--- a/docs/getting-started/tutorials/python-api.md
+++ b/docs/getting-started/tutorials/python-api.md
@@ -0,0 +1,92 @@
+# Working with Python API
+
+We provide Python API which help to initialze and setup the channel easily.
+In this tutorial, you will write a simple program to initialize communication between eight GPUs using MSCCL++ Python API.
+
+## Setup Channel with Python API
+
+We will setup a mesh topology with eight GPUs. Each GPU will be connected to its neighbors. The following code shows how to initialize communication with MSCCL++ Python API.
+```python
+from mpi4py import MPI
+import cupy as cp
+
+from mscclpp import (
+    ProxyService,
+    Transport,
+)
+import mscclpp.comm as mscclpp_comm
+
+def create_connection(group: mscclpp_comm.CommGroup, transport: str):
+    remote_nghrs = list(range(group.nranks))
+    remote_nghrs.remove(group.my_rank)
+    if transport == "NVLink":
+        tran = Transport.CudaIpc
+    elif transport == "IB":
+        tran = group.my_ib_device(group.my_rank % 8)
+    else:
+        assert False
+    connections = group.make_connection(remote_nghrs, tran)
+    return connections
+
+if __name__ == "__main__":
+    mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
+    connections = create_connection(mscclpp_group, "NVLink")
+    nelems = 1024
+    memory = cp.zeros(nelem, dtype=cp.int32)
+    proxy_service = ProxyService()
+    simple_channels = group.make_proxy_channels(proxy_service, memory, connections)
+    proxy_service.start_proxy()
+    mscclpp_group.barrier()
+    launch_kernel(mscclpp_group.my_rank, mscclpp_group.nranks, simple_channels, memory)
+    cp.cuda.runtime.deviceSynchronize()
+    mscclpp_group.barrier()
+```
+
+### Launch Kernel with Python API
+We provide some Python utils to help you launch kernel via python. Here is a exampl.
+```python
+from mscclpp.utils import KernelBuilder, pack
+
+def launch_kernel(my_rank: int, nranks: int, simple_channels: List[SimpleProxyChannel], memory: cp.ndarray):
+    file_dir = os.path.dirname(os.path.abspath(__file__))
+    kernel = KernelBuilder(file="test.cu", kernel_name="test", file_dir=file_dir).get_compiled_kernel()
+    params = b""
+    first_arg = next(iter(simple_channels.values()))
+    size_of_channels = len(first_arg.device_handle().raw)
+    device_handles = []
+    for rank in range(nranks):
+        if rank == my_rank:
+            device_handles.append(
+                bytes(size_of_channels)
+            )  # just zeros for semaphores that do not exist
+        else:
+            device_handles.append(simple_channels[rank].device_handle().raw)
+    # keep a reference to the device handles so that they don't get garbage collected
+    d_channels = cp.asarray(memoryview(b"".join(device_handles)), dtype=cp.uint8)
+    params = pack(d_channels, my_rank, nranks, memory.size)
+
+    nblocks = 1
+    nthreads = 512
+    kernel.launch_kernel(params, nblocks, nthreads, 0, None)
+```
+
+The test kernel is defined in `test.cu` as follows:
+```cuda
+#include <mscclpp/packet_device.hpp>
+#include <mscclpp/proxy_channel_device.hpp>
+
+// be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
+extern "C" __global__ void __launch_bounds__(1024, 1)
+    simple_proxy_channel(mscclpp::SimpleProxyChannelDeviceHandle* channels, int my_rank, int nranks,
+                         int num_elements) {
+    int tid = threadIdx.x;
+    int nthreads = blockDim.x;
+    uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks;
+    uint64_t my_offset = size_per_rank * my_rank;
+    __syncthreads();
+    if (tid < nranks && tid != my_rank) {
+      channels[tid].putWithSignalAndFlush(my_offset, my_offset, size_per_rank);
+      channels[tid].wait();
+    }
+}
+```
--- a/docs/getting-started/tutorials/sm-channel.md
+++ b/docs/getting-started/tutorials/sm-channel.md
@@ -0,0 +1,3 @@
+# Using SmChannel for Intra-Node Communication
+
+TBU
				`@@ -0,0 +1 @@`
				`# Packet API for latency sensitive applications`