[Doc] mscclpp docs (#348)

Generate docs for mescclpp.
Setup github action to auto-deploy github-page
doc link here: https://microsoft.github.io/mscclpp

---------

Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
This commit is contained in:
Binyang Li
2024-10-17 23:08:31 -07:00
committed by GitHub
parent 0c150e5166
commit 4136153a76
21 changed files with 575 additions and 19 deletions

View File

@@ -0,0 +1 @@
# Customize the Proxy Service

View File

@@ -0,0 +1,16 @@
Tutorials
----------
This tutorial section provides a step-by-step guide to help you get started with the C++/Python API.
.. toctree::
:maxdepth: 1
:caption: Tutorials
:hidden:
initialization
proxy-channel
sm-channel
packet-api
customized-proxy-service
python-api

View File

@@ -0,0 +1,71 @@
# Commnunication initialize with mscclpp API
In this tutorial, you will write a simple program to initialize communication between eight GPUs using MSCCL++ C++ API. You will also learn how to use the Python API to initialize communication.
## Prerequisites
A system with eight GPUs is required to run this tutorial.
Also make sure that you have installed MSCCL++ on your system. If not, please follow the [quick start](../quickstart.md).
## Initialize Communication with C++ API
We will setup a mesh topology with eight GPUs. Each GPU will be connected to its neighbors. The following code shows how to initialize communication with MSCCL++ C++ API.
```cpp
#include <mscclpp/core.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/proxy_channel.hpp>
#include <memory>
#include <string>
#include <vector>
template <class T>
using DeviceHandle = mscclpp::DeviceHandle<T>;
__constant__ DeviceHandle<mscclpp::SimpleProxyChannel> constProxyChans[8];
void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {
std::string ip_port = "10.0.0.4:50000";
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, worldsize);
bootstrap->initialize(ip_port);
mscclpp::Communicator comm(bootstrap);
mscclpp::ProxyService proxyService;
std::vector<mscclpp::SemaphoreId> semaphoreIds;
std::vector<mscclpp::RegisteredMemory> localMemories;
std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connections(world_size);
std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteMemories;
for (int r = 0; r < world_size; ++r) {
if (r == rank) continue;
mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
// Connect with all other ranks
connections[r] = comm.connectOnSetup(r, 0, transport);
auto memory = comm.registerMemory(data, dataSize, mscclpp::Transport::CudaIpc | ibTransport);
localMemories.push_back(memory);
comm.sendMemoryOnSetup(memory, r, 0);
remoteMemories.push_back(comm.recvMemoryOnSetup(r, 0));
}
comm.setup();
for (int r = 0; r < world_size; ++r) {
if (r == rank) continue;
semaphoreIds.push_back(proxyService.buildAndAddSemaphore(comm, connections[r].get()));
}
comm.setup();
std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>> proxyChannels;
for (size_t i = 0; i < semaphoreIds.size(); ++i) {
proxyChannels.push_back(mscclpp::deviceHandle(mscclpp::SimpleProxyChannel(
proxyService.proxyChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()),
proxyService.addMemory(localMemories[i]))));
}
if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
std::runtime_error("unexpected error");
}
CUDACHECK(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(),
sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>) * proxyChannels.size()));
}
```

View File

@@ -0,0 +1 @@
# Packet API for latency sensitive applications

View File

@@ -0,0 +1,3 @@
# Offload commnunication to CPU with ProxyChannel
TBU

View File

@@ -0,0 +1,92 @@
# Working with Python API
We provide Python API which help to initialze and setup the channel easily.
In this tutorial, you will write a simple program to initialize communication between eight GPUs using MSCCL++ Python API.
## Setup Channel with Python API
We will setup a mesh topology with eight GPUs. Each GPU will be connected to its neighbors. The following code shows how to initialize communication with MSCCL++ Python API.
```python
from mpi4py import MPI
import cupy as cp
from mscclpp import (
ProxyService,
Transport,
)
import mscclpp.comm as mscclpp_comm
def create_connection(group: mscclpp_comm.CommGroup, transport: str):
remote_nghrs = list(range(group.nranks))
remote_nghrs.remove(group.my_rank)
if transport == "NVLink":
tran = Transport.CudaIpc
elif transport == "IB":
tran = group.my_ib_device(group.my_rank % 8)
else:
assert False
connections = group.make_connection(remote_nghrs, tran)
return connections
if __name__ == "__main__":
mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
connections = create_connection(mscclpp_group, "NVLink")
nelems = 1024
memory = cp.zeros(nelem, dtype=cp.int32)
proxy_service = ProxyService()
simple_channels = group.make_proxy_channels(proxy_service, memory, connections)
proxy_service.start_proxy()
mscclpp_group.barrier()
launch_kernel(mscclpp_group.my_rank, mscclpp_group.nranks, simple_channels, memory)
cp.cuda.runtime.deviceSynchronize()
mscclpp_group.barrier()
```
### Launch Kernel with Python API
We provide some Python utils to help you launch kernel via python. Here is a exampl.
```python
from mscclpp.utils import KernelBuilder, pack
def launch_kernel(my_rank: int, nranks: int, simple_channels: List[SimpleProxyChannel], memory: cp.ndarray):
file_dir = os.path.dirname(os.path.abspath(__file__))
kernel = KernelBuilder(file="test.cu", kernel_name="test", file_dir=file_dir).get_compiled_kernel()
params = b""
first_arg = next(iter(simple_channels.values()))
size_of_channels = len(first_arg.device_handle().raw)
device_handles = []
for rank in range(nranks):
if rank == my_rank:
device_handles.append(
bytes(size_of_channels)
) # just zeros for semaphores that do not exist
else:
device_handles.append(simple_channels[rank].device_handle().raw)
# keep a reference to the device handles so that they don't get garbage collected
d_channels = cp.asarray(memoryview(b"".join(device_handles)), dtype=cp.uint8)
params = pack(d_channels, my_rank, nranks, memory.size)
nblocks = 1
nthreads = 512
kernel.launch_kernel(params, nblocks, nthreads, 0, None)
```
The test kernel is defined in `test.cu` as follows:
```cuda
#include <mscclpp/packet_device.hpp>
#include <mscclpp/proxy_channel_device.hpp>
// be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
extern "C" __global__ void __launch_bounds__(1024, 1)
simple_proxy_channel(mscclpp::SimpleProxyChannelDeviceHandle* channels, int my_rank, int nranks,
int num_elements) {
int tid = threadIdx.x;
int nthreads = blockDim.x;
uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks;
uint64_t my_offset = size_per_rank * my_rank;
__syncthreads();
if (tid < nranks && tid != my_rank) {
channels[tid].putWithSignalAndFlush(my_offset, my_offset, size_per_rank);
channels[tid].wait();
}
}
```

View File

@@ -0,0 +1,3 @@
# Using SmChannel for Intra-Node Communication
TBU