Update documentation (#576)

Documentation overhaul
2026-04-20 06:49:29 +00:00 · 2025-08-07 15:37:37 -07:00
parent 4f6f23dae3
commit 9650e5c37e
43 changed files with 2036 additions and 712 deletions
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
 [![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)

-| Pipelines                | Build Status      |
+| Testing Pipelines        | Build Status      |
 |--------------------------|-------------------|
 | Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
 | Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
@@ -13,9 +13,11 @@

 A GPU-driven communication stack for scalable AI applications.

-See [Quick Start](https://microsoft.github.io/mscclpp/getting-started/quickstart.html) to quickly get started.
-
-Check our [paper](https://arxiv.org/abs/2504.09014) to know more about MSCCL++.
+| [Quick Start](https://microsoft.github.io/mscclpp/quickstart.html)
+| [Tutorials](https://microsoft.github.io/mscclpp/tutorials.html)
+| [API Reference](https://microsoft.github.io/mscclpp/cpp_api.html)
+| [Paper](https://arxiv.org/abs/2504.09014)
+|

 ## Overview

--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -44,7 +44,7 @@ PROJECT_NUMBER         =
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.

-PROJECT_BRIEF          = "GPU-driven computation & communication stack"
+PROJECT_BRIEF          = "GPU-driven communication stack"

 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
@@ -2163,7 +2163,7 @@ ENABLE_PREPROCESSING   = YES
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

-MACRO_EXPANSION        = NO
+MACRO_EXPANSION        = YES

 # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
 # the macro expansion is limited to the macros specified with the PREDEFINED and
@@ -2203,7 +2203,17 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

-PREDEFINED             = __CUDACC__
+PREDEFINED             = __CUDACC__ \
+                         MSCCLPP_DEVICE_COMPILE \
+                         MSCCLPP_DEVICE_CUDA \
+                         MSCCLPP_DEVICE_HIP \
+                         MSCCLPP_DEVICE_INLINE= \
+                         MSCCLPP_HOST_DEVICE_INLINE= \
+                         MSCCLPP_INLINE= \
+                         __forceinline__= \
+                         __device__= \
+                         __host__= \
+                         __global__=

 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
--- a/docs/_static/.gitkeep
+++ b/docs/_static/.gitkeep
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -1,5 +0,0 @@
-API Reference
-=============
-
-.. doxygennamespace:: mscclpp
-   :members:
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -7,14 +7,14 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

 project = "mscclpp"
-copyright = "2024, MSCCL++ Team"
+copyright = "2025, MSCCL++ Team"
 author = "MSCCL++ Team"
 release = "v0.7.0"

 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

-extensions = ["breathe", "myst_parser"]
+extensions = ["breathe", "myst_parser", "sphinxcontrib.mermaid"]

 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
@@ -23,6 +23,10 @@ exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 breathe_projects = {"mscclpp": "./doxygen/xml"}
 breathe_default_project = "mscclpp"

+# Mermaid configuration
+mermaid_version = "11.0.0"
+mermaid_init_js = "mermaid.initialize({startOnLoad:true});"
+
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

--- a/docs/cpp_api.rst
+++ b/docs/cpp_api.rst
@@ -0,0 +1,378 @@
+C++ API Reference
+=================
+
+This reference organizes the MSCCL++ C++ API into two main categories: :ref:`host-side-interfaces` for CPU code and :ref:`device-side-interfaces` for GPU kernels. Components that are used in both host and device code are documented in the Device-Side Interfaces section.
+
+.. _host-side-interfaces:
+
+Host-Side Interfaces
+--------------------
+
+These are the interfaces used in CPU code to set up connections, manage memory, and coordinate operations.
+
+Bootstrap and Process Coordination
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. doxygenclass:: mscclpp::Bootstrap
+   :members:
+
+.. doxygenclass:: mscclpp::TcpBootstrap
+   :members:
+
+.. doxygentypedef:: mscclpp::UniqueId
+
+.. doxygenvariable:: mscclpp::UniqueIdBytes
+
+Connection Setup and Memory Management
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. doxygenclass:: mscclpp::Connection
+   :members:
+
+.. doxygenclass:: mscclpp::Context
+   :members:
+
+.. doxygenclass:: mscclpp::Communicator
+   :members:
+
+.. doxygenstruct:: mscclpp::Device
+   :members:
+
+.. doxygenclass:: mscclpp::Endpoint
+   :members:
+
+.. doxygenstruct:: mscclpp::EndpointConfig
+   :members:
+
+.. doxygenclass:: mscclpp::NvlsConnection
+   :members:
+
+.. doxygenclass:: mscclpp::RegisteredMemory
+   :members:
+
+.. doxygenclass:: mscclpp::TransportFlags
+   :members:
+
+.. doxygenenum:: mscclpp::DeviceType
+
+.. doxygenenum:: mscclpp::Transport
+
+.. doxygenfunction:: mscclpp::connectNvlsCollective
+
+Semaphores
+~~~~~~~~~~
+
+.. doxygenclass:: mscclpp::Host2DeviceSemaphore
+   :members:
+
+.. doxygenclass:: mscclpp::Host2HostSemaphore
+   :members:
+
+.. doxygenclass:: mscclpp::MemoryDevice2DeviceSemaphore
+   :members:
+
+.. doxygenclass:: mscclpp::Semaphore
+   :members:
+
+.. doxygenclass:: mscclpp::SemaphoreStub
+   :members:
+
+Channels
+~~~~~~~~
+
+.. doxygenstruct:: mscclpp::BaseMemoryChannel
+   :members:
+
+.. doxygenstruct:: mscclpp::BasePortChannel
+   :members:
+
+.. doxygenstruct:: mscclpp::MemoryChannel
+   :members:
+
+.. doxygenstruct:: mscclpp::PortChannel
+   :members:
+
+.. doxygenstruct:: mscclpp::SwitchChannel
+   :members:
+
+Proxy Service and FIFO Management
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. doxygenclass:: mscclpp::BaseProxyService
+   :members:
+
+.. doxygenclass:: mscclpp::Fifo
+   :members:
+
+.. doxygenclass:: mscclpp::Proxy
+   :members:
+
+.. doxygenclass:: mscclpp::ProxyService
+   :members:
+
+.. doxygentypedef:: mscclpp::ProxyHandler
+
+.. doxygenenum:: mscclpp::ProxyHandlerResult
+
+.. doxygenvariable:: mscclpp::DEFAULT_FIFO_SIZE
+
+Utilities
+~~~~~~~~~
+
+.. doxygenstruct:: mscclpp::AvoidCudaGraphCaptureGuard
+   :members:
+
+.. doxygenstruct:: mscclpp::CudaStreamWithFlags
+   :members:
+
+.. doxygenclass:: mscclpp::GpuBuffer
+   :members:
+
+.. doxygenclass:: mscclpp::GpuStream
+   :members:
+
+.. doxygenclass:: mscclpp::GpuStreamPool
+   :members:
+
+.. doxygenfunction:: mscclpp::getDeviceNumaNode
+
+.. doxygenfunction:: mscclpp::getHostName
+
+.. doxygenfunction:: mscclpp::getIBDeviceCount
+
+.. doxygenfunction:: mscclpp::getIBDeviceName
+
+.. doxygenfunction:: mscclpp::getIBTransportByDeviceName
+
+.. doxygenfunction:: mscclpp::gpuMemcpy
+
+.. doxygenfunction:: mscclpp::gpuMemcpyAsync
+
+.. doxygenfunction:: mscclpp::gpuStreamPool
+
+.. doxygenfunction:: mscclpp::isCuMemMapAllocated
+
+.. doxygenfunction:: mscclpp::isNvlsSupported
+
+.. doxygenfunction:: mscclpp::numaBind
+
+Executor Interface
+~~~~~~~~~~~~~~~~~~
+
+.. doxygenclass:: mscclpp::ExecutionPlan
+   :members:
+
+.. doxygenclass:: mscclpp::Executor
+   :members:
+
+.. doxygenenum:: mscclpp::DataType
+
+.. doxygenenum:: mscclpp::PacketType
+
+Environment and Configuration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. doxygenclass:: mscclpp::Env
+   :members:
+
+.. doxygenfunction:: mscclpp::env
+
+Error Handling
+~~~~~~~~~~~~~~
+
+.. doxygenclass:: mscclpp::BaseError
+   :members:
+
+.. doxygenclass:: mscclpp::CudaError
+   :members:
+
+.. doxygenclass:: mscclpp::CuError
+   :members:
+
+.. doxygenclass:: mscclpp::Error
+   :members:
+
+.. doxygenclass:: mscclpp::IbError
+   :members:
+
+.. doxygenclass:: mscclpp::SysError
+   :members:
+
+.. doxygenenum:: mscclpp::ErrorCode
+
+.. doxygenfunction:: mscclpp::errorToString
+
+Version
+~~~~~~~
+
+.. doxygenfunction:: mscclpp::version
+
+Macro Functions
+~~~~~~~~~~~~~~~
+
+.. doxygendefine:: MSCCLPP_CUDATHROW
+
+.. doxygendefine:: MSCCLPP_CUTHROW
+
+
+.. _device-side-interfaces:
+
+Device-Side Interfaces
+----------------------
+
+These device-side handle structures provide GPU kernel interfaces for MSCCL++ communication primitives. They are designed to be used directly in CUDA/HIP device code.
+
+Channel Device Interfaces
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. doxygenstruct:: mscclpp::BaseMemoryChannelDeviceHandle
+   :members:
+
+.. doxygenstruct:: mscclpp::BasePortChannelDeviceHandle
+   :members:
+
+.. doxygenunion:: mscclpp::ChannelTrigger
+
+.. doxygenunion:: mscclpp::LL16Packet
+
+.. doxygenunion:: mscclpp::LL8Packet
+
+.. doxygenstruct:: mscclpp::MemoryChannelDeviceHandle
+   :members:
+
+.. doxygenstruct:: mscclpp::PortChannelDeviceHandle
+   :members:
+
+.. doxygenstruct:: mscclpp::SwitchChannelDeviceHandle
+   :members:
+
+.. doxygentypedef:: mscclpp::LLPacket
+
+.. doxygentypedef:: mscclpp::MemoryId
+
+.. doxygentypedef:: mscclpp::SemaphoreId
+
+Semaphore Device Interfaces
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. doxygenstruct:: mscclpp::Host2DeviceSemaphoreDeviceHandle
+   :members:
+
+.. doxygenstruct:: mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle
+   :members:
+
+FIFO Device Interfaces
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. doxygenstruct:: mscclpp::FifoDeviceHandle
+   :members:
+
+.. doxygenstruct:: mscclpp::ProxyTrigger
+   :members:
+
+.. doxygenvariable:: mscclpp::TriggerBitsFifoReserved
+
+.. doxygenvariable:: mscclpp::TriggerBitsMemoryId
+
+.. doxygenvariable:: mscclpp::TriggerBitsOffset
+
+.. doxygenvariable:: mscclpp::TriggerBitsSemaphoreId
+
+.. doxygenvariable:: mscclpp::TriggerBitsSize
+
+.. doxygenvariable:: mscclpp::TriggerBitsType
+
+.. doxygentypedef:: mscclpp::TriggerType
+
+.. doxygenvariable:: mscclpp::TriggerData
+
+.. doxygenvariable:: mscclpp::TriggerFlag
+
+.. doxygenvariable:: mscclpp::TriggerSync
+
+Device Utilities
+~~~~~~~~~~~~~~~~
+
+.. doxygenstruct:: mscclpp::DeviceSemaphore
+   :members:
+
+.. doxygenstruct:: mscclpp::DeviceSyncer
+   :members:
+
+.. doxygenunion:: mscclpp::VectorType
+
+.. doxygenstruct:: mscclpp::Words
+   :members:
+
+.. doxygenfunction:: mscclpp::copy
+
+.. doxygenfunction:: mscclpp::copyFromPackets
+
+.. doxygenfunction:: mscclpp::copyToPackets
+
+Atomics
+~~~~~~~
+
+.. doxygenvariable:: mscclpp::memoryOrderAcqRel
+
+.. doxygenvariable:: mscclpp::memoryOrderAcquire
+
+.. doxygenvariable:: mscclpp::memoryOrderRelaxed
+
+.. doxygenvariable:: mscclpp::memoryOrderRelease
+
+.. doxygenvariable:: mscclpp::memoryOrderSeqCst
+
+.. doxygenvariable:: mscclpp::scopeDevice
+
+.. doxygenvariable:: mscclpp::scopeSystem
+
+.. doxygenfunction:: mscclpp::atomicFetchAdd
+
+.. doxygenfunction:: mscclpp::atomicLoad
+
+.. doxygenfunction:: mscclpp::atomicStore
+
+Vector Data Types
+~~~~~~~~~~~~~~~~~
+
+.. doxygentypedef:: mscclpp::bf16x2
+
+.. doxygentypedef:: mscclpp::bf16x4
+
+.. doxygentypedef:: mscclpp::bf16x8
+
+.. doxygentypedef:: mscclpp::f16x2
+
+.. doxygentypedef:: mscclpp::f16x4
+
+.. doxygentypedef:: mscclpp::f16x8
+
+.. doxygentypedef:: mscclpp::f32x1
+
+.. doxygentypedef:: mscclpp::f32x2
+
+.. doxygentypedef:: mscclpp::f32x4
+
+.. doxygentypedef:: mscclpp::f64x1
+
+.. doxygentypedef:: mscclpp::i32x1
+
+.. doxygentypedef:: mscclpp::i32x2
+
+.. doxygentypedef:: mscclpp::i32x4
+
+.. doxygentypedef:: mscclpp::u32x1
+
+.. doxygentypedef:: mscclpp::u32x2
+
+.. doxygentypedef:: mscclpp::u32x4
+
+Macro Functions
+~~~~~~~~~~~~~~~
+
+.. doxygendefine:: MSCCLPP_ASSERT_DEVICE
+
+.. doxygendefine:: OR_POLL_MAYBE_JAILBREAK
+
+.. doxygendefine:: POLL_MAYBE_JAILBREAK
--- a/docs/design/design.md
+++ b/docs/design/design.md
@@ -1,157 +0,0 @@
-# MSCCL++ Design Document
-## Introduction
-MSCCL++ redefines inter-GPU communication interfaces, thereby delivering a highly efficient and customizable communication stack for distributed GPU applications. Its design is specifically tailored to accommodate diverse performance optimization scenarios often encountered in state-of-the-art AI applications. The figure below provides a high-level overview of MSCCL++ abstractions in CUDA, C, and Python.
-
-
-```{figure} ../figs/abstractions.png
-:name: MSCCL++ Abstractions
-:alt: MSCCL++ Abstractions
-:align: center
-
-MSCCL++ Abstractions Overview
-```
-
-The following highlight the key features of MSCCL++.
-* **Light-weight and multi-layer abstractions.** MSCCL++ provides communication abstractions at lowest level close to hardware and at the highest level close to application API. The lowest level of abstraction is ultra light weight which enables a user to implement logics of data movement for a collective operation such as AllReduce inside a GPU kernel extremely efficiently without worrying about memory ordering of different ops. The modularity of MSCCL++ enables a user to construct the building blocks of MSCCL++ in a high level abstraction in Python and feed them to a CUDA kernel in order to facilitate the user's productivity.
-
-* **1-sided 0-copy synchronous and asynchronous abstracts.** MSCCL++ provides fine-grained synchronous and asynchronous 0-copy 1-sided abstracts for communication primitives such as `put()`, `get()`, `signal()`, `flush()`, and `wait()`. The 1-sided abstractions allows a user to asynchronously `put()` their data on the remote GPU as soon as it is ready without requiring the remote side to issue any receive instruction. This enables users to easily implement flexible communication logics, such as overlapping communication with computation, or implementing customized collective communication algorithms without worrying about potential deadlocks. Additionally, the 0-copy capability enables MSCCL++ to directly transfer data between user's buffers without using intermediate internal buffers which saves GPU bandwidth and memory capacity.
-
-* **Unified abstractions for different interconnection hardware.** MSCCL++ provides consistent abstractions regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink/xGMI or InfiniBand). This simplifies the code for inter-GPU communication, which is often complex due to memory ordering of GPU/CPU read/writes and therefore, is error-prone.
-
-## Concepts
-
-To implement the list of features above, some concepts are introduced.
-### Channel
-MSCCL++ provides peer-to-peer communication methods between GPUs. A peer-to-peer connection between two GPUs is called a *Channel*. Channels are constructed by MSCCL++ host-side interfaces and copied to GPUs during initialization. Channels provide *GPU-side interfaces*, which means that all communication methods are defined as a device function to be called from a GPU kernel code. Following code shows the basic usage for channel, the `put()` method in the following code copies 1KB data from the local GPU to a remote GPU.
-```cpp
-__global__ void gpuKernel() {
-  ...
-  // Only one thread is needed for this method.
-  channel.put(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024);
-  ...
-}
-```
-MSCCL++ also provides efficient synchronization methods, `signal()`, `flush()`, and `wait()`. We will discuss these methods in the following sections.
-
-#### MemoryChannel & PortChannel
-MSCCL++ delivers two types of channels, **PortChannel** and **MemoryChannel**. `PortChannel` provides port-mapping-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy, which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, PortChannel requires only a single GPU thread to call its methods. See all `PortChannel` methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/port_channel_device.hpp).
-
-On the other hand, `MemoryChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against PortChannel, MemoryChannel is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all MemoryChannel methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/memory_channel_device.hpp).
-
-### Fifo & Trigger
-One of the key features of MSCCL++ is to offload the communication logic from the GPU to the CPU.
-To offload the communication logic from the GPU to the CPU, MSCCL++ introduces the concept of `Fifo` and `Trigger`. A Fifo is a circular buffer that shared between the GPU and the CPU. It is used to store `Trigger`. A `Trigger` is a signal that is sent from the GPU to the CPU to notify the CPU that there are commands in the Fifo that need to be processed. The CPU will then process the commands in the Fifo and send a signal back to the GPU to notify the GPU that the commands have been processed. The implementation details of Fifo and Trigger can be found in following sections.
-
-### ProxyService
-Proxy service is a persistent service that resides in the CPU side. It functions as a polling service that receives the message `Trigger` from the GPU side and then transfers data according to the command.  When we use `PortChannel` for communication, a `Trigger` is sent from the GPU side to the `ProxyService`. Then `ProxyService` will invoke `cudaMemcpy*` or `IB verbs` to transfer data to the targe device.
-
-## Implementation
-
-The core of MSCCL++ is implemented in C++ and CUDA. We offer both C++ and Python APIs for initializing communication channels. For interactions within the GPU kernel, we offer a collection of low-level device functions. Subsequent sections will delve into these interfaces and the methodology for transferring communication logic from the GPU to the CPU.
-
-### Interfaces
-This section delivers a comprehensive overview of the MSCCL++ interfaces, encompassing both the setup and initialization of communication channels and the MSCCL++ kernel programming model.
-
-#### Communication setup and initialization APIs
-MSCCL++ provides APIs in both C++ and Python for establishing communication channels, with further information available in the [Initialization](../getting-started/tutorials/initialization.md) section. Presently, it supports two types of transports: `cudaIPC` for `NVLink/xGMI`, and `IB` for `InfiniBand`. Users are empowered to select the connection type that best suits their hardware infrastructure.
-
-#### MSCCL++ kernel programming model
-MSCCL++ offers one-sided communication methods directly callable from a GPU kernel, encompassing two primary API categories: data copy and synchronization. The data copy API features functions such as `put()`, `get()`, `read()`, and `write()`, while the synchronization API comprises `signal()`, `flush()`, and `wait()`. Demonstrated below, the basic utilization of the data copy API involves the `put()` method, which facilitates the transfer of 1KB of data from a local GPU to a remote GPU. Then send a signal to remote peer to notify the data is ready to use. To receive the data, the remote peer can call `wait()` method.
-This operation is executed within a kernel launched with a single block.
-```cpp
-// Running on rank 0
-__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memoryChannel) {
-  memoryChannel[0].put(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x);
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    memoryChannel[0].signal();
-  }
-}
-
-// Running on rank 1
-__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memoryChannel) {
-  if (threadIdx.x == 0) {
-    memoryChannel[0].wait();
-  }
-  __syncthreads();
-  // Data is ready to use
-}
-```
-
-Similar to the LL protocol offered by NCCL, MSCCL++ introduces a `Packet` structure designed to facilitate the transfer of both data and flags within a single instruction, proving particularly beneficial for applications where latency is a critical concern. The following code shows the basic usage of the `Packet` structure. The flag should be same for sender and receiver side.
-```cpp
-// Running on rank 0
-__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memChans, int flag) {
-  memChans[0].putPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x,
-                        /*flag=*/ flag);
-}
-
-// Running on rank 1
-__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memChans, int flag) {
-  memChans[0].unpackPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x,
-                            /*flag=*/ flag);
-  // Data is ready to use
-}
-```
-
-### The mechanism for offloading communication logic from the GPU to the CPU
-
-As mentioned in the previous section, the offloading of communication logic from the GPU to the CPU is accomplished through the `Fifo` and `Trigger` mechanism.
-
-The accompanying figure details the structure of `Trigger`, employing three bits to denote the operation type: `data transfer`, `signal`, and `flush`. The remaining fields specify the precise data locations for both local and remote buffers.
-
-```
-|-------------------|-------------------|-------------------|-----------------|-----------------|---------|-------------------|---------------|
-|    32bit size     |  32bit src offset |  32bit dst offset | 9bit src mem id | 9bit dst mem id | 3bit op | 10bit channel id  | 1bit reserved |
-|-------------------|-------------------|-------------------|-----------------|-----------------|---------|-------------------|---------------|
-```
-<center>The proxy trigger format</center>
-
-Page-locked memory is utilized for the `Fifo`, guaranteeing access by both the CPU and GPU. On the CPU side, a polling thread periodically checks the Fifo for new commands. Upon processing a command, it updates an incremented counter to signal to the GPU that the command has been executed. Users wishing to ensure a command has been processed can invoke `flush()`, which waits for the device-side counter to reflect this update.
-
-## Use Cases
-
-In this section, we will discuss several use cases that demonstrate the capabilities of MSCCL++.
-
-### Overlapping communication with computation
-
-MSCCL++ enables the offloading of communication logic from the GPU to the CPU, facilitating the overlapping of communication and computation processes. The code snippet provided illustrates this overlapping technique. In the depicted scenario, the GPU emits a signal to the CPU indicating readiness for data transfer. Subsequently, while the GPU continues to execute computation tasks, the CPU initiates the data transfer to the designated target device.
-```cpp
-__device__ void gpuKernel(mscclpp::PortChannelDeviceHandle* portChannel) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  // Send a trigger to the CPU
-  if (tid == 0) {
-    portChannel[0].putWithSignal(/*dstOffset*/ 0, /*srcOffset*/ 0, /*size*/ 1024);
-  }
-  // Continue computation
-  matrixMul()
-  // ...
-}
-```
-
-### Fusion of communication and computation
-
-Traditional communication libraries enforce a separation between communication and computation, creating a bottleneck where communication must await the completion of computation, especially when data dependencies exist. In contrast, MSCCL++ leverages its low-level primitives to facilitate the seamless integration of communication with computation. By segmenting the computation into tiles, MSCCL++ enables the simultaneous pipelining of computation and communication tasks. This approach not only mitigates the communication delay by overlapping processes but also significantly improves throughput by leveraging the low-level API for fine-grained control over the hardware, ensuring optimal efficiency.
-
-### Implementing customized collective communication algorithms
-
-MCSCL++ offers a low-level communication API, allowing users to design customized collective communication algorithms. The following code demonstrates how to implement a customized All2All algorithm using MSCCL++.
-```cpp
-using DeviceHandle = mscclpp::DeviceHandle<T>;
-__device__ void localAlltoall(DeviceHandle<mscclpp::PortChannel>* portChans, int rank,
-                              int nRanksPerNode, size_t nElements) {
-  int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
-  for (int i = 1; i < nRanksPerNode; i++) {
-    DeviceHandle<mscclpp::PortChannel> portChan = portChans[blockIdx.x];
-    if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank + i) % nRanksPerNode) {
-      portChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
-                                      nElements * sizeof(int));
-    }
-    // wait for the data from GPU (rank-i) % nranksPerNode to arrive
-    if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank - i + nRanksPerNode) % nRanksPerNode) {
-      portChan.wait();
-    }
-    deviceSyncer.sync(nRanksPerNode - 1);
-  }
-}
-```
--- a/docs/design/mscclpp-dsl.md
+++ b/docs/design/mscclpp-dsl.md
@@ -1,114 +0,0 @@
-# MSCCL++ DSL
-## MSCCLPPLang Introduction
-MSCCLPPLang is a Python moudule for writing high-performance commnunication algorithms. It is designed to be easy to use and efficient, while providing a high-level interface for writing communication algorithms. MSCCLPPLang program will be compiled to json based execution plan, which can be executed by MSCCL++ executor.
-
-## How to use MSCCLPPLang
-### Install mscclpp package
-```bash
-git clone https://github.com/microsoft/mscclpp.git
-cd mscclpp
-pip install .
-```
-
-### Import mscclpp language module
-```python
-from mscclpp.language import *
-from mscclpp.language.types import ChannelType, ReplicationPolicy
-from mscclpp.language.collectives import AllGather
-
-instances = 1
-size = gpus
-collective = AllGather(size, chunk_factor=1, inplace=True)
-with MSCCLPPProgram(
-    "allgather",
-    collective,
-    size,
-    instances,
-    protocol="Simple",
-    replication_policy=ReplicationPolicy.interleaved,
-):
-    pass
-```
-
-## How MSCCLPPLang Works
-MSCCLPPLang provides a high-level interface for writing communication algorithms. We treat the communication algorithm as a graph, where the nodes are the data and the edges are the communication operations. The graph is represented as a Python program, which is compiled to a json based execution plan.
-
-### Core Concepts
-
-#### MSCCLPPProgram
-A MSCCLPPProgram provides the context to write MSCCLPPLang program, which can be initialized with `with` statement in Python. Its parameters include:
-
- `name`: Name of this program.
- `collective`: Collective type of this program, should be from `mscclpp.language.collectives`.
- `instances`: Number of parallel instances of this program. Please see the [Instance](#instance) section for more details.
- `protocol`: Data transmission protocol used in this program, can be `LL` or `Simple`. Optional, default is `Simple`.
- `instr_fusion`: Whether low-level instruction fusion is enabled. Optional, default is `True`.
- `replication_policy`: Data replication policy, should be from `mscclpp.language.types.ReplicationPolicy`. Optional, default is `duplicated`. Please see the [Instance](#instance) section for more details.
- `num_threads_per_block`: Thread block size. Optional, default is `1024`.
- `use_double_scratch_buffer`: Whether requires double scratch buffer during execution. Optional, default is `False`.
-
-### Collective:
-A collective is a communication operation that involves multiple GPUs. We provide a set of collective operations for users to utilize. For example, the `AllGather` operation gathers data from all GPUs to all GPUs. To instantiate a collective, the user needs to specify the number of ranks, the chunk factor (how many chunks the input buffer will be split into), and whether the operation is in-place.
-
-#### Chunk
-A chunk is a piece of data that is sent between GPUs. It is the basic unit of data in MSCCLPPLang. Chunk can be a piece of data from input buffer, output buffer or intermediate buffer.
-Example of creating a chunk:
-```python
-c = chunk(rank, Buffer.input, index, size)
-```
- rank: the rank of the GPU that the chunk belongs to.
- buffer: the buffer that the chunk belongs to. It can be Buffer.input, Buffer.output or Buffer.scratch.
- index: the index of the chunk in the buffer.
- size: the number of unit chunks.
-
-Assume we split the input data in the buffer into 4 chunks. On GPU rank 0, we can retrieve the chunks from indices 0 to 2 using the following command:
-```python
-c = chunk(0, Buffer.input, 0, 2)
-```
-
-#### Operation
-The operation can only be applied to the chunks. We provide a set of communications operations for the users to use. For example, the `put` operation is used to send the data from one GPU to another GPU. The `get` operation is used to receive the data from another GPU.
-
-***Please notice***: MSCCLPPLang only provides one-sided communication operations. The user needs to make sure that the data is ready to be sent or received before calling the communication operations. Also we provides `wait/signal` operations to synchronize the communication across GPUs.
-
-#### Channel
-A channel is a communication channel between two GPUs. It is used to send and receive data between GPUs. We supports three types of channel: `ChannelType.memory`, `ChannelType.port` and `ChannelType.nvls`.
-
-`ChannelType.memory` is used for communication between GPUs on the same node. This channel uses GPU processors to transfer data.
-
-`ChannelType.port` is used for communication between GPUs, whether they are on different nodes or the same node. This channel will offload the data transfer to CPU processors, which can provide better throughput compared to `ChannelType.memory`. However, this comes at the cost of higher latency compared to `ChannelType.memory`.
-
-`ChannelType.nvls` is used for communication between GPUs on the same node. This feature offloads the data processing task to the switch, requiring specific hardware support. Refer [nvdia documentation](https://www.nvidia.com/en-us/data-center/nvlink/) for more details.
-
-#### Thread Block
-We can assign operations to a thread block. The thread block is a group of threads that are executed together on the GPU. In the operation function, we can specify the thread block that the operation belongs to via `sendtb` or `recvtb` parameter.
-
-#### Instance
-An instance is a parallel execution of the program. For example, if a collective algorithm is designed to run on `n` chunks with `m` thread blocks, setting the instance to 2 will run the algorithm on `2n` chunks with `2m` thread blocks. Serveral replication policies are supported, including `duplicated` and `interleaved`.
- `duplicated`: Each chunk is split into smaller parts based on the number of instances, duplicating the same instructions for all parts. For example, ChunkA is split into ChunkA0 and ChunkA1, while ChunkB is split into ChunkB0 and ChunkB1. Both ChunkA0 and ChunkA1 belong to Instance 0, and both ChunkB0 and ChunkB1 belong to Instance 1.
- `interleaved`: Assign chunks to instances in an interleaved manner. For example, ChunkA and ChunkB are split into to ChunkA0, ChunkA1, ChunkB0, and ChunkB1. ChunkA0 and ChunkB0 belong to Instance 0, while ChunkA1 and ChunkB1 belong to Instance 1.
-
-#### Instruction Fusion
-MSCCLPPLang provides the instruction fusion mechanism to fuse multiple operations into a single kernel. This can reduce the overhead of launching multiple instructions. When users create the MSCCLPPLang program, they can specify the `instr_fusion` parameter to enable the instruction fusion. By default, the instruction fusion is enabled.
-
-## MSCCLPPLang APIs
-
-### Basic APIs
- `chunk(rank, buffer, index, size)`: create a chunk.
- `put(self, dst, buffer, index, sendtb, chan_type)`: send the data from one GPU to another GPU. User can specify the index of the chunk in the destination buffer, the sendtb and the channel type.
- `get(self, src, buffer, index, recvtb, chan_type)`: receive the data from another GPU. User can specify the index of the chunk in the destination buffer, the recvtb and the channel type.
- `signal(self, dst, buffer, index, sendtb, chan_type)`: send a signal to another GPU.
- `wait(self, src, buffer, index, recvtb, chan_type)`: wait for a signal from another GPU.
- `flush(self, dst, buffer, index, sendtb, chan_type)`: flush the data in the buffer to the destination GPU. This is used to make sure the data is sent to the destination GPU.
- `copy(self, dst, buffer, index, sendtb)`: copy the data from one buffer to another buffer in the same GPU.
- `reduce(self, other_chunkref, recvtb, channel_type)`: Reduces the chunk(s) referenced by other_chunkref into the chunk(s) referenced by this chunkref
-
-### Packet APIs
-Packet APIs are used when user wants to use LL algorithm. The packet APIs are similar to the basic APIs, it will packet the data and flags into a packet and send the packet to the destination GPU. The destination GPU will unpack the packet and get the data and flags. So no synchronization is needed when using packet APIs. (`ChannelType.nvls` does not support packet APIs)
- `packet_put(self, dst, buffer, index, sendtb, chan_type)`: send the data from one GPU to another GPU using packet.
- `copy_packet(self, dst, buffer, index, sendtb)`: copy the data from one buffer to another buffer in the same GPU using packet.
- `reduce_packet(self, other_chunkref, recvtb)`: Reduces the chunk(s) referenced by other_chunkref into the chunk(s) referenced by this chunkref using packet.
-
-
-### Examples
-We provide several examples demonstrating how to use the MSCCL++ DSL to write communication collective algorithms. For more details, please refer to the [examples](https://github.com/microsoft/mscclpp/tree/main/python/examples) folder.
--- a/docs/design/nccl-over-mscclpp.md
+++ b/docs/design/nccl-over-mscclpp.md
@@ -1,67 +0,0 @@
-# NCCL Over MSCCL++
-
-(limitations)=
-## Limitations
-
-Current NCCL over MSCCL++ has a few limitations.
-
-* We do not cover all APIs yet. See the [API Support Table](#api-support-table) for details.
-* Multi-node communication is not supported yet.
-* Currently, collective communication functions may not work correctly if the buffer address is differed from that of previous function calls while sharing the same base address (returned by [cuMemGetAddressRange](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g64fee5711274a2a0573a789c94d8299b)) with the previous address. This is because the current implementation performs zero-copy communication over user buffers, and it is difficult to efficiently inform all ranks if the buffer address dynamically changes.
-
-(api-support-table)=
-## API Support Table
-
-The table below lists all NCCL APIs (v2.21). We may cover more APIs in the future.
-
-| API Name                 | Supported |
-| :----------------------- | :-------: |
-| ncclGetLastError         | X         |
-| ncclGetErrorString       | O         |
-| ncclGetVersion           | O         |
-| ncclGetUniqueId          | O         |
-| ncclCommInitRank         | O         |
-| ncclCommInitAll          | X         |
-| ncclCommInitRankConfig   | X         |
-| ncclCommSplit            | X         |
-| ncclCommFinalize         | O         |
-| ncclCommDestroy          | O         |
-| ncclCommAbort            | X         |
-| ncclCommGetAsyncError    | O         |
-| ncclCommCount            | O         |
-| ncclCommCuDevice         | O         |
-| ncclCommUserRank         | O         |
-| ncclCommRegister         | X         |
-| ncclCommDeregister       | X         |
-| ncclMemAlloc             | X         |
-| ncclMemFree              | X         |
-| ncclAllReduce            | O         |
-| ncclBroadcast            | X         |
-| ncclReduce               | X         |
-| ncclAllGather            | O         |
-| ncclReduceScatter        | X         |
-| ncclGroupStart           | O         |
-| ncclGroupEnd             | O         |
-| ncclSend                 | X         |
-| ncclRecv                 | X         |
-| ncclRedOpCreatePreMulSum | X         |
-| ncclRedOpDestroy         | X         |
-
-## Executor Support
-
-The executor is a versatile tool designed to specify how mscclpp executes algorithms. Currently, only the allReduce operation allows for algorithm customization. The following environment variables can be managed:
-
- MSCCLPP_EXECUTION_PLAN_DIR: Specifies the directory where the executor will look for JSON files.
-
-```{figure} ../figs/size_boundary_diagram.png
-:name: MMSCCL++ Abstractions
-:alt: MSCCL++ Abstractions
-:align: center
-
-Decision Flowchart for Message Size-Based Algorithm Execution
-```
-
-This is an example of executing the interface with the executor:
-``` bash
-mpirun -np 8 -x MSCCLPP_EXECUTION_PLAN_DIR=/root/azure-mscclpp/nccl/test/execution-files ./apps/nccl/test/nccl_api_test
-```
--- a/docs/getting-started/quickstart.md
+++ b/docs/getting-started/quickstart.md
@@ -1,210 +0,0 @@
-# Quick Start
-
-## Prerequisites
-
-* Azure SKUs
-    * [ND_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/nda100-v4-series)
-    * [NDm_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series)
-    * [ND_H100_v5](https://learn.microsoft.com/en-us/azure/virtual-machines/nd-h100-v5-series)
-    * [NC_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/nc-a100-v4-series) (TBD)
-* Non-Azure Systems
-    * NVIDIA A100 GPUs + CUDA >= 11.8
-    * NVIDIA H100 GPUs + CUDA >= 12.0
-    * AMD MI250X GPUs + ROCm >= 5.7
-    * AMD MI300X GPUs + ROCm >= 6.0
-* OS: tested over Ubuntu 18.04 and 20.04
-* Libraries
-    * [libnuma](https://github.com/numactl/numactl)
-        ```bash
-        sudo apt-get install libnuma-dev
-        ```
-    * (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.8 and Python Development Package
-        ```bash
-        sudo apt-get satisfy "python3 (>=3.8), python3-dev (>=3.8)"
-        ```
-        If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source (Libraries and Headers)](#install-from-source-libraries-and-headers)).
-    * (Optional, for benchmarks) MPI
-* Others
-    * For NVIDIA platforms, `nvidia_peermem` driver should be loaded on all nodes. Check it via:
-        ```
-        lsmod | grep nvidia_peermem
-        ```
-    * For GPU with nvls support, we require the kernel version to be 5.6 or above.
-
-## Build with Docker Images
-
-We provide docker images which package all prerequisites for MSCCL++. You can setup your dev environment with the following command.
-
-```bash
-$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4  bash
-```
-
-See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp).
-
-(build-from-source)=
-## Build from Source
-
-CMake 3.25 or later is required.
-
-```bash
-$ git clone https://github.com/microsoft/mscclpp.git
-$ mkdir -p mscclpp/build && cd mscclpp/build
-```
-
-For NVIDIA platforms, build MSCCL++ as follows.
-
-```bash
-# For NVIDIA platforms
-$ cmake -DCMAKE_BUILD_TYPE=Release ..
-$ make -j
-```
-
-For AMD platforms, use HIPCC instead of the default C++ compiler. Replace `/path/to/hipcc` from the command below into the your HIPCC path.
-
-```bash
-# For AMD platforms
-$ CXX=/path/to/hipcc cmake -DCMAKE_BUILD_TYPE=Release ..
-$ make -j
-```
-
-(install-from-source-libraries-and-headers)=
-## Install from Source (Libraries and Headers)
-
-```bash
-# Install the generated headers and binaries to /usr/local/mscclpp
-$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/mscclpp -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF ..
-$ make -j mscclpp mscclpp_static
-$ sudo make install/fast
-```
-
-(install-from-source-python-module)=
-## Install from Source (Python Module)
-
-Python 3.8 or later is required.
-
-```bash
-# For NVIDIA platforms
-$ python -m pip install .
-# For AMD platforms
-$ CXX=/path/to/hipcc python -m pip install .
-```
-
-## Docker Images
-
-Our base image installs all prerequisites for MSCCL++.
-
-```bash
-$ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.3
-```
-
-See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp).
-
-## Unit Tests
-
-`unit_tests` require one GPU on the system. It only tests operation of basic components.
-
-```bash
-$ make -j unit_tests
-$ ./test/unit_tests
-```
-
-For thorough testing of MSCCL++ features, we need to use `mp_unit_tests` that require at least two GPUs on the system. `mp_unit_tests` also requires MPI to be installed on the system. For example, the following commands compile and run `mp_unit_tests` with two processes (two GPUs). The number of GPUs can be changed by changing the number of processes.
-
-```bash
-$ make -j mp_unit_tests
-$ mpirun -np 2 ./test/mp_unit_tests
-```
-
-To run `mp_unit_tests` with more than two nodes, you need to specify the `-ip_port` argument that is accessible from all nodes. For example:
-
-```bash
-$ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.0.0.5:50000
-```
-
-## Performance Benchmark
-
-### Python Benchmark
-
-[Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
-
-```bash
-# Choose `requirements_*.txt` according to your CUDA/ROCm version.
-$ python3 -m pip install -r ./python/requirements_cuda12.txt
-$ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
-```
-
-## NCCL over MSCCL++
-
-We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html) APIs using MSCCL++. How to use:
-
-1. [Build MSCCL++ from source](#build-from-source).
-2. Replace your `libnccl.so` library with `libmscclpp_nccl.so`, which is compiled under `./build/apps/nccl/` directory.
-
-For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
-
-```bash
-mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
-```
-
-If MSCCL++ is built on AMD platforms, `libmscclpp_nccl.so` would replace the [RCCL](https://github.com/ROCm/rccl) library (i.e., `librccl.so`).
-
-See limitations of the current NCCL over MSCCL++ from [here](../design/nccl-over-mscclpp.md#limitations).
-
-MSCCL++ also supports fallback to NCCL/RCCL collectives by adding following environment variables.
-x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE
-x MSCCLPP_NCCL_LIB_PATH=/path_to_nccl_lib/libnccl.so or /path_to_rccl_lib/librccl.so (AMD platformis)
-x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="list of collective name[s]"
-
-The value "list of collective name[s]" can be a combination of collectives, such as "allgather," "allreduce," "broadcast," and "reducescatter." Alternatively, it can simply be set to "all" to enable fallback for all these collectives.
-By default, if the parameter MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION is not specified, "all" will be applied.
-
-Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
-```bash
-mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
-```
-
-Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
-```bash
-mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
-```
-
-On AMD platforms, you need to add RCCL_MSCCL_ENABLE=0 to avoid conflicts with the fallback features.
-
-### C++ Benchmark (mscclpp-test, *Deprecated*)
-
-*NOTE: mscclpp-test is retired and maintained only as an example of C++ implementation. If you want to get the latest performance numbers, please use the Python benchmark or the NCCL APIs instead.*
-
-mscclpp-test is a set of C++ performance benchmarks. It requires MPI on the system, and the path should be provided via `MPI_HOME` environment variable to the CMake build system.
-
-```bash
-$ MPI_HOME=/path/to/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
-$ make -j allgather_test_perf allreduce_test_perf
-```
-
-For example, the following command runs the `allreduce5` algorithm with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. You can try different algorithms by changing the `-k 5` option to another value (e.g., `-k 3` runs `allreduce3`). Check all algorithms from the code: [allreduce_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allreduce_test.cu) and [allgather_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allgather_test.cu).
-
-```bash
-$ mpirun --bind-to numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5
-```
-
-*NOTE: a few algorithms set a condition on the total data size, such as to be a multiple of 3. If the condition is unmet, the command will throw a regarding error.*
-
-Check the help message for more details.
-
-```bash
-$ ./test/mscclpp-test/allreduce_test_perf --help
-USAGE: allreduce_test_perf
-        [-b,--minbytes <min size in bytes>]
-        [-e,--maxbytes <max size in bytes>]
-        [-i,--stepbytes <increment size>]
-        [-f,--stepfactor <increment factor>]
-        [-n,--iters <iteration count>]
-        [-w,--warmup_iters <warmup iteration count>]
-        [-c,--check <0/1>]
-        [-T,--timeout <time in seconds>]
-        [-G,--cudagraph <num graph launches>]
-        [-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>]
-        [-k,--kernel_num <kernel number of commnication primitive>]
-        [-o, --output_file <output file name>]
-        [-h,--help]
-```
--- a/docs/getting-started/tutorials/customized-proxy-service.md
+++ b/docs/getting-started/tutorials/customized-proxy-service.md
@@ -1 +0,0 @@
-# Customize the Proxy Service
--- a/docs/getting-started/tutorials/index.rst
+++ b/docs/getting-started/tutorials/index.rst
@@ -1,16 +0,0 @@
-Tutorials
----------
-
-This tutorial section provides a step-by-step guide to help you get started with the C++/Python API.
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Tutorials
-   :hidden:
-
-   initialization
-   port-channel
-   memory-channel
-   packet-api
-   customized-proxy-service
-   python-api
--- a/docs/getting-started/tutorials/initialization.md
+++ b/docs/getting-started/tutorials/initialization.md
@@ -1,68 +0,0 @@
-# Commnunication initialize with mscclpp API
-
-In this tutorial, you will write a simple program to initialize communication between eight GPUs using MSCCL++ C++ API. You will also learn how to use the Python API to initialize communication.
-
-## Prerequisites
-A system with eight GPUs is required to run this tutorial.
-
-Also make sure that you have installed MSCCL++ on your system. If not, please follow the [quick start](../quickstart.md).
-
-## Initialize Communication with C++ API
-We will setup a mesh topology with eight GPUs. Each GPU will be connected to its neighbors. The following code shows how to initialize communication with MSCCL++ C++ API.
-
-```cpp
-#include <mscclpp/core.hpp>
-#include <mscclpp/gpu_utils.hpp>
-#include <mscclpp/port_channel.hpp>
-
-#include <memory>
-#include <string>
-#include <vector>
-
-template <class T>
-using DeviceHandle = mscclpp::DeviceHandle<T>;
-__constant__ DeviceHandle<mscclpp::PortChannel> constPortChans[8];
-
-void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {
-  std::string ip_port = "10.0.0.4:50000";
-  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, worldsize);
-  bootstrap->initialize(ip_port);
-  mscclpp::Communicator comm(bootstrap);
-  mscclpp::ProxyService proxyService;
-
-  std::vector<mscclpp::SemaphoreId> semaphoreIds;
-  std::vector<mscclpp::RegisteredMemory> localMemories;
-  std::vector<std::shared_future<std::shared_ptr<mscclpp::Connection>>> connections(world_size);
-  std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteMemories;
-
-  for (int r = 0; r < world_size; ++r) {
-    if (r == rank) continue;
-    mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
-    // Connect with all other ranks
-    connections[r] = comm.connect(transport, r);
-    auto memory = comm.registerMemory(data, dataSize, mscclpp::Transport::CudaIpc | ibTransport);
-    localMemories.push_back(memory);
-    comm.sendMemory(memory, r);
-    remoteMemories.push_back(comm.recvMemory(r));
-  }
-
-  for (int r = 0; r < world_size; ++r) {
-    if (r == rank) continue;
-    auto sema = communicator->buildSemaphore(connections[r].get(), r).get();
-    semaphoreIds.push_back(proxyService->addSemaphore(sema));
-  }
-
-  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannels;
-  for (size_t i = 0; i < semaphoreIds.size(); ++i) {
-    portChannels.push_back(mscclpp::deviceHandle(mscclpp::PortChannel(
-        proxyService.portChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()),
-        proxyService.addMemory(localMemories[i]))));
-  }
-
-  if (portChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle<mscclpp::PortChannel>)) {
-    std::runtime_error("unexpected error");
-  }
-  CUDACHECK(cudaMemcpyToSymbol(constPortChans, portChannels.data(),
-                              sizeof(DeviceHandle<mscclpp::PortChannel>) * portChannels.size()));
-}
-```
--- a/docs/getting-started/tutorials/memory-channel.md
+++ b/docs/getting-started/tutorials/memory-channel.md
@@ -1,3 +0,0 @@
-# Using MemoryChannel for Intra-Node Communication
-
-TBU
--- a/docs/getting-started/tutorials/packet-api.md
+++ b/docs/getting-started/tutorials/packet-api.md
@@ -1 +0,0 @@
-# Packet API for latency sensitive applications
--- a/docs/getting-started/tutorials/port-channel.md
+++ b/docs/getting-started/tutorials/port-channel.md
@@ -1,3 +0,0 @@
-# Offload commnunication to CPU with PortChannel
-
-TBU
--- a/docs/guide/advanced-connections.md
+++ b/docs/guide/advanced-connections.md
@@ -0,0 +1,3 @@
+# Advanced Connections
+
+TBU
--- a/docs/guide/cpp-examples.md
+++ b/docs/guide/cpp-examples.md
@@ -0,0 +1,45 @@
+# C++ Examples
+
+## Tutorials
+
+Check out our [tutorials](../tutorials) for step-by-step guides on using MSCCL++.
+
+(mscclpp-test)=
+## mscclpp-test
+
+*NOTE: mscclpp-test is NOT a performance benchmark. If you want to get the latest performance numbers, please use the Python benchmark or the NCCL APIs instead.*
+
+mscclpp-test is a set of C++ implementation examples. It requires MPI on the system, and the path should be provided via `MPI_HOME` environment variable to the CMake build system.
+
+```bash
+$ MPI_HOME=/path/to/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
+$ make -j allgather_test_perf allreduce_test_perf
+```
+
+For example, the following command runs the `allreduce5` algorithm with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. You can try different algorithms by changing the `-k 5` option to another value (e.g., `-k 3` runs `allreduce3`). Check all algorithms from the code: [allreduce_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allreduce_test.cu) and [allgather_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allgather_test.cu).
+
+```bash
+$ mpirun --bind-to numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5
+```
+
+*NOTE: a few algorithms set a condition on the total data size, such as to be a multiple of 3. If the condition is unmet, the command will throw a regarding error.*
+
+Check the help message for more details.
+
+```bash
+$ ./test/mscclpp-test/allreduce_test_perf --help
+USAGE: allreduce_test_perf
+        [-b,--minbytes <min size in bytes>]
+        [-e,--maxbytes <max size in bytes>]
+        [-i,--stepbytes <increment size>]
+        [-f,--stepfactor <increment factor>]
+        [-n,--iters <iteration count>]
+        [-w,--warmup_iters <warmup iteration count>]
+        [-c,--check <0/1>]
+        [-T,--timeout <time in seconds>]
+        [-G,--cudagraph <num graph launches>]
+        [-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>]
+        [-k,--kernel_num <kernel number of commnication primitive>]
+        [-o, --output_file <output file name>]
+        [-h,--help]
+```
--- a/docs/guide/memory-management.md
+++ b/docs/guide/memory-management.md
@@ -0,0 +1,12 @@
+# Memory Management
+
+The MSCCL++ stack handles most of the resource management automatically, so users don't need to explicitly store or destroy objects constructed by MSCCL++ APIs in most cases. For example:
+* The `Context` object will be alive iff itself or any Connections created by it are alive.
+* The `Connection` object will be alive iff itself or any SemaphoreStubs created from it are alive.
+* The `SemaphoreStub` object will be alive iff itself or any Semaphores created from it are alive.
+* The `Semaphore` object will be alive iff itself or any Channels created from it are alive.
+
+However, there are still a few things put on **users' responsibility**:
+* `RegisteredMemory` does not own the memory region it represents. Users need to ensure that the memory region is valid and accessible for the lifetime of the `RegisteredMemory` object.
+* The objects that are serialized and sent to other processes (like `Endpoint`, `SemaphoreStub`, and `RegisteredMemory`) should be kept alive until the remote endpoint has finished using them.
+* The channel objects on the host should be kept alive until the GPU kernels are finished using the device handles.
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -3,65 +3,23 @@
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

-Welcome to MSCCL++'s documentation!
-===================================
+MSCCL++ Documentation
+=====================

-MSCCL++ is a GPU-driven communication stack for scalable AI applications. It is designed to provide a high-performance, scalable, and customizable communication stack for distributed GPU applications.
+You can find the followings from this documentation.

-Getting Started
---------------
- Follow the :doc:`quick start <getting-started/quickstart>` for your platform of choice.
- Take a look at the :doc:`tutorials <getting-started/tutorials/index>` to learn how to write your first mscclpp program.
+- **Overview:** An overview of MSCCL++ and its features. :doc:`🔗 <overview>`
+- **Quick Start:** A guide to build, install, and run MSCCL++. :doc:`🔗 <quickstart>`
+- **Tutorials:** A step-by-step guide for GPU communication using MSCCL++. :doc:`🔗 <tutorials>`
+- **Programming Guide:** Advanced topics and best practices for using MSCCL++. :doc:`🔗 <programming_guide>`
+- **C++ API Reference:** Detailed documentation of the MSCCL++ C++ API. :doc:`🔗 <cpp_api>`

 .. toctree::
   :maxdepth: 1
-   :caption:  Getting Started
   :hidden:

-   getting-started/quickstart
-   getting-started/tutorials/index
-
-Design
-------
- :doc:`Design <design/design>` doc for those who want to understand the internals of MSCCL++.
- :doc:`NCCL over MSCCL++ <design/nccl-over-mscclpp>` doc for those who want to understand how to use NCCL over MSCCL++.
- :doc:`MSCCL++ DSL <design/mscclpp-dsl>` doc for those who want to understand the MSCCL++ DSL.
-
-.. toctree::
-   :maxdepth: 1
-   :caption:  Design
-   :hidden:
-
-   design/design
-   design/nccl-over-mscclpp
-   design/mscclpp-dsl
-
-Performance
---------------
- We evaluate the performance of MSCCL++ in A100 and H100. Here are some :doc:`performance results <performance/performance-ndmv4>` for all-reduce operations.
-
-.. toctree::
-   :maxdepth: 1
-   :caption:  Performance
-   :hidden:
-
-   performance/performance-ndmv4
-
-C++ API
---------------
- :doc:`mscclpp <api/index>`
-
-
-.. toctree::
-   :maxdepth: 1
-   :caption: C++ API
-   :hidden:
-
-   api/index
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
+   overview
+   quickstart
+   tutorials
+   programming_guide
+   cpp_api
--- a/docs/overview.md
+++ b/docs/overview.md
@@ -0,0 +1,8 @@
+# MSCCL++ Overview
+
+MSCCL++ (Microsoft Collective Communication Library ++, pronounced *em-sickle-plus-plus*) is a GPU communication library that provides **multiple levels of abstraction** for writing high-performance distributed GPU applications.
+
+- **Primitive API:** At the lowest level, MSCCL++ provides boilerplate-free C++ API (which we call *primitives*) for writing highly flexible GPU communication kernels.
+- **DSL API:** Over the primitive layer, MSCCL++ provides a Python-based domain-specific language (DSL) that helps users quickly develop large-scale collective communication algorithms.
+- **NCCL API:** At the highest level, MSCCL++ reimplements the NCCL API, allowing users to replace NCCL with MSCCL++ in their existing applications without any code changes.
+
--- a/docs/performance/performance-ndmv4.md
+++ b/docs/performance/performance-ndmv4.md
@@ -1,3 +0,0 @@
-# NDmv4 Performance
-
-TBU
--- a/docs/programming_guide.rst
+++ b/docs/programming_guide.rst
@@ -0,0 +1,13 @@
+Programming Guide
+-----------------
+
+This section provides advanced topics and best practices for using MSCCL++. It is designed for users who are already familiar with the basics of MSCCL++ and want to deepen their understanding or optimize their usage.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Programming Guide
+   :hidden:
+
+   guide/memory-management
+   guide/advanced-connections
+   guide/cpp-examples
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -0,0 +1,202 @@
+# Quick Start
+
+(prerequisites)=
+## Prerequisites
+
+* GPUs
+    * NVIDIA CUDA architecture 7.0 (Volta) or later, or AMD CDNA 2 architecture (GFX90a) or later are required. Features are more thoroughly tested on CUDA architecture 8.0 (Ampere) or later and AMD CDNA 3 architecture (GFX942) or later.
+    * A part of the features require GPUs to be connected peer-to-peer (through NVLink/xGMI or under the same PCIe switch).
+        * On NVIDIA platforms, check the connectivity via `nvidia-smi topo -m`. If the output shows `NV#` or `PIX`, it means the GPUs are connected peer-to-peer.
+        * On AMD platforms, check the connectivity via `rocm-smi --showtopohops`. If the output shows `1`, it means the GPUs are connected peer-to-peer.
+    * Below are example systems that meet the requirements:
+        * Azure SKUs
+            * [ND_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/nda100-v4-series)
+            * [NDm_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series)
+            * [ND_H100_v5](https://learn.microsoft.com/en-us/azure/virtual-machines/nd-h100-v5-series)
+        * Non-Azure Systems
+            * NVIDIA A100 GPUs + CUDA >= 11.8
+            * NVIDIA H100 GPUs + CUDA >= 12.0
+            * AMD MI250X GPUs + ROCm >= 5.7
+            * AMD MI300X GPUs + ROCm >= 6.0
+* OS
+    * Tested on Ubuntu 18.04 and later
+* Libraries
+    * [libnuma](https://github.com/numactl/numactl)
+        ```bash
+        sudo apt-get install libnuma-dev
+        ```
+    * (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.8 and Python Development Package
+        ```bash
+        sudo apt-get satisfy "python3 (>=3.8), python3-dev (>=3.8)"
+        ```
+        If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
+    * (Optional, for benchmarks) MPI
+* Others
+    * For NVIDIA platforms, `nvidia_peermem` driver should be loaded on all nodes. Check it via:
+        ```bash
+        lsmod | grep nvidia_peermem
+        ```
+    * For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.
+
+(docker-images)=
+## Docker Images
+
+We provide docker images which package all prerequisites for MSCCL++. You can setup your dev environment with the following command. Note that our docker images don't contain MSCCL++ by default, so you need to build it from source inside the container (see [Install from Source](#install-from-source) below).
+
+```bash
+# For NVIDIA platforms
+$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
+# For AMD platforms
+$ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
+```
+
+See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp).
+
+(install-from-source)=
+## Install from Source
+
+If you want to install only the Python module, you can skip this section and go to [Install from Source (Python Module)](#install-from-source-python-module).
+
+CMake 3.25 or later is required.
+
+```bash
+$ git clone https://github.com/microsoft/mscclpp.git
+$ mkdir -p mscclpp/build && cd mscclpp/build
+```
+
+For NVIDIA platforms, build MSCCL++ as follows. Replace `/usr` with your desired installation path.
+
+```bash
+# For NVIDIA platforms
+$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr ..
+$ make -j$(nproc)
+```
+
+For AMD platforms, use HIPCC instead of the default C++ compiler. The HIPCC path is usually `/opt/rocm/bin/hipcc` in official ROCm installations. If the path is different in your environment, please change it accordingly.
+
+```bash
+# For AMD platforms
+$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr ..
+$ make -j$(nproc)
+```
+
+After build succeeds, install the headers and binaries.
+
+```bash
+$ sudo make install
+```
+
+```{tip}
+There are a few optional CMake options you can set:
+- `-DMSCCLPP_GPU_ARCHS=<arch-list>`: Specify the GPU architectures to build for. For example, `-DMSCCLPP_GPU_ARCHS="80,90"` for NVIDIA A100 and H100 GPUs, `-DMSCCLPP_GPU_ARCHS=gfx942` for AMD MI300x GPU.
+- `-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON`: If the build environment doesn't have GPUs and only has CUDA installed, you can set these options to bypass GPU checks and use CUDA APIs. This is useful for building on CI systems or environments without GPUs.
+- `-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON`: If the build environment doesn't have GPUs and only has ROCm installed, you can set these options to bypass GPU checks and use ROCm APIs.
+- `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF`: Don't build the Python module.
+- `-DMSCCLPP_BUILD_TESTS=OFF`: Don't build the tests.
+- `-DMSCCLPP_BUILD_APPS_NCCL=OFF`: Don't build the NCCL API.
+```
+
+(install-from-source-python-module)=
+## Install from Source (Python Module)
+
+Python 3.8 or later is required.
+
+```bash
+# For NVIDIA platforms
+$ python -m pip install .
+# For AMD platforms, set the C++ compiler to HIPCC
+$ CXX=/opt/rocm/bin/hipcc python -m pip install .
+```
+
+(vscode-dev-container)=
+## VSCode Dev Container
+
+If you are using VSCode, you can use our VSCode Dev Container that automatically launches a development environment and installs MSCCL++ in it. Steps to use our VSCode Dev Container:
+
+1. Open the MSCCL++ repository in VSCode.
+2. Make sure your Docker is running.
+3. Make sure you have the [Dev Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) installed in VSCode.
+4. Open the command palette with `Ctrl`+`Shift`+`P` and select
+   `Dev Containers: Rebuild and Reopen in Container`.
+5. Wait for the container to build and open (may take a few minutes).
+
+```{note}
+- Our Dev Container is set up for NVIDIA GPUs by default. If you are using AMD GPUs, you need to copy [`devcontainer_amd.json`](https://github.com/microsoft/mscclpp/blob/main/.devcontainer/devcontainer_amd.json) to [`devcontainer.json`](https://github.com/microsoft/mscclpp/blob/main/.devcontainer/devcontainer.json).
+- Our Dev Container runs an SSH server over the host network and the port number is `22345` by default. You can change the port number by modifying the `SSH_PORT` argument in the [`devcontainer.json`](https://github.com/microsoft/mscclpp/blob/main/.devcontainer/devcontainer.json) file.
+- Our Dev Container uses a non-root user `devuser` by default, but note that you may need the root privileges to enable all hardware features of the GPUs inside the container. `devuser` is already configured to have `sudo` privileges without a password.
+```
+
+For more details on how to use the Dev Container, see the [Dev Containers tutorial](https://code.visualstudio.com/docs/devcontainers/tutorial).
+
+## Unit Tests
+
+`unit_tests` require one GPU on the system. It only tests operation of basic components.
+
+```bash
+$ make -j unit_tests
+$ ./test/unit_tests
+```
+
+For thorough testing of MSCCL++ features, we need to use `mp_unit_tests` that require at least two GPUs on the system. `mp_unit_tests` also requires MPI to be installed on the system. For example, the following commands compile and run `mp_unit_tests` with two processes (two GPUs). The number of GPUs can be changed by changing the number of processes.
+
+```bash
+$ make -j mp_unit_tests
+$ mpirun -np 2 ./test/mp_unit_tests
+```
+
+To run `mp_unit_tests` with more than two nodes, you need to specify the `-ip_port` argument that is accessible from all nodes. For example:
+
+```bash
+$ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.0.0.5:50000
+```
+
+## Performance Benchmark
+
+### Python Benchmark
+
+[Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
+
+```bash
+# Choose `requirements_*.txt` according to your CUDA/ROCm version.
+$ python3 -m pip install -r ./python/requirements_cuda12.txt
+$ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
+```
+
+### NCCL/RCCL Benchmark over MSCCL++
+
+We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html) APIs using MSCCL++. How to use:
+
+1. [Build MSCCL++ from source](#install-from-source).
+2. Replace your `libnccl.so` library with `libmscclpp_nccl.so`, which is compiled under `./build/apps/nccl/` directory.
+
+For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
+
+```bash
+mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
+```
+
+If MSCCL++ is built on AMD platforms, `libmscclpp_nccl.so` would replace the [RCCL](https://github.com/ROCm/rccl) library (i.e., `librccl.so`).
+
+See limitations of the current NCCL over MSCCL++ from [here](design/nccl-over-mscclpp.md#limitations).
+
+MSCCL++ also supports fallback to NCCL/RCCL collectives by adding following environment variables.
+```bash
+-x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE
+-x MSCCLPP_NCCL_LIB_PATH=/path_to_nccl_lib/libnccl.so (or /path_to_rccl_lib/librccl.so for AMD platforms)
+-x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="list of collective name[s]"
+```
+
+The value `"list of collective name[s]"` can be a combination of collectives, such as `"allgather"`, `"allreduce"`, `"broadcast"`, and `"reducescatter"`. Alternatively, it can simply be set to `"all"` to enable fallback for all these collectives.
+By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not specified, `"all"` will be applied.
+
+Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
+```bash
+mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
+```
+
+Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
+```bash
+mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
+```
+
+On AMD platforms, you need to add `RCCL_MSCCL_ENABLE=0` to avoid conflicts with the fallback features.
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,3 +1,4 @@
 breathe
 sphinx_rtd_theme
 myst_parser
+sphinxcontrib-mermaid
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -0,0 +1,17 @@
+Tutorials
+---------
+
+This tutorial section provides a step-by-step guide to help you learn about GPU communication using MSCCL++.
+
+Start with the :doc:`Before You Start <tutorials/00-before-you-start>`.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Tutorials
+   :hidden:
+
+   tutorials/00-before-you-start
+   tutorials/01-basic-concepts
+   tutorials/02-bootstrap-and-communicator
+   tutorials/03-memory-channel
+   tutorials/04-port-channel
--- a/docs/tutorials/00-before-you-start.md
+++ b/docs/tutorials/00-before-you-start.md
@@ -0,0 +1,20 @@
+# Before You Start
+
+This tutorial introduces how to use the MSCCL++ Primitive API to write highly flexible and optimized GPU communication kernels from the lowest level. If you are looking for the high-level APIs, please refer to the DSL API or the NCCL API.
+
+## Hardware Requirements
+
+To run example code in this tutorial, you may need a system with at least two NVIDIA or AMD GPUs. For multi-node examples, you will need RDMA Network Interface Cards (NICs) and a network setup that allows communication between nodes. See the {ref}`prerequisites` for details.
+
+## Environment Setup
+
+We provide {ref}`docker-images` and a {ref}`vscode-dev-container` to simplify the environment setup.
+
+## Prior Knowledge
+
+This tutorial assumes that readers have a basic understanding of C++ and GPU programming (CUDA). If you are unfamiliar with the following concepts, we recommend reviewing the relevant documentation or tutorials:
+- **C++ Basics:** STL containers, smart pointers, templates, futures, etc.
+- **CUDA Basics:** thread blocks, warps, shared memory, etc.
+- **(Optional) RDMA Basics:** If you are interested in multi-node communication, understanding RDMA concepts (`ibverbs` library) will be helpful.
+
+In the next page, we will introduce a few basic concepts of the MSCCL++ Primitive API by a simple ping-pong example between two GPUs.
--- a/docs/tutorials/01-basic-concepts.md
+++ b/docs/tutorials/01-basic-concepts.md
@@ -0,0 +1,215 @@
+# Basic Concepts
+
+In this tutorial, we explain a few basic concepts of the MSCCL++ Primitive API using a simple ping-pong example between two GPUs. The example demonstrates how to set up communication between GPUs.
+
+## Build and Run the Example
+
+The code of this tutorial is under [examples/tutorials/01-basic-concepts](https://github.com/microsoft/mscclpp/blob/main/examples/tutorials/01-basic-concepts).
+
+Build the example with `make`:
+
+```bash
+$ cd examples/tutorials/01-basic-concepts
+$ make
+```
+
+Run the example with `./gpu_ping_pong`. If you are in a container, you may need to run with the root privileges. You should see output similar to the following:
+
+```
+# ./gpu_ping_pong
+Creating endpoints ...
+GPU 0: Creating a connection and a semaphore stub ...
+GPU 1: Creating a connection and a semaphore stub ...
+GPU 0: Creating a semaphore and a memory channel ...
+GPU 1: Creating a semaphore and a memory channel ...
+GPU 0: Launching gpuKernel0 ...
+GPU 1: Launching gpuKernel1 ...
+Elapsed 4.77814 ms per iteration (100)
+Succeed!
+```
+
+If you see error messages like "At least two GPUs are required" or "GPU 0 cannot access GPU 1", it means that your system does not meet the requirements for running the example. Make sure you have at least two GPUs installed and that they are connected peer-to-peer (through NVLink or under the same PCIe switch). See the {ref}`prerequisites` for more details.
+
+## Code Overview
+
+The example code constructs three key components for communication: **Connection**, **Semaphore**, and **Channel**. The following diagram illustrates the flow of how these components are created and used.
+
+```{mermaid}
+sequenceDiagram
+    participant ProcessA
+    participant ProcessB
+
+    rect rgb(240, 240, 240)
+        Note over ProcessA, ProcessB: Create an Endpoint
+
+        ProcessA<<->>ProcessB: Exchange the Endpoints
+
+        Note over ProcessA, ProcessB: Create a Connection using the two Endpoints
+    end
+
+    rect rgb(240, 240, 240)
+        Note over ProcessA, ProcessB: Construct a SemaphoreStub using the Connection
+
+        ProcessA<<->>ProcessB: Exchange the SemaphoreStubs
+
+        Note over ProcessA, ProcessB: Create a Semaphore using the two SemaphoreStubs
+    end
+
+    Note over ProcessA, ProcessB: Create a Channel using the Semaphore and run applications
+```
+
+```{note}
+Note that ProcessA and ProcessB are not necessarily different processes; they can be the same process running on the same host (like in the example code).
+The endpoints constructed by ProcessA and ProcessB are also not necessarily using different GPUs; they can be the same GPU, allowing for intra-GPU communication.
+```
+
+## Endpoint and Connection
+
+An **Endpoint** represents an entity that can communicate with another entity, such as a GPU. In this example, we create two endpoints, one for each GPU. A **Connection** is established between these endpoints, allowing them to communicate with each other. Construction of endpoints and connections is done by a **Context** object, which is responsible for managing communication resources.
+
+The example code creates endpoints as follows:
+
+```cpp
+// From gpu_ping_pong.cu, lines 70-71
+mscclpp::Endpoint ep0 = ctx->createEndpoint({transport, {mscclpp::DeviceType::GPU, 0}});
+mscclpp::Endpoint ep1 = ctx->createEndpoint({transport, {mscclpp::DeviceType::GPU, 1}});
+```
+
+Both endpoints are created to use the same transport `mscclpp::Transport::CudaIpc`, which uses direct communication supported by CUDA/HIP IPC. The two endpoints must use the same transport type to establish a connection between them. We will introduce other transport types in later tutorials.
+
+`mscclpp::DeviceType::GPU` indicates that these endpoints are for GPUs, and the numbers `0` and `1` specify the GPU IDs.
+
+The connection is created by calling `connect` on the context object:
+
+```cpp
+// From gpu_ping_pong.cu, lines 76 and 82
+std::shared_ptr<mscclpp::Connection> conn0 = ctx->connect(/*localEndpoint*/ ep0, /*remoteEndpoint*/ ep1);
+std::shared_ptr<mscclpp::Connection> conn1 = ctx->connect(/*localEndpoint*/ ep1, /*remoteEndpoint*/ ep0);
+```
+
+The `localEndpoint` and `remoteEndpoint` parameters specify which endpoints are used for the connection. A connection is asymmetric by nature, meaning that we need to create one connection for each endpoint. In this case, `conn0` is created for `ep0` to communicate with `ep1`, and `conn1` is created for `ep1` to communicate with `ep0`.
+
+This example creates both endpoints in a single process for simplicity, so the connections can be established directly using the two endpoints. However, in most real-world applications, the endpoints would be created in different processes. In that case, you can **serialize the endpoints** and send them over a network or through shared memory. For example:
+
+```cpp
+// Process A
+mscclpp::Endpoint ep0 = ctx->createEndpoint({transport, {mscclpp::DeviceType::GPU, 0}});
+std::vector<char> serializedEp0 = ep0.serialize();
+sendToProcessB(serializedEp0);  // send serializedEp0 to Process B using any IPC mechanism
+
+// Process B
+mscclpp::Endpoint ep1 = ctx->createEndpoint({transport, {mscclpp::DeviceType::GPU, 1}});
+std::vector<char> serializedEp0 = recvFromProcessA();  // receive serializedEp0 from Process A
+mscclpp::Endpoint ep0 = mscclpp::Endpoint::deserialize(serializedEp0);
+std::shared_ptr<mscclpp::Connection> conn1 = ctx->connect(/*localEndpoint*/ ep1, /*remoteEndpoint*/ ep0);
+```
+
+## SemaphoreStub and Semaphore
+
+After a connection is established, both endpoints know how to communicate with each other. Now, we need a way to synchronize the communication. This is where **Semaphore** comes into play. A Semaphore provides a synchronization mechanism that allows one endpoint to signal the other or wait for a signal from the other endpoint.
+
+To construct a Semaphore, we first need to create a **SemaphoreStub** using the connection from each endpoint. A SemaphoreStub holds one endpoint's resource for a Semaphore, and a Semaphore is constructed using two SemaphoreStubs, one from each endpoint.
+
+```cpp
+// From gpu_ping_pong.cu, lines 77 and 83
+mscclpp::SemaphoreStub semaStub0(conn0);
+mscclpp::SemaphoreStub semaStub1(conn1);
+```
+
+The SemaphoreStubs are created using the connections established earlier. They are then exchanged between the two endpoints to create a Semaphore:
+
+```cpp
+// From gpu_ping_pong.cu, lines 88 and 98
+mscclpp::Semaphore sema0(/*localSemaphoreStub*/ semaStub0, /*remoteSemaphoreStub*/ semaStub1);
+mscclpp::Semaphore sema1(/*localSemaphoreStub*/ semaStub1, /*remoteSemaphoreStub*/ semaStub0);
+```
+
+Like the connections, the Semaphore is also asymmetric. Each endpoint has its own Semaphore, which is constructed using its own SemaphoreStub and the other endpoint's SemaphoreStub. SemaphoreStubs can be serialized and sent to other processes in the same way as endpoints.
+
+## Channel
+
+Semaphores can be used to synchronize operations between endpoints, but they do not provide a way to transfer data. To facilitate data transfer, we introduce the concept of **Channel**. A Channel is built on top of a semaphore and allows for the transfer of data between endpoints.
+
+However, since this ping-pong example doesn't need to transfer any data, we construct a `BaseMemoryChannel` that is a shallow wrapper around a Semaphore but does not associate with any memory region for data transfer. We will introduce more advanced channels in later tutorials.
+
+```cpp
+// From gpu_ping_pong.cu, lines 89 and 99
+mscclpp::BaseMemoryChannel memChan0(sema0);
+mscclpp::BaseMemoryChannel memChan1(sema1);
+```
+
+To let the application (GPU kernels) use the channels, we need to obtain the **device handles** for the channels. The device handle is a lightweight object that can be passed to GPU kernels to perform operations on the channel. Different types of channels have different device handle types.
+
+```cpp
+// From gpu_ping_pong.cu, lines 90 and 100
+mscclpp::BaseMemoryChannelDeviceHandle memChanHandle0 = memChan0.deviceHandle();
+mscclpp::BaseMemoryChannelDeviceHandle memChanHandle1 = memChan1.deviceHandle();
+```
+
+The device handles are then copied to the GPU memory and passed to the GPU kernels for execution. For example:
+
+```cpp
+// From gpu_ping_pong.cu, lines 91-93
+void *devHandle0;
+MSCCLPP_CUDATHROW(cudaMalloc(&devHandle0, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
+MSCCLPP_CUDATHROW(cudaMemcpy(devHandle0, &memChanHandle0, sizeof(memChanHandle0), cudaMemcpyHostToDevice));
+
+// From gpu_ping_pong.cu, line 108
+gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle0), iter);
+
+// From gpu_ping_pong.cu, lines 26-35
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+  if (threadIdx.x + blockIdx.x * gridDim.x == 0) {
+    for (int i = 0; i < iter; ++i) {
+      devHandle->relaxedWait();
+      // spin for a few ms
+      spin_cycles(1e7);
+      devHandle->relaxedSignal();
+    }
+  }
+}
+```
+
+The kernel code will be explained in the next section.
+
+```{tip}
+If your application (GPU kernel) needs to access the device handle very frequently with low latency requirements, you can consider either of the following approaches:
+
+* Use constant memory to store the device handle, which allows faster access by storing the handle in a read-only memory region on the GPU. However, since constant memory is limited in size, this approach may not be suitable if other parts of the application also require constant memory.
+* Copy the device handle into the GPU's shared memory, which incurs a one-time cost of copying the handle from global memory to shared memory, but allows faster access thereafter.
+```
+
+(channel-interfaces)=
+## Channel Interfaces in GPU Kernels
+
+In the GPU kernels of this example, we use the `relaxedSignal()` and `relaxedWait()` methods of the `BaseMemoryChannelDeviceHandle` to synchronize operations between the two GPUs. The `relaxedWait()` method blocks the calling thread until it receives a signal from the other GPU, while `relaxedSignal()` sends a signal to the other GPU. To demonstrate the synchronization, we put a spin loop of 10 million clock cycles (which takes a few milliseconds) on one side of the ping-pong (GPU 0) and check if the elapsed time is greater than 1 millisecond on the other side (GPU 1).
+
+```cpp
+// From gpu_ping_pong.cu, lines 26-44
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+  if (threadIdx.x + blockIdx.x * gridDim.x == 0) {
+    for (int i = 0; i < iter; ++i) {
+      devHandle->relaxedWait();
+      // spin for a few ms
+      spin_cycles(1e7);
+      devHandle->relaxedSignal();
+    }
+  }
+}
+
+__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+  if (threadIdx.x + blockIdx.x * gridDim.x == 0) {
+    for (int i = 0; i < iter; ++i) {
+      devHandle->relaxedSignal();
+      devHandle->relaxedWait();
+    }
+  }
+}
+```
+
+`relaxedSignal()` and `relaxedWait()` are used to synchronize the execution flow, but they do not synchronize memory operations. This means that when `relaxedWait()` returns, it guarantees that the other GPU has executed `relaxedSignal()`, but it does not guarantee that the memory operations before `relaxedSignal()` have completed. This can happen because GPUs follow [weakly-ordered memory models](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-fence-functions). If synchronization of memory operations is needed, you can use `signal()` and `wait()` instead, which will ensure that all memory operations before the signal are visible to the other GPU when its `wait()` returns. In this example, we do not need to synchronize memory operations, so we use `relaxedSignal()` and `relaxedWait()` which are faster.
+
+
+## Summary and Next Steps
+
+In this tutorial, you have learned the basic concepts of connections, semaphores, and channels in MSCCL++. In the next tutorial, we will introduce `Bootstrap` and `Communicator` interfaces, which provide a convenient way to set up connections across multiple processes.
--- a/docs/tutorials/02-bootstrap-and-communicator.md
+++ b/docs/tutorials/02-bootstrap-and-communicator.md
@@ -0,0 +1,104 @@
+# Bootstrap and Communicator
+
+```{note}
+This tutorial follows the [Basic Concepts](./01-basic-concepts.md) tutorial.
+```
+
+In this tutorial, we introduce `Bootstrap` and `Communicator` interfaces, which provide a convenient way to set up connections across multiple processes. The example code implements the same ping-pong example as in the [Basic Concepts](./01-basic-concepts.md) tutorial, but using one process per GPU and the `Bootstrap` and `Communicator` interfaces to establish connections.
+
+## Build and Run the Example
+
+The code of this tutorial is under [examples/tutorials/02-bootstrap-and-communicator](https://github.com/microsoft/mscclpp/blob/main/examples/tutorials/02-bootstrap-and-communicator).
+
+Build the example with `make`:
+
+```bash
+$ cd examples/tutorials/02-bootstrap-and-communicator
+$ make
+```
+
+Run the example with `./gpu_ping_pong_mp`. If you are in a container, you may need to run with root privileges. You should see output similar to the following:
+
+```
+# ./gpu_ping_pong_mp
+GPU 1: Initializing a bootstrap ...
+GPU 0: Initializing a bootstrap ...
+GPU 0: Creating a connection ...
+GPU 1: Creating a connection ...
+GPU 0: Creating a semaphore ...
+GPU 1: Creating a semaphore ...
+GPU 1: Creating a channel ...
+GPU 0: Creating a channel ...
+GPU 1: Launching a GPU kernel ...
+GPU 0: Launching a GPU kernel ...
+Elapsed 4.78082 ms per iteration (100)
+Succeed!
+```
+
+The example code uses localhost port `50505` by default. If the port is already in use, you can change it by modifying the `PORT_NUMER` macro in the code.
+
+If you see error messages like "At least two GPUs are required" or "GPU 0 cannot access GPU 1", it means that your system does not meet the requirements for running the example. Make sure you have at least two GPUs installed and that they are connected peer-to-peer (through NVLink or under the same PCIe switch). See the {ref}`prerequisites` for more details.
+
+## Code Overview
+
+The example code is similar to that in the [Basic Concepts](./01-basic-concepts.md) tutorial, but uses `Bootstrap` and `Communicator` interfaces to establish connections between GPUs. The code spawns one child process per GPU. The parent process waits for the child processes to finish before exiting.
+
+## Bootstrap
+
+**Bootstrap** is an abstract class that defines common inter-process communication (IPC) interfaces such as `send()`, `recv()`, `allGather()`, and `barrier()`. Bootstrap is used to exchange serialized MSCCL++ objects between host processes, or to synchronize the processes. `TcpBootstrap` is a concrete implementation of the `Bootstrap` interface that uses TCP sockets for communication.
+
+In the example code, two processes create and initialize a `TcpBootstrap` instance as follows:
+
+```cpp
+auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
+bootstrap->initialize("lo:127.0.0.1:" PORT_NUMER);
+```
+
+`myRank` is the rank of the current process, and `nRanks` is the total number of processes. The `initialize()` method sets up the bootstrap connection between all processes. In this example, we pass an `ifIpPortTrio` string, which has the format `if:ip:port`, where `if` is the network interface (e.g., `lo` for localhost), `ip` is the IP address, and `port` is the port number. The `TcpBootstrap` will listen on the specified port and accept connections from other processes.
+
+```{note}
+Alternatively, `TcpBootstrap` can be initialized with a **UniqueId**, which is a unique identifier for the bootstrap connection. This is similar to what NCCL does with its [`ncclGetUniqueId()`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclgetuniqueid) and [`ncclCommInitRank()`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcomminitrank) functions. The UniqueId should be shared between processes using an external mechanism, such as using MPI like the following:
+
+```cpp
+auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
+mscclpp::UniqueId id;
+if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
+MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
+bootstrap->initialize(id);
+```
+
+## Communicator
+
+While `Bootstrap` provides general IPC interfaces, `Communicator` is a wrapper around `Bootstrap` that provides more specific methods for building channels between GPUs.
+
+In the example code, `Communicator` is constructed as follows:
+
+```cpp
+mscclpp::Communicator comm(bootstrap);
+```
+
+Then it creates a GPU endpoint that connects to the remote rank:
+
+```cpp
+auto connFuture = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank);
+auto conn = connFuture.get();
+```
+
+The `connect()` method builds a connection asynchronously; it returns a future of a connection object. The `get()` method is called later on the future to retrieve the connection object. In this example, we call `get()` immediately since we don't have other tasks in between.
+
+After the connection is established, we create a semaphore for synchronization:
+
+```cpp
+auto semaFuture = comm.buildSemaphore(conn, remoteRank);
+auto sema = semaFuture.get();
+```
+
+Like `connect()`, `buildSemaphore()` is an asynchronous method that returns a future of a semaphore object.
+
+We omit explaining the rest of the code, as it is similar to that in the [Basic Concepts](./01-basic-concepts.md) tutorial.
+
+## Summary and Next Steps
+
+In this tutorial, you have learned how to use `Bootstrap` and `Communicator` interfaces to establish connections between multiple processes. Note that `Bootstrap` and `Communicator` are still optional interfaces for convenience. As noted in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can still use your own IPC mechanisms to build connections and semaphores. For advanced examples that use Redis or `torch.distributed` for IPC, see the [Advanced Connections](../guide/advanced-connections.md) guide.
+
+In the next tutorial, we will introduce more comprehensive usage of `MemoryChannel` including how to use it for efficient data transfer between GPUs. It will also cover how to create communication buffers and how to use them with channels.
--- a/docs/tutorials/03-memory-channel.md
+++ b/docs/tutorials/03-memory-channel.md
@@ -0,0 +1,258 @@
+# Memory Channel
+
+```{note}
+This tutorial follows the [Bootstrap and Communicator](./02-bootstrap-and-communicator.md) tutorial.
+```
+
+In this tutorial, we will introduce the comprehensive usage of `MemoryChannel`, which provides direct access to remote GPU memory for communication. We will cover how to create communication buffers, use them with `MemoryChannel`, and perform efficient data transfer between GPUs.
+
+## Build and Run the Example
+
+The code of this tutorial is under [examples/tutorials/03-memory-channel](https://github.com/microsoft/mscclpp/blob/main/examples/tutorials/03-memory-channel).
+
+Build the example with `make`:
+
+```bash
+$ cd examples/tutorials/03-memory-channel
+$ make
+```
+
+Run the example with `./bidir_memory_channel`. If you are in a container, you may need to run with root privileges. You should see output similar to the following:
+
+```
+# ./bidir_memory_channel
+GPU 1: Preparing for tests ...
+GPU 0: Preparing for tests ...
+GPU 0: [Bidir Put] bytes 1024, elapsed 0.0065079 ms/iter, BW 0.157347 GB/s
+GPU 0: [Bidir Put] bytes 1048576, elapsed 0.00926096 ms/iter, BW 113.225 GB/s
+GPU 0: [Bidir Put] bytes 134217728, elapsed 0.389238 ms/iter, BW 344.822 GB/s
+GPU 0: [Bidir Get] bytes 1024, elapsed 0.00437581 ms/iter, BW 0.234014 GB/s
+GPU 0: [Bidir Get] bytes 1048576, elapsed 0.00768634 ms/iter, BW 136.421 GB/s
+GPU 0: [Bidir Get] bytes 134217728, elapsed 0.417454 ms/iter, BW 321.515 GB/s
+GPU 0: [Bidir Put Packets] bytes 1024, elapsed 0.00407117 ms/iter, BW 0.251525 GB/s
+GPU 0: [Bidir Put Packets] bytes 1048576, elapsed 0.0104925 ms/iter, BW 99.936 GB/s
+GPU 0: [Bidir Put Packets] bytes 134217728, elapsed 1.0188 ms/iter, BW 131.741 GB/s
+Succeed!
+```
+
+The example code uses localhost port `50505` by default. If the port is already in use, you can change it by modifying the `PORT_NUMBER` macro in the code.
+
+```{caution}
+Note that this example is **NOT** a performance benchmark. The performance numbers are provided to give you an idea of the performance characteristics of `MemoryChannel`. For optimal performance, we need to tune the number of thread blocks and threads per block according to the copy size and hardware specifications. Additionally, synchronization can be further optimized depending on the application scenario and implementation.
+```
+
+## Code Overview
+
+The example code establishes a channel similarly to the [Bootstrap and Communicator](./02-bootstrap-and-communicator.md) tutorial, but creates a `MemoryChannel` instead of a `BaseMemoryChannel`. To create a `MemoryChannel`, we need to specify the local and remote `RegisteredMemory` objects, which represent the memory regions that the channel can transfer data to/from. The following diagram illustrates how `RegisteredMemory` objects are created and used to establish a `MemoryChannel`:
+
+```{mermaid}
+sequenceDiagram
+    participant ProcessA
+    participant ProcessB
+
+    Note over ProcessA: Create RegisteredMemory A
+
+    Note over ProcessB: Create RegisteredMemory B
+
+    rect rgb(240, 240, 240)
+        ProcessA->>ProcessB: Send and receive RegisteredMemory A
+        Note over ProcessB: Create a MemoryChannel using a pre-built Semaphore<br>and RegisteredMemory B and A
+    end
+    
+    rect rgb(240, 240, 240)
+        ProcessB->>ProcessA: Send and receive RegisteredMemory B
+        Note over ProcessA: Create a MemoryChannel using a pre-built Semaphore<br>and RegisteredMemory A and B
+    end
+```
+
+The procedure for building a `Semaphore` is explained in the [Basic Concepts](./01-basic-concepts.md) tutorial.
+
+The example code implements three GPU kernels that perform the same bidirectional data transfer operation using different methods: `put()`, `get()`, and `putPackets()`. The code examines the performance of these three methods.
+
+## RegisteredMemory and GpuBuffer
+
+**RegisteredMemory** represents a memory region that can be accessed by local or remote processes. It provides a way to register a memory region for communication, allowing remote memory access. In the example code, each process creates a local `RegisteredMemory` object as follows:
+
+```cpp
+mscclpp::GpuBuffer buffer(bufferBytes);
+mscclpp::RegisteredMemory localRegMem = comm.registerMemory(buffer.data(), buffer.bytes(), transport);
+```
+
+Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (such as [NVL72](https://www.nvidia.com/en-us/data-center/gb300-nvl72) on NVIDIA Grace Blackwell platforms). We will introduce other transport types in later tutorials.
+
+**GpuBuffer** is NOT required for creating a `RegisteredMemory`; you can register any pre-allocated GPU memory region with `registerMemory()`. However, it is the user's responsibility to ensure that the memory region is suitable for their communication operations. Depending on the hardware platform, some communication methods may require specific memory allocation to ensure data consistency and correctness. `GpuBuffer` is a convenient way to allocate GPU memory that is compatible with the communication methods that MSCCL++ supports. It provides a simple interface for allocating GPU memory and automatically handles memory deallocation when it goes out of scope.
+
+```{note}
+If you are an optimization expert, we recommend learning about the details of `GpuBuffer`. It is a thin wrapper around the CUDA/HIP memory allocation APIs with the following features:
+* If the GPU device is an NVIDIA GPU that supports [NVLink SHARP](https://docs.nvidia.com/networking/display/sharpv300), it automatically allocates [multimem-addressable](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#multimem-addresses) memory. The allocated memory can still be directly accessed by other peer GPUs' threads, and users can run computation kernels on the memory region directly without performance degradation.
+* If the GPU device is an AMD GPU that supports [Uncached Memory](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/conceptual/definitions.html#memory-type), it automatically allocates uncached memory. For such GPU devices, uncached memory must be used if (1) a remote device (CPU, GPU, or NIC) may directly access and update the memory region, and (2) the local GPU device may wait for the update without synchronizing the whole device (e.g., via `hipDeviceSynchronize()` or `hipStreamSynchronize()`). Therefore, you do NOT need to use uncached memory unless you will use the memory region for synchronization flags or counters (such as MSCCL++ [Packets](#packets)). However, in general, we recommend using uncached memory by default unless you understand the implications. As the name implies, uncached memory is not cached by the GPU, so it can be accessed without polluting the GPU cache (which is beneficial for other parallel computation kernels). For the same reason, running complex computation kernels (such as matrix multiplication) on uncached memory may lead to performance degradation, so it is recommended to use uncached memory only for communication purposes.
+```
+
+## MemoryChannel
+
+**MemoryChannel** is a specialized channel that allows direct access to remote GPU memory. In addition to the synchronization methods provided by `BaseMemoryChannel`, `MemoryChannel` provides methods for data access and transfer between local and remote memory regions. To construct a `MemoryChannel`, we need to specify the local and remote `RegisteredMemory` objects. `RegisteredMemory` provides `serialize()` and `deserialize()` methods to convert memory region metadata into a serialized format that can be sent over the network. While any IPC mechanism can be used to send the serialized data, MSCCL++ `Communicator` provides `sendMemory()` and `recvMemory()` methods to send and receive `RegisteredMemory` objects between processes. The following code shows an example:
+
+```cpp
+comm.sendMemory(localRegMem, remoteRank);
+auto remoteRegMemFuture = comm.recvMemory(remoteRank);
+mscclpp::RegisteredMemory remoteRegMem = remoteRegMemFuture.get();
+```
+
+After exchanging the `RegisteredMemory` objects, we can create a `MemoryChannel` as follows:
+
+```cpp
+mscclpp::MemoryChannel memChan(sema, /*dst*/ remoteRegMem, /*src*/ localRegMem);
+```
+
+Here, `sema` is a pre-built semaphore used for synchronization methods, which is introduced in the [Basic Concepts](./01-basic-concepts.md) tutorial. The `remoteRegMem` and `localRegMem` are the destination and source memory regions, respectively. The following diagram illustrates how the `memChan` channel uses these memory regions (A and B representing the two GPUs):
+
+```{mermaid}
+flowchart TD
+    RegMemA -->|"put() from A"| RegMemB
+    RegMemB -->|"put() from B"| RegMemA
+    RegMemB -->|"get() from A"| RegMemA
+    RegMemA -->|"get() from B"| RegMemB
+```
+
+### Copy with `put()`
+
+The example code demonstrates a bidirectional data copy between two GPUs using `MemoryChannel` interfaces. Below is the GPU kernel code that performs the data copy using the `put()` method:
+
+```cpp
+__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid == 0) {
+    devHandle->relaxedSignal();
+    devHandle->relaxedWait();
+  }
+  devSyncer.sync(gridDim.x);
+
+  const uint64_t srcOffset = myRank * copyBytes;
+  const uint64_t dstOffset = srcOffset;
+  devHandle->put(dstOffset, srcOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x);
+  devSyncer.sync(gridDim.x);
+  if (tid == 0) {
+    devHandle->signal();
+    devHandle->wait();
+  }
+}
+```
+
+Both GPUs run this kernel concurrently to copy data from their own memory regions to the other GPU's memory region. This code assumes no preceding synchronization between the two GPUs. Therefore, to ensure the other side is ready to receive data, each kernel needs to check if the other has started execution before proceeding with the data copy. This is done by a single thread (`tid == 0`) in each GPU signaling the other GPU (`relaxedSignal()`), and then waiting for the other GPU to signal that it is ready (`relaxedWait()`). We use the relaxed versions of signal and wait because the purpose here is execution control, not data synchronization (see {ref}`channel-interfaces` to recap). After one thread synchronizes with the other GPU, all threads in the GPU kernel synchronize with `devSyncer.sync(gridDim.x)`, which ensures that all threads in the GPU kernel start executing the data copy operation after the other GPU is ready.
+
+The `put()` method copies data from the source offset in the local memory region to the destination offset in the remote memory region. The `threadId` and `numThreads` parameters map the data copy operation to the participating threads in the GPU kernel. Since the example code uses all threads in the GPU kernel to perform the data copy, we pass `tid` as the `threadId` and `blockDim.x * gridDim.x` as the `numThreads`. Users can also use a subset of threads to perform the data copy by passing the appropriate values for `threadId` and `numThreads`. This can be useful for optimizing the data copy, especially when there are multiple destinations or sources, or when following computation after `put()` needs to be pipelined with the data transfer.
+
+The example code assumes there may be following computation that consumes the received data, so it performs another synchronization after the data copy. It first synchronizes all threads in the GPU kernel (`devSyncer.sync(gridDim.x)`) to ensure that all threads have completed the data copy operation, and then the first thread (`tid == 0`) signals the other GPU that the data copy is complete (`devHandle->signal()`) and waits for the other GPU to acknowledge it (`devHandle->wait()`). This ensures that the other GPU can safely access the copied data after the data copy operation is complete.
+
+### Copy with `get()`
+
+While `put()` writes to the remote memory, `get()` reads from the remote memory. The example code demonstrates a bidirectional data copy using the `get()` method, which is similar to `put()`, but reads data from the remote memory region and writes it to the local memory region. The following code shows how to use `get()` in a GPU kernel:
+
+```cpp
+__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid == 0) {
+    devHandle->relaxedSignal();
+    devHandle->relaxedWait();
+  }
+  devSyncer.sync(gridDim.x);
+
+  const int remoteRank = myRank ^ 1;
+  const uint64_t srcOffset = remoteRank * copyBytes;
+  const uint64_t dstOffset = srcOffset;
+  devHandle->get(srcOffset, dstOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x);
+}
+```
+
+Note that the `get()` method doesn't need explicit data synchronization after the data copy because it is a read operation. This makes `get()` more efficient than `put()`, especially for small data transfers. However, `get()` may not be suitable for all scenarios, especially when the data can be modified by the remote GPU while it is being read. For large data transfers, `put()` is usually considered more efficient, but this highly depends on the hardware implementation, and we recommend benchmarking the performance of both methods for your specific use case.
+
+(packets)=
+## Packets
+
+In MSCCL++, **Packet** is a data structure that contains user data with metadata (which we call *flags*) that can validate the user data's integrity. This allows the receiver to safely retrieve the user data without explicit synchronization (signal and wait). Using packets is often faster than `put()` for small data transfers and more flexible than `get()` because both the sender and receiver can work at their own pace. However, the goodput of communication using packets is much smaller than that of `put()` or `get()` because packets require additional metadata to be sent along with the user data.
+
+The example code creates one more `MemoryChannel` to demonstrate usage of packets. The channel is created as follows:
+
+```cpp
+mscclpp::MemoryChannel memPktChan(sema, /*dst*/ remotePktRegMem, /*src*/ localRegMem,
+                                  /*packetBuffer*/ localPktRegMem.data());
+```
+
+Compared to the previous `memChan` channel, this `memPktChan` channel uses the same source (`localRegMem`) but a different destination (`remotePktRegMem`) and an additional packet buffer (`localPktRegMem.data()`). The following diagram illustrates how the `memPktChan` channel uses these memory regions (A and B representing the two GPUs):
+
+```{mermaid}
+block-beta
+    columns 6
+    space:1
+    RegMemA space:2 RegMemB
+    space:8
+    PktRegMemA space:2 PktRegMemB
+
+    RegMemA --"putPackets()"--> PktRegMemB
+    RegMemB --"putPackets()"--> PktRegMemA
+
+    PktRegMemA --"unpackPackets()"--> RegMemA
+    PktRegMemB --"unpackPackets()"--> RegMemB
+```
+
+The `putPackets()` method reads data from the source memory region, converts it into packets, and writes the packets to the destination memory region. The `memPktChan` channel sets the destination memory region to the packet buffer of the remote GPU, so that the remote GPU can use the `unpackPackets()` method, which reads packets from the local packet buffer and writes the data to the source memory region locally. The example code demonstrates how to use `putPackets()` and `unpackPackets()` in a GPU kernel:
+
+```cpp
+__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank,
+                                     uint32_t flag) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid == 0) {
+    devHandle->relaxedSignal();
+    devHandle->relaxedWait();
+  }
+  devSyncer.sync(gridDim.x);
+
+  const uint64_t srcOffset = myRank * copyBytes;
+  const uint64_t dstOffset = srcOffset;
+  const uint64_t pktBufOffset = 0;
+  devHandle->putPackets(pktBufOffset, srcOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
+  devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
+}
+```
+
+The `flag` parameter is used to construct the packets. It can be any non-zero 4-byte value. If `putPackets()` may directly overwrite previous packets without clearing the packet buffer (as in the example code), the flag value should be different from the previous packets' flags. The figure below illustrates how packets are constructed. `D0-3` are the user data (4 bytes each), and each packet consists of two user data and two flags. We call this packet format `mscclpp::LL16Packet`, which is the default format of `putPackets()` and `unpackPackets()`. The name `LL` stands for "low-latency" (borrowed term from the [LL protocol of NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-proto)) and `16` indicates the packet size.
+
+```{mermaid}
+block-beta
+columns 2
+  block:Data
+    Data0["D0"]
+    Data1["D1"]
+    Data2["D2"]
+    Data3["D3"]
+  end
+  space
+  block:PacketA
+    PacketA0["D0"]
+    PacketA1["flag"]
+    PacketA2["D1"]
+    PacketA3["flag"]
+  end
+  block:PacketB
+    PacketB0["D2"]
+    PacketB1["flag"]
+    PacketB2["D3"]
+    PacketB3["flag"]
+  end
+  space
+  Data0 --> PacketA0
+  Data1 --> PacketA2
+  Data2 --> PacketB0
+  Data3 --> PacketB2
+  style Data fill:#ffffff,stroke:#ffffff
+  style PacketA fill:#f0f0f0,stroke:#ffffff
+  style PacketB fill:#f0f0f0,stroke:#ffffff
+```
+
+Since the flags take 50% of the packet size, the goodput of communication using packets is only 50% compared to transferring raw data. However, this doesn't matter because packets are designed for small data transfers. Packets transfer small data efficiently because the integrity of the user data is guaranteed by only waiting for the correct flags (done by `unpackPackets()`); explicit memory synchronization (signal and wait) is not needed.
+
+## Summary and Next Steps
+
+In this tutorial, you have learned how to use `MemoryChannel` for efficient data transfer between GPUs. You have also learned how to create communication buffers using `RegisteredMemory` and `GpuBuffer`, and how to use packets for small data transfers. You can find more complex usage of `MemoryChannel` in the {ref}`mscclpp-test`.
+
+In the next tutorial, we will introduce `PortChannel`, which is another type of channel that provides port-based data transfer methods.
--- a/docs/tutorials/04-port-channel.md
+++ b/docs/tutorials/04-port-channel.md
@@ -0,0 +1,38 @@
+# Port Channel
+
+```{note}
+This tutorial follows the [Memory Channel](./03-memory-channel.md) tutorial.
+```
+
+## Build and Run the Example
+
+The code of this tutorial is under [examples/tutorials/04-port-channel](https://github.com/microsoft/mscclpp/blob/main/examples/tutorials/04-port-channel).
+
+Build the example with `make`:
+
+```bash
+$ cd examples/tutorials/04-port-channel
+$ make
+```
+
+Run the example with `./bidir_port_channel`. If you are in a container, you may need to run with root privileges. You should see output similar to the following:
+
+```
+# ./bidir_port_channel
+GPU 0: Preparing for tests ...
+GPU 1: Preparing for tests ...
+GPU 0: [Bidir PutWithSignal] bytes 1024, elapsed 0.0204875 ms/iter, BW 0.0499818 GB/s
+GPU 0: [Bidir PutWithSignal] bytes 1048576, elapsed 0.0250319 ms/iter, BW 41.8896 GB/s
+GPU 0: [Bidir PutWithSignal] bytes 134217728, elapsed 0.365497 ms/iter, BW 367.219 GB/s
+Succeed!
+```
+
+The example code uses localhost port `50505` by default. If the port is already in use, you can change it by modifying the `PORT_NUMBER` macro in the code.
+
+```{caution}
+Note that this example is **NOT** a performance benchmark. The performance numbers are provided to give you an idea of the performance characteristics of `PortChannel`. For optimal performance, synchronization can be further optimized depending on the application scenario and implementation.
+```
+
+## Code Overview
+
+TBU
--- a/docs/getting-started/tutorials/python-api.md
+++ b/docs/getting-started/tutorials/python-api.md
--- a/examples/tutorials/02-bootstrap/.gitignore
+++ b/examples/tutorials/02-bootstrap/.gitignore
@@ -0,0 +1 @@
+gpu_ping_pong_mp
--- a/examples/tutorials/02-bootstrap/Makefile
+++ b/examples/tutorials/02-bootstrap/Makefile
@@ -0,0 +1,22 @@
+CUDA_HOME ?= /usr/local/cuda
+ROCM_HOME ?= /opt/rocm
+
+# Check if nvcc exists, otherwise use hipcc
+ifeq ($(shell which $(CUDA_HOME)/bin/nvcc 2>/dev/null),)
+    COMPILER := $(ROCM_HOME)/bin/hipcc
+    ARCH_FLAG := -D__HIP_PLATFORM_AMD__=1
+else
+    COMPILER := $(CUDA_HOME)/bin/nvcc
+    ARCH_FLAG := -arch=native
+endif
+
+TARGET = gpu_ping_pong_mp
+SRC = gpu_ping_pong_mp.cu
+
+all: $(TARGET)
+
+$(TARGET): $(SRC)
+	$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp
+
+clean:
+	rm -f $(TARGET)
--- a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
+++ b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
@@ -0,0 +1,171 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/memory_channel.hpp>
+#include <mscclpp/memory_channel_device.hpp>
+#include <sstream>
+
+#define PORT_NUMBER "50505"
+
+template <typename... Args>
+void log(Args &&...args) {
+  std::stringstream ss;
+  (ss << ... << args);
+  ss << std::endl;
+  std::cout << ss.str();
+}
+
+int spawn_process(std::function<void()> func) {
+  pid_t pid = fork();
+  if (pid < 0) return -1;
+  if (pid == 0) {
+    // Child process
+    func();
+    exit(0);
+  }
+  return pid;
+}
+
+int wait_process(int pid) {
+  int status;
+  if (waitpid(pid, &status, 0) < 0) {
+    return -1;
+  }
+  if (WIFEXITED(status)) {
+    return WEXITSTATUS(status);
+  }
+  return -1;
+}
+
+__device__ void spin_cycles(unsigned long long cycles) {
+  unsigned long long start = clock64();
+  while (clock64() - start < cycles) {
+    // spin
+  }
+}
+
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+    for (int i = 0; i < iter; ++i) {
+      devHandle->relaxedWait();
+      // spin for a few ms
+      spin_cycles(1e7);
+      devHandle->relaxedSignal();
+    }
+  }
+}
+
+__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+    for (int i = 0; i < iter; ++i) {
+      devHandle->relaxedSignal();
+      devHandle->relaxedWait();
+    }
+  }
+}
+
+void worker(int gpuId) {
+  // Optional: check if we have at least two GPUs
+  int deviceCount;
+  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&deviceCount));
+  if (deviceCount < 2) {
+    log("Error: At least two GPUs are required.");
+    std::exit(1);
+  }
+
+  // Optional: check if the two GPUs can peer-to-peer access each other
+  int canAccessPeer;
+  MSCCLPP_CUDATHROW(cudaDeviceCanAccessPeer(&canAccessPeer, 0, 1));
+  if (!canAccessPeer) {
+    log("Error: GPU 0 cannot access GPU 1. Make sure that the GPUs are connected peer-to-peer. You can check this "
+        "by running `nvidia-smi topo -m` (the connection between GPU 0 and 1 should be either NV# or PIX).");
+    std::exit(1);
+  }
+
+  MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
+  const int myRank = gpuId;
+  const int remoteRank = myRank == 0 ? 1 : 0;
+  const int nRanks = 2;
+  const int iter = 100;
+  const mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
+
+  log("GPU ", gpuId, ": Initializing a bootstrap ...");
+
+  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
+  bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER);
+  mscclpp::Communicator comm(bootstrap);
+
+  log("GPU ", gpuId, ": Creating a connection ...");
+
+  auto connFuture = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank);
+  auto conn = connFuture.get();
+
+  log("GPU ", gpuId, ": Creating a semaphore ...");
+
+  auto semaFuture = comm.buildSemaphore(conn, remoteRank);
+  auto sema = semaFuture.get();
+
+  log("GPU ", gpuId, ": Creating a channel ...");
+
+  mscclpp::BaseMemoryChannel memChan(sema);
+  auto memChanHandle = memChan.deviceHandle();
+  void *devHandle;
+  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
+  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
+
+  log("GPU ", gpuId, ": Launching a GPU kernel ...");
+
+  if (gpuId == 0) {
+    gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
+    MSCCLPP_CUDATHROW(cudaGetLastError());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  } else {
+    cudaEvent_t start, end;
+    MSCCLPP_CUDATHROW(cudaEventCreate(&start));
+    MSCCLPP_CUDATHROW(cudaEventCreate(&end));
+    MSCCLPP_CUDATHROW(cudaEventRecord(start));
+    gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
+    MSCCLPP_CUDATHROW(cudaGetLastError());
+    MSCCLPP_CUDATHROW(cudaEventRecord(end));
+    MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
+
+    float elapsedMs;
+    MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedMs, start, end));
+
+    float msPerIter = elapsedMs / iter;
+    log("Elapsed ", msPerIter, " ms per iteration (", iter, ")");
+    if (msPerIter < 1.0f) {
+      log("Failed: the elapsed time per iteration is less than 1 ms, which may indicate that the relaxedSignal "
+          "and relaxedWait are not working as expected.");
+    }
+  }
+
+  bootstrap->barrier();
+}
+
+int main() {
+  int pid0 = spawn_process([]() { worker(0); });
+  int pid1 = spawn_process([]() { worker(1); });
+  if (pid0 < 0 || pid1 < 0) {
+    log("Failed to spawn processes.");
+    return -1;
+  }
+  int status0 = wait_process(pid0);
+  int status1 = wait_process(pid1);
+  if (status0 < 0 || status1 < 0) {
+    log("Failed to wait for processes.");
+    return -1;
+  }
+  if (status0 != 0 || status1 != 0) {
+    log("One of the processes failed.");
+    return -1;
+  }
+  log("Succeed!");
+  return 0;
+}
--- a/examples/tutorials/03-memory-channel/.gitignore
+++ b/examples/tutorials/03-memory-channel/.gitignore
@@ -0,0 +1 @@
+bidir_memory_channel
--- a/examples/tutorials/03-memory-channel/Makefile
+++ b/examples/tutorials/03-memory-channel/Makefile
@@ -0,0 +1,22 @@
+CUDA_HOME ?= /usr/local/cuda
+ROCM_HOME ?= /opt/rocm
+
+# Check if nvcc exists, otherwise use hipcc
+ifeq ($(shell which $(CUDA_HOME)/bin/nvcc 2>/dev/null),)
+    COMPILER := $(ROCM_HOME)/bin/hipcc
+    ARCH_FLAG := -D__HIP_PLATFORM_AMD__=1
+else
+    COMPILER := $(CUDA_HOME)/bin/nvcc
+    ARCH_FLAG := -arch=native
+endif
+
+TARGET = bidir_memory_channel
+SRC = bidir_memory_channel.cu
+
+all: $(TARGET)
+
+$(TARGET): $(SRC)
+	$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp
+
+clean:
+	rm -f $(TARGET)
--- a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
+++ b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
@@ -0,0 +1,238 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <functional>
+#include <iostream>
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/memory_channel.hpp>
+#include <mscclpp/memory_channel_device.hpp>
+#include <sstream>
+
+#define PORT_NUMBER "50505"
+
+template <typename... Args>
+void log(Args &&...args) {
+  std::stringstream ss;
+  (ss << ... << args);
+  ss << std::endl;
+  std::cout << ss.str();
+}
+
+int spawn_process(std::function<void()> func) {
+  pid_t pid = fork();
+  if (pid < 0) return -1;
+  if (pid == 0) {
+    // Child process
+    func();
+    exit(0);
+  }
+  return pid;
+}
+
+int wait_process(int pid) {
+  int status;
+  if (waitpid(pid, &status, 0) < 0) {
+    return -1;
+  }
+  if (WIFEXITED(status)) {
+    return WEXITSTATUS(status);
+  }
+  return -1;
+}
+
+__device__ mscclpp::DeviceSyncer devSyncer;
+
+__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid == 0) {
+    devHandle->relaxedSignal();
+    devHandle->relaxedWait();
+  }
+  devSyncer.sync(gridDim.x);
+
+  const uint64_t srcOffset = myRank * copyBytes;
+  const uint64_t dstOffset = srcOffset;
+  devHandle->put(dstOffset, srcOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x);
+  devSyncer.sync(gridDim.x);
+  if (tid == 0) {
+    devHandle->signal();
+    devHandle->wait();
+  }
+}
+
+__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid == 0) {
+    devHandle->relaxedSignal();
+    devHandle->relaxedWait();
+  }
+  devSyncer.sync(gridDim.x);
+
+  const int remoteRank = myRank ^ 1;
+  const uint64_t srcOffset = remoteRank * copyBytes;
+  const uint64_t dstOffset = srcOffset;
+  devHandle->get(srcOffset, dstOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x);
+}
+
+__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank,
+                                     uint32_t flag) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid == 0) {
+    devHandle->relaxedSignal();
+    devHandle->relaxedWait();
+  }
+  devSyncer.sync(gridDim.x);
+
+  const uint64_t srcOffset = myRank * copyBytes;
+  const uint64_t dstOffset = srcOffset;
+  const uint64_t pktBufOffset = 0;
+  devHandle->putPackets(pktBufOffset, srcOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
+  devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
+}
+
+void worker(int gpuId) {
+  MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
+  const int myRank = gpuId;
+  const int remoteRank = myRank == 0 ? 1 : 0;
+  const int nRanks = 2;
+  const int iter = 1000;
+  const mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
+  const size_t bufferBytes = 256 * 1024 * 1024;
+  const size_t pktBufferBytes = 256 * 1024 * 1024;
+
+  log("GPU ", gpuId, ": Preparing for tests ...");
+
+  // Build a connection and a semaphore
+  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
+  bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER);
+  mscclpp::Communicator comm(bootstrap);
+  auto conn = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank).get();
+  auto sema = comm.buildSemaphore(conn, remoteRank).get();
+
+  mscclpp::GpuBuffer buffer(bufferBytes);
+  mscclpp::GpuBuffer pktBuffer(pktBufferBytes);
+  mscclpp::RegisteredMemory localRegMem = comm.registerMemory(buffer.data(), buffer.bytes(), transport);
+  mscclpp::RegisteredMemory localPktRegMem = comm.registerMemory(pktBuffer.data(), pktBuffer.bytes(), transport);
+
+  comm.sendMemory(localRegMem, remoteRank);
+  comm.sendMemory(localPktRegMem, remoteRank);
+  auto remoteRegMemFuture = comm.recvMemory(remoteRank);
+  auto remotePktRegMemFuture = comm.recvMemory(remoteRank);
+  mscclpp::RegisteredMemory remoteRegMem = remoteRegMemFuture.get();
+  mscclpp::RegisteredMemory remotePktRegMem = remotePktRegMemFuture.get();
+
+  mscclpp::MemoryChannel memChan(sema, /*dst*/ remoteRegMem, /*src*/ localRegMem);
+  mscclpp::MemoryChannel memPktChan(sema, /*dst*/ remotePktRegMem, /*src*/ localRegMem,
+                                    /*packetBuffer*/ localPktRegMem.data());
+
+  auto memChanHandle = memChan.deviceHandle();
+  auto memPktChanHandle = memPktChan.deviceHandle();
+
+  void *devHandle;
+  void *devPktHandle;
+  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
+  MSCCLPP_CUDATHROW(cudaMalloc(&devPktHandle, sizeof(memPktChanHandle)));
+  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
+  MSCCLPP_CUDATHROW(cudaMemcpy(devPktHandle, &memPktChanHandle, sizeof(memPktChanHandle), cudaMemcpyHostToDevice));
+
+  cudaStream_t stream;
+  MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  std::function<void(size_t)> kernels[3];
+
+  kernels[0] = [&](size_t copyBytes) {
+    bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
+                                            copyBytes, myRank);
+  };
+
+  kernels[1] = [&](size_t copyBytes) {
+    bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
+                                            copyBytes, myRank);
+  };
+
+  kernels[2] = [&](size_t copyBytes) {
+    static uint32_t flag = 1;
+    bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devPktHandle),
+                                                  copyBytes, myRank, flag++);
+  };
+
+  cudaEvent_t start, end;
+  if (gpuId == 0) {
+    MSCCLPP_CUDATHROW(cudaEventCreate(&start));
+    MSCCLPP_CUDATHROW(cudaEventCreate(&end));
+  }
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  bootstrap->barrier();
+
+  for (int kernelId = 0; kernelId < 3; ++kernelId) {
+    const std::string testName = (kernelId == 0) ? "Bidir Put" : (kernelId == 1) ? "Bidir Get" : "Bidir Put Packets";
+    for (size_t copyBytes : {1024, 1024 * 1024, 128 * 1024 * 1024}) {
+      cudaGraph_t graph;
+      cudaGraphExec_t graphExec;
+
+      MSCCLPP_CUDATHROW(cudaGraphCreate(&graph, 0));
+      MSCCLPP_CUDATHROW(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
+
+      for (int i = 0; i < iter; ++i) {
+        kernels[kernelId](copyBytes);
+      }
+
+      MSCCLPP_CUDATHROW(cudaStreamEndCapture(stream, &graph));
+      MSCCLPP_CUDATHROW(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
+
+      // Synchronize before timing
+      MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+      bootstrap->barrier();
+
+      if (gpuId == 0) {
+        MSCCLPP_CUDATHROW(cudaEventRecord(start, stream));
+      }
+
+      MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream));
+
+      if (gpuId == 0) {
+        MSCCLPP_CUDATHROW(cudaEventRecord(end, stream));
+        MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
+        float elapsedTime;
+        float elapsedTimePerIter;
+        float gbps;
+        MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
+        elapsedTimePerIter = elapsedTime / iter;
+        gbps = float(copyBytes) / elapsedTimePerIter * 1e-6f;
+        log("GPU ", gpuId, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
+            gbps, " GB/s");
+      }
+      MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+      MSCCLPP_CUDATHROW(cudaGraphExecDestroy(graphExec));
+      MSCCLPP_CUDATHROW(cudaGraphDestroy(graph));
+    }
+  }
+
+  bootstrap->barrier();
+}
+
+int main() {
+  int pid0 = spawn_process([]() { worker(0); });
+  int pid1 = spawn_process([]() { worker(1); });
+  if (pid0 < 0 || pid1 < 0) {
+    log("Failed to spawn processes.");
+    return -1;
+  }
+  int status0 = wait_process(pid0);
+  int status1 = wait_process(pid1);
+  if (status0 < 0 || status1 < 0) {
+    log("Failed to wait for processes.");
+    return -1;
+  }
+  if (status0 != 0 || status1 != 0) {
+    log("One of the processes failed.");
+    return -1;
+  }
+  log("Succeed!");
+  return 0;
+}
--- a/examples/tutorials/04-port-channel/.gitignore
+++ b/examples/tutorials/04-port-channel/.gitignore
@@ -0,0 +1 @@
+bidir_port_channel
--- a/examples/tutorials/04-port-channel/Makefile
+++ b/examples/tutorials/04-port-channel/Makefile
@@ -0,0 +1,22 @@
+CUDA_HOME ?= /usr/local/cuda
+ROCM_HOME ?= /opt/rocm
+
+# Check if nvcc exists, otherwise use hipcc
+ifeq ($(shell which $(CUDA_HOME)/bin/nvcc 2>/dev/null),)
+    COMPILER := $(ROCM_HOME)/bin/hipcc
+    ARCH_FLAG := -D__HIP_PLATFORM_AMD__=1
+else
+    COMPILER := $(CUDA_HOME)/bin/nvcc
+    ARCH_FLAG := -arch=native
+endif
+
+TARGET = bidir_port_channel
+SRC = bidir_port_channel.cu
+
+all: $(TARGET)
+
+$(TARGET): $(SRC)
+	$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp
+
+clean:
+	rm -f $(TARGET)
--- a/examples/tutorials/04-port-channel/bidir_port_channel.cu
+++ b/examples/tutorials/04-port-channel/bidir_port_channel.cu
@@ -0,0 +1,188 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <functional>
+#include <iostream>
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/port_channel.hpp>
+#include <mscclpp/port_channel_device.hpp>
+#include <sstream>
+
+#define PORT_NUMBER "50505"
+
+template <typename... Args>
+void log(Args &&...args) {
+  std::stringstream ss;
+  (ss << ... << args);
+  ss << std::endl;
+  std::cout << ss.str();
+}
+
+int spawn_process(std::function<void()> func) {
+  pid_t pid = fork();
+  if (pid < 0) return -1;
+  if (pid == 0) {
+    // Child process
+    func();
+    exit(0);
+  }
+  return pid;
+}
+
+int wait_process(int pid) {
+  int status;
+  if (waitpid(pid, &status, 0) < 0) {
+    return -1;
+  }
+  if (WIFEXITED(status)) {
+    return WEXITSTATUS(status);
+  }
+  return -1;
+}
+
+__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid == 0) {
+    devHandle->signal();
+    devHandle->wait();
+
+    const uint64_t srcOffset = myRank * copyBytes;
+    const uint64_t dstOffset = srcOffset;
+    devHandle->putWithSignal(dstOffset, srcOffset, copyBytes);
+    devHandle->wait();
+  }
+}
+
+void worker(int gpuId) {
+  MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
+  const int myRank = gpuId;
+  const int remoteRank = myRank == 0 ? 1 : 0;
+  const int nRanks = 2;
+  const int iter = 1000;
+  const mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
+  const size_t bufferBytes = 256 * 1024 * 1024;
+
+  log("GPU ", gpuId, ": Preparing for tests ...");
+
+  // Build a connection and a semaphore
+  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
+  bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER);
+  mscclpp::Communicator comm(bootstrap);
+  auto conn = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank).get();
+  auto sema = comm.buildSemaphore(conn, remoteRank).get();
+
+  mscclpp::GpuBuffer buffer(bufferBytes);
+  mscclpp::RegisteredMemory localRegMem = comm.registerMemory(buffer.data(), buffer.bytes(), transport);
+
+  comm.sendMemory(localRegMem, remoteRank);
+  auto remoteRegMemFuture = comm.recvMemory(remoteRank);
+  mscclpp::RegisteredMemory remoteRegMem = remoteRegMemFuture.get();
+
+  mscclpp::ProxyService proxyService;
+  mscclpp::SemaphoreId semaId = proxyService.addSemaphore(sema);
+  mscclpp::MemoryId localMemId = proxyService.addMemory(localRegMem);
+  mscclpp::MemoryId remoteMemId = proxyService.addMemory(remoteRegMem);
+  mscclpp::PortChannel portChan = proxyService.portChannel(semaId, remoteMemId, localMemId);
+
+  auto portChanHandle = portChan.deviceHandle();
+
+  void *devHandle;
+  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(portChanHandle)));
+  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &portChanHandle, sizeof(portChanHandle), cudaMemcpyHostToDevice));
+
+  cudaStream_t stream;
+  MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  std::function<void(size_t)> kernels[1];
+
+  kernels[0] = [&](size_t copyBytes) {
+    bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle *>(devHandle), copyBytes, myRank);
+  };
+
+  cudaEvent_t start, end;
+  if (gpuId == 0) {
+    MSCCLPP_CUDATHROW(cudaEventCreate(&start));
+    MSCCLPP_CUDATHROW(cudaEventCreate(&end));
+  }
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  bootstrap->barrier();
+
+  for (int kernelId = 0; kernelId < 1; ++kernelId) {
+    const std::string testName = "Bidir PutWithSignal";
+    for (size_t copyBytes : {1024, 1024 * 1024, 128 * 1024 * 1024}) {
+      cudaGraph_t graph;
+      cudaGraphExec_t graphExec;
+
+      proxyService.startProxy();
+
+      MSCCLPP_CUDATHROW(cudaGraphCreate(&graph, 0));
+      MSCCLPP_CUDATHROW(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
+
+      for (int i = 0; i < iter; ++i) {
+        kernels[kernelId](copyBytes);
+      }
+
+      MSCCLPP_CUDATHROW(cudaStreamEndCapture(stream, &graph));
+      MSCCLPP_CUDATHROW(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
+
+      proxyService.stopProxy();
+
+      // Synchronize before timing
+      MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+      proxyService.startProxy();
+      bootstrap->barrier();
+
+      if (gpuId == 0) {
+        MSCCLPP_CUDATHROW(cudaEventRecord(start, stream));
+      }
+
+      MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream));
+
+      if (gpuId == 0) {
+        MSCCLPP_CUDATHROW(cudaEventRecord(end, stream));
+        MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
+        float elapsedTime;
+        float elapsedTimePerIter;
+        float gbps;
+        MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
+        elapsedTimePerIter = elapsedTime / iter;
+        gbps = float(copyBytes) / elapsedTimePerIter * 1e-6f;
+        log("GPU ", gpuId, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
+            gbps, " GB/s");
+      }
+      MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+      proxyService.stopProxy();
+
+      MSCCLPP_CUDATHROW(cudaGraphExecDestroy(graphExec));
+      MSCCLPP_CUDATHROW(cudaGraphDestroy(graph));
+    }
+  }
+
+  bootstrap->barrier();
+}
+
+int main() {
+  int pid0 = spawn_process([]() { worker(0); });
+  int pid1 = spawn_process([]() { worker(1); });
+  if (pid0 < 0 || pid1 < 0) {
+    log("Failed to spawn processes.");
+    return -1;
+  }
+  int status0 = wait_process(pid0);
+  int status1 = wait_process(pid1);
+  if (status0 < 0 || status1 < 0) {
+    log("Failed to wait for processes.");
+    return -1;
+  }
+  if (status0 != 0 || status1 != 0) {
+    log("One of the processes failed.");
+    return -1;
+  }
+  log("Succeed!");
+  return 0;
+}
--- a/examples/tutorials/Makefile
+++ b/examples/tutorials/Makefile
@@ -0,0 +1,17 @@
+TUTORIAL_DIRS := 01-basic-concepts 02-bootstrap 03-memory-channel 04-port-channel
+
+.PHONY: all clean help $(TUTORIAL_DIRS) clean-%
+
+all: $(TUTORIAL_DIRS)
+	@echo "All tutorials built successfully!"
+
+$(TUTORIAL_DIRS):
+	@echo "Building tutorial: $@"
+	@$(MAKE) -C $@ all
+
+clean: $(addprefix clean-, $(TUTORIAL_DIRS))
+	@echo "All tutorials cleaned!"
+
+clean-%:
+	@echo "Cleaning tutorial: $*"
+	@$(MAKE) -C $* clean
--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -79,6 +79,7 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
 #define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__)
 #define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__)
+#define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__)
 #define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__)
 #define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__)
 #define cudaGraphDestroy(...) hipGraphDestroy(__VA_ARGS__)
				`@@ -1 +0,0 @@`
				`# Packet API for latency sensitive applications`