From 41bf96abc2d1640f4d7c4a704081acc7362672c4 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Tue, 3 Feb 2026 05:16:11 +0800
Subject: [PATCH 01/52] Fix the relative path extraction on github page (#739)

Fix missing 'mscclpp' base directory during version switching on GitHub
Pages.

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Binyang Li <binyli@microsoft.com>
---
 docs/_static/version-selector.js | 62 ++++++++++++++++++++++----------
 1 file changed, 44 insertions(+), 18 deletions(-)

diff --git a/docs/_static/version-selector.js b/docs/_static/version-selector.js
index 0efc47fe..7622aefd 100644
--- a/docs/_static/version-selector.js
+++ b/docs/_static/version-selector.js
@@ -26,27 +26,53 @@
      * @returns {string} The base path (e.g., '/mscclpp' or '')
      */
     function detectBasePath() {
-        const path = window.location.pathname;
-        // Match pattern: /base-path/vX.Y.Z/... or /base-path/main/...
-        // The base path is everything before the version or main directory
-        const match = path.match(/^(\/[^\/]+)?(?=\/(v\d+\.\d+\.\d+|main)\/)/);
-        if (match && match[1]) {
-            return match[1];
-        }
-        // Check if we're at a root that's actually a project site
-        // Look for common indicators like the repository name in the path
-        const projectMatch = path.match(/^(\/[^\/]+)(?=\/)/);
-        if (projectMatch) {
-            // Verify this isn't a version path at root
-            const potentialBase = projectMatch[1];
-            if (!potentialBase.match(/^\/v\d+\.\d+\.\d+$/) && potentialBase !== '/main') {
-                // Check if the remaining path contains version info
-                const remainingPath = path.substring(potentialBase.length);
-                if (remainingPath.match(/^\/(v\d+\.\d+\.\d+|main)\//)) {
-                    return potentialBase;
+        // Most reliable method: detect from this script's own URL
+        // The script is always at {base}/_static/version-selector.js or {base}/vX.Y.Z/_static/version-selector.js
+        const scripts = document.getElementsByTagName('script');
+        for (let i = 0; i < scripts.length; i++) {
+            const src = scripts[i].src;
+            if (src && (src.includes('/_static/version-selector.js') || src.endsWith('version-selector.js'))) {
+                try {
+                    const url = new URL(src);
+                    const scriptPath = url.pathname;
+                    // Extract base path: everything before /_static/version-selector.js
+                    // But also strip version directories like /v0.8.0/ or /main/
+                    const match = scriptPath.match(/^(.*?)\/_static\/version-selector\.js$/);
+                    if (match) {
+                        let basePath = match[1] || '';
+                        // Remove version suffix if present (e.g., /mscclpp/v0.8.0 -> /mscclpp)
+                        basePath = basePath.replace(/\/(v\d+\.\d+\.\d+|main)$/, '');
+                        return basePath;
+                    }
+                } catch (e) {
+                    // URL parsing failed, continue to fallback
+                    // Log a warning to aid debugging when the primary detection method fails.
+                    if (typeof console !== 'undefined' && typeof console.warn === 'function') {
+                        console.warn('version-selector: Failed to parse script URL for base path detection; falling back to location-based detection.', src, e);
+                    }
                 }
             }
         }
+
+        // Fallback: try to detect from URL path
+        const path = window.location.pathname;
+        const segments = path.split('/').filter(s => s.length > 0);
+
+        if (segments.length >= 1) {
+            const firstSegment = segments[0];
+            // If first segment is not a version tag (vX.Y.Z), not 'main', and
+            // does not look like a file name (no '.' in the segment), then it's
+            // the GitHub Pages project base path (e.g., 'mscclpp').
+            // This handles both:
+            //   /mscclpp/v0.8.0/index.html -> base is /mscclpp
+            //   /mscclpp/index.html -> base is /mscclpp
+            // while avoiding treating root files like /index.html as a base path.
+            if (!firstSegment.match(/^v\d+\.\d+\.\d+$/) && firstSegment !== 'main' && !firstSegment.includes('.')) {
+                return '/' + firstSegment;
+            }
+        }
+
+        // No base path (root site or local development)
         return '';
     }
     

From 03b1936ddb5d56275b6257164a2c22a40b399c0a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 3 Feb 2026 08:50:45 +0900
Subject: [PATCH 02/52] Support multi-node in `MemoryChannel` tutorial (#726)

Co-authored-by: mahdiehghazim <mahdiehghazi@microsoft.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 docs/tutorials/03-memory-channel.md           | 33 +++++++-
 .../03-memory-channel/bidir_memory_channel.cu | 75 ++++++++++++-------
 2 files changed, 81 insertions(+), 27 deletions(-)

diff --git a/docs/tutorials/03-memory-channel.md b/docs/tutorials/03-memory-channel.md
index 00e2192b..c6a8b9e1 100644
--- a/docs/tutorials/03-memory-channel.md
+++ b/docs/tutorials/03-memory-channel.md
@@ -78,7 +78,7 @@ mscclpp::GpuBuffer buffer(bufferBytes);
 mscclpp::RegisteredMemory localRegMem = comm.registerMemory(buffer.data(), buffer.bytes(), transport);
 ```
 
-Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (such as [NVL72](https://www.nvidia.com/en-us/data-center/gb300-nvl72) on NVIDIA Grace Blackwell platforms). We will introduce other transport types in later tutorials.
+Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (will be explained in a later section: {ref}`mc-cross-node`). We will introduce other transport types in later tutorials.
 
 **GpuBuffer** is NOT required for creating a `RegisteredMemory`; you can register any pre-allocated GPU memory region with `registerMemory()`. However, it is the user's responsibility to ensure that the memory region is suitable for their communication operations. Depending on the hardware platform, some communication methods may require specific memory allocation to ensure data consistency and correctness. `GpuBuffer` is a convenient way to allocate GPU memory that is compatible with the communication methods that MSCCL++ supports. It provides a simple interface for allocating GPU memory and automatically handles memory deallocation when it goes out of scope.
 
@@ -251,6 +251,37 @@ columns 2
 
 Since the flags take 50% of the packet size, the goodput of communication using packets is only 50% compared to transferring raw data. However, this doesn't matter because packets are designed for small data transfers. Packets transfer small data efficiently because the integrity of the user data is guaranteed by only waiting for the correct flags (done by `unpackPackets()`); explicit memory synchronization (signal and wait) is not needed.
 
+(mc-cross-node)=
+## Cross-node Execution
+
+For **inter-node** communication, using `PortChannel` (will be explained in the following tutorial) is usually a more accessible option that leverages more widely-used networking interfaces. However, `MemoryChannel` can still be used as long as the underlying hardware allows memory mapping between the two GPUs, such as [Multi-Node NVLink (MNNVL)](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/overview.html) on NVIDIA Grace Blackwell platforms.
+
+We can use the same example code to test inter-node `MemoryChannel`. Users can consult the [NVIDIA MNNVL verification guide](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/verifying.html) for verification steps and detailed environment requirements for MNNVL.
+
+Run the program on two nodes with command line arguments:
+
+```
+./bidir_memory_channel [<ip_port> <rank> <gpu_id>]
+```
+
+For example, assume we use `192.168.0.1:50000` as the bootstrap IP address and port, and both nodes use GPU 0 locally.
+
+On Node 0 (Rank 0):
+```bash
+$ ./bidir_memory_channel 192.168.0.1:50000 0 0
+```
+
+On Node 1 (Rank 1):
+```bash
+$ ./bidir_memory_channel 192.168.0.1:50000 1 0
+```
+
+You should see output indicating successful data transfer.
+
+```{tip}
+If your bootstrap IP address is not on the default network interface of your node, you can specify the network interface by passing `interface_name:ip:port` as the first argument (such as `eth1:192.168.0.1:50000`).
+```
+
 ## Summary and Next Steps
 
 In this tutorial, you have learned how to use `MemoryChannel` for efficient data transfer between GPUs. You have also learned how to create communication buffers using `RegisteredMemory` and `GpuBuffer`, and how to use packets for small data transfers. You can find more complex usage of `MemoryChannel` in the {ref}`mscclpp-test`.
diff --git a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
index e9007612..cfbf12d7 100644
--- a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
+++ b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
@@ -95,9 +95,8 @@ __global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHand
   devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
 }
 
-void worker(int gpuId) {
+void worker(int myRank, int gpuId, const std::string &ipPort) {
   MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
-  const int myRank = gpuId;
   const int remoteRank = myRank == 0 ? 1 : 0;
   const int nRanks = 2;
   const int iter = 1000;
@@ -105,11 +104,11 @@ void worker(int gpuId) {
   const size_t bufferBytes = 256 * 1024 * 1024;
   const size_t pktBufferBytes = 256 * 1024 * 1024;
 
-  log("GPU ", gpuId, ": Preparing for tests ...");
+  log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");
 
   // Build a connection and a semaphore
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
-  bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER);
+  bootstrap->initialize(ipPort);
   mscclpp::Communicator comm(bootstrap);
   auto conn = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank).get();
   auto sema = comm.buildSemaphore(conn, remoteRank).get();
@@ -162,7 +161,7 @@ void worker(int gpuId) {
   };
 
   cudaEvent_t start, end;
-  if (gpuId == 0) {
+  if (myRank == 0) {
     MSCCLPP_CUDATHROW(cudaEventCreate(&start));
     MSCCLPP_CUDATHROW(cudaEventCreate(&end));
   }
@@ -189,13 +188,13 @@ void worker(int gpuId) {
       MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
       bootstrap->barrier();
 
-      if (gpuId == 0) {
+      if (myRank == 0) {
         MSCCLPP_CUDATHROW(cudaEventRecord(start, stream));
       }
 
       MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream));
 
-      if (gpuId == 0) {
+      if (myRank == 0) {
         MSCCLPP_CUDATHROW(cudaEventRecord(end, stream));
         MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
         float elapsedTime;
@@ -204,8 +203,8 @@ void worker(int gpuId) {
         MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
         elapsedTimePerIter = elapsedTime / iter;
         gbps = float(copyBytes) / elapsedTimePerIter * 1e-6f;
-        log("GPU ", gpuId, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
-            gbps, " GB/s");
+        log("Rank ", myRank, " (GPU ", gpuId, "): [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter,
+            " ms/iter, BW ", gbps, " GB/s");
       }
       MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
       MSCCLPP_CUDATHROW(cudaGraphExecDestroy(graphExec));
@@ -216,23 +215,47 @@ void worker(int gpuId) {
   bootstrap->barrier();
 }
 
-int main() {
-  int pid0 = spawn_process([]() { worker(0); });
-  int pid1 = spawn_process([]() { worker(1); });
-  if (pid0 < 0 || pid1 < 0) {
-    log("Failed to spawn processes.");
+int main(int argc, char **argv) {
+  if (argc == 1) {
+    int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
+    int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
+    if (pid0 < 0 || pid1 < 0) {
+      log("Failed to spawn processes.");
+      return -1;
+    }
+    int status0 = wait_process(pid0);
+    int status1 = wait_process(pid1);
+    if (status0 < 0 || status1 < 0) {
+      log("Failed to wait for processes.");
+      return -1;
+    }
+    if (status0 != 0 || status1 != 0) {
+      log("One of the processes failed.");
+      return -1;
+    }
+    log("Succeed!");
+    return 0;
+  } else if (argc == 4) {
+    std::string ipPort = argv[1];
+    int rank, gpuId;
+    try {
+      rank = std::stoi(argv[2]);
+      gpuId = std::stoi(argv[3]);
+    } catch (const std::exception &) {
+      log("Error: rank and gpu_id must be valid integers.");
+      return -1;
+    }
+    if (rank < 0 || rank > 2 || gpuId < 0) {
+      log("Error: rank must be between 0 and 1 and gpu_id must be non-negative.");
+      return -1;
+    }
+    worker(rank, gpuId, ipPort);
+    log("Rank ", rank, ": Succeed!");
+    return 0;
+  } else {
+    std::cerr << "Usage:\n"
+              << "  " << argv[0] << "                Run in intra-node mode\n"
+              << "  " << argv[0] << " <ip_port> <rank> <gpu_id>   Run in inter-node mode\n";
     return -1;
   }
-  int status0 = wait_process(pid0);
-  int status1 = wait_process(pid1);
-  if (status0 < 0 || status1 < 0) {
-    log("Failed to wait for processes.");
-    return -1;
-  }
-  if (status0 != 0 || status1 != 0) {
-    log("One of the processes failed.");
-    return -1;
-  }
-  log("Succeed!");
-  return 0;
 }

From e21513791a79f62768a9f8f9b8517ebf803d2eed Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 3 Feb 2026 10:13:20 -0800
Subject: [PATCH 03/52] Address comments for PR #692 (#733)

Rename nanobind-exposed C++ types to Cpp*
Replace MSCCLPP_EXECUTION_PLAN_DIR / MSCCLPP_NATIVE_CACHE_DIR with
MSCCLPP_CACHE_DIR across C++ and Python.
---
 docs/conf.py                                  |  18 ++-
 docs/dsl/results.md                           |   2 +-
 docs/py_api.rst                               |   4 +-
 docs/quickstart.md                            |   2 +-
 include/mscclpp/env.hpp                       |   6 +-
 python/csrc/algorithm.cpp                     |  20 ++--
 python/csrc/core_py.cpp                       |  36 +++---
 python/csrc/env_py.cpp                        |   4 +-
 python/csrc/error_py.cpp                      |   2 +-
 python/csrc/executor_py.cpp                   |   6 +-
 .../ext/algorithm_collection_builder_py.cpp   |   2 +-
 python/csrc/fifo_py.cpp                       |   6 +-
 python/csrc/gpu_utils_py.cpp                  |   2 +-
 python/csrc/memory_channel_py.cpp             |   8 +-
 python/csrc/npkit_py.cpp                      |   2 +-
 python/csrc/numa_py.cpp                       |   2 +-
 python/csrc/port_channel_py.cpp               |  12 +-
 python/csrc/semaphore_py.cpp                  |   6 +-
 python/csrc/switch_channel_py.cpp             |   6 +-
 python/mscclpp/__init__.py                    |  52 ++++-----
 python/mscclpp/__main__.py                    |   4 +-
 python/mscclpp/_core/__init__.py              |   6 -
 python/mscclpp/_core/algorithm.py             |  52 +++++----
 python/mscclpp/_core/buffer.py                |   4 +-
 python/mscclpp/_core/comm.py                  | 110 +++++++++---------
 python/mscclpp/_core/compiler.py              |  19 ++-
 python/mscclpp/ext/__init__.py                |   2 -
 .../ext/algorithm_collection_builder.py       |   8 +-
 python/mscclpp/utils.py                       |   2 +-
 src/core/env.cpp                              |   5 +-
 .../algorithm_collection_builder.cc           |   6 +-
 31 files changed, 211 insertions(+), 205 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index fdfb8d66..52321465 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -11,6 +11,18 @@
 import sys
 import importlib.util
 from pathlib import Path
+from unittest.mock import MagicMock
+
+
+class NamedMock(MagicMock):
+    def __getattr__(self, name):
+        attr = super().__getattr__(name)
+        if isinstance(attr, MagicMock):
+            # Assigns __name__ and __qualname__ to satisfy Sphinx autodoc inspection.
+            attr.__name__ = name
+            attr.__qualname__ = name
+        return attr
+
 
 # Add the python package to sys.path so Sphinx can find it
 project_root = Path(__file__).parent.parent
@@ -63,7 +75,7 @@ autodoc_default_options = {
     "show-inheritance": True,
 }
 # only mock the C-extension when using the source tree
-autodoc_mock_imports = ["mscclpp._version", "mscclpp._mscclpp", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"]
+autodoc_mock_imports = ["mscclpp._version", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"]
 autodoc_typehints = "description"
 napoleon_google_docstring = True
 napoleon_numpy_docstring = True
@@ -71,6 +83,10 @@ intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
 }
+mock_mscclpp = NamedMock()
+# Set attributes to satisfy Sphinx autodoc inspection.
+mock_mscclpp.env.return_value.cache_dir = "_mscclpp"
+sys.modules["mscclpp._mscclpp"] = mock_mscclpp
 
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
diff --git a/docs/dsl/results.md b/docs/dsl/results.md
index a34eae5b..99f19476 100644
--- a/docs/dsl/results.md
+++ b/docs/dsl/results.md
@@ -56,7 +56,7 @@ python3 -m mscclpp --install
 
 After installation, the generated JSON execution plan can be found at:
 ```
-~/.cache/mscclpp_default/
+~/.cache/mscclpp/default/
 ```
 
 **Performance Results:**
diff --git a/docs/py_api.rst b/docs/py_api.rst
index 5ea39bc3..7acc9273 100644
--- a/docs/py_api.rst
+++ b/docs/py_api.rst
@@ -7,6 +7,4 @@ This reference organizes the MSCCL++ Python API.
    :toctree: py_api
    :recursive:
 
-   mscclpp.comm
-   mscclpp.utils
-   mscclpp.language
+   mscclpp
diff --git a/docs/quickstart.md b/docs/quickstart.md
index 04a26466..ac1b7d6b 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -196,7 +196,7 @@ mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib
 Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
 ```bash
 export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
-mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
+mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
 On AMD platforms, you need to add `RCCL_MSCCL_ENABLE=0` to avoid conflicts with the fallback features.
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index 5972234b..9d78cd1a 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -70,9 +70,9 @@ class Env {
   /// Env name: `MSCCLPP_COMM_ID`. To be deprecated; don't use this.
   const std::string commId;
 
-  /// Env name: `MSCCLPP_EXECUTION_PLAN_DIR`. The directory to find execution plans from. This should be set to
-  /// use execution plans for the NCCL API. Unset by default.
-  const std::string executionPlanDir;
+  /// Env name: `MSCCLPP_CACHE_DIR`. The directory to use for caching execution plans and other temporary files.
+  /// If unset, it defaults to `~/.cache/mscclpp`.
+  const std::string cacheDir;
 
   /// Env name: `MSCCLPP_NPKIT_DUMP_DIR`. The directory to dump NPKIT traces to. If this is set, NPKIT will be
   /// enabled and will dump traces to this directory. Unset by default.
diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp
index 28edfe2d..3553256a 100644
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -16,14 +16,16 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_algorithm(nb::module_& m) {
-  nb::enum_<CollectiveBufferMode>(m, "CollectiveBufferMode")
+  nb::enum_<CollectiveBufferMode>(m, "CppCollectiveBufferMode")
       .value("ANY", CollectiveBufferMode::Any)
       .value("IN_PLACE", CollectiveBufferMode::InPlace)
       .value("OUT_OF_PLACE", CollectiveBufferMode::OutOfPlace);
 
-  nb::enum_<AlgorithmType>(m, "AlgorithmType").value("NATIVE", AlgorithmType::Native).value("DSL", AlgorithmType::DSL);
+  nb::enum_<AlgorithmType>(m, "CppAlgorithmType")
+      .value("NATIVE", AlgorithmType::Native)
+      .value("DSL", AlgorithmType::DSL);
 
-  nb::enum_<CommResult>(m, "CommResult")
+  nb::enum_<CommResult>(m, "CppCommResult")
       .value("COMM_SUCCESS", CommResult::CommSuccess)
       .value("COMM_UNHANDLED_CUDA_ERROR", CommResult::CommUnhandledCudaError)
       .value("COMM_SYSTEM_ERROR", CommResult::CommSystemError)
@@ -34,13 +36,13 @@ void register_algorithm(nb::module_& m) {
       .value("COMM_IN_PROGRESS", CommResult::CommInProgress)
       .value("COMM_NUM_RESULTS", CommResult::CommNumResults);
 
-  nb::enum_<ReduceOp>(m, "ReduceOp")
+  nb::enum_<ReduceOp>(m, "CppReduceOp")
       .value("SUM", ReduceOp::SUM)
       .value("MIN", ReduceOp::MIN)
       .value("NOP", ReduceOp::NOP);
 
   auto algorithmClass =
-      nb::class_<Algorithm>(m, "Algorithm")
+      nb::class_<Algorithm>(m, "CppAlgorithm")
           .def_static(
               "from_native_capsule",
               [](nb::capsule cap) {
@@ -83,21 +85,21 @@ void register_algorithm(nb::module_& m) {
       .def_rw("world_size", &Algorithm::Constraint::worldSize)
       .def_rw("n_ranks_per_node", &Algorithm::Constraint::nRanksPerNode);
 
-  nb::class_<AlgorithmBuilder>(m, "AlgorithmBuilder").def("build", &AlgorithmBuilder::build);
+  nb::class_<AlgorithmBuilder>(m, "CppAlgorithmBuilder").def("build", &AlgorithmBuilder::build);
 
-  nb::class_<DslAlgorithm, Algorithm>(m, "DslAlgorithm")
+  nb::class_<DslAlgorithm, Algorithm>(m, "CppDslAlgorithm")
       .def(nb::init<std::string, ExecutionPlan, std::unordered_map<std::string, uint64_t>, Algorithm::Constraint>(),
            nb::arg("id"), nb::arg("plan"), nb::arg("tags") = std::unordered_map<std::string, uint64_t>(),
            nb::arg("constraint") = Algorithm::Constraint())
       .def("build", &DslAlgorithm::build);
 
-  nb::class_<AlgorithmCollection>(m, "AlgorithmCollection")
+  nb::class_<AlgorithmCollection>(m, "CppAlgorithmCollection")
       .def("register_algorithm", &AlgorithmCollection::registerAlgorithm, nb::arg("collective"), nb::arg("algo_name"),
            nb::arg("algorithm"))
       .def("get_algorithms_by_collective", &AlgorithmCollection::getAlgorithmsByCollective, nb::arg("collective"))
       .def("to_list", &AlgorithmCollection::getAllAlgorithms);
 
-  nb::class_<CollectiveRequest>(m, "CollectiveRequest")
+  nb::class_<CollectiveRequest>(m, "CppCollectiveRequest")
       .def_ro("world_size", &CollectiveRequest::worldSize)
       .def_ro("n_ranks_per_node", &CollectiveRequest::nRanksPerNode)
       .def_ro("rank", &CollectiveRequest::rank)
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index ba6af1dd..9f085675 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -32,21 +32,21 @@ extern void register_algorithm_collection_builder(nb::module_& m);
 
 template <typename T>
 void def_shared_future(nb::handle& m, const std::string& typestr) {
-  std::string pyclass_name = std::string("shared_future_") + typestr;
+  std::string pyclass_name = std::string("CppSharedFuture_") + typestr;
   nb::class_<std::shared_future<T>>(m, pyclass_name.c_str()).def("get", &std::shared_future<T>::get);
 }
 
 void register_core(nb::module_& m) {
   m.def("version", &version);
 
-  nb::enum_<DataType>(m, "DataType")
+  nb::enum_<DataType>(m, "CppDataType")
       .value("int32", DataType::INT32)
       .value("uint32", DataType::UINT32)
       .value("float16", DataType::FLOAT16)
       .value("float32", DataType::FLOAT32)
       .value("bfloat16", DataType::BFLOAT16);
 
-  nb::class_<Bootstrap>(m, "Bootstrap")
+  nb::class_<Bootstrap>(m, "CppBootstrap")
       .def("get_rank", &Bootstrap::getRank)
       .def("get_n_ranks", &Bootstrap::getNranks)
       .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode)
@@ -71,7 +71,7 @@ void register_core(nb::module_& m) {
       .def("recv", static_cast<void (Bootstrap::*)(std::vector<char>&, int, int)>(&Bootstrap::recv), nb::arg("data"),
            nb::arg("peer"), nb::arg("tag"));
 
-  nb::class_<UniqueId>(m, "UniqueId")
+  nb::class_<UniqueId>(m, "CppUniqueId")
       .def(nb::init<>())
       .def("__setstate__",
            [](UniqueId& self, nb::bytes b) {
@@ -81,7 +81,7 @@ void register_core(nb::module_& m) {
       .def("__getstate__",
            [](const UniqueId& self) { return nb::bytes(reinterpret_cast<const char*>(self.data()), UniqueIdBytes); });
 
-  nb::class_<TcpBootstrap, Bootstrap>(m, "TcpBootstrap")
+  nb::class_<TcpBootstrap, Bootstrap>(m, "CppTcpBootstrap")
       .def(nb::init<int, int>(), "Do not use this constructor. Use create instead.")
       .def_static(
           "create", [](int rank, int nRanks) { return std::make_shared<TcpBootstrap>(rank, nRanks); }, nb::arg("rank"),
@@ -93,7 +93,7 @@ void register_core(nb::module_& m) {
       .def("initialize", static_cast<void (TcpBootstrap::*)(const std::string&, int64_t)>(&TcpBootstrap::initialize),
            nb::call_guard<nb::gil_scoped_release>(), nb::arg("if_ip_port_trio"), nb::arg("timeout_sec") = 30);
 
-  nb::enum_<Transport>(m, "Transport")
+  nb::enum_<Transport>(m, "CppTransport")
       .value("Unknown", Transport::Unknown)
       .value("CudaIpc", Transport::CudaIpc)
       .value("IB0", Transport::IB0)
@@ -106,7 +106,7 @@ void register_core(nb::module_& m) {
       .value("IB7", Transport::IB7)
       .value("NumTransports", Transport::NumTransports);
 
-  nb::class_<TransportFlags>(m, "TransportFlags")
+  nb::class_<TransportFlags>(m, "CppTransportFlags")
       .def(nb::init<>())
       .def(nb::init_implicit<Transport>(), nb::arg("transport"))
       .def("has", &TransportFlags::has, nb::arg("transport"))
@@ -130,12 +130,12 @@ void register_core(nb::module_& m) {
       .def(nb::self == nb::self)
       .def(nb::self != nb::self);
 
-  nb::enum_<DeviceType>(m, "DeviceType")
+  nb::enum_<DeviceType>(m, "CppDeviceType")
       .value("Unknown", DeviceType::Unknown)
       .value("CPU", DeviceType::CPU)
       .value("GPU", DeviceType::GPU);
 
-  nb::class_<Device>(m, "Device")
+  nb::class_<Device>(m, "CppDevice")
       .def(nb::init<>())
       .def(nb::init_implicit<DeviceType>(), nb::arg("type"))
       .def(nb::init<DeviceType, int>(), nb::arg("type"), nb::arg("id") = -1)
@@ -147,7 +147,7 @@ void register_core(nb::module_& m) {
         return ss.str();
       });
 
-  nb::class_<EndpointConfig::Ib>(m, "EndpointConfigIb")
+  nb::class_<EndpointConfig::Ib>(m, "CppEndpointConfigIb")
       .def(nb::init<>())
       .def(nb::init<int, int, int, int, int, int, int>(), nb::arg("device_index") = -1,
            nb::arg("port") = EndpointConfig::Ib::DefaultPort,
@@ -164,7 +164,7 @@ void register_core(nb::module_& m) {
       .def_rw("max_send_wr", &EndpointConfig::Ib::maxSendWr)
       .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend);
 
-  nb::class_<RegisteredMemory>(m, "RegisteredMemory")
+  nb::class_<RegisteredMemory>(m, "CppRegisteredMemory")
       .def(nb::init<>())
       .def("data", [](RegisteredMemory& self) { return reinterpret_cast<uintptr_t>(self.data()); })
       .def("size", &RegisteredMemory::size)
@@ -172,7 +172,7 @@ void register_core(nb::module_& m) {
       .def("serialize", &RegisteredMemory::serialize)
       .def_static("deserialize", &RegisteredMemory::deserialize, nb::arg("data"));
 
-  nb::class_<Endpoint>(m, "Endpoint")
+  nb::class_<Endpoint>(m, "CppEndpoint")
       .def("config", &Endpoint::config)
       .def("transport", &Endpoint::transport)
       .def("device", &Endpoint::device)
@@ -180,7 +180,7 @@ void register_core(nb::module_& m) {
       .def("serialize", &Endpoint::serialize)
       .def_static("deserialize", &Endpoint::deserialize, nb::arg("data"));
 
-  nb::class_<Connection>(m, "Connection")
+  nb::class_<Connection>(m, "CppConnection")
       .def("write", &Connection::write, nb::arg("dst"), nb::arg("dstOffset"), nb::arg("src"), nb::arg("srcOffset"),
            nb::arg("size"))
       .def(
@@ -197,7 +197,7 @@ void register_core(nb::module_& m) {
       .def("local_device", &Connection::localDevice)
       .def("get_max_write_queue_size", &Connection::getMaxWriteQueueSize);
 
-  nb::class_<EndpointConfig>(m, "EndpointConfig")
+  nb::class_<EndpointConfig>(m, "CppEndpointConfig")
       .def(nb::init<>())
       .def(nb::init_implicit<Transport>(), nb::arg("transport"))
       .def(nb::init<Transport, Device, int, EndpointConfig::Ib>(), nb::arg("transport"), nb::arg("device"),
@@ -228,7 +228,7 @@ void register_core(nb::module_& m) {
           [](EndpointConfig& self, int v) { self.ib.maxWrPerSend = v; })
       .def_rw("max_write_queue_size", &EndpointConfig::maxWriteQueueSize);
 
-  nb::class_<Context>(m, "Context")
+  nb::class_<Context>(m, "CppContext")
       .def_static("create", &Context::create)
       .def(
           "register_memory",
@@ -239,13 +239,13 @@ void register_core(nb::module_& m) {
       .def("create_endpoint", &Context::createEndpoint, nb::arg("config"))
       .def("connect", &Context::connect, nb::arg("local_endpoint"), nb::arg("remote_endpoint"));
 
-  nb::class_<SemaphoreStub>(m, "SemaphoreStub")
+  nb::class_<SemaphoreStub>(m, "CppSemaphoreStub")
       .def(nb::init<const Connection&>(), nb::arg("connection"))
       .def("memory", &SemaphoreStub::memory)
       .def("serialize", &SemaphoreStub::serialize)
       .def_static("deserialize", &SemaphoreStub::deserialize, nb::arg("data"));
 
-  nb::class_<Semaphore>(m, "Semaphore")
+  nb::class_<Semaphore>(m, "CppSemaphore")
       .def(nb::init<>())
       .def(nb::init<const SemaphoreStub&, const SemaphoreStub&>(), nb::arg("local_stub"), nb::arg("remote_stub"))
       .def("connection", &Semaphore::connection)
@@ -256,7 +256,7 @@ void register_core(nb::module_& m) {
   def_shared_future<Connection>(m, "Connection");
   def_shared_future<Semaphore>(m, "Semaphore");
 
-  nb::class_<Communicator>(m, "Communicator")
+  nb::class_<Communicator>(m, "CppCommunicator")
       .def(nb::init<std::shared_ptr<Bootstrap>, std::shared_ptr<Context>>(), nb::arg("bootstrap"),
            nb::arg("context") = nullptr)
       .def("bootstrap", &Communicator::bootstrap)
diff --git a/python/csrc/env_py.cpp b/python/csrc/env_py.cpp
index a0ba4a4e..360acc6f 100644
--- a/python/csrc/env_py.cpp
+++ b/python/csrc/env_py.cpp
@@ -11,7 +11,7 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_env(nb::module_& m) {
-  nb::class_<Env>(m, "Env")
+  nb::class_<Env>(m, "CppEnv")
       .def_ro("debug", &Env::debug)
       .def_ro("debug_subsys", &Env::debugSubsys)
       .def_ro("debug_file", &Env::debugFile)
@@ -20,7 +20,7 @@ void register_env(nb::module_& m) {
       .def_ro("socket_family", &Env::socketFamily)
       .def_ro("socket_ifname", &Env::socketIfname)
       .def_ro("comm_id", &Env::commId)
-      .def_ro("execution_plan_dir", &Env::executionPlanDir)
+      .def_ro("cache_dir", &Env::cacheDir)
       .def_ro("npkit_dump_dir", &Env::npkitDumpDir)
       .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
 
diff --git a/python/csrc/error_py.cpp b/python/csrc/error_py.cpp
index ff532d10..208f4e84 100644
--- a/python/csrc/error_py.cpp
+++ b/python/csrc/error_py.cpp
@@ -22,7 +22,7 @@ using namespace mscclpp;
       m.attr(#name_).ptr());
 
 void register_error(nb::module_ &m) {
-  nb::enum_<ErrorCode>(m, "ErrorCode")
+  nb::enum_<ErrorCode>(m, "CppErrorCode")
       .value("SystemError", ErrorCode::SystemError)
       .value("InternalError", ErrorCode::InternalError)
       .value("RemoteError", ErrorCode::RemoteError)
diff --git a/python/csrc/executor_py.cpp b/python/csrc/executor_py.cpp
index 0a196f37..350a1e7a 100644
--- a/python/csrc/executor_py.cpp
+++ b/python/csrc/executor_py.cpp
@@ -15,16 +15,16 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_executor(nb::module_& m) {
-  nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
+  nb::enum_<PacketType>(m, "CppPacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
 
-  nb::class_<ExecutionPlan>(m, "ExecutionPlan")
+  nb::class_<ExecutionPlan>(m, "CppExecutionPlan")
       .def(nb::init<const std::string&, int>(), nb::arg("planPath"), nb::arg("rank"))
       .def_prop_ro("name", [](const ExecutionPlan& self) -> std::string { return self.name(); })
       .def_prop_ro("collective", [](const ExecutionPlan& self) -> std::string { return self.collective(); })
       .def_prop_ro("min_message_size", [](const ExecutionPlan& self) -> size_t { return self.minMessageSize(); })
       .def_prop_ro("max_message_size", [](const ExecutionPlan& self) -> size_t { return self.maxMessageSize(); });
 
-  nb::class_<Executor>(m, "Executor")
+  nb::class_<Executor>(m, "CppExecutor")
       .def(nb::init<std::shared_ptr<Communicator>>(), nb::arg("comm"))
       .def(
           "execute",
diff --git a/python/csrc/ext/algorithm_collection_builder_py.cpp b/python/csrc/ext/algorithm_collection_builder_py.cpp
index 2756edb7..1a912724 100644
--- a/python/csrc/ext/algorithm_collection_builder_py.cpp
+++ b/python/csrc/ext/algorithm_collection_builder_py.cpp
@@ -15,7 +15,7 @@ using namespace mscclpp;
 using namespace mscclpp::collective;
 
 void register_algorithm_collection_builder(nb::module_& m) {
-  nb::class_<AlgorithmCollectionBuilder>(m, "AlgorithmCollectionBuilder")
+  nb::class_<AlgorithmCollectionBuilder>(m, "CppAlgorithmCollectionBuilder")
       .def_static("get_instance", &AlgorithmCollectionBuilder::getInstance)
       .def("add_algorithm_builder", &AlgorithmCollectionBuilder::addAlgorithmBuilder, nb::arg("builder"))
       .def(
diff --git a/python/csrc/fifo_py.cpp b/python/csrc/fifo_py.cpp
index 63be4a33..e8b6a3e2 100644
--- a/python/csrc/fifo_py.cpp
+++ b/python/csrc/fifo_py.cpp
@@ -9,7 +9,7 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_fifo(nb::module_& m) {
-  nb::class_<ProxyTrigger>(m, "ProxyTrigger")
+  nb::class_<ProxyTrigger>(m, "CppProxyTrigger")
       .def_prop_rw(
           "fst", [](const ProxyTrigger& self) { return self.fst; },
           [](ProxyTrigger& self, uint64_t v) { self.fst = v; })
@@ -17,7 +17,7 @@ void register_fifo(nb::module_& m) {
           "snd", [](const ProxyTrigger& self) { return self.snd; },
           [](ProxyTrigger& self, uint64_t v) { self.snd = v; });
 
-  nb::class_<FifoDeviceHandle>(m, "FifoDeviceHandle")
+  nb::class_<FifoDeviceHandle>(m, "CppFifoDeviceHandle")
       .def_rw("triggers", &FifoDeviceHandle::triggers)
       .def_rw("tail", &FifoDeviceHandle::tail)
       .def_rw("head", &FifoDeviceHandle::head)
@@ -26,7 +26,7 @@ void register_fifo(nb::module_& m) {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<Fifo>(m, "Fifo")
+  nb::class_<Fifo>(m, "CppFifo")
       .def(nb::init<int>(), nb::arg("size") = DEFAULT_FIFO_SIZE)
       .def("poll", &Fifo::poll)
       .def("pop", &Fifo::pop)
diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp
index 66f036e2..6995756b 100644
--- a/python/csrc/gpu_utils_py.cpp
+++ b/python/csrc/gpu_utils_py.cpp
@@ -101,7 +101,7 @@ static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::v
 void register_gpu_utils(nb::module_& m) {
   m.def("is_nvls_supported", &isNvlsSupported);
 
-  nb::class_<GpuBuffer<char>>(m, "RawGpuBuffer")
+  nb::class_<GpuBuffer<char>>(m, "CppRawGpuBuffer")
       .def(nb::init<size_t>(), nb::arg("nelems"))
       .def("nelems", &GpuBuffer<char>::nelems)
       .def("bytes", &GpuBuffer<char>::bytes)
diff --git a/python/csrc/memory_channel_py.cpp b/python/csrc/memory_channel_py.cpp
index 4f9d90a0..ecccb1a0 100644
--- a/python/csrc/memory_channel_py.cpp
+++ b/python/csrc/memory_channel_py.cpp
@@ -11,20 +11,20 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_memory_channel(nb::module_& m) {
-  nb::class_<BaseMemoryChannel>(m, "BaseMemoryChannel")
+  nb::class_<BaseMemoryChannel>(m, "CppBaseMemoryChannel")
       .def(nb::init<>())
       .def(nb::init<std::shared_ptr<MemoryDevice2DeviceSemaphore>>(), nb::arg("semaphore"))
       .def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
       .def("device_handle", &BaseMemoryChannel::deviceHandle);
 
-  nb::class_<BaseMemoryChannel::DeviceHandle>(m, "BaseMemoryChannelDeviceHandle")
+  nb::class_<BaseMemoryChannel::DeviceHandle>(m, "CppBaseMemoryChannelDeviceHandle")
       .def(nb::init<>())
       .def_rw("semaphore_", &BaseMemoryChannel::DeviceHandle::semaphore_)
       .def_prop_ro("raw", [](const BaseMemoryChannel::DeviceHandle& self) -> nb::bytes {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<MemoryChannel>(m, "MemoryChannel")
+  nb::class_<MemoryChannel>(m, "CppMemoryChannel")
       .def(nb::init<>())
       .def(
           "__init__",
@@ -42,7 +42,7 @@ void register_memory_channel(nb::module_& m) {
           nb::arg("semaphore"), nb::arg("dst"), nb::arg("src"), nb::arg("packet_buffer") = 0)
       .def("device_handle", &MemoryChannel::deviceHandle);
 
-  nb::class_<MemoryChannel::DeviceHandle>(m, "MemoryChannelDeviceHandle")
+  nb::class_<MemoryChannel::DeviceHandle>(m, "CppMemoryChannelDeviceHandle")
       .def(nb::init<>())
       .def_rw("semaphore_", &MemoryChannel::DeviceHandle::semaphore_)
       .def_rw("dst_", &MemoryChannel::DeviceHandle::dst_)
diff --git a/python/csrc/npkit_py.cpp b/python/csrc/npkit_py.cpp
index 0557b72d..8aaa8011 100644
--- a/python/csrc/npkit_py.cpp
+++ b/python/csrc/npkit_py.cpp
@@ -9,7 +9,7 @@
 namespace nb = nanobind;
 
 void register_npkit(nb::module_ &m) {
-  nb::module_ sub_m = m.def_submodule("npkit", "NPKit functions");
+  nb::module_ sub_m = m.def_submodule("cpp_npkit", "NPKit functions");
   sub_m.def("init", &NpKit::Init);
   sub_m.def("dump", &NpKit::Dump);
   sub_m.def("shutdown", &NpKit::Shutdown);
diff --git a/python/csrc/numa_py.cpp b/python/csrc/numa_py.cpp
index 2489a479..4433ecc8 100644
--- a/python/csrc/numa_py.cpp
+++ b/python/csrc/numa_py.cpp
@@ -7,7 +7,7 @@ void numaBind(int node);
 };  // namespace mscclpp
 
 void register_numa(nb::module_ &m) {
-  nb::module_ sub_m = m.def_submodule("numa", "numa functions");
+  nb::module_ sub_m = m.def_submodule("cpp_numa", "numa functions");
   sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode);
   sub_m.def("numa_bind", &mscclpp::numaBind);
 }
diff --git a/python/csrc/port_channel_py.cpp b/python/csrc/port_channel_py.cpp
index 4b1aa289..e3dd98f1 100644
--- a/python/csrc/port_channel_py.cpp
+++ b/python/csrc/port_channel_py.cpp
@@ -11,11 +11,11 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_port_channel(nb::module_& m) {
-  nb::class_<BaseProxyService>(m, "BaseProxyService")
+  nb::class_<BaseProxyService>(m, "CppBaseProxyService")
       .def("start_proxy", &BaseProxyService::startProxy, nb::arg("blocking") = false)
       .def("stop_proxy", &BaseProxyService::stopProxy);
 
-  nb::class_<ProxyService, BaseProxyService>(m, "ProxyService")
+  nb::class_<ProxyService, BaseProxyService>(m, "CppProxyService")
       .def(nb::init<int>(), nb::arg("fifo_size") = DEFAULT_FIFO_SIZE)
       .def("start_proxy", &ProxyService::startProxy, nb::arg("blocking") = false)
       .def("stop_proxy", &ProxyService::stopProxy)
@@ -31,13 +31,13 @@ void register_port_channel(nb::module_& m) {
       .def("base_port_channel", &ProxyService::basePortChannel, nb::arg("id"))
       .def("port_channel", &ProxyService::portChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src"));
 
-  nb::class_<BasePortChannel>(m, "BasePortChannel")
+  nb::class_<BasePortChannel>(m, "CppBasePortChannel")
       .def(nb::init<>())
       .def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>>(),
            nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy"))
       .def("device_handle", &BasePortChannel::deviceHandle);
 
-  nb::class_<BasePortChannel::DeviceHandle>(m, "BasePortChannelDeviceHandle")
+  nb::class_<BasePortChannel::DeviceHandle>(m, "CppBasePortChannelDeviceHandle")
       .def(nb::init<>())
       .def_rw("semaphore_id_", &BasePortChannel::DeviceHandle::semaphoreId_)
       .def_rw("semaphore_", &BasePortChannel::DeviceHandle::semaphore_)
@@ -46,13 +46,13 @@ void register_port_channel(nb::module_& m) {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<PortChannel>(m, "PortChannel")
+  nb::class_<PortChannel>(m, "CppPortChannel")
       .def(nb::init<>())
       .def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>, MemoryId, MemoryId>(),
            nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy"), nb::arg("dst"), nb::arg("src"))
       .def("device_handle", &PortChannel::deviceHandle);
 
-  nb::class_<PortChannel::DeviceHandle>(m, "PortChannelDeviceHandle")
+  nb::class_<PortChannel::DeviceHandle>(m, "CppPortChannelDeviceHandle")
       .def(nb::init<>())
       .def_rw("semaphore_id_", &PortChannel::DeviceHandle::semaphoreId_)
       .def_rw("semaphore_", &PortChannel::DeviceHandle::semaphore_)
diff --git a/python/csrc/semaphore_py.cpp b/python/csrc/semaphore_py.cpp
index 665d395e..36d559f2 100644
--- a/python/csrc/semaphore_py.cpp
+++ b/python/csrc/semaphore_py.cpp
@@ -10,7 +10,7 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_semaphore(nb::module_& m) {
-  nb::class_<Host2DeviceSemaphore> host2DeviceSemaphore(m, "Host2DeviceSemaphore");
+  nb::class_<Host2DeviceSemaphore> host2DeviceSemaphore(m, "CppHost2DeviceSemaphore");
   host2DeviceSemaphore.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
       .def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
       .def("connection", &Host2DeviceSemaphore::connection)
@@ -25,7 +25,7 @@ void register_semaphore(nb::module_& m) {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<Host2HostSemaphore>(m, "Host2HostSemaphore")
+  nb::class_<Host2HostSemaphore>(m, "CppHost2HostSemaphore")
       .def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
       .def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
       .def("connection", &Host2HostSemaphore::connection)
@@ -34,7 +34,7 @@ void register_semaphore(nb::module_& m) {
       .def("wait", &Host2HostSemaphore::wait, nb::call_guard<nb::gil_scoped_release>(),
            nb::arg("max_spin_count") = 10000000);
 
-  nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "MemoryDevice2DeviceSemaphore");
+  nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "CppMemoryDevice2DeviceSemaphore");
   memoryDevice2DeviceSemaphore.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
       .def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
       .def("connection", &MemoryDevice2DeviceSemaphore::connection)
diff --git a/python/csrc/switch_channel_py.cpp b/python/csrc/switch_channel_py.cpp
index dd72c97e..2d0340dd 100644
--- a/python/csrc/switch_channel_py.cpp
+++ b/python/csrc/switch_channel_py.cpp
@@ -15,11 +15,11 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_nvls(nb::module_& m) {
-  nb::class_<SwitchChannel>(m, "SwitchChannel")
+  nb::class_<SwitchChannel>(m, "CppSwitchChannel")
       .def("get_device_ptr", [](SwitchChannel* self) { return (uintptr_t)self->getDevicePtr(); })
       .def("device_handle", &SwitchChannel::deviceHandle);
 
-  nb::class_<SwitchChannel::DeviceHandle>(m, "DeviceHandle")
+  nb::class_<SwitchChannel::DeviceHandle>(m, "CppSwitchChannelDeviceHandle")
       .def(nb::init<>())
       .def_rw("device_ptr", &SwitchChannel::DeviceHandle::devicePtr)
       .def_rw("mc_ptr", &SwitchChannel::DeviceHandle::mcPtr)
@@ -28,7 +28,7 @@ void register_nvls(nb::module_& m) {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<NvlsConnection>(m, "NvlsConnection")
+  nb::class_<NvlsConnection>(m, "CppNvlsConnection")
       .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("device_ptr"), nb::arg("size"));
 
   m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("all_ranks"),
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 58233a7c..86923003 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -23,35 +23,35 @@ version = {
 from ._core import *
 
 from ._mscclpp import (
-    Device,
-    DeviceType,
-    Communicator,
-    Connection,
+    CppDevice as Device,
+    CppDeviceType as DeviceType,
+    CppCommunicator as Communicator,
+    CppConnection as Connection,
     connect_nvls_collective,
-    EndpointConfig,
-    Fifo,
-    Semaphore,
-    Host2DeviceSemaphore,
-    Host2HostSemaphore,
-    numa,
-    ProxyService,
-    RegisteredMemory,
-    PortChannel,
-    MemoryChannel,
-    MemoryDevice2DeviceSemaphore,
-    TcpBootstrap,
-    Transport,
-    TransportFlags,
-    DataType,
-    ErrorCode,
-    Executor,
-    ExecutionPlan,
-    PacketType,
-    RawGpuBuffer,
-    ReduceOp,
+    CppEndpointConfig as EndpointConfig,
+    CppFifo as Fifo,
+    CppSemaphore as Semaphore,
+    CppHost2DeviceSemaphore as Host2DeviceSemaphore,
+    CppHost2HostSemaphore as Host2HostSemaphore,
+    cpp_numa as numa,
+    CppProxyService as ProxyService,
+    CppRegisteredMemory as RegisteredMemory,
+    CppPortChannel as PortChannel,
+    CppMemoryChannel as MemoryChannel,
+    CppMemoryDevice2DeviceSemaphore as MemoryDevice2DeviceSemaphore,
+    CppTcpBootstrap as TcpBootstrap,
+    CppTransport as Transport,
+    CppTransportFlags as TransportFlags,
+    CppDataType as DataType,
+    CppErrorCode as ErrorCode,
+    CppExecutor as Executor,
+    CppExecutionPlan as ExecutionPlan,
+    CppPacketType as PacketType,
+    CppRawGpuBuffer as RawGpuBuffer,
+    CppReduceOp as ReduceOp,
     env,
     is_nvls_supported,
-    npkit,
+    cpp_npkit as npkit,
 )
 
 __all__ = [
diff --git a/python/mscclpp/__main__.py b/python/mscclpp/__main__.py
index 6d0e0108..d57cb362 100644
--- a/python/mscclpp/__main__.py
+++ b/python/mscclpp/__main__.py
@@ -6,7 +6,7 @@ import shutil
 import argparse
 from pathlib import Path
 
-from mscclpp.language import default_algos as def_algo
+from mscclpp import default_algos as def_algo
 from mscclpp.language.collectives import *
 from mscclpp.language.utils import AlgoSpec
 
@@ -57,7 +57,7 @@ default_algo_configs = [
 
 
 def create_default_plans():
-    plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp_default")
+    plan_dir = os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp/default")
     plan_path = Path(plan_dir)
     if plan_path.exists():
         shutil.rmtree(plan_path)
diff --git a/python/mscclpp/_core/__init__.py b/python/mscclpp/_core/__init__.py
index e9d886f3..a97c91a0 100644
--- a/python/mscclpp/_core/__init__.py
+++ b/python/mscclpp/_core/__init__.py
@@ -5,9 +5,3 @@ from .algorithm import *
 from .comm import *
 from .compiler import *
 from .buffer import *
-
-__all__ = []
-__all__ += algorithm.__all__
-__all__ += comm.__all__
-__all__ += compiler.__all__
-__all__ += buffer.__all__
diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py
index abaac60c..6c4a3f20 100644
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -7,15 +7,17 @@ from functools import cached_property
 
 
 from mscclpp._mscclpp import (
-    Algorithm as _Algorithm,
-    DslAlgorithm as _DslAlgorithm,
-    AlgorithmType as _AlgorithmType,
-    Communicator,
-    CollectiveBufferMode,
-    DataType,
-    Executor,
-    ExecutionPlan,
-    ReduceOp,
+    CppAlgorithm,
+    CppDslAlgorithm,
+    CppAlgorithmType,
+    CppCommunicator,
+    CppCollectiveBufferMode,
+    CppDataType,
+    CppExecutor,
+    CppExecutionPlan,
+    CppReduceOp,
+    CppAlgorithmBuilder,
+    CppAlgorithmCollection,
 )
 
 __all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
@@ -45,7 +47,7 @@ class Algorithm:
         """
 
         def __init__(self, world_size: int = 0, n_ranks_per_node: int = 0):
-            self._constraint = _Algorithm.Constraint(world_size, n_ranks_per_node)
+            self._constraint = CppAlgorithm.Constraint(world_size, n_ranks_per_node)
 
         @property
         def world_size(self) -> int:
@@ -58,23 +60,23 @@ class Algorithm:
     def __init__(
         self,
         id: Optional[str] = None,
-        execution_plan: Optional[ExecutionPlan] = None,
-        native_handle: Optional[_Algorithm] = None,
+        execution_plan: Optional[CppExecutionPlan] = None,
+        native_handle: Optional[CppAlgorithm] = None,
         tags: Optional[Dict[str, int]] = None,
         constraint: Optional[Constraint] = None,
     ):
         if execution_plan is not None:
-            self._algorithm = _DslAlgorithm(
+            self._algorithm = CppDslAlgorithm(
                 id,
                 execution_plan,
                 tags=tags if tags is not None else {},
-                constraint=constraint._constraint if constraint is not None else _Algorithm.Constraint(),
+                constraint=constraint._constraint if constraint is not None else CppAlgorithm.Constraint(),
             )
         elif native_handle is not None:
             self._algorithm = native_handle
 
     @classmethod
-    def create_from_native_handle(cls, handle: _Algorithm):
+    def create_from_native_handle(cls, handle: CppAlgorithm):
         """Create an Algorithm instance from a native C++ algorithm handle.
 
         Args:
@@ -97,7 +99,7 @@ class Algorithm:
         Returns:
             A new Algorithm instance wrapping the algorithm from the capsule.
         """
-        handle = _Algorithm.from_native_capsule(obj)
+        handle = CppAlgorithm.from_native_capsule(obj)
         return cls(native_handle=handle)
 
     @cached_property
@@ -121,7 +123,7 @@ class Algorithm:
         return self._algorithm.tags
 
     @cached_property
-    def buffer_mode(self) -> CollectiveBufferMode:
+    def buffer_mode(self) -> CppCollectiveBufferMode:
         """The buffer mode supported by this algorithm (IN_PLACE, OUT_OF_PLACE, or ANY)."""
         return self._algorithm.buffer_mode
 
@@ -131,7 +133,7 @@ class Algorithm:
         Returns:
             True if this algorithm is defined using DSL/execution plan, False otherwise.
         """
-        if self._algorithm.type == _AlgorithmType.DSL:
+        if self._algorithm.type == CppAlgorithmType.DSL:
             return True
         return False
 
@@ -141,21 +143,21 @@ class Algorithm:
         Returns:
             True if this algorithm is implemented natively, False otherwise.
         """
-        if self._algorithm.type == _AlgorithmType.NATIVE:
+        if self._algorithm.type == CppAlgorithmType.NATIVE:
             return True
         return False
 
     def execute(
         self,
-        comm: Communicator,
+        comm: CppCommunicator,
         input_buffer: int,
         output_buffer: int,
         input_size: int,
         output_size: int,
-        dtype: DataType,
-        op: ReduceOp = ReduceOp.NOP,
+        dtype: CppDataType,
+        op: CppReduceOp = CppReduceOp.NOP,
         stream: int = 0,
-        executor: Optional[Executor] = None,
+        executor: Optional[CppExecutor] = None,
         nblocks=0,
         nthreads_per_block=0,
         extras: Optional[Dict[str, int]] = None,
@@ -196,7 +198,7 @@ class Algorithm:
 
 
 class AlgorithmBuilder:
-    def __init__(self, algorithm_builder: _AlgorithmBuilder):
+    def __init__(self, algorithm_builder: CppAlgorithmBuilder):
         self._algorithm_builder = algorithm_builder
 
     def build(self) -> Algorithm:
@@ -204,7 +206,7 @@ class AlgorithmBuilder:
 
 
 class AlgorithmCollection:
-    def __init__(self, native_collection: _AlgorithmCollection):
+    def __init__(self, native_collection: CppAlgorithmCollection):
         self._native_collection = native_collection
         self._algorithms = [Algorithm.create_from_native_handle(algo) for algo in self._native_collection.to_list()]
 
diff --git a/python/mscclpp/_core/buffer.py b/python/mscclpp/_core/buffer.py
index b54342ea..0575ca68 100644
--- a/python/mscclpp/_core/buffer.py
+++ b/python/mscclpp/_core/buffer.py
@@ -6,7 +6,7 @@ from typing import Union, Tuple
 
 import cupy as cp
 import numpy as np
-from mscclpp._mscclpp import RawGpuBuffer
+from mscclpp._mscclpp import CppRawGpuBuffer
 
 __all__ = ["GpuBuffer"]
 
@@ -25,6 +25,6 @@ class GpuBuffer(cp.ndarray):
         if any(s <= 0 for s in shape):
             raise ValueError("Shape must be positive.")
         # Create the buffer
-        buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
+        buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
         memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
         return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py
index 2b5a5f25..f0c5c219 100644
--- a/python/mscclpp/_core/comm.py
+++ b/python/mscclpp/_core/comm.py
@@ -6,18 +6,18 @@ from typing import Type
 
 import cupy as cp
 from mscclpp._mscclpp import (
-    Communicator,
-    Connection,
+    CppCommunicator,
+    CppConnection,
     connect_nvls_collective,
-    EndpointConfig,
-    Semaphore,
-    ProxyService,
-    RegisteredMemory,
-    PortChannel,
-    MemoryChannel,
-    TcpBootstrap,
-    Transport,
-    TransportFlags,
+    CppEndpointConfig,
+    CppSemaphore,
+    CppProxyService,
+    CppRegisteredMemory,
+    CppPortChannel,
+    CppMemoryChannel,
+    CppTcpBootstrap,
+    CppTransport,
+    CppTransportFlags,
 )
 import mpi4py
 import numpy as np
@@ -32,7 +32,7 @@ class CommGroup:
         self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None
     ):
         if interfaceIpPortTrio == "":
-            self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
+            self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
             uniq_id = None
             if mpi_comm.rank == 0:
                 # similar to NCCL's unique id
@@ -41,15 +41,15 @@ class CommGroup:
             self.bootstrap.initialize(uniq_id_global)
         elif mpi_comm:
             # use this instead
-            self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
+            self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
             self.bootstrap.initialize(interfaceIpPortTrio)
         elif not interfaceIpPortTrio == "":
             assert rank >= 0 and size >= 1
-            self.bootstrap = TcpBootstrap.create(rank, size)
+            self.bootstrap = CppTcpBootstrap.create(rank, size)
             self.bootstrap.initialize(interfaceIpPortTrio)
         else:
             raise RuntimeError("Either the interface or mpi_group need to be specified")
-        self.communicator = Communicator(self.bootstrap)
+        self.communicator = CppCommunicator(self.bootstrap)
         self.my_rank = self.bootstrap.get_rank()
         self.nranks = self.bootstrap.get_n_ranks()
         self.nranks_per_node = self.bootstrap.get_n_ranks_per_node()
@@ -63,43 +63,43 @@ class CommGroup:
     def recv(self, tensor: np.ndarray, peer: int, tag: int):
         self.bootstrap.recv(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag)
 
-    def my_ib_device(self, local_rank: int) -> Transport:
+    def my_ib_device(self, local_rank: int) -> CppTransport:
         if local_rank == 0:
-            return Transport.IB0
+            return CppTransport.IB0
         if local_rank == 1:
-            return Transport.IB1
+            return CppTransport.IB1
         if local_rank == 2:
-            return Transport.IB2
+            return CppTransport.IB2
         if local_rank == 3:
-            return Transport.IB3
+            return CppTransport.IB3
         if local_rank == 4:
-            return Transport.IB4
+            return CppTransport.IB4
         if local_rank == 5:
-            return Transport.IB5
+            return CppTransport.IB5
         if local_rank == 6:
-            return Transport.IB6
+            return CppTransport.IB6
         if local_rank == 7:
-            return Transport.IB7
+            return CppTransport.IB7
         else:
             assert False  # only 8 IBs are supported
 
     def make_connection(
         self,
         all_ranks: list[int],
-        endpoints: EndpointConfig | Transport | dict[int, EndpointConfig] | dict[int, Transport],
+        endpoints: CppEndpointConfig | CppTransport | dict[int, CppEndpointConfig] | dict[int, CppTransport],
         use_switch: bool = False,
-    ) -> dict[int, Connection]:
-        if type(endpoints) is Transport:
-            endpoints = EndpointConfig(endpoints)
+    ) -> dict[int, CppConnection]:
+        if type(endpoints) is CppTransport:
+            endpoints = CppEndpointConfig(endpoints)
         elif type(endpoints) is dict:
-            endpoints = {k: EndpointConfig(v) if type(v) is Transport else v for k, v in endpoints.items()}
+            endpoints = {k: CppEndpointConfig(v) if type(v) is CppTransport else v for k, v in endpoints.items()}
         connections = {}
         for rank in all_ranks:
             if type(endpoints) is dict:
                 endpoint = endpoints[rank]
             else:
                 endpoint = endpoints
-            if endpoint.transport == Transport.CudaIpc and use_switch:
+            if endpoint.transport == CppTransport.CudaIpc and use_switch:
                 return connect_nvls_collective(self.communicator, all_ranks, 2**30)
             else:
                 connections[rank] = self.communicator.connect(endpoint, rank)
@@ -107,8 +107,8 @@ class CommGroup:
         return connections
 
     def register_tensor_with_connections(
-        self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, Connection]
-    ) -> dict[int, RegisteredMemory]:
+        self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, CppConnection]
+    ) -> dict[int, CppRegisteredMemory]:
         local_reg_memory = self.register_local_memory(tensor, connections)
         all_registered_memories = {}
         all_registered_memories[self.my_rank] = local_reg_memory
@@ -121,8 +121,8 @@ class CommGroup:
         return all_registered_memories
 
     def _register_memory_with_connections(
-        self, memory: RegisteredMemory, connections: dict[int, Connection]
-    ) -> dict[int, RegisteredMemory]:
+        self, memory: CppRegisteredMemory, connections: dict[int, CppConnection]
+    ) -> dict[int, CppRegisteredMemory]:
         all_registered_memories = {}
         all_registered_memories[self.my_rank] = memory
         future_memories = {}
@@ -133,18 +133,20 @@ class CommGroup:
             all_registered_memories[rank] = future_memories[rank].get()
         return all_registered_memories
 
-    def make_semaphores(self, connections: dict[int, Connection]) -> dict[int, Semaphore]:
+    def make_semaphores(self, connections: dict[int, CppConnection]) -> dict[int, CppSemaphore]:
         future_semaphores = {}
         for rank in connections:
             future_semaphores[rank] = self.communicator.build_semaphore(connections[rank], rank)
         return {rank: future.get() for rank, future in future_semaphores.items()}
 
-    def make_memory_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, MemoryChannel]:
+    def make_memory_channels(
+        self, tensor: cp.ndarray, connections: dict[int, CppConnection]
+    ) -> dict[int, CppMemoryChannel]:
         semaphores = self.make_semaphores(connections)
         registered_memories = self.register_tensor_with_connections(tensor, connections)
         channels = {}
         for rank in connections:
-            channels[rank] = MemoryChannel(
+            channels[rank] = CppMemoryChannel(
                 semaphores[rank], registered_memories[rank], registered_memories[self.my_rank]
             )
         return channels
@@ -152,9 +154,9 @@ class CommGroup:
     def make_memory_channels_with_scratch(
         self,
         tensor: cp.ndarray,
-        registeredScratchBuffer: RegisteredMemory,
-        connections: dict[int, Connection],
-    ) -> dict[int, MemoryChannel]:
+        registeredScratchBuffer: CppRegisteredMemory,
+        connections: dict[int, CppConnection],
+    ) -> dict[int, CppMemoryChannel]:
         semaphores = self.make_semaphores(connections)
         registered_memories = self._register_memory_with_connections(registeredScratchBuffer, connections)
         channels = {}
@@ -162,17 +164,17 @@ class CommGroup:
         tensor_size = (
             tensor.numel() * tensor.element_size() if is_torch_tensor(tensor) else tensor.size * tensor.itemsize
         )
-        local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, TransportFlags())
+        local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, CppTransportFlags())
         scratch_data_ptr = registeredScratchBuffer.data()
         for rank in connections:
-            channels[rank] = MemoryChannel(
+            channels[rank] = CppMemoryChannel(
                 semaphores[rank], registered_memories[rank], local_registered_memory, scratch_data_ptr
             )
         return channels
 
     def make_port_channels(
-        self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
-    ) -> dict[int, PortChannel]:
+        self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection]
+    ) -> dict[int, CppPortChannel]:
         semaphores = self.make_semaphores(connections)
         registered_memories = self.register_tensor_with_connections(tensor, connections)
         memory_ids = {}
@@ -188,12 +190,12 @@ class CommGroup:
 
     def make_port_channels_with_scratch(
         self,
-        proxy_service: ProxyService,
+        proxy_service: CppProxyService,
         tensor: cp.ndarray,
-        registeredScratchBuffer: RegisteredMemory,
-        connections: dict[int, Connection],
-    ) -> dict[int, PortChannel]:
-        transport_flags = TransportFlags()
+        registeredScratchBuffer: CppRegisteredMemory,
+        connections: dict[int, CppConnection],
+    ) -> dict[int, CppPortChannel]:
+        transport_flags = CppTransportFlags()
         for rank in connections:
             transport_flags |= connections[rank].transport()
         data_ptr = (
@@ -223,8 +225,8 @@ class CommGroup:
         return channels
 
     def register_semaphore_with_proxy(
-        self, proxy_service: ProxyService, connections: dict[int, Connection]
-    ) -> dict[int, PortChannel]:
+        self, proxy_service: CppProxyService, connections: dict[int, CppConnection]
+    ) -> dict[int, CppPortChannel]:
         semaphores = self.make_semaphores(connections)
         semaphore_ids = {}
         for rank in semaphores:
@@ -235,7 +237,7 @@ class CommGroup:
         return channels
 
     def register_memory_with_proxy(
-        self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
+        self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection]
     ) -> dict[int, int]:
         registered_memories = self.register_tensor_with_connections(tensor, connections)
         memory_ids = {}
@@ -243,8 +245,8 @@ class CommGroup:
             memory_ids[rank] = proxy_service.add_memory(registered_memories[rank])
         return memory_ids
 
-    def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> RegisteredMemory:
-        transport_flags = TransportFlags()
+    def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, CppConnection]) -> CppRegisteredMemory:
+        transport_flags = CppTransportFlags()
         for rank in connections:
             transport_flags |= connections[rank].transport()
         data_ptr = (
diff --git a/python/mscclpp/_core/compiler.py b/python/mscclpp/_core/compiler.py
index 82ae93a9..b2da976d 100644
--- a/python/mscclpp/_core/compiler.py
+++ b/python/mscclpp/_core/compiler.py
@@ -26,9 +26,7 @@ from mscclpp.language.program import CollectiveProgram
 from mscclpp.language.utils import AlgoSpec
 from mscclpp.utils import get_device_arch
 
-from mscclpp._mscclpp import (
-    ExecutionPlan,
-)
+from mscclpp._mscclpp import CppExecutionPlan, env
 
 logging.basicConfig(level=logging.INFO)
 
@@ -51,7 +49,7 @@ class DslCompiler:
     into execution plans that can be run on GPUs. The compiled plans are cached
     to disk for reuse.
 
-    The cache location can be configured via the `MSCCLPP_EXECUTION_PLAN_DIR`
+    The cache location can be configured via the `MSCCLPP_CACHE_DIR`
     environment variable (defaults to `~/.cache/mscclpp`).
 
     Example:
@@ -138,7 +136,7 @@ class DslCompiler:
             )
         ).hexdigest()
 
-        plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp")
+        plan_dir = Path(env().cache_dir)
         os.makedirs(plan_dir, exist_ok=True)
         filename = f"{plan_id}.json"
         plan_path = os.path.join(plan_dir, filename)
@@ -157,7 +155,7 @@ class DslCompiler:
                     os.remove(tmp_path)
             except Exception:
                 Path(plan_path).unlink(missing_ok=True)
-        execution_plan = ExecutionPlan(plan_path, rank)
+        execution_plan = CppExecutionPlan(plan_path, rank)
         return Algorithm(
             id=plan_id,
             execution_plan=execution_plan,
@@ -179,8 +177,8 @@ class NativeCodeCompiler:
     based on the runtime environment. Compiled modules are cached to avoid
     recompilation.
 
-    The cache location can be configured via the `MSCCLPP_NATIVE_CACHE_DIR`
-    environment variable (defaults to `~/.cache/mscclpp/native`).
+    The cache location can be configured via the `MSCCLPP_CACHE_DIR`
+    environment variable (defaults to `~/.cache/mscclpp`).
 
     Attributes:
         _is_hip: True if running on AMD/ROCm, False for NVIDIA/CUDA.
@@ -226,8 +224,7 @@ class NativeCodeCompiler:
             "-L" + os.path.join(self._lib_home, "lib"),
             "-lmscclpp",
         ]
-        cache_root = os.environ.get("MSCCLPP_NATIVE_CACHE_DIR", Path.home() / ".cache/mscclpp/native")
-        self._cache_dir = Path(cache_root)
+        self._cache_dir = Path(env().cache_dir) / "native"
         self._cache_dir.mkdir(parents=True, exist_ok=True)
 
     def _get_compiler(self) -> str:
@@ -283,7 +280,7 @@ class NativeCodeCompiler:
         Note:
             - The source file should include pybind11 bindings to expose functions.
             - MSCCLPP headers are automatically included in the compilation.
-            - The module is cached in `MSCCLPP_NATIVE_CACHE_DIR` (default: ~/.cache/mscclpp/native).
+            - The module is cached in `MSCCLPP_CACHE_DIR` (default: ~/.cache/mscclpp).
             - File locking is used to prevent race conditions during parallel compilation.
 
         Example:
diff --git a/python/mscclpp/ext/__init__.py b/python/mscclpp/ext/__init__.py
index 5c73df3c..08a96ecd 100644
--- a/python/mscclpp/ext/__init__.py
+++ b/python/mscclpp/ext/__init__.py
@@ -2,5 +2,3 @@
 # Licensed under the MIT license.
 
 from .algorithm_collection_builder import *
-
-__all__ = algorithm_collection_builder.__all__
diff --git a/python/mscclpp/ext/algorithm_collection_builder.py b/python/mscclpp/ext/algorithm_collection_builder.py
index 51a178fb..8361bd2f 100644
--- a/python/mscclpp/ext/algorithm_collection_builder.py
+++ b/python/mscclpp/ext/algorithm_collection_builder.py
@@ -6,9 +6,7 @@ from typing import Union
 from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection
 import atexit
 
-from mscclpp._mscclpp import (
-    AlgorithmCollectionBuilder as _AlgorithmCollectionBuilder,
-)
+from mscclpp._mscclpp import CppAlgorithmCollectionBuilder
 
 __all__ = ["AlgorithmCollectionBuilder"]
 
@@ -24,12 +22,12 @@ class AlgorithmCollectionBuilder:
     @classmethod
     def reset(cls):
         if cls._instance is not None:
-            _AlgorithmCollectionBuilder.reset()
+            CppAlgorithmCollectionBuilder.reset()
             cls._instance = None
 
     def __init__(self):
         if not hasattr(self, "_initialized"):
-            self._builder = _AlgorithmCollectionBuilder.get_instance()
+            self._builder = CppAlgorithmCollectionBuilder.get_instance()
             self._initialized = True
 
     def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]):
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index 783b0ca9..69dd7ce6 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -11,7 +11,7 @@ from typing import Any, Type, Union
 import cupy as cp
 import numpy as np
 
-from mscclpp._mscclpp import DataType
+from mscclpp._mscclpp import CppDataType as DataType
 
 try:
     import torch
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 35a31f4c..508208e9 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -58,8 +58,7 @@ Env::Env()
       socketFamily(readEnv<std::string>("MSCCLPP_SOCKET_FAMILY", "")),
       socketIfname(readEnv<std::string>("MSCCLPP_SOCKET_IFNAME", "")),
       commId(readEnv<std::string>("MSCCLPP_COMM_ID", "")),
-      executionPlanDir(readEnv<std::string>("MSCCLPP_EXECUTION_PLAN_DIR",
-                                            readEnv<std::string>("HOME", "~") + "/.cache/mscclpp_default")),
+      cacheDir(readEnv<std::string>("MSCCLPP_CACHE_DIR", readEnv<std::string>("HOME", "~") + "/.cache/mscclpp")),
       npkitDumpDir(readEnv<std::string>("MSCCLPP_NPKIT_DUMP_DIR", "")),
       cudaIpcUseDefaultStream(readEnv<bool>("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)),
       ncclSharedLibPath(readEnv<std::string>("MSCCLPP_NCCL_LIB_PATH", "")),
@@ -85,7 +84,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_SOCKET_FAMILY", globalEnv->socketFamily);
     logEnv("MSCCLPP_SOCKET_IFNAME", globalEnv->socketIfname);
     logEnv("MSCCLPP_COMM_ID", globalEnv->commId);
-    logEnv("MSCCLPP_EXECUTION_PLAN_DIR", globalEnv->executionPlanDir);
+    logEnv("MSCCLPP_CACHE_DIR", globalEnv->cacheDir);
     logEnv("MSCCLPP_NPKIT_DUMP_DIR", globalEnv->npkitDumpDir);
     logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream);
     logEnv("MSCCLPP_NCCL_LIB_PATH", globalEnv->ncclSharedLibPath);
diff --git a/src/ext/collectives/algorithm_collection_builder.cc b/src/ext/collectives/algorithm_collection_builder.cc
index 566c1852..67e616ae 100644
--- a/src/ext/collectives/algorithm_collection_builder.cc
+++ b/src/ext/collectives/algorithm_collection_builder.cc
@@ -105,13 +105,13 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultDslAlgorithms(int ra
     return oss.str();
   };
 
-  std::string planDir = env()->executionPlanDir;
+  auto planDir = std::filesystem::path(env()->cacheDir) / "default";
   if (!std::filesystem::exists(planDir)) {
-    INFO(ALGO, "Plan directory does not exist: ", planDir);
+    INFO(ALGO, "Default plan directory does not exist: ", planDir);
     return collection;
   }
   for (const auto& config : defaultAlgoConfigs) {
-    std::string planPath = planDir + "/" + config.filename;
+    auto planPath = planDir / config.filename;
     INFO(ALGO, "Loading plan: ", planPath);
     if (!std::filesystem::exists(planPath)) {
       INFO(ALGO, "Plan file does not exist: ", planPath);

From dc747b15222b7eab3ab710f1594e90aecafbadde Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 5 Feb 2026 09:23:43 -0800
Subject: [PATCH 04/52] Refactor reduce kernel (#738)

- Put the common reduce kernel to reduce_kernel.hpp
- Implement operator overloading for the vector type
- Clean up the duplicated code at `executor_ kernel.hpp` and
`allreduce/common.hpp`
---
 include/mscclpp/gpu_data_types.hpp            | 456 ++++++++++++++++--
 include/mscclpp/switch_channel_device.hpp     |  24 +-
 src/core/include/execution_kernel.hpp         | 379 +--------------
 src/core/include/reduce_kernel.hpp            |  81 ++++
 .../allreduce/allreduce_allpair_packet.cu     |   2 +-
 .../allreduce/allreduce_fullmesh.cu           |   4 +-
 .../allreduce/allreduce_nvls_packet.cu        |   2 +-
 .../collectives/allreduce/allreduce_packet.cu |   4 +-
 .../collectives/include/allreduce/common.hpp  | 441 +----------------
 9 files changed, 538 insertions(+), 855 deletions(-)
 create mode 100644 src/core/include/reduce_kernel.hpp

diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
index 99b95d9a..9e7747a8 100644
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -16,20 +16,27 @@ using __bfloat16 = __hip_bfloat16;
 using __bfloat162 = __hip_bfloat162;
 #define __CUDA_BF16_TYPES_EXIST__
 
-// AMD FP8 support - hip_fp8.h provides __hip_fp8_e4m3_fnuz and __hip_fp8_e5m2_fnuz
-// Only available on gfx942 and newer architectures (ROCm 6.0+)
+// AMD FP8 support - Use fnuz types for HIP 6.0 or when HIP_FP8_TYPE_FNUZ is enabled and HIP_FP8_TYPE_OCP is not
+// enabled. Otherwise, use the standard FP8 types.
 #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6)
 #include <hip/hip_fp8.h>
 
 // Create aliases matching CUDA naming convention for cross-platform compatibility
+#if (HIP_VERSION_MAJOR == 6) || (HIP_VERSION_MAJOR > 6 && HIP_FP8_TYPE_FNUZ && !HIP_FP8_TYPE_OCP)
 using __fp8_e4m3 = __hip_fp8_e4m3_fnuz;
 using __fp8_e5m2 = __hip_fp8_e5m2_fnuz;
-
-// HIP FP8 vector types use storage types (from hip/amd_detail/amd_hip_fp8.h):
-using __fp8x2_e4m3 = __hip_fp8x2_storage_t;  // uint16_t
-using __fp8x2_e5m2 = __hip_fp8x2_storage_t;  // uint16_t
-using __fp8x4_e4m3 = __hip_fp8x4_storage_t;  // uint32_t
-using __fp8x4_e5m2 = __hip_fp8x4_storage_t;  // uint32_t
+using __fp8x2_e4m3 = __hip_fp8x2_e4m3_fnuz;
+using __fp8x2_e5m2 = __hip_fp8x2_e5m2_fnuz;
+using __fp8x4_e4m3 = __hip_fp8x4_e4m3_fnuz;
+using __fp8x4_e5m2 = __hip_fp8x4_e5m2_fnuz;
+#else
+using __fp8_e4m3 = __hip_fp8_e4m3;
+using __fp8_e5m2 = __hip_fp8_e5m2;
+using __fp8x2_e4m3 = __hip_fp8x2_e4m3;
+using __fp8x2_e5m2 = __hip_fp8x2_e5m2;
+using __fp8x4_e4m3 = __hip_fp8x4_e4m3;
+using __fp8x4_e5m2 = __hip_fp8x4_e5m2;
+#endif
 
 #define __FP8_TYPES_EXIST__
 #endif  // HIP_VERSION_MAJOR >= 6
@@ -71,10 +78,8 @@ enum class DataType {
 };
 
 /// Word array.
-template <int Bytes>
+template <int Bytes, bool Enabled = (Bytes >= 4 && Bytes % 4 == 0)>
 struct alignas(Bytes) Words {
-  static_assert(Bytes > 0, "Bytes must be greater than 0");
-  static_assert(Bytes % 4 == 0, "Bytes must be multiple of 4");
   uint32_t w[Bytes / 4];
 
   MSCCLPP_HOST_DEVICE_INLINE Words() {}
@@ -84,18 +89,33 @@ struct alignas(Bytes) Words {
   MSCCLPP_HOST_DEVICE_INLINE const uint32_t& operator[](int i) const { return w[i]; }
 };
 
-/// Vector type.
-template <typename T, int N>
-union alignas(sizeof(T) * N) VectorType {
+template <int Bytes>
+struct alignas(Bytes) Words<Bytes, false> {};
+
+/// Vector type implementation (internal).
+template <typename T, int N, typename StorageT>
+union alignas(sizeof(T) * N) VectorTypeImpl {
   static_assert(N > 0, "N must be greater than 0");
 
   T data[N];
   Words<sizeof(T) * N> words;
+  StorageT storage;
 
   using ElementType = T;
   constexpr static int Size = N;
 
-  MSCCLPP_HOST_DEVICE_INLINE VectorType() {}
+  MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl() {}
+
+  MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl(const StorageT& value) : storage(value) {}
+
+  MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl(const VectorTypeImpl& other) { storage = other.storage; }
+
+  MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl& operator=(const VectorTypeImpl& other) {
+    storage = other.storage;
+    return *this;
+  }
+
+  MSCCLPP_HOST_DEVICE_INLINE operator StorageT() const { return storage; }
 
   MSCCLPP_HOST_DEVICE_INLINE operator T*() { return data; }
 
@@ -106,38 +126,394 @@ union alignas(sizeof(T) * N) VectorType {
   MSCCLPP_HOST_DEVICE_INLINE const T& operator[](int i) const { return data[i]; }
 };
 
-using i32x1 = VectorType<int32_t, 1>;
-using u32x1 = VectorType<uint32_t, 1>;
-using f64x1 = VectorType<double, 1>;
-using f32x1 = VectorType<float, 1>;
+// Helper template to get the appropriate vector type for a given element type and count
+template <typename T, int N>
+struct VectorTypeHelper {
+  using type =
+      VectorTypeImpl<T, N,
+                     typename std::conditional_t<N * sizeof(T) == 4, uint32_t,
+                                                 typename std::conditional_t<N * sizeof(T) == 8, uint2, uint4>>>;
+};
 
-using i32x2 = VectorType<int32_t, 2>;
-using u32x2 = VectorType<uint32_t, 2>;
-using f32x2 = VectorType<float, 2>;
-using f16x2 = VectorType<__half, 2>;
-using bf16x2 = VectorType<__bfloat16, 2>;
+/// Vector type - clean user interface (automatically selects appropriate storage type)
+template <typename T, int N>
+using VectorType = typename VectorTypeHelper<T, N>::type;
 
-using i32x4 = VectorType<int32_t, 4>;
-using u32x4 = VectorType<uint32_t, 4>;
-using f32x4 = VectorType<float, 4>;
-using f16x4 = VectorType<__half, 4>;
-using bf16x4 = VectorType<__bfloat16, 4>;
+// Macro to define specialization AND alias in one go
+#define DEFINE_VEC(Alias, T, N, Storage)        \
+  template <>                                   \
+  struct VectorTypeHelper<T, N> {               \
+    using type = VectorTypeImpl<T, N, Storage>; \
+  };                                            \
+  using Alias = VectorType<T, N>
 
-using f16x8 = VectorType<__half, 8>;
-using bf16x8 = VectorType<__bfloat16, 8>;
+DEFINE_VEC(i32x1, int32_t, 1, int32_t);
+DEFINE_VEC(u32x1, uint32_t, 1, uint32_t);
+DEFINE_VEC(f32x1, float, 1, float);
+DEFINE_VEC(f64x1, double, 1, double);
+
+DEFINE_VEC(i32x2, int32_t, 2, int2);
+DEFINE_VEC(u32x2, uint32_t, 2, uint2);
+DEFINE_VEC(f32x2, float, 2, float2);
+DEFINE_VEC(f16x2, __half, 2, __half2);
+DEFINE_VEC(bf16x2, __bfloat16, 2, __bfloat162);
+
+DEFINE_VEC(i32x4, int32_t, 4, int4);
+DEFINE_VEC(u32x4, uint32_t, 4, uint4);
+DEFINE_VEC(f32x4, float, 4, float4);
+DEFINE_VEC(f16x4, __half, 4, uint2);
+DEFINE_VEC(bf16x4, __bfloat16, 4, uint2);
+
+DEFINE_VEC(f16x8, __half, 8, uint4);
+DEFINE_VEC(bf16x8, __bfloat16, 8, uint4);
 
 #if defined(__FP8_TYPES_EXIST__)
-// FP8 vector types
-using fp8_e4m3x2 = VectorType<__fp8_e4m3, 2>;
-using fp8_e4m3x4 = VectorType<__fp8_e4m3, 4>;
-using fp8_e4m3x8 = VectorType<__fp8_e4m3, 8>;
-using fp8_e4m3x16 = VectorType<__fp8_e4m3, 16>;
-using fp8_e5m2x2 = VectorType<__fp8_e5m2, 2>;
-using fp8_e5m2x4 = VectorType<__fp8_e5m2, 4>;
-using fp8_e5m2x8 = VectorType<__fp8_e5m2, 8>;
-using fp8_e5m2x16 = VectorType<__fp8_e5m2, 16>;
+DEFINE_VEC(f8_e4m3x2, __fp8_e4m3, 2, __fp8x2_e4m3);
+DEFINE_VEC(f8_e4m3x4, __fp8_e4m3, 4, __fp8x4_e4m3);
+DEFINE_VEC(f8_e4m3x8, __fp8_e4m3, 8, uint2);
+DEFINE_VEC(f8_e4m3x16, __fp8_e4m3, 16, uint4);
+
+DEFINE_VEC(f8_e5m2x2, __fp8_e5m2, 2, __fp8x2_e5m2);
+DEFINE_VEC(f8_e5m2x4, __fp8_e5m2, 4, __fp8x4_e5m2);
+DEFINE_VEC(f8_e5m2x8, __fp8_e5m2, 8, uint2);
+DEFINE_VEC(f8_e5m2x16, __fp8_e5m2, 16, uint4);
+#endif
+#undef DEFINE_VEC
+
+#if defined(MSCCLPP_DEVICE_COMPILE)
+template <typename To, typename From>
+MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) {
+  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
+
+  union {
+    From f;
+    To t;
+  } u{.f = src};
+  return u.t;
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE T clip(T val) {
+  return val;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __half clip(__half val) {
+  val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
+  val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
+
+  return val;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __half2 clip(__half2 val) {
+  val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff));
+  val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff));
+  val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff));
+  val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff));
+  return val;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __bfloat16 clip(__bfloat16 val) {
+  val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80));
+  val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80));
+  return val;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __bfloat162 clip(__bfloat162 val) {
+  val.x = __hmax(val.x, bit_cast<__bfloat16, unsigned short>(0xff80));
+  val.x = __hmin(val.x, bit_cast<__bfloat16, unsigned short>(0x7f80));
+  val.y = __hmax(val.y, bit_cast<__bfloat16, unsigned short>(0xff80));
+  val.y = __hmin(val.y, bit_cast<__bfloat16, unsigned short>(0x7f80));
+  return val;
+}
+
+// FP8 E4M3 clipping function
+#if defined(__FP8_TYPES_EXIST__)
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3 clip(__fp8_e4m3 val) {
+  // FP8 E4M3 has range [-448, 448], no infinities
+  // Built-in saturation in FP8 arithmetic
+  return val;
+}
+
+// FP8 E5M2 clipping function - prevent infinities by clamping to max finite value
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e5m2 clip(__fp8_e5m2 val) {
+  // FP8 E5M2 has infinities - clamp to max finite value to prevent overflow
+  // Max finite value for E5M2 is 57344.0f (0x7B), min is -57344.0f (0xFB)
+  float fval = float(val);
+  fval = fmaxf(fval, -57344.0f);
+  fval = fminf(fval, 57344.0f);
+  return __fp8_e5m2(fval);
+}
 #endif
 
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
+  __half2 result;
+  if constexpr (UseClip) {
+    result = clip(__hadd2(a, b));
+  } else {
+    result = __hadd2(a, b);
+  }
+  return result;
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE bf16x2 operator+(const bf16x2& a, const bf16x2& b) {
+  __bfloat162 result;
+  if constexpr (UseClip) {
+    result = clip(__hadd2(a, b));
+  } else {
+    result = __hadd2(a, b);
+  }
+  return result;
+}
+
+#if defined(__FP8_TYPES_EXIST__)
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3 operator+(const __fp8_e4m3& a, const __fp8_e4m3& b) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // Optimized assembly for gfx942
+  float2 v;
+  uint32_t ival = 0;
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0)));
+  return static_cast<__hip_fp8_storage_t>(__builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false));
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  // NVIDIA CUDA FP8 addition (CUDA 11.8+)
+  __fp8_e4m3 result = __fp8_e4m3(__hadd(__half(a), __half(b)));
+  return UseClip ? clip(result) : result;
+#else
+  // Fallback for other devices
+  __fp8_e4m3 result = __fp8_e4m3(float(a) + float(b));
+  return UseClip ? clip(result) : result;
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 operator+(const f8_e4m3x2& a, const f8_e4m3x2& b) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  float2 v;
+  uint32_t ival = 0;
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, 0)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, 0)));
+  return bit_cast<f8_e4m3x2>(
+      static_cast<__hip_fp8x2_storage_t>(__builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false)));
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  // CUDA: Convert to half2, add using optimized __hadd2, convert back
+  return __fp8x2_e4m3(__hadd2(__half2(static_cast<__fp8x2_e4m3>(a)), __half2(static_cast<__fp8x2_e4m3>(b))));
+#else
+  // Fallback for other devices: element-wise using single-element operations
+  f8_e4m3x2 result;
+  result.data[0] = a.data[0] + b.data[0];
+  result.data[1] = a.data[1] + b.data[1];
+  return result;
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3x4 operator+(const f8_e4m3x4& a, const f8_e4m3x4& b) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  float2 v_low, v_high;
+  // E4M3 using fp8 conversion - process low word (false) and high word (true)
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v_low)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, false)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, false)));
+  uint32_t result_packed = __builtin_amdgcn_cvt_pk_fp8_f32(v_low.x, v_low.y, 0, false);
+
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v_high)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, true)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, true)));
+  result_packed = __builtin_amdgcn_cvt_pk_fp8_f32(v_high.x, v_high.y, result_packed, true);
+  return bit_cast<f8_e4m3x4>(result_packed);
+#else
+  // Process as two f8_e4m3x2 using operator+ for 2 elements
+  const f8_e4m3x2* a_pair = reinterpret_cast<const f8_e4m3x2*>(&a);
+  const f8_e4m3x2* b_pair = reinterpret_cast<const f8_e4m3x2*>(&b);
+
+  f8_e4m3x2 result[2];
+  result[0] = a_pair[0] + b_pair[0];
+  result[1] = a_pair[1] + b_pair[1];
+
+  return *reinterpret_cast<f8_e4m3x4*>(result);
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE __fp8_e5m2 operator+(const __fp8_e5m2& a, const __fp8_e5m2& b) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // Optimized assembly for gfx942 (bfloat8)
+  float2 v;
+  uint32_t ival = 0;
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0)));
+  return static_cast<__hip_fp8_storage_t>(__builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false));
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  // NVIDIA CUDA FP8 addition
+  __fp8_e5m2 result = __fp8_e5m2(__hadd(__half(a), __half(b)));
+  return UseClip ? clip(result) : result;
+#else
+  __fp8_e5m2 result = __fp8_e5m2(float(a) + float(b));
+  return UseClip ? clip(result) : result;
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e5m2x2 operator+(const f8_e5m2x2& a, const f8_e5m2x2& b) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  // CUDA: Convert to half2, add using optimized __hadd2, convert back
+  f8_e5m2x2 result =
+      __fp8x2_e5m2(__hadd2(__half2(static_cast<__fp8x2_e5m2>(a)), __half2(static_cast<__fp8x2_e5m2>(b))));
+  if constexpr (UseClip) {
+    result = clip(result);
+  }
+  return result;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // HIP gfx942: Use BF8 assembly instructions
+  float2 v;
+  uint32_t ival = 0;
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.data[0].__x, 0)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.data[0].__x, 0)));
+  return bit_cast<f8_e5m2x2>(
+      static_cast<__hip_fp8x2_storage_t>(__builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, ival, false)));
+#else
+  // Fallback: element-wise using single-element operations
+  f8_e5m2x2 result;
+  result.data[0] = a.data[0] + b.data[0];
+  result.data[1] = a.data[1] + b.data[1];
+  return result;
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e5m2x4 operator+(const f8_e5m2x4& a, const f8_e5m2x4& b) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  float2 v_low, v_high;
+  // E5M2 using bf8 conversion - process low word (false) and high word (true)
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v_low)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.storage.__x, false)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.storage.__x, false)));
+  uint32_t result_packed = __builtin_amdgcn_cvt_pk_bf8_f32(v_low.x, v_low.y, 0, false);
+
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v_high)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.storage.__x, true)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.storage.__x, true)));
+  result_packed = __builtin_amdgcn_cvt_pk_bf8_f32(v_high.x, v_high.y, result_packed, true);
+  return bit_cast<f8_e5m2x4>(result_packed);
+#else
+  // Process as two f8_e5m2x2 using operator+ for 2 elements
+  const f8_e5m2x2* a_pair = reinterpret_cast<const f8_e5m2x2*>(&a);
+  const f8_e5m2x2* b_pair = reinterpret_cast<const f8_e5m2x2*>(&b);
+  f8_e5m2x2 result[2];
+  result[0] = a_pair[0] + b_pair[0];
+  result[1] = a_pair[1] + b_pair[1];
+
+  return *reinterpret_cast<f8_e5m2x4*>(result);
+#endif
+}
+#endif  // defined(__FP8_TYPES_EXIST__)
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) {
+  return (a < b ? a : b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE f16x2 min(const f16x2& a, const f16x2& b) {
+#if defined(MSCCLPP_DEVICE_HIP)
+  f16x2 val;
+  val[0] = __hmin(a[0], b[0]);
+  val[1] = __hmin(a[1], b[1]);
+  return val;
+#else
+  __half2 ret = __hmin2(a, b);
+  return ret;
+#endif
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE bf16x2 min(const bf16x2& a, const bf16x2& b) {
+  return __hmin2(a, b);
+}
+
+#if defined(__FP8_TYPES_EXIST__)
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) {
+#if defined(MSCCLPP_DEVICE_HIP)
+  return __fp8_e4m3(fminf(float(a), float(b)));
+#else
+  return __fp8_e4m3(__hmin(__half(a), __half(b)));
+#endif
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 min(const f8_e4m3x2& a, const f8_e4m3x2& b) {
+  // Process element-wise using single-element operations
+  f8_e4m3x2 result;
+  result.data[0] = mscclpp::min(a.data[0], b.data[0]);
+  result.data[1] = mscclpp::min(a.data[1], b.data[1]);
+  return result;
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3x4 min(const f8_e4m3x4& a, const f8_e4m3x4& b) {
+  // Process as two f8_e4m3x2 using min for 2 elements
+  const f8_e4m3x2* a_ptr = reinterpret_cast<const f8_e4m3x2*>(&a);
+  const f8_e4m3x2* b_ptr = reinterpret_cast<const f8_e4m3x2*>(&b);
+
+  f8_e4m3x4 result;
+  f8_e4m3x2* result_ptr = reinterpret_cast<f8_e4m3x2*>(&result);
+
+  result_ptr[0] = mscclpp::min(a_ptr[0], b_ptr[0]);
+  result_ptr[1] = mscclpp::min(a_ptr[1], b_ptr[1]);
+
+  return result;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e5m2 min(const __fp8_e5m2& a, const __fp8_e5m2& b) {
+#if defined(MSCCLPP_DEVICE_HIP)
+  return __fp8_e5m2(fminf(float(a), float(b)));
+#else
+  return __fp8_e5m2(__hmin(__half(a), __half(b)));
+#endif
+}
+
+MSCCLPP_DEVICE_INLINE f8_e5m2x2 min(const f8_e5m2x2& a, const f8_e5m2x2& b) {
+  // Process element-wise using single-element operations
+  f8_e5m2x2 result;
+  result.data[0] = mscclpp::min(a.data[0], b.data[0]);
+  result.data[1] = mscclpp::min(a.data[1], b.data[1]);
+  return result;
+}
+
+MSCCLPP_DEVICE_INLINE f8_e5m2x4 min(const f8_e5m2x4& a, const f8_e5m2x4& b) {
+  // Process as two f8_e5m2x2 using min for 2 elements
+  const f8_e5m2x2* a_ptr = reinterpret_cast<const f8_e5m2x2*>(&a);
+  const f8_e5m2x2* b_ptr = reinterpret_cast<const f8_e5m2x2*>(&b);
+
+  f8_e5m2x4 result;
+  f8_e5m2x2* result_ptr = reinterpret_cast<f8_e5m2x2*>(&result);
+
+  result_ptr[0] = mscclpp::min(a_ptr[0], b_ptr[0]);
+  result_ptr[1] = mscclpp::min(a_ptr[1], b_ptr[1]);
+
+  return result;
+}
+#endif  // defined(__FP8_TYPES_EXIST__)
+#endif  // MSCCLPP_DEVICE_COMPILE
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_GPU_DATA_TYPES_HPP_
diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp
index 5f8a1608..b52b6572 100644
--- a/include/mscclpp/switch_channel_device.hpp
+++ b/include/mscclpp/switch_channel_device.hpp
@@ -80,26 +80,26 @@ struct SwitchChannelDeviceHandle {
           : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
           : "l"(ptr)
           : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];"
           : "=r"(val.words[0]), "=r"(val.words[1])
           : "l"(ptr)
           : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x16>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];"
           : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
           : "l"(ptr)
           : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x4>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x8>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];"
           : "=r"(val.words[0]), "=r"(val.words[1])
           : "l"(ptr)
           : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x16>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];"
           : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
           : "l"(ptr)
@@ -148,23 +148,23 @@ struct SwitchChannelDeviceHandle {
       asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
       asm volatile("multimem.st.relaxed.sys.global.e4m3x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
       asm volatile("multimem.st.relaxed.sys.global.v2.e4m3x4  [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1])
                    : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x16>) {
       asm volatile("multimem.st.relaxed.sys.global.v4.e4m3x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x4>) {
       asm volatile("multimem.st.relaxed.sys.global.e5m2x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x8>) {
       asm volatile("multimem.st.relaxed.sys.global.v2.e5m2x4  [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1])
                    : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x16>) {
       asm volatile("multimem.st.relaxed.sys.global.v4.e5m2x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index fb6c436f..918bff61 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -17,356 +17,7 @@
 #include <mscclpp/switch_channel_device.hpp>
 
 #include "execution_common.hpp"
-
-namespace {
-#if defined(MSCCLPP_DEVICE_COMPILE)
-template <typename To, typename From>
-MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) {
-  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
-
-  union {
-    From f;
-    To t;
-  } u;
-  u.f = src;
-  return u.t;
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE T add_elements(T a, T b) {
-  return a + b;
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __half2 add_elements(__half2 a, __half2 b) {
-  return __hadd2(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __bfloat16 add_elements(__bfloat16 a, __bfloat16 b) {
-  return __hadd(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) {
-  return __hadd2(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-// FP8 E4M3 addition using __hadd (single element)
-template <>
-MSCCLPP_DEVICE_INLINE __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // Optimized assembly for gfx942
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false);
-#else
-  return __fp8_e4m3(__hadd(__half(a), __half(b)));
-#endif
-}
-
-// FP8 E5M2 addition using __hadd (single element) - must come before helper functions
-template <>
-MSCCLPP_DEVICE_INLINE __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // Optimized assembly for gfx942 (bfloat8)
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false);
-#else
-  return __fp8_e5m2(__hadd(__half(a), __half(b)));
-#endif
-}
-
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-// HIP gfx942 platform: Helper functions for vectorized FP8 operations
-// We use separate function names because __fp8x2_e4m3 and __fp8x2_e5m2 are both uint16_t
-
-// E4M3 vectorized addition for 2 elements
-MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e4m3(uint16_t a, uint16_t b) {
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false);
-}
-
-// E4M3 vectorized addition for 4 elements
-MSCCLPP_DEVICE_INLINE uint32_t add_fp8x4_e4m3(uint32_t a, uint32_t b) {
-  uint16_t a_low = a & 0xFFFF;
-  uint16_t a_high = (a >> 16) & 0xFFFF;
-  uint16_t b_low = b & 0xFFFF;
-  uint16_t b_high = (b >> 16) & 0xFFFF;
-  uint16_t result_low = add_fp8x2_e4m3(a_low, b_low);
-  uint16_t result_high = add_fp8x2_e4m3(a_high, b_high);
-  return (static_cast<uint32_t>(result_high) << 16) | result_low;
-}
-
-// E5M2 vectorized addition for 2 elements
-MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e5m2(uint16_t a, uint16_t b) {
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b, 0)));
-  return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, ival, false);
-}
-
-// E5M2 vectorized addition for 4 elements
-MSCCLPP_DEVICE_INLINE uint32_t add_fp8x4_e5m2(uint32_t a, uint32_t b) {
-  uint16_t a_low = a & 0xFFFF;
-  uint16_t a_high = (a >> 16) & 0xFFFF;
-  uint16_t b_low = b & 0xFFFF;
-  uint16_t b_high = (b >> 16) & 0xFFFF;
-  uint16_t result_low = add_fp8x2_e5m2(a_low, b_low);
-  uint16_t result_high = add_fp8x2_e5m2(a_high, b_high);
-  return (static_cast<uint32_t>(result_high) << 16) | result_low;
-}
-#endif
-
-#if !defined(MSCCLPP_DEVICE_HIP)
-// CUDA platform: Template specializations for vectorized FP8 operations
-
-// FP8 E4M3 vectorized addition using __hadd2 for 2 elements (CUDA only)
-template <>
-MSCCLPP_DEVICE_INLINE __fp8x2_e4m3 add_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) {
-  return __fp8x2_e4m3(__hadd2(__half2(a), __half2(b)));
-}
-
-// FP8 E4M3 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e4m3)
-template <>
-MSCCLPP_DEVICE_INLINE __fp8x4_e4m3 add_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) {
-  __fp8x2_e4m3* a_pair = reinterpret_cast<__fp8x2_e4m3*>(&a);
-  __fp8x2_e4m3* b_pair = reinterpret_cast<__fp8x2_e4m3*>(&b);
-
-  __fp8x2_e4m3 result[2];
-  result[0] = add_elements(a_pair[0], b_pair[0]);
-  result[1] = add_elements(a_pair[1], b_pair[1]);
-
-  return *reinterpret_cast<__fp8x4_e4m3*>(result);
-}
-
-// FP8 E5M2 vectorized addition for 2 elements (CUDA only)
-template <>
-MSCCLPP_DEVICE_INLINE __fp8x2_e5m2 add_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) {
-  return __fp8x2_e5m2(__hadd2(__half2(a), __half2(b)));
-}
-
-// FP8 E5M2 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e5m2)
-template <>
-MSCCLPP_DEVICE_INLINE __fp8x4_e5m2 add_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) {
-  __fp8x2_e5m2* a_pair = reinterpret_cast<__fp8x2_e5m2*>(&a);
-  __fp8x2_e5m2* b_pair = reinterpret_cast<__fp8x2_e5m2*>(&b);
-
-  __fp8x2_e5m2 result[2];
-  result[0] = add_elements(a_pair[0], b_pair[0]);
-  result[1] = add_elements(a_pair[1], b_pair[1]);
-
-  return *reinterpret_cast<__fp8x4_e5m2*>(result);
-}
-#endif
-#endif  // __FP8_TYPES_EXIST__
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int4 add_vectors_helper(int4 a, int4 b) {
-  int4 ret;
-  ret.w = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
-  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  ret.z = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
-  return ret;
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int4 add_vectors(int4 a, int4 b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE int4 add_vectors<__half>(int4 a, int4 b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE int4 add_vectors<__bfloat16>(int4 a, int4 b) {
-  return add_vectors_helper<__bfloat162>(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-template <>
-MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e4m3>(int4 a, int4 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // HIP gfx942: Use helper functions that work with storage types
-  int4 ret;
-  ret.w = add_fp8x4_e4m3(a.w, b.w);
-  ret.x = add_fp8x4_e4m3(a.x, b.x);
-  ret.y = add_fp8x4_e4m3(a.y, b.y);
-  ret.z = add_fp8x4_e4m3(a.z, b.z);
-  return ret;
-#else
-  return add_vectors_helper<__fp8x4_e4m3>(a, b);
-#endif
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e5m2>(int4 a, int4 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // HIP gfx942: Use helper functions that work with storage types
-  int4 ret;
-  ret.w = add_fp8x4_e5m2(a.w, b.w);
-  ret.x = add_fp8x4_e5m2(a.x, b.x);
-  ret.y = add_fp8x4_e5m2(a.y, b.y);
-  ret.z = add_fp8x4_e5m2(a.z, b.z);
-  return ret;
-#else
-  return add_vectors_helper<__fp8x4_e5m2>(a, b);
-#endif
-}
-#endif  // __FP8_TYPES_EXIST__
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint2 add_vectors_helper(uint2 a, uint2 b) {
-  uint2 ret;
-  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  return ret;
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__half>(uint2 a, uint2 b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__bfloat16>(uint2 a, uint2 b) {
-  return add_vectors_helper<__bfloat162>(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e4m3>(uint2 a, uint2 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // HIP gfx942: Use helper functions that work with storage types
-  uint2 ret;
-  ret.x = add_fp8x4_e4m3(a.x, b.x);
-  ret.y = add_fp8x4_e4m3(a.y, b.y);
-  return ret;
-#else
-  return add_vectors_helper<__fp8x4_e4m3>(a, b);
-#endif
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e5m2>(uint2 a, uint2 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // HIP gfx942: Use helper functions that work with storage types
-  uint2 ret;
-  ret.x = add_fp8x4_e5m2(a.x, b.x);
-  ret.y = add_fp8x4_e5m2(a.y, b.y);
-  return ret;
-#else
-  return add_vectors_helper<__fp8x4_e5m2>(a, b);
-#endif
-}
-#endif  // __FP8_TYPES_EXIST__
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int add_vectors_helper(int a, int b) {
-  return bit_cast<int, T>(add_elements(bit_cast<T, int>(a), bit_cast<T, int>(b)));
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__half>(int a, int b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__bfloat16>(int a, int b) {
-  return add_vectors_helper<__bfloat162>(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e4m3>(int a, int b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  return add_fp8x4_e4m3(a, b);
-#else
-  return add_vectors_helper<__fp8x4_e4m3>(a, b);
-#endif
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e5m2>(int a, int b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  return add_fp8x4_e5m2(a, b);
-#else
-  return add_vectors_helper<__fp8x4_e5m2>(a, b);
-#endif
-}
-#endif  // __FP8_TYPES_EXIST__
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors_helper(uint32_t a, uint32_t b) {
-  return bit_cast<uint32_t, T>(add_elements(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__bfloat16>(uint32_t a, uint32_t b) {
-  return add_vectors_helper<__bfloat162>(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-template <>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e4m3>(uint32_t a, uint32_t b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  return add_fp8x4_e4m3(a, b);
-#else
-  return add_vectors_helper<__fp8x4_e4m3>(a, b);
-#endif
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e5m2>(uint32_t a, uint32_t b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  return add_fp8x4_e5m2(a, b);
-#else
-  return add_vectors_helper<__fp8x4_e5m2>(a, b);
-#endif
-}
-#endif  // __FP8_TYPES_EXIST__
-
-#endif  // MSCCLPP_DEVICE_COMPILE
-
-}  // namespace
-
+#include "reduce_kernel.hpp"
 namespace mscclpp {
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
@@ -534,7 +185,7 @@ MSCCLPP_DEVICE_INLINE void handlePut(const Operation& op, void* input, void* out
   }
 }
 
-template <typename T, bool ReuseScratch, bool SendToRemote = true>
+template <typename T, bool ReuseScratch, bool SendToRemote = true, ReduceOp OpType = SUM>
 MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input, void* output, void* scratch,
                                                 uint32_t offset, uint32_t unitSize) {
   const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize);
@@ -559,7 +210,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input
           sizeof(int4);
       void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]);
       val = mscclpp::read<int4>(remoteMemory, srcOffset + idx);
-      tmp = add_vectors<T>(tmp, val);
+      tmp = cal_vector<T, OpType>(tmp, val);
     }
     output4[outputOffset4 + idx] = tmp;
     if constexpr (SendToRemote) {
@@ -587,7 +238,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input
            getOffset<ReuseScratch>(memoryChannelBufferTypes_[op.inputBufferRefs[index + 1].id], offset)) /
           sizeof(T);
       void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]);
-      tmp = add_elements(tmp, mscclpp::read<T>(remoteMemory, srcOffset + idx));
+      tmp = tmp + mscclpp::read<T>(remoteMemory, srcOffset + idx);
     }
     static_cast<T*>(output)[idx] = tmp;
     if constexpr (SendToRemote) {
@@ -681,7 +332,7 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat
   }
 }
 
-template <typename T, typename PacketType, bool SendToRemote = true>
+template <typename T, typename PacketType, bool SendToRemote = true, ReduceOp OpType = SUM>
 MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* input, void* output, void* scratch) {
   uint32_t size = op.inputBufferSizes[0];
   const uint32_t nSrcs = op.nInputs - 1;
@@ -704,9 +355,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in
     for (uint32_t index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
       PacketPayload<PacketType> val = pkt[idx].read(flag_);
-      data = add_vectors<T>(data, val);
+      data = cal_vector<T, OpType>(data, val);
     }
-    data = add_vectors<T>(data, srcPacketPayload[idx]);
+    data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
     dstPacketPayload[idx] = data;
 
     if constexpr (SendToRemote) {
@@ -720,7 +371,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in
   }
 }
 
-template <typename T, typename PacketType, bool SendToRemote = true>
+template <typename T, typename PacketType, bool SendToRemote = true, ReduceOp OpType = SUM>
 MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void* input, void* output, void* scratch) {
   uint32_t size = op.inputBufferSizes[0];
   const uint32_t nSrcs = op.nInputs - 1;
@@ -745,9 +396,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void
     for (uint32_t index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
       PacketPayload<PacketType> val = pkt[idx].read(flag_);
-      data = add_vectors<T>(data, val);
+      data = cal_vector<T, OpType>(data, val);
     }
-    data = add_vectors<T>(data, srcPacketPayload[idx]);
+    data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
     dstPacketPayload[idx] = data;
     PacketType* dst_val = &dstPkt[idx];
     dst_val->write(data, flag_);
@@ -790,7 +441,7 @@ MSCCLPP_DEVICE_INLINE void handleCopyPackets(const Operation& op, void* input, v
   mscclpp::copyToPackets<PacketType>(dst, src, size, threadIdx.x, blockDim.x, flag_);
 }
 
-template <typename T, bool ReuseScratch, bool SendToRemote = true>
+template <typename T, bool ReuseScratch, bool SendToRemote = true, ReduceOp OpType = SUM>
 MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, void* output, void* scratch,
                                             uint32_t offset, uint32_t unitSize) {
   const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize);
@@ -815,7 +466,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo
       size_t buffOffset =
           (inputOffsets[index] + getOffset<ReuseScratch>(outputBufferRefs[index].type, offset)) / sizeof(int4);
       int4 val = buff4[buffOffset + idx];
-      tmp = add_vectors<T>(tmp, val);
+      tmp = cal_vector<T, OpType>(tmp, val);
     }
     dst4[dstOffset4 + idx] = tmp;
     if constexpr (SendToRemote) {
@@ -840,7 +491,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo
       T* buff = static_cast<T*>(getBuffer(input, output, scratch, inputBufferRefs[index].type));
       uint32_t buffOffset =
           (inputOffsets[index] + getOffset<ReuseScratch>(inputBufferRefs[index].type, offset)) / sizeof(T);
-      tmp = add_elements(tmp, buff[buffOffset + idx]);
+      tmp = tmp + buff[buffOffset + idx];
     }
     dst[idx] = tmp;
     if constexpr (SendToRemote) {
@@ -897,7 +548,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint3
     }
   } else {
     // handle data in 16-byte unit
-    using Type16 = typename mscclpp::VectorType<T, 16 / sizeof(T)>;
+    using Type16 = mscclpp::VectorType<T, 16 / sizeof(T)>;
     const size_t nType16 = size / sizeof(Type16);
     const size_t srcOffset16 = srcOffset / sizeof(Type16);
     const size_t dstOffset16 = dstOffset / sizeof(Type16);
@@ -909,7 +560,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint3
     }
     // handle rest of data
     constexpr int RedBytes = (sizeof(T) == 8) ? 8 : 4;
-    using TypeRest = typename mscclpp::VectorType<T, RedBytes / sizeof(T)>;
+    using TypeRest = mscclpp::VectorType<T, RedBytes / sizeof(T)>;
     const size_t processed = nType16 * sizeof(Type16);
     const size_t nRest = (size - processed) / sizeof(TypeRest);
     TypeRest* srcR = reinterpret_cast<TypeRest*>(src + srcOffset + processed);
diff --git a/src/core/include/reduce_kernel.hpp b/src/core/include/reduce_kernel.hpp
new file mode 100644
index 00000000..00dc7714
--- /dev/null
+++ b/src/core/include/reduce_kernel.hpp
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_REDUCE_KERNEL_HPP_
+#define MSCCLPP_REDUCE_KERNEL_HPP_
+
+#include <mscclpp/algorithm.hpp>
+#include <mscclpp/gpu_data_types.hpp>
+#include <type_traits>
+
+namespace mscclpp {
+
+#if defined(MSCCLPP_DEVICE_COMPILE)
+
+// Generic element-wise calculation helper
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
+  if constexpr (OpType == SUM) {
+    return a + b;
+  } else if constexpr (OpType == MIN) {
+    return mscclpp::min(a, b);
+  }
+  static_assert(OpType == SUM || OpType == MIN, "Unsupported ReduceOp");
+}
+
+// Generic vector reduction helpers
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE int4 cal_vector_helper(const int4& a, const int4& b) {
+  int4 ret;
+  ret.w = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
+  ret.x = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  ret.z = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
+  return ret;
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE uint2 cal_vector_helper(const uint2& a, const uint2& b) {
+  uint2 ret;
+  ret.x = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
+  ret.y = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
+  return ret;
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE int cal_vector_helper(const int& a, const int& b) {
+  return bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE uint32_t cal_vector_helper(const uint32_t& a, const uint32_t& b) {
+  return bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
+}
+
+// cal_vector wrapper - converts scalar types to vector types and calls cal_vector_helper
+template <typename T, ReduceOp OpType, typename DataType>
+MSCCLPP_DEVICE_INLINE DataType cal_vector(const DataType& a, const DataType& b) {
+  // Define the vectorized computation type based on the element type
+  static_assert(sizeof(DataType) % sizeof(T) == 0, "DataType size must be multiple of T size");
+  static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes");
+  using CompType = typename std::conditional_t<
+      std::is_same_v<T, __half>, f16x2,
+      std::conditional_t<std::is_same_v<T, __bfloat16>, bf16x2,
+#if defined(__FP8_TYPES_EXIST__)
+                         std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
+                                            std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4,
+#endif
+                                                               T
+#if defined(__FP8_TYPES_EXIST__)
+                                                               >>>>;
+#else
+                         >>;
+#endif
+  return cal_vector_helper<CompType, OpType>(a, b);
+}
+
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_REDUCE_KERNEL_HPP_
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index a4881093..f6081043 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -58,7 +58,7 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
       const int remoteRank = index < rank ? index : index + 1;
       LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems;
       uint32_t val = dstPkt[idx].read(flag, -1);
-      data = cal_vectors<T, OpType>(val, data);
+      data = cal_vector<T, OpType>(val, data);
     }
     dst[idx] = data;
   }
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index e8cd93bb..d04766c1 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -85,7 +85,7 @@ __global__ void __launch_bounds__(512, 1)
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
-        data = cal_vectors<T, OpType>(val, data);
+        data = cal_vector<T, OpType>(val, data);
       }
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
@@ -125,7 +125,7 @@ __global__ void __launch_bounds__(512, 1)
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
-        data = cal_vectors<T, OpType>(val, data);
+        data = cal_vector<T, OpType>(val, data);
       }
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index aafe7566..bc7d596a 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -47,7 +47,7 @@ __global__ void __launch_bounds__(1024, 1)
         continue;
       }
       uint val = scratchPkt[peer * worldSize * nPktPerRank + i].read(flag);
-      data = cal_vectors<T, OpType>(data, val);
+      data = cal_vector<T, OpType>(data, val);
     }
     dst[i] = data;
   }
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index d150c717..23ed5d09 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -102,8 +102,8 @@ __global__ void __launch_bounds__(1024, 1)
       const int remoteRank = index < rank ? index : index + 1;
       mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
       uint2 val = dstPkt[idx].read(flag);
-      data.x = cal_vectors<T, OpType>(val.x, data.x);
-      data.y = cal_vectors<T, OpType>(val.y, data.y);
+      data.x = cal_vector<T, OpType>(val.x, data.x);
+      data.y = cal_vector<T, OpType>(val.y, data.y);
     }
 
     dst[idx].x = data.x;
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 10eecf7e..26b57dbf 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -10,6 +10,8 @@
 #include <mscclpp/packet_device.hpp>
 #include <type_traits>
 
+#include "reduce_kernel.hpp"
+
 #if defined(ENABLE_NPKIT)
 #include <mscclpp/npkit/npkit.hpp>
 #endif
@@ -22,438 +24,6 @@ constexpr ReduceOp MIN = ReduceOp::MIN;
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
 
-template <typename To, typename From>
-__forceinline__ __device__ To bit_cast(const From& src) {
-  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
-
-  union {
-    From f;
-    To t;
-  } u;
-  u.f = src;
-  return u.t;
-}
-
-template <typename T>
-__forceinline__ __device__ T clip(T val) {
-  return val;
-}
-
-template <>
-__forceinline__ __device__ __half clip(__half val) {
-  val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
-  val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
-
-  return val;
-}
-
-template <>
-__forceinline__ __device__ __half2 clip(__half2 val) {
-  val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff));
-  val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff));
-  val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff));
-  val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff));
-  return val;
-}
-
-template <>
-__forceinline__ __device__ __bfloat16 clip(__bfloat16 val) {
-  val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80));
-  val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80));
-  return val;
-}
-
-template <>
-__forceinline__ __device__ __bfloat162 clip(__bfloat162 val) {
-  val.x = __hmax(val.x, bit_cast<__bfloat16, unsigned short>(0xff80));
-  val.x = __hmin(val.x, bit_cast<__bfloat16, unsigned short>(0x7f80));
-  val.y = __hmax(val.y, bit_cast<__bfloat16, unsigned short>(0xff80));
-  val.y = __hmin(val.y, bit_cast<__bfloat16, unsigned short>(0x7f80));
-  return val;
-}
-
-template <typename T, bool UseClip = true>
-__forceinline__ __device__ T add_elements(T a, T b) {
-  if constexpr (UseClip) {
-    return clip(a + b);
-  } else {
-    return a + b;
-  }
-}
-
-template <bool UseClip = true>
-__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) {
-  if constexpr (UseClip) {
-    return clip(__hadd2(a, b));
-  } else {
-    return __hadd2(a, b);
-  }
-}
-
-template <bool UseClip = true>
-__forceinline__ __device__ __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) {
-  if constexpr (UseClip) {
-    return clip(__hadd2(a, b));
-  } else {
-    return __hadd2(a, b);
-  }
-}
-
-template <typename T>
-__forceinline__ __device__ T min_elements(T a, T b) {
-  return (a < b ? a : b);
-}
-
-template <>
-__forceinline__ __device__ __half2 min_elements(__half2 a, __half2 b) {
-#if defined(__HIP_PLATFORM_AMD__)
-  __half2 val;
-  val.x = __hmin(a.x, b.x);
-  val.y = __hmin(a.y, b.y);
-  return val;
-#else
-  return __hmin2(a, b);
-#endif
-}
-
-template <>
-__forceinline__ __device__ __bfloat162 min_elements(__bfloat162 a, __bfloat162 b) {
-  return __hmin2(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-// FP8 E4M3 clipping function
-template <>
-__forceinline__ __device__ __fp8_e4m3 clip(__fp8_e4m3 val) {
-  // FP8 E4M3 has range [-448, 448], no infinities
-  // Built-in saturation in FP8 arithmetic
-  return val;
-}
-
-// FP8 E5M2 clipping function - prevent infinities by clamping to max finite value
-template <>
-__forceinline__ __device__ __fp8_e5m2 clip(__fp8_e5m2 val) {
-  // FP8 E5M2 has infinities - clamp to max finite value to prevent overflow
-  // Max finite value for E5M2 is 57344.0f (0x7B), min is -57344.0f (0xFB)
-  float fval = float(val);
-  fval = fmaxf(fval, -57344.0f);
-  fval = fminf(fval, 57344.0f);
-  return __fp8_e5m2(fval);
-}
-
-// FP8 E4M3 addition using __hadd for efficiency (single element)
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
-#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
-  // Optimized assembly for gfx942
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false);
-#elif !defined(__HIP_PLATFORM_AMD__)
-  // NVIDIA CUDA FP8 addition (CUDA 11.8+)
-  __fp8_e4m3 result = __fp8_e4m3(__hadd(__half(a), __half(b)));
-  return UseClip ? clip(result) : result;
-#else
-  // Fallback for non-gfx942 HIP platforms
-  __fp8_e4m3 result = __fp8_e4m3(float(a) + float(b));
-  return UseClip ? clip(result) : result;
-#endif
-}
-
-// FP8 E4M3 vectorized addition for 2 elements
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8x2_e4m3 add_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) {
-#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false);
-#elif !defined(__HIP_PLATFORM_AMD__)
-  // CUDA: Convert to half2, add using optimized __hadd2, convert back
-  __fp8x2_e4m3 result = __fp8x2_e4m3(__hadd2(__half2(a), __half2(b)));
-  return result;
-#else
-  // Fallback for non-gfx942 HIP: element-wise using single-element operations
-  union {
-    __fp8_e4m3 fp8[2];
-    __fp8x2_e4m3 fp8x2;
-  } ua, ub, result;
-  ua.fp8x2 = a;
-  ub.fp8x2 = b;
-  result.fp8[0] = add_elements<UseClip>(ua.fp8[0], ub.fp8[0]);
-  result.fp8[1] = add_elements<UseClip>(ua.fp8[1], ub.fp8[1]);
-  return result.fp8x2;
-#endif
-}
-
-// FP8 E4M3 vectorized addition for 4 elements (via 2x __fp8x2_e4m3)
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8x4_e4m3 add_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) {
-  // Process as two __fp8x2_e4m3 using add_elements for 2 elements
-  __fp8x2_e4m3* a_pair = reinterpret_cast<__fp8x2_e4m3*>(&a);
-  __fp8x2_e4m3* b_pair = reinterpret_cast<__fp8x2_e4m3*>(&b);
-
-  __fp8x2_e4m3 result[2];
-  result[0] = add_elements<UseClip>(a_pair[0], b_pair[0]);
-  result[1] = add_elements<UseClip>(a_pair[1], b_pair[1]);
-
-  return *reinterpret_cast<__fp8x4_e4m3*>(result);
-}
-
-// FP8 E5M2 addition using __hadd for efficiency (single element)
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
-#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
-  // Optimized assembly for gfx942 (bfloat8)
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false);
-#elif !defined(__HIP_PLATFORM_AMD__)
-  // NVIDIA CUDA FP8 addition
-  __fp8_e5m2 result = __fp8_e5m2(__hadd(__half(a), __half(b)));
-  return UseClip ? clip(result) : result;
-#else
-  // Fallback for non-gfx942 HIP platforms
-  __fp8_e5m2 result = __fp8_e5m2(float(a) + float(b));
-  return UseClip ? clip(result) : result;
-#endif
-}
-
-#if !defined(__HIP_PLATFORM_AMD__)
-// FP8 E5M2 vectorized addition for 2 elements (CUDA only)
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8x2_e5m2 add_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) {
-  // CUDA: Convert to half2, add using optimized __hadd2, convert back
-  __fp8x2_e5m2 result = __fp8x2_e5m2(__hadd2(__half2(a), __half2(b)));
-  return result;
-}
-
-// FP8 E5M2 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e5m2)
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8x4_e5m2 add_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) {
-  // Process as two __fp8x2_e5m2 using add_elements for 2 elements
-  __fp8x2_e5m2* a_pair = reinterpret_cast<__fp8x2_e5m2*>(&a);
-  __fp8x2_e5m2* b_pair = reinterpret_cast<__fp8x2_e5m2*>(&b);
-
-  __fp8x2_e5m2 result[2];
-  result[0] = add_elements<UseClip>(a_pair[0], b_pair[0]);
-  result[1] = add_elements<UseClip>(a_pair[1], b_pair[1]);
-
-  return *reinterpret_cast<__fp8x4_e5m2*>(result);
-}
-#endif  // !defined(__HIP_PLATFORM_AMD__)
-
-// FP8 E4M3 min operation (single element)
-template <>
-__forceinline__ __device__ __fp8_e4m3 min_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
-#if defined(__HIP_PLATFORM_AMD__)
-  return __fp8_e4m3(fminf(float(a), float(b)));
-#else
-  return __fp8_e4m3(__hmin(__half(a), __half(b)));
-#endif
-}
-
-// FP8 E4M3 vectorized min for 2 elements
-__forceinline__ __device__ __fp8x2_e4m3 min_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) {
-#if defined(__HIP_PLATFORM_AMD__)
-  // HIP implementation: use union and process element-wise
-  union {
-    __fp8_e4m3 fp8[2];
-    __fp8x2_e4m3 fp8x2;
-  } ua, ub, result;
-  ua.fp8x2 = a;
-  ub.fp8x2 = b;
-  result.fp8[0] = min_elements(ua.fp8[0], ub.fp8[0]);
-  result.fp8[1] = min_elements(ua.fp8[1], ub.fp8[1]);
-  return result.fp8x2;
-#else
-  return __fp8x2_e4m3(__hmin2(__half2(a), __half2(b)));
-#endif
-}
-
-// FP8 E4M3 vectorized min for 4 elements
-__forceinline__ __device__ __fp8x4_e4m3 min_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) {
-  // Process as two __fp8x2_e4m3 using min_elements for 2 elements
-  union {
-    __fp8x4_e4m3 vec4;
-    __fp8x2_e4m3 vec2[2];
-  } ua, ub, uresult;
-  ua.vec4 = a;
-  ub.vec4 = b;
-
-  uresult.vec2[0] = min_elements(ua.vec2[0], ub.vec2[0]);
-  uresult.vec2[1] = min_elements(ua.vec2[1], ub.vec2[1]);
-
-  return uresult.vec4;
-}
-
-// FP8 E5M2 min operation (single element)
-template <>
-__forceinline__ __device__ __fp8_e5m2 min_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
-#if defined(__HIP_PLATFORM_AMD__)
-  return __fp8_e5m2(fminf(float(a), float(b)));
-#else
-  return __fp8_e5m2(__hmin(__half(a), __half(b)));
-#endif
-}
-
-#if !defined(__HIP_PLATFORM_AMD__)
-// FP8 E5M2 vectorized min for 2 elements (CUDA only)
-__forceinline__ __device__ __fp8x2_e5m2 min_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) {
-  return __fp8x2_e5m2(__hmin2(__half2(a), __half2(b)));
-}
-
-// FP8 E5M2 vectorized min for 4 elements (CUDA only)
-__forceinline__ __device__ __fp8x4_e5m2 min_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) {
-  // Process as two __fp8x2_e5m2 using min_elements for 2 elements
-  union {
-    __fp8x4_e5m2 vec4;
-    __fp8x2_e5m2 vec2[2];
-  } ua, ub, uresult;
-  ua.vec4 = a;
-  ub.vec4 = b;
-
-  uresult.vec2[0] = min_elements(ua.vec2[0], ub.vec2[0]);
-  uresult.vec2[1] = min_elements(ua.vec2[1], ub.vec2[1]);
-
-  return uresult.vec4;
-}
-#endif  // !defined(__HIP_PLATFORM_AMD__)
-#endif  // __FP8_TYPES_EXIST__
-
-template <typename T, ReduceOp OpType>
-__forceinline__ __device__ T cal_elements(T a, T b) {
-  if constexpr (OpType == SUM) {
-    return add_elements(a, b);
-  } else if constexpr (OpType == MIN) {
-    return min_elements(a, b);
-  }
-  // Should never reach here
-  return a;
-}
-
-template <typename T, ReduceOp OpType>
-__forceinline__ __device__ int4 cal_vectors_helper(int4 a, int4 b) {
-  int4 ret;
-  ret.w = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
-  ret.x = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  ret.z = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
-  return ret;
-}
-
-template <typename T, ReduceOp OpType>
-__forceinline__ __device__ uint2 cal_vectors_helper(uint2 a, uint2 b) {
-  uint2 ret;
-  ret.x = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  return ret;
-}
-
-template <typename T, ReduceOp OpType>
-__forceinline__ __device__ int cal_vectors_helper(int a, int b) {
-  return bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
-}
-
-#if defined(__HIP_PLATFORM_AMD__) && defined(__FP8_TYPES_EXIST__) && defined(__gfx942__)
-// Helper function to perform FP8 vector addition - dispatches based on scalar type
-// Uses AMD builtins from hip/amd_detail/amd_hip_fp8.h:
-//   - __builtin_amdgcn_cvt_pk_f32_fp8/bf8: Convert 2 FP8 values to 2 floats
-//   - __builtin_amdgcn_cvt_pk_fp8/bf8_f32: Convert 2 floats to 2 FP8 values
-// The 'word' parameter (false/true) selects low/high 16-bit word from uint32_t
-template <typename ScalarT>
-__forceinline__ __device__ int add_fp8x4_hip(int a, int b) {
-  uint32_t a32 = static_cast<uint32_t>(a);
-  uint32_t b32 = static_cast<uint32_t>(b);
-
-  float2 v_low, v_high;
-  uint32_t ival = 0;
-
-  if constexpr (std::is_same_v<ScalarT, __fp8_e4m3>) {
-    // E4M3 using fp8 conversion - process low word (false) and high word (true)
-    asm volatile("v_pk_add_f32 %0, %1, %2"
-                 : "=v"(v_low)
-                 : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, false)));
-    uint16_t result_low = __builtin_amdgcn_cvt_pk_fp8_f32(v_low.x, v_low.y, ival, false);
-
-    asm volatile("v_pk_add_f32 %0, %1, %2"
-                 : "=v"(v_high)
-                 : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, true)));
-    uint16_t result_high = __builtin_amdgcn_cvt_pk_fp8_f32(v_high.x, v_high.y, ival, false);
-
-    uint32_t result = (static_cast<uint32_t>(result_high) << 16) | result_low;
-    return static_cast<int>(result);
-  } else {  // __fp8_e5m2
-    // E5M2 using bf8 conversion - process low word (false) and high word (true)
-    asm volatile("v_pk_add_f32 %0, %1, %2"
-                 : "=v"(v_low)
-                 : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, false)));
-    uint16_t result_low = __builtin_amdgcn_cvt_pk_bf8_f32(v_low.x, v_low.y, ival, false);
-
-    asm volatile("v_pk_add_f32 %0, %1, %2"
-                 : "=v"(v_high)
-                 : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, true)));
-    uint16_t result_high = __builtin_amdgcn_cvt_pk_bf8_f32(v_high.x, v_high.y, ival, false);
-
-    uint32_t result = (static_cast<uint32_t>(result_high) << 16) | result_low;
-    return static_cast<int>(result);
-  }
-}
-#endif
-
-template <typename T, ReduceOp OpType, typename DataType>
-__forceinline__ __device__ DataType cal_vectors(DataType a, DataType b) {
-#if defined(__HIP_PLATFORM_AMD__) && defined(__FP8_TYPES_EXIST__) && defined(__gfx942__)
-  // For FP8 types on HIP gfx942, use specialized helper that dispatches based on scalar type
-  if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
-    if constexpr (OpType == SUM) {
-      if constexpr (std::is_same_v<DataType, int> || std::is_same_v<DataType, uint32_t>) {
-        // Handle int/uint32_t (4 FP8 elements)
-        return add_fp8x4_hip<T>(a, b);
-      } else if constexpr (std::is_same_v<DataType, int4>) {
-        // Handle int4 (16 FP8 elements) - process as 4 ints
-        int4 ret;
-        ret.w = add_fp8x4_hip<T>(a.w, b.w);
-        ret.x = add_fp8x4_hip<T>(a.x, b.x);
-        ret.y = add_fp8x4_hip<T>(a.y, b.y);
-        ret.z = add_fp8x4_hip<T>(a.z, b.z);
-        return ret;
-      } else if constexpr (std::is_same_v<DataType, uint2>) {
-        // Handle uint2 (8 FP8 elements) - process as 2 ints
-        uint2 ret;
-        ret.x = add_fp8x4_hip<T>(a.x, b.x);
-        ret.y = add_fp8x4_hip<T>(a.y, b.y);
-        return ret;
-      }
-    }
-  }
-#endif
-
-  // Define the vectorized computation type based on the element type
-  using CompType = typename std::conditional_t<
-      std::is_same_v<T, __half>, __half2,
-      std::conditional_t<std::is_same_v<T, __bfloat16>, __bfloat162,
-#if defined(__FP8_TYPES_EXIST__)
-                         std::conditional_t<std::is_same_v<T, __fp8_e4m3>, __fp8x4_e4m3,
-                                            std::conditional_t<std::is_same_v<T, __fp8_e5m2>, __fp8x4_e5m2,
-#endif
-                                                               T
-#if defined(__FP8_TYPES_EXIST__)
-                                                               >>>>;
-#else
-                         >>;
-#endif
-  return cal_vectors_helper<CompType, OpType>(a, b);
-}
-
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
 template <class T>
 MSCCLPP_DEVICE_INLINE constexpr std::size_t calcVectorSize() {
@@ -472,7 +42,12 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src
   // nvls can only handle 4 bytes alignment
   MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned");
   constexpr size_t nElem = calcVectorSize<T>();
-  using vectorType = mscclpp::VectorType<T, nElem>;
+  // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations
+  constexpr size_t vecSize = (std::is_same_v<T, int> || std::is_same_v<T, int32_t> || std::is_same_v<T, unsigned int> ||
+                              std::is_same_v<T, uint32_t>)
+                                 ? 1
+                                 : nElem;
+  using vectorType = mscclpp::VectorType<T, vecSize>;
   const size_t nVec = size / sizeof(vectorType);
   const size_t srcOffset4 = srcOffset / sizeof(vectorType);
   const size_t dstOffset4 = dstOffset / sizeof(vectorType);

From 620378b4fb3c9180dc4259d918b1b769d04d6d73 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Fri, 6 Feb 2026 01:25:12 +0800
Subject: [PATCH 05/52] Fix cpplint error in main branch (#740)

Fix the legacy cpplint error in main branch.

---------

Co-authored-by: Qinghua Zhou <qinghuahzhou@microsoft.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Binyang Li <binyli@microsoft.com>
---
 .../01-basic-concepts/gpu_ping_pong.cu        | 14 +++++-----
 .../02-bootstrap/gpu_ping_pong_mp.cu          | 12 ++++----
 .../03-memory-channel/bidir_memory_channel.cu | 28 +++++++++----------
 .../04-port-channel/bidir_port_channel.cu     | 14 +++++-----
 include/mscclpp/assert_device.hpp             |  8 +++---
 python/csrc/error_py.cpp                      |  8 +++---
 python/csrc/npkit_py.cpp                      |  2 +-
 python/csrc/numa_py.cpp                       |  2 +-
 src/core/context.cc                           | 10 +++----
 src/core/include/context.hpp                  |  6 ++--
 src/core/include/gpu_ipc_mem.hpp              | 10 +++----
 src/core/include/ibverbs_wrapper.hpp          | 22 +++++++--------
 src/ext/nccl/audit-shim/audit_nccl.cc         |  8 +++---
 test/unit/local_channel_tests.cu              |  6 ++--
 14 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu b/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu
index 0e2ab5ad..f3c69b72 100644
--- a/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu
+++ b/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu
@@ -9,7 +9,7 @@
 #include <sstream>
 
 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
   std::stringstream ss;
   (ss << ... << args);
   ss << std::endl;
@@ -23,7 +23,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
   }
 }
 
-__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
   if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
     for (int i = 0; i < iter; ++i) {
       devHandle->relaxedWait();
@@ -34,7 +34,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
   }
 }
 
-__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
   if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
     for (int i = 0; i < iter; ++i) {
       devHandle->relaxedSignal();
@@ -88,7 +88,7 @@ int main() {
   mscclpp::Semaphore sema0(/*localSemaphoreStub*/ semaStub0, /*remoteSemaphoreStub*/ semaStub1);
   mscclpp::BaseMemoryChannel memChan0(sema0);
   mscclpp::BaseMemoryChannelDeviceHandle memChanHandle0 = memChan0.deviceHandle();
-  void *devHandle0;
+  void* devHandle0;
   MSCCLPP_CUDATHROW(cudaMalloc(&devHandle0, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
   MSCCLPP_CUDATHROW(cudaMemcpy(devHandle0, &memChanHandle0, sizeof(memChanHandle0), cudaMemcpyHostToDevice));
 
@@ -98,14 +98,14 @@ int main() {
   mscclpp::Semaphore sema1(/*localSemaphoreStub*/ semaStub1, /*remoteSemaphoreStub*/ semaStub0);
   mscclpp::BaseMemoryChannel memChan1(sema1);
   mscclpp::BaseMemoryChannelDeviceHandle memChanHandle1 = memChan1.deviceHandle();
-  void *devHandle1;
+  void* devHandle1;
   MSCCLPP_CUDATHROW(cudaMalloc(&devHandle1, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
   MSCCLPP_CUDATHROW(cudaMemcpy(devHandle1, &memChanHandle1, sizeof(memChanHandle1), cudaMemcpyHostToDevice));
 
   log("GPU 0: Launching gpuKernel0 ...");
 
   MSCCLPP_CUDATHROW(cudaSetDevice(0));
-  gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle0), iter);
+  gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle0), iter);
   MSCCLPP_CUDATHROW(cudaGetLastError());
 
   log("GPU 1: Launching gpuKernel1 ...");
@@ -115,7 +115,7 @@ int main() {
   MSCCLPP_CUDATHROW(cudaEventCreate(&start));
   MSCCLPP_CUDATHROW(cudaEventCreate(&end));
   MSCCLPP_CUDATHROW(cudaEventRecord(start));
-  gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle1), iter);
+  gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle1), iter);
   MSCCLPP_CUDATHROW(cudaGetLastError());
   MSCCLPP_CUDATHROW(cudaEventRecord(end));
   MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
diff --git a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
index 05eb1b25..0526407e 100644
--- a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
+++ b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
@@ -14,7 +14,7 @@
 #define PORT_NUMBER "50505"
 
 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
   std::stringstream ss;
   (ss << ... << args);
   ss << std::endl;
@@ -50,7 +50,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
   }
 }
 
-__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
   if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
     for (int i = 0; i < iter; ++i) {
       devHandle->relaxedWait();
@@ -61,7 +61,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
   }
 }
 
-__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
   if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
     for (int i = 0; i < iter; ++i) {
       devHandle->relaxedSignal();
@@ -115,14 +115,14 @@ void worker(int gpuId) {
 
   mscclpp::BaseMemoryChannel memChan(sema);
   auto memChanHandle = memChan.deviceHandle();
-  void *devHandle;
+  void* devHandle;
   MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
   MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
 
   log("GPU ", gpuId, ": Launching a GPU kernel ...");
 
   if (gpuId == 0) {
-    gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
+    gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
     MSCCLPP_CUDATHROW(cudaGetLastError());
     MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
   } else {
@@ -130,7 +130,7 @@ void worker(int gpuId) {
     MSCCLPP_CUDATHROW(cudaEventCreate(&start));
     MSCCLPP_CUDATHROW(cudaEventCreate(&end));
     MSCCLPP_CUDATHROW(cudaEventRecord(start));
-    gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
+    gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
     MSCCLPP_CUDATHROW(cudaGetLastError());
     MSCCLPP_CUDATHROW(cudaEventRecord(end));
     MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
diff --git a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
index cfbf12d7..a1be59f2 100644
--- a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
+++ b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
@@ -16,7 +16,7 @@
 #define PORT_NUMBER "50505"
 
 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
   std::stringstream ss;
   (ss << ... << args);
   ss << std::endl;
@@ -47,7 +47,7 @@ int wait_process(int pid) {
 
 __device__ mscclpp::DeviceSyncer devSyncer;
 
-__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid == 0) {
     devHandle->relaxedSignal();
@@ -65,7 +65,7 @@ __global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
   }
 }
 
-__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid == 0) {
     devHandle->relaxedSignal();
@@ -79,7 +79,7 @@ __global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
   devHandle->get(srcOffset, dstOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x);
 }
 
-__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank,
+__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank,
                                      uint32_t flag) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid == 0) {
@@ -95,7 +95,7 @@ __global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHand
   devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
 }
 
-void worker(int myRank, int gpuId, const std::string &ipPort) {
+void worker(int myRank, int gpuId, const std::string& ipPort) {
   MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
   const int remoteRank = myRank == 0 ? 1 : 0;
   const int nRanks = 2;
@@ -132,8 +132,8 @@ void worker(int myRank, int gpuId, const std::string &ipPort) {
   auto memChanHandle = memChan.deviceHandle();
   auto memPktChanHandle = memPktChan.deviceHandle();
 
-  void *devHandle;
-  void *devPktHandle;
+  void* devHandle;
+  void* devPktHandle;
   MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
   MSCCLPP_CUDATHROW(cudaMalloc(&devPktHandle, sizeof(memPktChanHandle)));
   MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
@@ -145,18 +145,18 @@ void worker(int myRank, int gpuId, const std::string &ipPort) {
   std::function<void(size_t)> kernels[3];
 
   kernels[0] = [&](size_t copyBytes) {
-    bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
-                                            copyBytes, myRank);
+    bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
+                                            myRank);
   };
 
   kernels[1] = [&](size_t copyBytes) {
-    bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
-                                            copyBytes, myRank);
+    bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
+                                            myRank);
   };
 
   kernels[2] = [&](size_t copyBytes) {
     static uint32_t flag = 1;
-    bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devPktHandle),
+    bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devPktHandle),
                                                   copyBytes, myRank, flag++);
   };
 
@@ -215,7 +215,7 @@ void worker(int myRank, int gpuId, const std::string &ipPort) {
   bootstrap->barrier();
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   if (argc == 1) {
     int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
     int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
@@ -241,7 +241,7 @@ int main(int argc, char **argv) {
     try {
       rank = std::stoi(argv[2]);
       gpuId = std::stoi(argv[3]);
-    } catch (const std::exception &) {
+    } catch (const std::exception&) {
       log("Error: rank and gpu_id must be valid integers.");
       return -1;
     }
diff --git a/examples/tutorials/04-port-channel/bidir_port_channel.cu b/examples/tutorials/04-port-channel/bidir_port_channel.cu
index 46064581..9e6d61dd 100644
--- a/examples/tutorials/04-port-channel/bidir_port_channel.cu
+++ b/examples/tutorials/04-port-channel/bidir_port_channel.cu
@@ -16,7 +16,7 @@
 #define PORT_NUMBER "50505"
 
 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
   std::stringstream ss;
   (ss << ... << args);
   ss << std::endl;
@@ -45,7 +45,7 @@ int wait_process(int pid) {
   return -1;
 }
 
-__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid == 0) {
     devHandle->signal();
@@ -58,7 +58,7 @@ __global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size
   }
 }
 
-void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport transport) {
+void worker(int rank, int gpuId, const std::string& ipPort, mscclpp::Transport transport) {
   MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
   const int myRank = rank;
   const int remoteRank = myRank == 0 ? 1 : 0;
@@ -90,7 +90,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
 
   auto portChanHandle = portChan.deviceHandle();
 
-  void *devHandle;
+  void* devHandle;
   MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(portChanHandle)));
   MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &portChanHandle, sizeof(portChanHandle), cudaMemcpyHostToDevice));
 
@@ -100,7 +100,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
   std::function<void(size_t)> kernels[1];
 
   kernels[0] = [&](size_t copyBytes) {
-    bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle *>(devHandle), copyBytes,
+    bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle*>(devHandle), copyBytes,
                                         myRank);
   };
 
@@ -166,7 +166,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
   bootstrap->barrier();
 }
 
-mscclpp::Transport parseTransport(const std::string &transportStr) {
+mscclpp::Transport parseTransport(const std::string& transportStr) {
   if (transportStr == "CudaIpc") return mscclpp::Transport::CudaIpc;
   if (transportStr == "IB0") return mscclpp::Transport::IB0;
   if (transportStr == "IB1") return mscclpp::Transport::IB1;
@@ -180,7 +180,7 @@ mscclpp::Transport parseTransport(const std::string &transportStr) {
   throw std::runtime_error("Unknown transport: " + transportStr);
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   if (argc == 1) {
     int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
     int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
diff --git a/include/mscclpp/assert_device.hpp b/include/mscclpp/assert_device.hpp
index bf982ba6..1b9cb611 100644
--- a/include/mscclpp/assert_device.hpp
+++ b/include/mscclpp/assert_device.hpp
@@ -19,11 +19,11 @@
 #else  // defined(DEBUG_BUILD)
 
 #if defined(MSCCLPP_DEVICE_HIP)
-extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
-                                         const char *__function);
+extern "C" __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
+                                         const char* __function);
 #else   // !defined(MSCCLPP_DEVICE_HIP)
-extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
-                                                  const char *__function) __THROW;
+extern "C" __host__ __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
+                                                  const char* __function) __THROW;
 #endif  // !defined(MSCCLPP_DEVICE_HIP)
 
 /// Assert a condition on the device and print a message if the condition is false.
diff --git a/python/csrc/error_py.cpp b/python/csrc/error_py.cpp
index 208f4e84..c19a3b15 100644
--- a/python/csrc/error_py.cpp
+++ b/python/csrc/error_py.cpp
@@ -11,17 +11,17 @@ using namespace mscclpp;
 
 #define REGISTER_EXCEPTION_TRANSLATOR(name_)                                                                         \
   nb::register_exception_translator(                                                                                 \
-      [](const std::exception_ptr &p, void *payload) {                                                               \
+      [](const std::exception_ptr& p, void* payload) {                                                               \
         try {                                                                                                        \
           std::rethrow_exception(p);                                                                                 \
-        } catch (const name_ &e) {                                                                                   \
-          PyErr_SetObject(reinterpret_cast<PyObject *>(payload),                                                     \
+        } catch (const name_& e) {                                                                                   \
+          PyErr_SetObject(reinterpret_cast<PyObject*>(payload),                                                      \
                           PyTuple_Pack(2, PyLong_FromLong(long(e.getErrorCode())), PyUnicode_FromString(e.what()))); \
         }                                                                                                            \
       },                                                                                                             \
       m.attr(#name_).ptr());
 
-void register_error(nb::module_ &m) {
+void register_error(nb::module_& m) {
   nb::enum_<ErrorCode>(m, "CppErrorCode")
       .value("SystemError", ErrorCode::SystemError)
       .value("InternalError", ErrorCode::InternalError)
diff --git a/python/csrc/npkit_py.cpp b/python/csrc/npkit_py.cpp
index 8aaa8011..8c158354 100644
--- a/python/csrc/npkit_py.cpp
+++ b/python/csrc/npkit_py.cpp
@@ -8,7 +8,7 @@
 
 namespace nb = nanobind;
 
-void register_npkit(nb::module_ &m) {
+void register_npkit(nb::module_& m) {
   nb::module_ sub_m = m.def_submodule("cpp_npkit", "NPKit functions");
   sub_m.def("init", &NpKit::Init);
   sub_m.def("dump", &NpKit::Dump);
diff --git a/python/csrc/numa_py.cpp b/python/csrc/numa_py.cpp
index 4433ecc8..fadc0f69 100644
--- a/python/csrc/numa_py.cpp
+++ b/python/csrc/numa_py.cpp
@@ -6,7 +6,7 @@ int getDeviceNumaNode(int cudaDev);
 void numaBind(int node);
 };  // namespace mscclpp
 
-void register_numa(nb::module_ &m) {
+void register_numa(nb::module_& m) {
   nb::module_ sub_m = m.def_submodule("cpp_numa", "numa functions");
   sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode);
   sub_m.def("numa_bind", &mscclpp::numaBind);
diff --git a/src/core/context.cc b/src/core/context.cc
index 9bf299d3..a5cdffb2 100644
--- a/src/core/context.cc
+++ b/src/core/context.cc
@@ -23,14 +23,14 @@ void CudaIpcStream::setStreamIfNeeded() {
   }
 }
 
-void CudaIpcStream::memcpyD2D(void *dst, const void *src, size_t nbytes) {
+void CudaIpcStream::memcpyD2D(void* dst, const void* src, size_t nbytes) {
   CudaDeviceGuard deviceGuard(deviceId_);
   setStreamIfNeeded();
   MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, *stream_));
   dirty_ = true;
 }
 
-void CudaIpcStream::memcpyH2D(void *dst, const void *src, size_t nbytes) {
+void CudaIpcStream::memcpyH2D(void* dst, const void* src, size_t nbytes) {
   CudaDeviceGuard deviceGuard(deviceId_);
   setStreamIfNeeded();
   MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, *stream_));
@@ -48,7 +48,7 @@ void CudaIpcStream::sync() {
 
 Context::Impl::Impl() {}
 
-IbCtx *Context::Impl::getIbContext(Transport ibTransport) {
+IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
   // Find IB context or create it
   auto it = ibContexts_.find(ibTransport);
   if (it == ibContexts_.end()) {
@@ -70,7 +70,7 @@ MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique<Impl>()) {}
 
 MSCCLPP_API_CPP Context::~Context() = default;
 
-MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void *ptr, size_t size, TransportFlags transports) {
+MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void* ptr, size_t size, TransportFlags transports) {
   return RegisteredMemory(std::make_shared<RegisteredMemory::Impl>(ptr, size, transports, *pimpl_));
 }
 
@@ -78,7 +78,7 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) {
   return Endpoint(std::make_shared<Endpoint::Impl>(config, *pimpl_));
 }
 
-MSCCLPP_API_CPP Connection Context::connect(const Endpoint &localEndpoint, const Endpoint &remoteEndpoint) {
+MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) {
   if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) {
     throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage);
   }
diff --git a/src/core/include/context.hpp b/src/core/include/context.hpp
index b53a2662..ee84d0f7 100644
--- a/src/core/include/context.hpp
+++ b/src/core/include/context.hpp
@@ -24,9 +24,9 @@ class CudaIpcStream {
  public:
   CudaIpcStream(int deviceId);
 
-  void memcpyD2D(void *dst, const void *src, size_t nbytes);
+  void memcpyD2D(void* dst, const void* src, size_t nbytes);
 
-  void memcpyH2D(void *dst, const void *src, size_t nbytes);
+  void memcpyH2D(void* dst, const void* src, size_t nbytes);
 
   void sync();
 
@@ -44,7 +44,7 @@ struct Context::Impl {
 
   Impl();
 
-  IbCtx *getIbContext(Transport ibTransport);
+  IbCtx* getIbContext(Transport ibTransport);
   std::shared_ptr<uint64_t> getToken();
 };
 
diff --git a/src/core/include/gpu_ipc_mem.hpp b/src/core/include/gpu_ipc_mem.hpp
index 98fa47f2..923e807d 100644
--- a/src/core/include/gpu_ipc_mem.hpp
+++ b/src/core/include/gpu_ipc_mem.hpp
@@ -46,7 +46,7 @@ struct GpuIpcMemHandle {
     char handle[64];
   } fabric;
 
-  static void deleter(GpuIpcMemHandle *handle);
+  static void deleter(GpuIpcMemHandle* handle);
 
   // We make GpuIpcMemHandle trivially copyable for easy serialization,
   // and thus it cannot have explicit destructors.
@@ -61,7 +61,7 @@ struct GpuIpcMemHandle {
     using Base::Base;
 
     // Allow implicit conversion from Base
-    UniquePtr(Base &&other) : Base(std::move(other)) {}
+    UniquePtr(Base&& other) : Base(std::move(other)) {}
   };
 
   static UniquePtr create(const CUdeviceptr ptr);
@@ -70,7 +70,7 @@ struct GpuIpcMemHandle {
 
 using UniqueGpuIpcMemHandle = GpuIpcMemHandle::UniquePtr;
 
-std::ostream &operator<<(std::ostream &os, const GpuIpcMemHandle::TypeFlags &typeFlags);
+std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typeFlags);
 
 static_assert(std::is_trivially_copyable_v<GpuIpcMemHandle>);
 
@@ -82,7 +82,7 @@ class GpuIpcMem : public std::enable_shared_from_this<GpuIpcMem> {
   /// Create a GpuIpcMem instance from a GpuIpcMemHandle.
   /// @param handle The handle to import.
   /// @return A shared_ptr to the created GpuIpcMem instance.
-  static std::shared_ptr<GpuIpcMem> create(const GpuIpcMemHandle &handle);
+  static std::shared_ptr<GpuIpcMem> create(const GpuIpcMemHandle& handle);
 
   ~GpuIpcMem();
 
@@ -102,7 +102,7 @@ class GpuIpcMem : public std::enable_shared_from_this<GpuIpcMem> {
   std::shared_ptr<void> mapMulticast(int numDevices, size_t mcOffset, CUdeviceptr bufferAddr, size_t bufferSize);
 
  private:
-  GpuIpcMem(const GpuIpcMemHandle &handle);
+  GpuIpcMem(const GpuIpcMemHandle& handle);
 
   GpuIpcMemHandle handle_;
   CUmemGenericAllocationHandle allocHandle_;
diff --git a/src/core/include/ibverbs_wrapper.hpp b/src/core/include/ibverbs_wrapper.hpp
index 45054ff3..b5ab2eff 100644
--- a/src/core/include/ibverbs_wrapper.hpp
+++ b/src/core/include/ibverbs_wrapper.hpp
@@ -12,12 +12,12 @@ namespace mscclpp {
 
 struct IBVerbs {
  private:
-  static void *dlsym(const std::string &symbol, bool allowReturnNull = false);
+  static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
 
  public:
 #define REGISTER_IBV_FUNC_WITH_NAME(name__, func__)                                          \
   template <typename... Args>                                                                \
-  static inline auto(name__)(Args && ...args) {                                              \
+  static inline auto(name__)(Args && ... args) {                                             \
     static_assert(sizeof(&::func__) > 0, #func__ " is expected be a function, not a macro"); \
     static decltype(&::func__) impl = nullptr;                                               \
     if (!impl) impl = reinterpret_cast<decltype(impl)>(IBVerbs::dlsym(#func__));             \
@@ -46,7 +46,7 @@ struct IBVerbs {
   REGISTER_IBV_FUNC(ibv_wc_status_str)
 
   static bool isDmabufSupported();
-  static struct ibv_mr *ibv_reg_dmabuf_mr(struct ibv_pd *, uint64_t, size_t, uint64_t, int, int);
+  static struct ibv_mr* ibv_reg_dmabuf_mr(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int);
 
   ///
   /// Below is for cases where the API (may be / is) a macro. Refer to `infiniband/verbs.h`.
@@ -57,8 +57,8 @@ struct IBVerbs {
 #else  // defined(ibv_get_device_list)
 #undef ibv_get_device_list
   REGISTER_IBV_FUNC(ibv_static_providers)
-  static inline struct ibv_device **ibv_get_device_list(int *num_devices) {
-    using FuncType = struct ibv_device **(*)(int *);
+  static inline struct ibv_device** ibv_get_device_list(int* num_devices) {
+    using FuncType = struct ibv_device** (*)(int*);
     static FuncType impl = nullptr;
     if (!impl) impl = reinterpret_cast<FuncType>(IBVerbs::dlsym("ibv_get_device_list"));
     IBVerbs::ibv_static_providers(NULL, _RDMA_STATIC_PREFIX(RDMA_STATIC_PROVIDERS), NULL);
@@ -67,21 +67,21 @@ struct IBVerbs {
 #endif  // defined(ibv_get_device_list)
 
 #undef ibv_query_port
-  static inline int ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) {
+  static inline int ibv_query_port(struct ibv_context* context, uint8_t port_num, struct ibv_port_attr* port_attr) {
     static decltype(&::ibv_query_port) impl = nullptr;
     if (!impl) impl = reinterpret_cast<decltype(impl)>(IBVerbs::dlsym("ibv_query_port"));
-    struct verbs_context *vctx = verbs_get_ctx_op(context, query_port);
+    struct verbs_context* vctx = verbs_get_ctx_op(context, query_port);
     if (!vctx) {
       int rc;
       ::memset(port_attr, 0, sizeof(*port_attr));
-      rc = impl(context, port_num, (struct _compat_ibv_port_attr *)port_attr);
+      rc = impl(context, port_num, (struct _compat_ibv_port_attr*)port_attr);
       return rc;
     }
     return vctx->query_port(context, port_num, port_attr, sizeof(*port_attr));
   }
 
 #undef ibv_reg_mr
-  static inline struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
+  static inline struct ibv_mr* ibv_reg_mr(struct ibv_pd* pd, void* addr, size_t length, int access) {
     static decltype(&::ibv_reg_mr) impl = nullptr;
     static decltype(&::ibv_reg_mr_iova2) impl_iova2 = nullptr;
     int is_access_const = __builtin_constant_p(((int)(access)&IBV_ACCESS_OPTIONAL_RANGE) == 0);
@@ -98,11 +98,11 @@ struct IBVerbs {
   /// Below is for cases where the API (may be / is) a static function. Refer to `infiniband/verbs.h`.
   ///
 
-  static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  static inline int ibv_post_send(struct ibv_qp* qp, struct ibv_send_wr* wr, struct ibv_send_wr** bad_wr) {
     return qp->context->ops.post_send(qp, wr, bad_wr);
   }
 
-  static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) {
+  static inline int ibv_poll_cq(struct ibv_cq* cq, int num_entries, struct ibv_wc* wc) {
     return cq->context->ops.poll_cq(cq, num_entries, wc);
   }
 };
diff --git a/src/ext/nccl/audit-shim/audit_nccl.cc b/src/ext/nccl/audit-shim/audit_nccl.cc
index 5e3ab6f2..7fdeb67b 100644
--- a/src/ext/nccl/audit-shim/audit_nccl.cc
+++ b/src/ext/nccl/audit-shim/audit_nccl.cc
@@ -8,11 +8,11 @@
 
 extern "C" __attribute__((visibility("default"))) unsigned int la_version(unsigned int) { return LAV_CURRENT; }
 
-extern "C" __attribute__((visibility("default"))) char *la_objsearch(const char *name, uintptr_t *, unsigned int) {
-  const char *library = "libmscclpp_nccl.so";
+extern "C" __attribute__((visibility("default"))) char* la_objsearch(const char* name, uintptr_t*, unsigned int) {
+  const char* library = "libmscclpp_nccl.so";
   if (strcmp(name, "libnccl.so.2") && strcmp(name, "libnccl.so") && strcmp(name, "librccl.so") &&
       strcmp(name, "librccl.so.1")) {
-    return (char *)name;
+    return (char*)name;
   }
-  return (char *)library;
+  return (char*)library;
 }
\ No newline at end of file
diff --git a/test/unit/local_channel_tests.cu b/test/unit/local_channel_tests.cu
index 7414f6bb..50ffc9ea 100644
--- a/test/unit/local_channel_tests.cu
+++ b/test/unit/local_channel_tests.cu
@@ -12,10 +12,10 @@
 
 __constant__ mscclpp::PortChannelDeviceHandle gPortChannel;
 
-__global__ void kernelLocalPortChannelTest(void *dst, void *src, size_t bytes, int *ret) {
+__global__ void kernelLocalPortChannelTest(void* dst, void* src, size_t bytes, int* ret) {
   if (blockIdx.x == 0) {
     // sender
-    int *ptr = reinterpret_cast<int *>(src);
+    int* ptr = reinterpret_cast<int*>(src);
     for (size_t idx = threadIdx.x; idx < bytes / sizeof(int); idx += blockDim.x) {
       ptr[idx] = MAGIC_CONST;
     }
@@ -29,7 +29,7 @@ __global__ void kernelLocalPortChannelTest(void *dst, void *src, size_t bytes, i
       gPortChannel.wait();
     }
     __syncthreads();
-    int *ptr = reinterpret_cast<int *>(dst);
+    int* ptr = reinterpret_cast<int*>(dst);
     for (size_t idx = threadIdx.x; idx < bytes / sizeof(int); idx += blockDim.x) {
       if (ptr[idx] != MAGIC_CONST) {
         *ret = 1;  // Error: value mismatch

From d7925448f38e5e9236ca571b524f4bb01df6f02f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 7 Feb 2026 04:27:01 +0900
Subject: [PATCH 06/52] Update `copilot-instructions.md` (#722)

---
 .github/copilot-instructions.md | 10 ++++++++--
 .gitignore                      |  6 ++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 4cf9dbf8..4f13c557 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -25,7 +25,7 @@ For C/C++/CUDA source code:
 ```
 
 ## Formatting
-If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only staged files.
+If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only files that are tracked by git, so if you have added new files, make sure to `git add` them first.
 
 ## Building and Testing
 The following commands are commonly used for building and testing the project. See `docs/quickstart.md` for more detailed instructions.
@@ -40,7 +40,7 @@ cd ..
 
 For testing after successful build:
 ```bash
-# To run all tests
+# To run tests with two GPUs - two is enough for most tests
 mpirun -np 2 ./build/bin/mp_unit_tests
 # To run tests excluding IB-related ones (when IB is not available)
 mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
@@ -51,6 +51,12 @@ For building a Python package:
 python3 -m pip install -e .
 ```
 
+For Python tests after building the package:
+```bash
+# Run tests with 8 GPUs - adjust the number as needed
+mpirun -np 8 python3 -m pytest ./python/test/test_mscclpp.py -vx
+```
+
 For building documentation (see dependencies in `docs/requirements.txt`):
 ```bash
 cd docs
diff --git a/.gitignore b/.gitignore
index 9c4da143..ed3b94c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,8 @@
 .vscode/
-.hypothesis/
 build/
-dist/
 __pycache__
 .*.swp
-.idea/
 *.so
+.pytest_cache/
+_codeql_detected_source_root
 docs/_static/versions.js
-_codeql_detected_source_root
\ No newline at end of file

From c12822a7af908c6aec7c07385d639f04dd329e35 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 9 Feb 2026 16:55:16 -0800
Subject: [PATCH 07/52] create CI pipeline for rocm (#718)

Create CI pipeline for AMD GPU.
---
 .azure-pipelines/integration-test-rocm.yml | 114 ---------------------
 .azure-pipelines/templates/ut.yaml         |  13 ++-
 .azure-pipelines/ut-rocm.yml               |  50 +++++++++
 README.md                                  |   2 +-
 docker/base-dev-x.dockerfile               |  14 ++-
 docker/base-x-rocm.dockerfile              |  19 ----
 docker/build.sh                            |  14 +--
 python/requirements_cuda13.txt             |   3 +-
 python/requirements_rocm6.txt              |  10 ++
 test/CMakeLists.txt                        |   3 +
 test/deploy/deploy.sh                      |  17 ++-
 test/deploy/setup.sh                       |  17 ++-
 12 files changed, 118 insertions(+), 158 deletions(-)
 delete mode 100644 .azure-pipelines/integration-test-rocm.yml
 create mode 100644 .azure-pipelines/ut-rocm.yml
 delete mode 100644 docker/base-x-rocm.dockerfile

diff --git a/.azure-pipelines/integration-test-rocm.yml b/.azure-pipelines/integration-test-rocm.yml
deleted file mode 100644
index a4ffcfc3..00000000
--- a/.azure-pipelines/integration-test-rocm.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-trigger:
-  branches:
-    include:
-    - main
-    - release/*
-  paths:
-    exclude:
-    - .devcontainer/**
-    - .github/**
-    - docker/**
-    - docs/**
-    - '**/*.md'
-
-pr:
-  branches:
-    include:
-    - main
-    - release/*
-  drafts: false
-  paths:
-    exclude:
-      - .devcontainer/**
-      - .github/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
-
-jobs:
-- job: IntegrationTestRocm
-  displayName: Integration test ROCm
-  strategy:
-    matrix:
-      rocm6.2:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
-
-  pool:
-    name: mscclpp-rocm
-  container:
-    image: $[ variables['containerImage'] ]
-    options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1
-
-  steps:
-  - task: Bash@3
-    name: Build
-    displayName: Build
-    inputs:
-      targetType: 'inline'
-      script: |
-        mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
-        make -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: InstallRcclTest
-    displayName: Install rccl-test
-    inputs:
-      targetType: 'inline'
-      script: |
-        git clone https://github.com/ROCm/rccl-tests.git
-        cd rccl-tests
-        make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: InstallDep
-    displayName: Install dependencies
-    inputs:
-     targetType: 'inline'
-     script: |
-      set -e
-      git clone https://github.com/Azure/msccl-tools.git
-      cd msccl-tools
-      pip3 install .
-
-  - task: Bash@3
-    name: GenerateExectionFiles
-    displayName: Generate execution files
-    inputs:
-     targetType: 'inline'
-     script: |
-      set -e
-      git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/msccl-users
-      cd msccl-users
-      mkdir execution-files
-      python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json
-      python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json
-
-  - task: Bash@3
-    name: AllReduceTest
-    displayName: Run mscclpp allReduce test
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        export PATH=/usr/local/mpi/bin:$PATH
-        sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \
-          -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: AllReduceWithExecutionFileTest
-    displayName: Run mscclpp allReduce with execution file
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        export PATH=/usr/local/mpi/bin:$PATH
-        sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
-          -x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
-          -x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
-          -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
-          -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index 093a6094..82ff4aac 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -5,6 +5,9 @@ parameters:
   type: string
 - name: sshKeySecureFile
   type: string
+- name: platform
+  type: string
+  default: 'cuda'
 - name: gpuArch
   type: string
 
@@ -16,7 +19,11 @@ steps:
     targetType: 'inline'
     script: |
       mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      if [ "${{ parameters.platform }}" == "rocm" ]; then
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      else
+        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      fi
       make -j
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
@@ -52,7 +59,7 @@ steps:
   inputs:
     targetType: filePath
     filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
+    arguments: "single-node-test true ${{ parameters.platform }}"
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 
@@ -119,7 +126,7 @@ steps:
         export PATH=/usr/local/mpi/bin:\$PATH                          \
         export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
         cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
+        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
diff --git a/.azure-pipelines/ut-rocm.yml b/.azure-pipelines/ut-rocm.yml
new file mode 100644
index 00000000..8b0aed1a
--- /dev/null
+++ b/.azure-pipelines/ut-rocm.yml
@@ -0,0 +1,50 @@
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - apps/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+      - .devcontainer/**
+      - .github/**
+      - apps/**
+      - docker/**
+      - docs/**
+      - '**/*.md'
+
+jobs:
+- job: UnitTestMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut.yaml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      sshKeySecureFile: mscclpp.pem
+      platform:         rocm
+      gpuArch:          gfx942
diff --git a/README.md b/README.md
index 69ae5add..8f300a2a 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 |--------------------------|-------------------|
 | Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
 | Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
-| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) |
+| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
 
 A GPU-driven communication stack for scalable AI applications.
 
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
index 04ba1f03..3aa81422 100644
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -24,6 +24,16 @@ RUN OS_ARCH=$(uname -m) && \
     rm -rf ${CMAKE_HOME}.tar.gz && \
     ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
 
+# Install ROCm-specific packages if building for ROCm
+ARG TARGET="cuda13.0"
+RUN if echo "$TARGET" | grep -q "^rocm"; then \
+        apt-get update -y && \
+        apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* /tmp/*; \
+    fi
+
 # Create Python venv
 RUN python3 -m venv /root/venv && \
     echo 'source /root/venv/bin/activate' >> /root/.bashrc
@@ -32,8 +42,10 @@ ENV PATH="/root/venv/bin:${PATH}"
 # Install Python dependencies
 ADD . /tmp/mscclpp
 WORKDIR /tmp/mscclpp
-ARG TARGET="cuda13.0"
 RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
+    if echo "$TARGET" | grep -q "^rocm"; then \
+        export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
+    fi && \
     pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r python/requirements_${target_type}.txt
 
diff --git a/docker/base-x-rocm.dockerfile b/docker/base-x-rocm.dockerfile
deleted file mode 100644
index 525ba1d4..00000000
--- a/docker/base-x-rocm.dockerfile
+++ /dev/null
@@ -1,19 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-ENV RCCL_VERSION=rocm-6.2.0
-ARG GPU_ARCH=gfx942
-ENV ARCH_TARGET=${GPU_ARCH}
-RUN cd /tmp && \
-    git clone --branch ${RCCL_VERSION} --depth 1  https://github.com/ROCm/rccl.git && \
-    cd rccl && \
-    ./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \
-    cd .. && \
-    rm -rf /tmp/rccl
-
-WORKDIR /
diff --git a/docker/build.sh b/docker/build.sh
index e9b10c3a..63552f74 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -12,7 +12,7 @@ baseImageTable=(
     ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
     ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
     ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
-    ["rocm6.2"]="rocm/rocm-terminal:6.2.1"
+    ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
 )
 
 declare -A extraLdPathTable
@@ -29,6 +29,7 @@ ofedVersionTable=(
     ["cuda12.8"]="24.10-1.1.4.0"
     ["cuda12.9"]="24.10-1.1.4.0"
     ["cuda13.0"]="24.10-3.2.5.0"
+    ["rocm6.2"]="24.10-1.1.4.0"
 )
 
 TARGET=${1}
@@ -68,18 +69,11 @@ docker build -t ${TAG_TMP} \
 
 if [[ ${TARGET} == rocm* ]]; then
     echo "Building ROCm base image..."
-    docker build -t ${TAG_BASE} \
-        -f docker/base-x-rocm.dockerfile \
-        --build-arg BASE_IMAGE=${TAG_TMP} \
-        --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
-        --build-arg TARGET=${TARGET} \
-        --build-arg GPU_ARCH="gfx942" .
-    docker rmi ${TAG_TMP}
 else
     echo "Building CUDA base image..."
-    docker tag ${TAG_TMP} ${TAG_BASE}
-    docker rmi --no-prune ${TAG_TMP}
 fi
+docker tag ${TAG_TMP} ${TAG_BASE}
+docker rmi --no-prune ${TAG_TMP}
 
 docker build -t ${TAG_BASE_DEV} \
     -f docker/base-dev-x.dockerfile \
diff --git a/python/requirements_cuda13.txt b/python/requirements_cuda13.txt
index b49a404c..49cf13bc 100644
--- a/python/requirements_cuda13.txt
+++ b/python/requirements_cuda13.txt
@@ -6,4 +6,5 @@ pytest
 numpy
 matplotlib
 sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
-blake3
\ No newline at end of file
+blake3
+pybind11
\ No newline at end of file
diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt
index e69de29b..d2a3389b 100644
--- a/python/requirements_rocm6.txt
+++ b/python/requirements_rocm6.txt
@@ -0,0 +1,10 @@
+mpi4py==4.1.1
+cupy==13.6.0
+prettytable
+netifaces
+pytest
+numpy
+matplotlib
+sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+blake3
+pybind11
\ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8e41aac5..6452ebf8 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -14,6 +14,9 @@ set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include)
 if(MSCCLPP_USE_ROCM)
     file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu)
     set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
+    foreach(arch ${MSCCLPP_GPU_ARCHS})
+        add_compile_options(--offload-arch=${arch})
+    endforeach()
 endif()
 
 function(add_test_executable name sources)
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index ccf85abd..b26ff1a8 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -1,8 +1,8 @@
 set -e
 
-# get parameter from $1 and $2
 TEST_NAME=$1
 IB_ENVIRONMENT="${2:-true}"
+PLATFORM="${3:-cuda}"
 
 KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
 ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
@@ -35,20 +35,29 @@ set -e
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
 parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
 
+if [ "${PLATFORM}" == "rocm" ]; then
+  parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
+fi
+
 # force to pull the latest image
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
   "sudo docker pull ${CONTAINERIMAGE}"
+
+LAUNCH_OPTION="--gpus=all"
+if [ "${PLATFORM}" == "rocm" ]; then
+  LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
+fi
 if [ "${IB_ENVIRONMENT}" == "true" ]; then
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-    "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
+    "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
     -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
     --entrypoint /bin/bash ${CONTAINERIMAGE}"
 else
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-    "sudo docker run --rm -itd --net=host --ipc=host --gpus=all --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
+    "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
     -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
     --entrypoint /bin/bash ${CONTAINERIMAGE}"
 fi
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-  "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'"
+  "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"
 
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
index 4916d2eb..80cd10b1 100644
--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -1,5 +1,7 @@
 set -e
 
+PLATFORM="${1:-cuda}"
+
 mkdir -p /root/.ssh
 mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
 chown root:root /root/.ssh/authorized_keys
@@ -8,10 +10,12 @@ chown root:root /root/.ssh/config
 chmod 400 /root/mscclpp/sshkey
 chown root:root /root/mscclpp/sshkey
 
-nvidia-smi -pm 1
-for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-    nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-done
+if [ "${PLATFORM}" == "cuda" ]; then
+    nvidia-smi -pm 1
+    for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
+        nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
+    done
+fi
 
 make -C /root/mscclpp/tools/peer-access-test
 /root/mscclpp/tools/peer-access-test/peer_access_test
@@ -19,10 +23,13 @@ make -C /root/mscclpp/tools/peer-access-test clean
 
 if [[ "${CUDA_VERSION}" == *"11."* ]]; then
     pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
-else
+elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
     pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
 fi
 
+if [ "${PLATFORM}" == "rocm" ]; then
+    export CXX=/opt/rocm/bin/hipcc
+fi
 cd /root/mscclpp && pip3 install .
 pip3 install setuptools_scm
 python3 -m setuptools_scm --force-write-version-files

From 42be3660e0db0279e02ed262edb03202d1570e74 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 10 Feb 2026 10:07:53 +0900
Subject: [PATCH 08/52] Add a new IB stack impl that doesn't use RDMA atomics
 (#728)

* Added configurable InfiniBand (IB) signaling mode.
`EndpointConfig::Ib::Mode` enum selects the mode (`Default`, `Host`,
`HostNoAtomic`). `Default` is equivalent to `Host` unless specified
different by envrionment `MSCCLPP_IBV_MODE`. `Host` corresponds to the
previous implementation using RDMA atomics for signaling, while
`HostNoAtomic` uses write-with-immediate instead.
* Regarding updates in Python bindings and API.
---
 include/mscclpp/core.hpp                |  30 +++-
 include/mscclpp/env.hpp                 |   6 +
 python/csrc/core_py.cpp                 |  21 ++-
 python/csrc/env_py.cpp                  |   1 +
 python/mscclpp/__init__.py              |   4 +
 src/core/communicator.cc                |   1 -
 src/core/connection.cc                  | 157 +++++++++++++++--
 src/core/endpoint.cc                    |  28 ++-
 src/core/env.cpp                        |   2 +
 src/core/gpu_utils.cc                   |  43 +----
 src/core/ib.cc                          | 220 +++++++++++++++++-------
 src/core/include/connection.hpp         |  35 ++++
 src/core/include/endpoint.hpp           |   2 +
 src/core/include/gpu_utils_internal.hpp |  64 +++++++
 src/core/include/ib.hpp                 |  95 ++++++----
 src/core/include/ibverbs_wrapper.hpp    |   4 +
 src/core/semaphore.cc                   |   5 +-
 test/mp_unit/ib_tests.cu                |  41 ++---
 test/mp_unit/mp_unit_tests.hpp          |  18 +-
 test/mp_unit/port_channel_tests.cu      |  93 +++++++---
 20 files changed, 648 insertions(+), 222 deletions(-)
 create mode 100644 src/core/include/gpu_utils_internal.hpp

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 38b05ccf..37bdbd51 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -381,11 +381,19 @@ struct EndpointConfig {
   /// These settings are only used when the transport is an InfiniBand type (IB0-IB7); they are ignored for other
   /// transports.
   struct Ib {
+    /// IB mode for signaling, used to select between different implementations.
+    enum class Mode {
+      Default,      // Use the MSCCLPP_IBV_MODE environment variable (or "host" if unset).
+      Host,         // Use the host stack with RDMA atomics.
+      HostNoAtomic  // Use the host stack with write-with-immediate signaling (no RDMA atomics).
+    };
+
     static constexpr int DefaultPort = -1;
     static constexpr int DefaultGidIndex = 0;
     static constexpr int DefaultMaxCqSize = 1024;
     static constexpr int DefaultMaxCqPollNum = 1;
     static constexpr int DefaultMaxSendWr = 8192;
+    static constexpr int DefaultMaxRecvWr = 16;
     static constexpr int DefaultMaxWrPerSend = 64;
 
     /// Device index. Currently ignored; use transport type (IB0-IB7) to select device.
@@ -394,32 +402,41 @@ struct EndpointConfig {
     int port;
     /// GID index.
     int gidIndex;
-    /// Maximum size of the completion queue.
+    /// Maximum size of the send completion queue.
     int maxCqSize;
-    /// Maximum number of completion queue polls per operation.
+    /// Maximum number of send completion queue polls per operation.
     int maxCqPollNum;
     /// Maximum number of outstanding send work requests.
     int maxSendWr;
+    /// Maximum number of outstanding receive work requests (used in HostNoAtomic mode for write-with-immediate).
+    int maxRecvWr;
     /// Maximum number of work requests per send operation.
     int maxWrPerSend;
+    /// IB mode for signaling. When set to Default, uses the MSCCLPP_IBV_MODE environment variable.
+    Mode mode;
 
     /// Constructor.
     /// @param deviceIndex Device index.
     /// @param port Port number.
     /// @param gidIndex GID index.
-    /// @param maxCqSize Maximum completion queue size.
-    /// @param maxCqPollNum Maximum completion queue poll count.
+    /// @param maxCqSize Maximum send completion queue size.
+    /// @param maxCqPollNum Maximum send completion queue poll count.
     /// @param maxSendWr Maximum outstanding send work requests.
+    /// @param maxRecvWr Maximum outstanding receive work requests (for HostNoAtomic mode).
     /// @param maxWrPerSend Maximum work requests per send operation.
+    /// @param mode IB mode for signaling (Default uses MSCCLPP_IBV_MODE env variable).
     Ib(int deviceIndex = -1, int port = DefaultPort, int gidIndex = DefaultGidIndex, int maxCqSize = DefaultMaxCqSize,
-       int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxWrPerSend = DefaultMaxWrPerSend)
+       int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxRecvWr = DefaultMaxRecvWr,
+       int maxWrPerSend = DefaultMaxWrPerSend, Mode mode = Mode::Default)
         : deviceIndex(deviceIndex),
           port(port),
           gidIndex(gidIndex),
           maxCqSize(maxCqSize),
           maxCqPollNum(maxCqPollNum),
           maxSendWr(maxSendWr),
-          maxWrPerSend(maxWrPerSend) {}
+          maxRecvWr(maxRecvWr),
+          maxWrPerSend(maxWrPerSend),
+          mode(mode) {}
   };
 
   /// Communication transport type (e.g., CudaIpc, IB0-IB7, Ethernet).
@@ -658,6 +675,7 @@ class Connection {
   friend class SemaphoreStub;
   friend class Semaphore;
   friend class ProxyService;
+  friend class BaseConnection;
 };
 
 /// SemaphoreStub object only used for constructing Semaphore, not for direct use by the user.
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index 9d78cd1a..bd3983e9 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -54,6 +54,12 @@ class Env {
   /// default libibverbs library found in the system.
   const std::string ibvSo;
 
+  /// Env name: `MSCCLPP_IBV_MODE`. Selects the IB stack implementation for PortChannel.
+  /// Allowed values:
+  /// - "host": use the host stack with RDMA atomics (default).
+  /// - "host-no-atomic": use the host stack with write-with-immediate signaling (no RDMA atomics).
+  const std::string ibvMode;
+
   /// Env name: `MSCCLPP_HOSTID`. A string that uniquely identifies the host. If unset, it will use the hostname.
   /// This is used to determine whether the host is the same across different processes.
   const std::string hostid;
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index 9f085675..a862c7e5 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -147,22 +147,31 @@ void register_core(nb::module_& m) {
         return ss.str();
       });
 
+  nb::enum_<EndpointConfig::Ib::Mode>(m, "CppIbMode")
+      .value("Default", EndpointConfig::Ib::Mode::Default)
+      .value("Host", EndpointConfig::Ib::Mode::Host)
+      .value("HostNoAtomic", EndpointConfig::Ib::Mode::HostNoAtomic);
+
   nb::class_<EndpointConfig::Ib>(m, "CppEndpointConfigIb")
       .def(nb::init<>())
-      .def(nb::init<int, int, int, int, int, int, int>(), nb::arg("device_index") = -1,
+      .def(nb::init<int, int, int, int, int, int, int, int, EndpointConfig::Ib::Mode>(), nb::arg("device_index") = -1,
            nb::arg("port") = EndpointConfig::Ib::DefaultPort,
            nb::arg("gid_index") = EndpointConfig::Ib::DefaultGidIndex,
            nb::arg("max_cq_size") = EndpointConfig::Ib::DefaultMaxCqSize,
            nb::arg("max_cq_poll_num") = EndpointConfig::Ib::DefaultMaxCqPollNum,
            nb::arg("max_send_wr") = EndpointConfig::Ib::DefaultMaxSendWr,
-           nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend)
+           nb::arg("max_recv_wr") = EndpointConfig::Ib::DefaultMaxRecvWr,
+           nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend,
+           nb::arg("mode") = EndpointConfig::Ib::Mode::Default)
       .def_rw("device_index", &EndpointConfig::Ib::deviceIndex)
       .def_rw("port", &EndpointConfig::Ib::port)
       .def_rw("gid_index", &EndpointConfig::Ib::gidIndex)
       .def_rw("max_cq_size", &EndpointConfig::Ib::maxCqSize)
       .def_rw("max_cq_poll_num", &EndpointConfig::Ib::maxCqPollNum)
       .def_rw("max_send_wr", &EndpointConfig::Ib::maxSendWr)
-      .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend);
+      .def_rw("max_recv_wr", &EndpointConfig::Ib::maxRecvWr)
+      .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend)
+      .def_rw("mode", &EndpointConfig::Ib::mode);
 
   nb::class_<RegisteredMemory>(m, "CppRegisteredMemory")
       .def(nb::init<>())
@@ -223,9 +232,15 @@ void register_core(nb::module_& m) {
       .def_prop_rw(
           "ib_max_send_wr", [](EndpointConfig& self) { return self.ib.maxSendWr; },
           [](EndpointConfig& self, int v) { self.ib.maxSendWr = v; })
+      .def_prop_rw(
+          "ib_max_recv_wr", [](EndpointConfig& self) { return self.ib.maxRecvWr; },
+          [](EndpointConfig& self, int v) { self.ib.maxRecvWr = v; })
       .def_prop_rw(
           "ib_max_wr_per_send", [](EndpointConfig& self) { return self.ib.maxWrPerSend; },
           [](EndpointConfig& self, int v) { self.ib.maxWrPerSend = v; })
+      .def_prop_rw(
+          "ib_mode", [](EndpointConfig& self) { return self.ib.mode; },
+          [](EndpointConfig& self, EndpointConfig::Ib::Mode v) { self.ib.mode = v; })
       .def_rw("max_write_queue_size", &EndpointConfig::maxWriteQueueSize);
 
   nb::class_<Context>(m, "CppContext")
diff --git a/python/csrc/env_py.cpp b/python/csrc/env_py.cpp
index 360acc6f..ce89fd3d 100644
--- a/python/csrc/env_py.cpp
+++ b/python/csrc/env_py.cpp
@@ -20,6 +20,7 @@ void register_env(nb::module_& m) {
       .def_ro("socket_family", &Env::socketFamily)
       .def_ro("socket_ifname", &Env::socketIfname)
       .def_ro("comm_id", &Env::commId)
+      .def_ro("ibv_mode", &Env::ibvMode)
       .def_ro("cache_dir", &Env::cacheDir)
       .def_ro("npkit_dump_dir", &Env::npkitDumpDir)
       .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 86923003..5f3a2302 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -29,6 +29,8 @@ from ._mscclpp import (
     CppConnection as Connection,
     connect_nvls_collective,
     CppEndpointConfig as EndpointConfig,
+    CppEndpointConfigIb as EndpointConfigIb,
+    CppIbMode as IbMode,
     CppFifo as Fifo,
     CppSemaphore as Semaphore,
     CppHost2DeviceSemaphore as Host2DeviceSemaphore,
@@ -61,6 +63,8 @@ __all__ = [
     "Connection",
     "connect_nvls_collective",
     "EndpointConfig",
+    "EndpointConfigIb",
+    "IbMode",
     "ErrorCode",
     "Fifo",
     "Semaphore",
diff --git a/src/core/communicator.cc b/src/core/communicator.cc
index a146f0de..c95ca421 100644
--- a/src/core/communicator.cc
+++ b/src/core/communicator.cc
@@ -4,7 +4,6 @@
 #include "communicator.hpp"
 
 #include "api.h"
-#include "debug.h"
 
 namespace mscclpp {
 
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 10a43e88..6466ca2a 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -7,7 +7,7 @@
 #include <mscclpp/npkit/npkit.hpp>
 #endif
 
-#include <mscclpp/env.hpp>
+#include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
 #include <thread>
@@ -15,6 +15,7 @@
 #include "api.h"
 #include "context.hpp"
 #include "endpoint.hpp"
+#include "gpu_utils_internal.hpp"
 #include "logger.hpp"
 
 namespace mscclpp {
@@ -180,25 +181,131 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
 
 // IBConnection
 
+void IBConnection::recvThreadFunc() {
+  // Set the CUDA device context for this thread
+  if (localGpuDeviceId_ >= 0) {
+    cudaError_t err = cudaSetDevice(localGpuDeviceId_);
+    if (err != cudaSuccess) {
+      WARN(NET, "IBConnection recvThreadFunc: cudaSetDevice(", localGpuDeviceId_,
+           ") failed: ", cudaGetErrorString(err));
+      return;
+    }
+    // Bind this thread to the NUMA node of the local GPU for optimal memory access
+    int deviceNumaNode = getDeviceNumaNode(localGpuDeviceId_);
+    if (deviceNumaNode >= 0) {
+      numaBind(deviceNumaNode);
+    }
+  }
+
+  // Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy)
+  uint64_t newValueHost = 0;
+
+  while (!stopRecvThread_.load(std::memory_order_relaxed)) {
+    auto qp = qp_.lock();
+    if (!qp) break;
+
+    int wcNum = qp->pollRecvCq();
+    if (wcNum < 0) {
+      WARN(NET, "IBConnection recvThreadFunc: pollRecvCq failed");
+      break;
+    }
+
+    for (int i = 0; i < wcNum; ++i) {
+      int status = qp->getRecvWcStatus(i);
+      if (status != static_cast<int>(WsStatus::Success)) {
+        WARN(NET, "IBConnection recvThreadFunc: recv work completion failed: ", qp->getRecvWcStatusString(i));
+        // Post another recv to replace the failed one
+        qp->stageRecv(/*wrId=*/0);
+        qp->postRecv();
+        continue;
+      }
+
+      // The imm_data contains newValue (32-bit, extended to 64-bit)
+      // Note: getRecvWcImmData already converts from network byte order via ntohl
+      unsigned int immData = qp->getRecvWcImmData(i);
+      newValueHost = static_cast<uint64_t>(immData);
+
+      // Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
+      uint64_t dstGpuAddr = remoteUpdateDstAddr_;
+      if (dstGpuAddr != 0) {
+        uint64_t* dstPtr = reinterpret_cast<uint64_t*>(dstGpuAddr);
+
+        // Use cudaMemcpyAsync with our dedicated stream to avoid blocking on the default stream
+        MSCCLPP_CUDATHROW(
+            cudaMemcpyAsync(dstPtr, &newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_));
+
+        INFO(CONN, "IBConnection recvThreadFunc: updated GPU ptr ", dstPtr, " to ", newValueHost, " (immData=", immData,
+             ")");
+      }
+
+      // Post another recv for future messages
+      qp->stageRecv(/*wrId=*/0);
+      qp->postRecv();
+    }
+  }
+}
+
 IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint,
                            const Endpoint& remoteEndpoint)
     : BaseConnection(context, localEndpoint),
       transport_(localEndpoint.transport()),
       remoteTransport_(remoteEndpoint.transport()),
-      dummyAtomicSource_(std::make_unique<uint64_t>(0)) {
+      dummyAtomicSource_(std::make_unique<uint64_t>(0)),
+      ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
+      stopRecvThread_(false),
+      localGpuDeviceId_(localEndpoint.device().id),
+      signalStream_(nullptr),
+      remoteUpdateDstAddr_(0) {
   qp_ = getImpl(localEndpoint).ibQp_;
   qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
   qp_.lock()->rts();
   dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
   validateTransport(dummyAtomicSourceMem_, transport_);
   dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_);
-  INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created");
+
+  if (ibNoAtomic_) {
+    // Create a CUDA stream for async memory copies
+    MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking));
+
+    // Pre-post receive requests for incoming write-with-imm
+    auto qp = qp_.lock();
+    int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
+    for (int i = 0; i < maxRecvWr; ++i) {
+      qp->stageRecv(/*wrId=*/0);
+    }
+    qp->postRecv();
+    // Start the background thread to poll recv CQ
+    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
+    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with no-atomic mode");
+  } else {
+    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode");
+  }
+}
+
+IBConnection::~IBConnection() {
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(true, std::memory_order_relaxed);
+    if (recvThread_.joinable()) {
+      recvThread_.join();
+    }
+    if (signalStream_ != nullptr) {
+      // Synchronize stream to ensure all async copies are complete before destruction
+      // Ignore errors during teardown (CUDA context may already be destroyed)
+      MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamSynchronize(signalStream_));
+      MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamDestroy(signalStream_));
+    }
+  }
 }
 
 Transport IBConnection::transport() const { return transport_; }
 
 Transport IBConnection::remoteTransport() const { return remoteTransport_; }
 
+void IBConnection::setRemoteUpdateDstAddr(uint64_t addr) {
+  remoteUpdateDstAddr_ = addr;
+  INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)addr);
+}
+
 void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
                          uint64_t size) {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_CONN_IB_WRITE_ENTRY)
@@ -220,8 +327,8 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem
   auto dstMrInfo = dstTransportInfo.ibMrInfo;
   auto srcMr = srcTransportInfo.ibMr;
 
-  qp_.lock()->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset,
-                        /*signaled=*/true);
+  qp_.lock()->stageSendWrite(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset,
+                             /*dstOffset=*/dstOffset, /*signaled=*/true);
 
   qp_.lock()->postSend();
   INFO(CONN, "IBConnection write: from ", (uint8_t*)srcMr->getBuff() + srcOffset, " to ",
@@ -248,12 +355,28 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
   uint64_t oldValue = *src;
   *src = newValue;
 
-  qp_.lock()->stageAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
-                             /*signaled=*/true);
+  if (ibNoAtomic_) {
+    // Use RDMA write-with-imm instead of atomic operation
+    // Send only newValue in imm_data (0-byte write)
+    // The remote's recvThreadFunc will use its stored remoteUpdateDstAddr_ to write
 
-  qp_.lock()->postSend();
-  INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
-       " -> ", newValue);
+    // Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
+    unsigned int immData = static_cast<unsigned int>(newValue);
+
+    // Send 0-byte write-with-imm; use dstMrInfo as target (we don't actually write anything)
+    qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
+                                      /*size=*/0, /*wrId=*/0,
+                                      /*srcOffset=*/0, /*dstOffset=*/0,
+                                      /*signaled=*/true, /*immData=*/immData);
+    qp_.lock()->postSend();
+    INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
+  } else {
+    qp_.lock()->stageSendAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
+                                   /*signaled=*/true);
+    qp_.lock()->postSend();
+    INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
+         " -> ", newValue);
+  }
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_EXIT)
   NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_EXIT, 0, 0, *NpKit::GetCpuTimestamp(), 0);
@@ -266,21 +389,21 @@ void IBConnection::flush(int64_t timeoutUsec) {
 #endif
 
   Timer timer;
-  while (qp_.lock()->getNumCqItems()) {
-    int wcNum = qp_.lock()->pollCq();
+  while (qp_.lock()->getNumSendCqItems()) {
+    int wcNum = qp_.lock()->pollSendCq();
     if (wcNum < 0) {
-      THROW(NET, IbError, errno, "pollCq failed");
+      THROW(NET, IbError, errno, "pollSendCq failed");
     } else if (timeoutUsec >= 0) {
       auto elapsed = timer.elapsed();
       if (elapsed > timeoutUsec) {
-        THROW(CONN, Error, ErrorCode::Timeout, "pollCq timed out: waited for ", elapsed / 1e6, " seconds. Expected ",
-              qp_.lock()->getNumCqItems(), " signals");
+        THROW(CONN, Error, ErrorCode::Timeout, "pollSendCq timed out: waited for ", elapsed / 1e6,
+              " seconds. Expected ", qp_.lock()->getNumSendCqItems(), " signals");
       }
     }
     for (int i = 0; i < wcNum; ++i) {
-      int status = qp_.lock()->getWcStatus(i);
+      int status = qp_.lock()->getSendWcStatus(i);
       if (status != static_cast<int>(WsStatus::Success)) {
-        THROW(NET, Error, ErrorCode::SystemError, "an IB work item failed: ", qp_.lock()->getWcStatusString(i));
+        THROW(NET, Error, ErrorCode::SystemError, "an IB work item failed: ", qp_.lock()->getSendWcStatusString(i));
       }
     }
   }
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 3833fdc4..4795aa62 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -4,9 +4,13 @@
 #include "endpoint.hpp"
 
 #include <algorithm>
+#include <mscclpp/env.hpp>
 
 #include "api.h"
 #include "context.hpp"
+#include "ib.hpp"
+#include "logger.hpp"
+#include "registered_memory.hpp"
 #include "serialization.hpp"
 #include "socket.h"
 #include "utils_internal.hpp"
@@ -23,9 +27,31 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
     if (config_.maxWriteQueueSize <= 0) {
       config_.maxWriteQueueSize = config_.ib.maxCqSize;
     }
+
+    // Determine if we should use no-atomics mode
+    ibNoAtomic_ = false;
+    if (config_.ib.mode == EndpointConfig::Ib::Mode::HostNoAtomic) {
+      ibNoAtomic_ = true;
+    } else if (config_.ib.mode == EndpointConfig::Ib::Mode::Default) {
+      // Use environment variable when mode is Default
+      ibNoAtomic_ = (env()->ibvMode == "host-no-atomic");
+    }
+
+    // If mode is Host (or Default resolved to host), check if atomics are supported
+    if (!ibNoAtomic_) {
+      IbCtx* ibCtx = contextImpl.getIbContext(config_.transport);
+      if (!ibCtx->supportsRdmaAtomics()) {
+        WARN(NET, "IB device ", ibCtx->getDevName(),
+             " does not support RDMA atomics. Falling back to write-with-immediate mode (HostNoAtomic).");
+        ibNoAtomic_ = true;
+      }
+    }
+
+    int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
+
     ibQp_ = contextImpl.getIbContext(config_.transport)
                 ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
-                           config_.ib.maxSendWr, 0, config_.ib.maxWrPerSend);
+                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend);
     ibQpInfo_ = ibQp_->getInfo();
   } else if (config_.transport == Transport::Ethernet) {
     // Configuring Ethernet Interfaces
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 508208e9..a70e3d28 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -54,6 +54,7 @@ Env::Env()
       logFile(readEnv<std::string>("MSCCLPP_LOG_FILE", "")),
       hcaDevices(readEnv<std::string>("MSCCLPP_HCA_DEVICES", "")),
       ibvSo(readEnv<std::string>("MSCCLPP_IBV_SO", "")),
+      ibvMode(readEnv<std::string>("MSCCLPP_IBV_MODE", "host")),
       hostid(readEnv<std::string>("MSCCLPP_HOSTID", "")),
       socketFamily(readEnv<std::string>("MSCCLPP_SOCKET_FAMILY", "")),
       socketIfname(readEnv<std::string>("MSCCLPP_SOCKET_IFNAME", "")),
@@ -80,6 +81,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_LOG_FILE", globalEnv->logFile);
     logEnv("MSCCLPP_HCA_DEVICES", globalEnv->hcaDevices);
     logEnv("MSCCLPP_IBV_SO", globalEnv->ibvSo);
+    logEnv("MSCCLPP_IBV_MODE", globalEnv->ibvMode);
     logEnv("MSCCLPP_HOSTID", globalEnv->hostid);
     logEnv("MSCCLPP_SOCKET_FAMILY", globalEnv->socketFamily);
     logEnv("MSCCLPP_SOCKET_IFNAME", globalEnv->socketIfname);
diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc
index 3aa6aa1c..628d2dcb 100644
--- a/src/core/gpu_utils.cc
+++ b/src/core/gpu_utils.cc
@@ -5,48 +5,7 @@
 #include <mscclpp/gpu.hpp>
 #include <mscclpp/gpu_utils.hpp>
 
-#include "debug.h"
-
-static inline bool isCudaTeardownError(cudaError_t err) {
-#if defined(MSCCLPP_USE_ROCM)
-  return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice;
-#else   // !defined(MSCCLPP_USE_ROCM)
-  return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError ||
-         err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure || err == cudaErrorDeviceUninitialized;
-#endif  // !defined(MSCCLPP_USE_ROCM)
-}
-
-[[maybe_unused]] static inline bool isCuTeardownError(CUresult r) {
-  return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED || r == CUDA_ERROR_LAUNCH_FAILED;
-}
-
-#define MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cmd) \
-  do {                                         \
-    cudaError_t __e = cmd;                     \
-    if (isCudaTeardownError(__e)) {            \
-      (void)cudaGetLastError();                \
-    } else {                                   \
-      MSCCLPP_CUDATHROW(__e);                  \
-    }                                          \
-  } while (false)
-
-#define MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cmd) \
-  do {                                       \
-    CUresult __e = cmd;                      \
-    if (!isCuTeardownError(__e)) {           \
-      MSCCLPP_CUTHROW(__e);                  \
-    }                                        \
-  } while (false)
-
-#define MSCCLPP_CUTHROW_IGNORE(cmd)                                        \
-  do {                                                                     \
-    CUresult __e = cmd;                                                    \
-    if (__e != CUDA_SUCCESS) {                                             \
-      const char* errStr;                                                  \
-      cuGetErrorString(__e, &errStr);                                      \
-      WARN("%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, __e, errStr); \
-    }                                                                      \
-  } while (false)
+#include "gpu_utils_internal.hpp"
 
 namespace mscclpp {
 
diff --git a/src/core/ib.cc b/src/core/ib.cc
index 9b86cdf1..2e7b867d 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -3,6 +3,7 @@
 
 #include "ib.hpp"
 
+#include <arpa/inet.h>
 #include <malloc.h>
 #include <unistd.h>
 
@@ -129,30 +130,46 @@ const void* IbMr::getBuff() const { return buff_; }
 
 uint32_t IbMr::getLkey() const { return mr_->lkey; }
 
-IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr,
-           int maxRecvWr, int maxWrPerSend)
+IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
+           int maxSendWr, int maxRecvWr, int maxWrPerSend)
     : portNum_(portNum),
       gidIndex_(gidIndex),
       info_(),
       qp_(nullptr),
-      cq_(nullptr),
-      wcs_(),
-      wrs_(),
-      sges_(),
-      wrn_(0),
-      numSignaledPostedItems_(0),
-      numSignaledStagedItems_(0),
-      maxCqPollNum_(maxCqPollNum),
-      maxWrPerSend_(maxWrPerSend) {
-  cq_ = IBVerbs::ibv_create_cq(ctx, maxCqSize, nullptr, nullptr, 0);
-  if (cq_ == nullptr) {
+      sendCq_(nullptr),
+      recvCq_(nullptr),
+      sendWcs_(),
+      recvWcs_(),
+      sendWrs_(),
+      sendSges_(),
+      recvWrs_(),
+      recvSges_(),
+      numStagedSend_(0),
+      numStagedRecv_(0),
+      numPostedSignaledSend_(0),
+      numStagedSignaledSend_(0),
+      maxSendCqPollNum_(maxSendCqPollNum),
+      maxSendWr_(maxSendWr),
+      maxWrPerSend_(maxWrPerSend),
+      maxRecvWr_(maxRecvWr) {
+  sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
+  if (sendCq_ == nullptr) {
     THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
   }
 
+  // Only create recv CQ if maxRecvWr > 0
+  if (maxRecvWr > 0) {
+    recvCq_ = IBVerbs::ibv_create_cq(ctx, maxRecvWr, nullptr, nullptr, 0);
+    if (recvCq_ == nullptr) {
+      THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
+    }
+  }
+
   struct ibv_qp_init_attr qpInitAttr = {};
   qpInitAttr.sq_sig_all = 0;
-  qpInitAttr.send_cq = cq_;
-  qpInitAttr.recv_cq = cq_;
+  qpInitAttr.send_cq = sendCq_;
+  // Use separate recv CQ if created, otherwise use the send CQ
+  qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
   qpInitAttr.qp_type = IBV_QPT_RC;
   qpInitAttr.cap.max_send_wr = maxSendWr;
   qpInitAttr.cap.max_recv_wr = maxRecvWr;
@@ -173,9 +190,9 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSiz
   info_.linkLayer = portAttr.link_layer;
   info_.qpn = qp->qp_num;
   info_.mtu = portAttr.active_mtu;
-  info_.is_grh = (portAttr.flags & IBV_QPF_GRH_REQUIRED);
+  info_.isGrh = (portAttr.flags & IBV_QPF_GRH_REQUIRED);
 
-  if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND || info_.is_grh) {
+  if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND || info_.isGrh) {
     if (gidIndex_ >= portAttr.gid_tbl_len) {
       THROW(NET, Error, ErrorCode::InvalidUsage, "invalid GID index ", gidIndex_, " for port ", portNum_,
             " (max index is ", portAttr.gid_tbl_len - 1, ")");
@@ -199,14 +216,22 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSiz
     THROW(NET, IbError, errno, "ibv_modify_qp failed (errno ", errno, ")");
   }
   qp_ = qp;
-  wrs_ = std::make_shared<std::vector<ibv_send_wr>>(maxWrPerSend_);
-  sges_ = std::make_shared<std::vector<ibv_sge>>(maxWrPerSend_);
-  wcs_ = std::make_shared<std::vector<ibv_wc>>(maxCqPollNum_);
+  sendWrs_ = std::make_shared<std::vector<ibv_send_wr>>(maxWrPerSend_);
+  sendSges_ = std::make_shared<std::vector<ibv_sge>>(maxWrPerSend_);
+  sendWcs_ = std::make_shared<std::vector<ibv_wc>>(maxSendCqPollNum_);
+  recvWcs_ = std::make_shared<std::vector<ibv_wc>>(maxRecvWr_);
+  if (maxRecvWr_ > 0) {
+    recvWrs_ = std::make_shared<std::vector<ibv_recv_wr>>(maxRecvWr_);
+    recvSges_ = std::make_shared<std::vector<ibv_sge>>(maxRecvWr_);
+  }
 }
 
 IbQp::~IbQp() {
   IBVerbs::ibv_destroy_qp(qp_);
-  IBVerbs::ibv_destroy_cq(cq_);
+  IBVerbs::ibv_destroy_cq(sendCq_);
+  if (recvCq_ != nullptr) {
+    IBVerbs::ibv_destroy_cq(recvCq_);
+  }
 }
 
 void IbQp::rtr(const IbQpInfo& info) {
@@ -217,7 +242,7 @@ void IbQp::rtr(const IbQpInfo& info) {
   qp_attr.rq_psn = 0;
   qp_attr.max_dest_rd_atomic = 1;
   qp_attr.min_rnr_timer = 0x12;
-  if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.is_grh) {
+  if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.isGrh) {
     qp_attr.ah_attr.is_global = 1;
     qp_attr.ah_attr.grh.dgid.global.subnet_prefix = info.spn;
     qp_attr.ah_attr.grh.dgid.global.interface_id = info.iid;
@@ -256,25 +281,25 @@ void IbQp::rts() {
   }
 }
 
-IbQp::WrInfo IbQp::getNewWrInfo() {
-  if (wrn_ >= maxWrPerSend_) {
-    THROW(NET, Error, ErrorCode::InvalidUsage, "too many outstanding work requests. limit is ", maxWrPerSend_);
+IbQp::SendWrInfo IbQp::getNewSendWrInfo() {
+  if (numStagedSend_ >= maxWrPerSend_) {
+    THROW(NET, Error, ErrorCode::InvalidUsage, "too many staged work requests. limit is ", maxWrPerSend_);
   }
-  ibv_send_wr* wr_ = &wrs_->data()[wrn_];
-  ibv_sge* sge_ = &sges_->data()[wrn_];
+  ibv_send_wr* wr_ = &sendWrs_->data()[numStagedSend_];
+  ibv_sge* sge_ = &sendSges_->data()[numStagedSend_];
   wr_->sg_list = sge_;
   wr_->num_sge = 1;
   wr_->next = nullptr;
-  if (wrn_ > 0) {
-    (*wrs_)[wrn_ - 1].next = wr_;
+  if (numStagedSend_ > 0) {
+    (*sendWrs_)[numStagedSend_ - 1].next = wr_;
   }
-  wrn_++;
-  return IbQp::WrInfo{wr_, sge_};
+  numStagedSend_++;
+  return IbQp::SendWrInfo{wr_, sge_};
 }
 
-void IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
-                     uint64_t dstOffset, bool signaled) {
-  auto wrInfo = this->getNewWrInfo();
+void IbQp::stageSendWrite(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
+                          uint64_t dstOffset, bool signaled) {
+  auto wrInfo = this->getNewSendWrInfo();
   wrInfo.wr->wr_id = wrId;
   wrInfo.wr->opcode = IBV_WR_RDMA_WRITE;
   wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0;
@@ -283,12 +308,12 @@ void IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64
   wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset;
   wrInfo.sge->length = size;
   wrInfo.sge->lkey = mr->getLkey();
-  if (signaled) numSignaledStagedItems_++;
+  if (signaled) numStagedSignaledSend_++;
 }
 
-void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal,
-                          bool signaled) {
-  auto wrInfo = this->getNewWrInfo();
+void IbQp::stageSendAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal,
+                              bool signaled) {
+  auto wrInfo = this->getNewSendWrInfo();
   wrInfo.wr->wr_id = wrId;
   wrInfo.wr->opcode = IBV_WR_ATOMIC_FETCH_AND_ADD;
   wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0;
@@ -298,57 +323,118 @@ void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, u
   wrInfo.sge->addr = (uint64_t)(mr->getBuff());
   wrInfo.sge->length = sizeof(uint64_t);  // atomic op is always on uint64_t
   wrInfo.sge->lkey = mr->getLkey();
-  if (signaled) numSignaledStagedItems_++;
+  if (signaled) numStagedSignaledSend_++;
 }
 
-void IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
-                            uint64_t dstOffset, bool signaled, unsigned int immData) {
-  auto wrInfo = this->getNewWrInfo();
+void IbQp::stageSendWriteWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
+                                 uint64_t dstOffset, bool signaled, unsigned int immData) {
+  auto wrInfo = this->getNewSendWrInfo();
   wrInfo.wr->wr_id = wrId;
   wrInfo.wr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
   wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0;
   wrInfo.wr->wr.rdma.remote_addr = (uint64_t)(info.addr) + dstOffset;
   wrInfo.wr->wr.rdma.rkey = info.rkey;
-  wrInfo.wr->imm_data = immData;
-  wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset;
-  wrInfo.sge->length = size;
-  wrInfo.sge->lkey = mr->getLkey();
-  if (signaled) numSignaledStagedItems_++;
+  wrInfo.wr->imm_data = htonl(immData);
+  if (mr != nullptr) {
+    wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset;
+    wrInfo.sge->length = size;
+    wrInfo.sge->lkey = mr->getLkey();
+  } else {
+    // 0-byte write-with-imm: no source buffer needed
+    wrInfo.sge->addr = 0;
+    wrInfo.sge->length = 0;
+    wrInfo.sge->lkey = 0;
+  }
+  if (signaled) numStagedSignaledSend_++;
 }
 
 void IbQp::postSend() {
-  if (wrn_ == 0) {
+  if (numStagedSend_ == 0) {
     return;
   }
   struct ibv_send_wr* bad_wr;
-  int err = IBVerbs::ibv_post_send(qp_, wrs_->data(), &bad_wr);
+  int err = IBVerbs::ibv_post_send(qp_, sendWrs_->data(), &bad_wr);
   if (err != 0) {
     THROW(NET, IbError, err, "ibv_post_send failed (errno ", err, ")");
   }
-  wrn_ = 0;
-  numSignaledPostedItems_ += numSignaledStagedItems_;
-  numSignaledStagedItems_ = 0;
-  if (numSignaledPostedItems_ + 4 > cq_->cqe) {
-    WARN(NET, "IB: CQ is almost full ( ", numSignaledPostedItems_, " / ", cq_->cqe,
+  numStagedSend_ = 0;
+  numPostedSignaledSend_ += numStagedSignaledSend_;
+  numStagedSignaledSend_ = 0;
+  if (numPostedSignaledSend_ + 4 > sendCq_->cqe) {
+    WARN(NET, "IB: CQ is almost full ( ", numPostedSignaledSend_, " / ", sendCq_->cqe,
          " ). The connection needs to be flushed to prevent timeout errors.");
   }
 }
 
-int IbQp::pollCq() {
-  int wcNum = IBVerbs::ibv_poll_cq(cq_, maxCqPollNum_, wcs_->data());
+IbQp::RecvWrInfo IbQp::getNewRecvWrInfo() {
+  if (numStagedRecv_ >= maxRecvWr_) {
+    THROW(NET, Error, ErrorCode::InvalidUsage, "too many outstanding recv work requests. limit is ", maxRecvWr_);
+  }
+  ibv_recv_wr* wr = &recvWrs_->data()[numStagedRecv_];
+  ibv_sge* sge = &recvSges_->data()[numStagedRecv_];
+  wr->next = nullptr;
+  if (numStagedRecv_ > 0) {
+    (*recvWrs_)[numStagedRecv_ - 1].next = wr;
+  }
+  numStagedRecv_++;
+  return IbQp::RecvWrInfo{wr, sge};
+}
+
+void IbQp::stageRecv(uint64_t wrId) {
+  auto wrInfo = this->getNewRecvWrInfo();
+  // For RDMA write-with-imm, data goes to remote_addr specified by sender.
+  // We only need the recv WR to get the completion notification with imm_data.
+  wrInfo.wr->wr_id = wrId;
+  wrInfo.wr->sg_list = nullptr;
+  wrInfo.wr->num_sge = 0;
+}
+
+void IbQp::stageRecv(const IbMr* mr, uint64_t wrId, uint32_t size, uint64_t offset) {
+  auto wrInfo = this->getNewRecvWrInfo();
+  wrInfo.wr->wr_id = wrId;
+  wrInfo.sge->addr = reinterpret_cast<uint64_t>(mr->getBuff()) + offset;
+  wrInfo.sge->length = size;
+  wrInfo.sge->lkey = mr->getLkey();
+  wrInfo.wr->sg_list = wrInfo.sge;
+  wrInfo.wr->num_sge = 1;
+}
+
+void IbQp::postRecv() {
+  if (numStagedRecv_ == 0) return;
+  struct ibv_recv_wr* bad_wr;
+  int err = IBVerbs::ibv_post_recv(qp_, recvWrs_->data(), &bad_wr);
+  if (err != 0) {
+    THROW(NET, IbError, err, "ibv_post_recv failed (errno ", err, ")");
+  }
+  numStagedRecv_ = 0;
+}
+
+int IbQp::pollSendCq() {
+  int wcNum = IBVerbs::ibv_poll_cq(sendCq_, maxSendCqPollNum_, sendWcs_->data());
   if (wcNum > 0) {
-    numSignaledPostedItems_ -= wcNum;
+    numPostedSignaledSend_ -= wcNum;
   }
   return wcNum;
 }
 
-int IbQp::getWcStatus(int idx) const { return (*wcs_)[idx].status; }
+int IbQp::pollRecvCq() {
+  int wcNum = IBVerbs::ibv_poll_cq(recvCq_, maxRecvWr_, recvWcs_->data());
+  return wcNum;
+}
 
-std::string IbQp::getWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*wcs_)[idx].status); }
+int IbQp::getSendWcStatus(int idx) const { return (*sendWcs_)[idx].status; }
 
-int IbQp::getNumCqItems() const { return numSignaledPostedItems_; }
+std::string IbQp::getSendWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*sendWcs_)[idx].status); }
 
-IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr) {
+int IbQp::getNumSendCqItems() const { return numPostedSignaledSend_; }
+
+int IbQp::getRecvWcStatus(int idx) const { return (*recvWcs_)[idx].status; }
+
+std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*recvWcs_)[idx].status); }
+
+unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }
+
+IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false) {
   int num;
   struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
   for (int i = 0; i < num; ++i) {
@@ -365,6 +451,12 @@ IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_
   if (pd_ == nullptr) {
     THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")");
   }
+
+  // Query and cache RDMA atomics capability
+  struct ibv_device_attr attr = {};
+  if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) {
+    supportsRdmaAtomics_ = (attr.atomic_cap == IBV_ATOMIC_HCA || attr.atomic_cap == IBV_ATOMIC_GLOB);
+  }
 }
 
 IbCtx::~IbCtx() {
@@ -419,7 +511,7 @@ int IbCtx::getAnyUsablePort(int gidIndex) const {
   return -1;
 }
 
-std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr,
+std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
                                       int maxRecvWr, int maxWrPerSend) {
   if (port == -1) {
     port = this->getAnyUsablePort(gidIndex);
@@ -430,13 +522,15 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxCqSize, int
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
   }
   return std::shared_ptr<IbQp>(
-      new IbQp(ctx_, pd_, port, gidIndex, maxCqSize, maxCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend));
+      new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend));
 }
 
 std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
   return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size));
 }
 
+bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
+
 MSCCLPP_API_CPP int getIBDeviceCount() {
   int num;
   IBVerbs::ibv_get_device_list(&num);
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index c9d81d41..06e733c7 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -4,11 +4,17 @@
 #ifndef MSCCLPP_CONNECTION_HPP_
 #define MSCCLPP_CONNECTION_HPP_
 
+#include <atomic>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <mutex>
+#include <thread>
+#include <utility>
+#include <vector>
 
 #include "communicator.hpp"
 #include "context.hpp"
+#include "endpoint.hpp"
 #include "ib.hpp"
 #include "registered_memory.hpp"
 #include "socket.h"
@@ -29,6 +35,12 @@ class BaseConnection {
 
   virtual void flush(int64_t timeoutUsec = -1) = 0;
 
+  /// Set the local address where remote updateAndSync operations should write.
+  /// This is called by the receiver to specify where incoming signals should be written.
+  /// Default implementation is a no-op for connections that don't need it.
+  /// @param addr The local address for incoming writes.
+  virtual void setRemoteUpdateDstAddr(uint64_t /*addr*/) {}
+
   virtual Transport transport() const = 0;
 
   virtual Transport remoteTransport() const = 0;
@@ -39,6 +51,8 @@ class BaseConnection {
 
   int getMaxWriteQueueSize() const;
 
+  static std::shared_ptr<BaseConnection>& getImpl(Connection& conn) { return conn.impl_; }
+
  protected:
   friend class Context;
   friend class CudaIpcConnection;
@@ -81,8 +95,29 @@ class IBConnection : public BaseConnection {
   RegisteredMemory dummyAtomicSourceMem_;
   mscclpp::TransportInfo dstTransportInfo_;
 
+  // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
+  // instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
+  bool ibNoAtomic_;
+  std::thread recvThread_;
+  std::atomic<bool> stopRecvThread_;
+  int localGpuDeviceId_;  // Local GPU device ID for setting CUDA context in recv thread
+  cudaStream_t signalStream_;
+
+  // Write-with-imm design:
+  // - Sender: 0-byte RDMA write-with-imm to dst MR, newValue in imm_data (32-bit)
+  // - Receiver: uses remoteUpdateDstAddr_ (set via setRemoteUpdateDstAddr) to know where to write
+  uint64_t remoteUpdateDstAddr_;
+
+  void recvThreadFunc();
+
  public:
   IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
+  ~IBConnection();
+
+  /// Set the local address where remote updateAndSync operations will write.
+  /// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
+  /// @param addr The local address for incoming writes.
+  void setRemoteUpdateDstAddr(uint64_t addr) override;
 
   Transport transport() const override;
 
diff --git a/src/core/include/endpoint.hpp b/src/core/include/endpoint.hpp
index a3a5ad41..363faab1 100644
--- a/src/core/include/endpoint.hpp
+++ b/src/core/include/endpoint.hpp
@@ -4,6 +4,7 @@
 #ifndef MSCCLPP_ENDPOINT_HPP_
 #define MSCCLPP_ENDPOINT_HPP_
 
+#include <memory>
 #include <mscclpp/core.hpp>
 #include <vector>
 
@@ -24,6 +25,7 @@ struct Endpoint::Impl {
 
   // The following are only used for IB and are undefined for other transports.
   bool ibLocal_;
+  bool ibNoAtomic_;
   std::shared_ptr<IbQp> ibQp_;
   IbQpInfo ibQpInfo_;
 
diff --git a/src/core/include/gpu_utils_internal.hpp b/src/core/include/gpu_utils_internal.hpp
new file mode 100644
index 00000000..a7cea86b
--- /dev/null
+++ b/src/core/include/gpu_utils_internal.hpp
@@ -0,0 +1,64 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_GPU_UTILS_INTERNAL_HPP_
+#define MSCCLPP_GPU_UTILS_INTERNAL_HPP_
+
+#include <mscclpp/gpu.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "logger.hpp"
+
+namespace mscclpp {
+
+/// Check if a CUDA error indicates runtime teardown (safe to ignore in destructors).
+inline bool isCudaTeardownError(cudaError_t err) {
+#if defined(MSCCLPP_USE_ROCM)
+  return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice;
+#else   // !defined(MSCCLPP_USE_ROCM)
+  return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError ||
+         err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure || err == cudaErrorDeviceUninitialized;
+#endif  // !defined(MSCCLPP_USE_ROCM)
+}
+
+/// Check if a CUDA driver error indicates runtime teardown.
+inline bool isCuTeardownError(CUresult r) {
+  return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED || r == CUDA_ERROR_LAUNCH_FAILED;
+}
+
+}  // namespace mscclpp
+
+/// Execute a CUDA runtime call and ignore teardown errors (useful in destructors).
+/// Non-teardown errors will throw.
+#define MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cmd) \
+  do {                                         \
+    cudaError_t __e = cmd;                     \
+    if (mscclpp::isCudaTeardownError(__e)) {   \
+      (void)cudaGetLastError();                \
+    } else {                                   \
+      MSCCLPP_CUDATHROW(__e);                  \
+    }                                          \
+  } while (false)
+
+/// Execute a CUDA driver call and ignore teardown errors (useful in destructors).
+/// Non-teardown errors will throw.
+#define MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cmd) \
+  do {                                       \
+    CUresult __e = cmd;                      \
+    if (!mscclpp::isCuTeardownError(__e)) {  \
+      MSCCLPP_CUTHROW(__e);                  \
+    }                                        \
+  } while (false)
+
+/// Execute a CUDA driver call and log (but don't throw) on error.
+#define MSCCLPP_CUTHROW_IGNORE(cmd)                                                                   \
+  do {                                                                                                \
+    CUresult __e = cmd;                                                                               \
+    if (__e != CUDA_SUCCESS) {                                                                        \
+      const char* errStr;                                                                             \
+      cuGetErrorString(__e, &errStr);                                                                 \
+      WARN(GPU, __FILE__, ":", __LINE__, " Cuda failure ", static_cast<int>(__e), " '", errStr, "'"); \
+    }                                                                                                 \
+  } while (false)
+
+#endif  // MSCCLPP_GPU_UTILS_INTERNAL_HPP_
diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp
index c6436dbf..e9363e9c 100644
--- a/src/core/include/ib.hpp
+++ b/src/core/include/ib.hpp
@@ -17,6 +17,7 @@ struct ibv_qp;
 struct ibv_cq;
 struct ibv_wc;
 struct ibv_send_wr;
+struct ibv_recv_wr;
 struct ibv_sge;
 
 namespace mscclpp {
@@ -28,11 +29,11 @@ struct IbMrInfo {
 
 class IbMr {
  public:
-  virtual ~IbMr();
+  ~IbMr();
 
-  virtual IbMrInfo getInfo() const;
-  virtual const void* getBuff() const;
-  virtual uint32_t getLkey() const;
+  IbMrInfo getInfo() const;
+  const void* getBuff() const;
+  uint32_t getLkey() const;
 
  private:
   IbMr(ibv_pd* pd, void* buff, std::size_t size);
@@ -52,7 +53,7 @@ struct IbQpInfo {
   uint64_t spn;
   int mtu;
   uint64_t iid;
-  bool is_grh;
+  bool isGrh;
 };
 
 enum class WsStatus {
@@ -61,38 +62,48 @@ enum class WsStatus {
 
 class IbQp {
  public:
-  virtual ~IbQp();
+  ~IbQp();
 
-  virtual void rtr([[maybe_unused]] const IbQpInfo& info);
-  virtual void rts();
-  virtual void stageSend([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info,
-                         [[maybe_unused]] uint32_t size, [[maybe_unused]] uint64_t wrId,
-                         [[maybe_unused]] uint64_t srcOffset, [[maybe_unused]] uint64_t dstOffset,
-                         [[maybe_unused]] bool signaled);
-  virtual void stageAtomicAdd([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info,
-                              [[maybe_unused]] uint64_t wrId, [[maybe_unused]] uint64_t dstOffset,
-                              [[maybe_unused]] uint64_t addVal, [[maybe_unused]] bool signaled);
-  virtual void stageSendWithImm([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info,
-                                [[maybe_unused]] uint32_t size, [[maybe_unused]] uint64_t wrId,
-                                [[maybe_unused]] uint64_t srcOffset, [[maybe_unused]] uint64_t dstOffset,
-                                [[maybe_unused]] bool signaled, [[maybe_unused]] unsigned int immData);
-  virtual void postSend();
-  virtual int pollCq();
+  void rtr(const IbQpInfo& info);
+  void rts();
+  void stageSendWrite(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
+                      uint64_t dstOffset, bool signaled);
+  void stageSendAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal,
+                          bool signaled);
+  void stageSendWriteWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
+                             uint64_t dstOffset, bool signaled, unsigned int immData);
+  void postSend();
+
+  void stageRecv(uint64_t wrId);
+  void stageRecv(const IbMr* mr, uint64_t wrId, uint32_t size, uint64_t offset = 0);
+  void postRecv();
+
+  int pollSendCq();
+  int pollRecvCq();
 
   IbQpInfo& getInfo() { return info_; }
-  virtual int getWcStatus([[maybe_unused]] int idx) const;
-  virtual std::string getWcStatusString([[maybe_unused]] int idx) const;
-  virtual int getNumCqItems() const;
+  int getSendWcStatus(int idx) const;
+  std::string getSendWcStatusString(int idx) const;
+  int getNumSendCqItems() const;
+  int getRecvWcStatus(int idx) const;
+  std::string getRecvWcStatusString(int idx) const;
+  unsigned int getRecvWcImmData(int idx) const;
 
  private:
-  struct WrInfo {
+  struct SendWrInfo {
     ibv_send_wr* wr;
     ibv_sge* sge;
   };
 
-  IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr,
+  struct RecvWrInfo {
+    ibv_recv_wr* wr;
+    ibv_sge* sge;
+  };
+
+  IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
        int maxRecvWr, int maxWrPerSend);
-  WrInfo getNewWrInfo();
+  SendWrInfo getNewSendWrInfo();
+  RecvWrInfo getNewRecvWrInfo();
 
   int portNum_;
   int gidIndex_;
@@ -100,16 +111,23 @@ class IbQp {
   IbQpInfo info_;
 
   ibv_qp* qp_;
-  ibv_cq* cq_;
-  std::shared_ptr<std::vector<ibv_wc>> wcs_;
-  std::shared_ptr<std::vector<ibv_send_wr>> wrs_;
-  std::shared_ptr<std::vector<ibv_sge>> sges_;
-  int wrn_;
-  int numSignaledPostedItems_;
-  int numSignaledStagedItems_;
+  ibv_cq* sendCq_;
+  ibv_cq* recvCq_;
+  std::shared_ptr<std::vector<ibv_wc>> sendWcs_;
+  std::shared_ptr<std::vector<ibv_wc>> recvWcs_;
+  std::shared_ptr<std::vector<ibv_send_wr>> sendWrs_;
+  std::shared_ptr<std::vector<ibv_sge>> sendSges_;
+  std::shared_ptr<std::vector<ibv_recv_wr>> recvWrs_;
+  std::shared_ptr<std::vector<ibv_sge>> recvSges_;
+  int numStagedSend_;
+  int numStagedRecv_;
+  int numPostedSignaledSend_;
+  int numStagedSignaledSend_;
 
-  const int maxCqPollNum_;
+  const int maxSendCqPollNum_;
+  const int maxSendWr_;
   const int maxWrPerSend_;
+  const int maxRecvWr_;
 
   friend class IbCtx;
 };
@@ -120,9 +138,10 @@ class IbCtx {
   IbCtx(const std::string& devName);
   ~IbCtx();
 
-  std::shared_ptr<IbQp> createQp(int port, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr, int maxRecvWr,
-                                 int maxWrPerSend);
+  std::shared_ptr<IbQp> createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
+                                 int maxRecvWr, int maxWrPerSend);
   std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
+  bool supportsRdmaAtomics() const;
 #else
   IbCtx([[maybe_unused]] const std::string& devName) {}
   ~IbCtx() {}
@@ -131,6 +150,7 @@ class IbCtx {
   std::unique_ptr<const IbMr> registerMr([[maybe_unused]] void* buff, [[maybe_unused]] std::size_t size) {
     return nullptr;
   }
+  bool supportsRdmaAtomics() const { return false; }
 #endif
 
   const std::string& getDevName() const { return devName_; };
@@ -142,6 +162,7 @@ class IbCtx {
   const std::string devName_;
   ibv_context* ctx_;
   ibv_pd* pd_;
+  bool supportsRdmaAtomics_;
 };
 
 }  // namespace mscclpp
diff --git a/src/core/include/ibverbs_wrapper.hpp b/src/core/include/ibverbs_wrapper.hpp
index b5ab2eff..5b0da8ba 100644
--- a/src/core/include/ibverbs_wrapper.hpp
+++ b/src/core/include/ibverbs_wrapper.hpp
@@ -102,6 +102,10 @@ struct IBVerbs {
     return qp->context->ops.post_send(qp, wr, bad_wr);
   }
 
+  static inline int ibv_post_recv(struct ibv_qp* qp, struct ibv_recv_wr* wr, struct ibv_recv_wr** bad_wr) {
+    return qp->context->ops.post_recv(qp, wr, bad_wr);
+  }
+
   static inline int ibv_poll_cq(struct ibv_cq* cq, int num_entries, struct ibv_wc* wc) {
     return cq->context->ops.poll_cq(cq, num_entries, wc);
   }
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index 57ac5979..c6eb1e23 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -8,7 +8,6 @@
 #include "atomic.hpp"
 #include "connection.hpp"
 #include "context.hpp"
-#include "debug.h"
 #include "registered_memory.hpp"
 #include "serialization.hpp"
 
@@ -122,6 +121,8 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
   if (connection().localDevice().type != DeviceType::GPU) {
     throw Error("Local endpoint device type of Host2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
   }
+  BaseConnection::getImpl(connection())
+      ->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
 }
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator, const Connection& connection)
@@ -150,6 +151,8 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
   if (connection().localDevice().type != DeviceType::CPU) {
     throw Error("Local endpoint device type of Host2HostSemaphore should be CPU", ErrorCode::InvalidUsage);
   }
+  BaseConnection::getImpl(connection())
+      ->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
 }
 
 MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(Communicator& communicator, const Connection& connection)
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 8475ccf9..051030ac 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -63,20 +63,21 @@ void IbPeerToPeerTest::registerBufferAndConnect(void* buf, size_t size) {
   bootstrap->barrier();
 }
 
-void IbPeerToPeerTest::stageSend(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled) {
+void IbPeerToPeerTest::stageSendWrite(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset,
+                                      bool signaled) {
   const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1];
-  qp->stageSend(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled);
+  qp->stageSendWrite(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled);
 }
 
-void IbPeerToPeerTest::stageAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled) {
+void IbPeerToPeerTest::stageSendAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled) {
   const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1];
-  qp->stageAtomicAdd(mr.get(), remoteMrInfo, wrId, dstOffset, addVal, signaled);
+  qp->stageSendAtomicAdd(mr.get(), remoteMrInfo, wrId, dstOffset, addVal, signaled);
 }
 
-void IbPeerToPeerTest::stageSendWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset,
-                                        bool signaled, unsigned int immData) {
+void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset,
+                                             bool signaled, unsigned int immData) {
   const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1];
-  qp->stageSendWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
+  qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
 }
 
 TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
@@ -96,15 +97,15 @@ TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
   if (gEnv->rank == 1) {
     mscclpp::Timer timer;
     for (int iter = 0; iter < maxIter; ++iter) {
-      stageSend(sizeof(uint64_t) * nelem, 0, 0, 0, true);
+      stageSendWrite(sizeof(uint64_t) * nelem, 0, 0, 0, true);
       qp->postSend();
       bool waiting = true;
       int spin = 0;
       while (waiting) {
-        int wcNum = qp->pollCq();
+        int wcNum = qp->pollSendCq();
         ASSERT_GE(wcNum, 0);
         for (int i = 0; i < wcNum; ++i) {
-          int status = qp->getWcStatus(i);
+          int status = qp->getSendWcStatus(i);
           EXPECT_EQ(status, static_cast<int>(mscclpp::WsStatus::Success));
           waiting = false;
           break;
@@ -261,26 +262,26 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
       bool signaled = (iter % signalPeriod == 0);
 
       // Send from the second element to the last
-      stageSend(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
+      stageSendWrite(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
       qp->postSend();
 
 #if 0
       // For reference: send the first element using a normal send. This should occasionally see a wrong result.
-      stageSend(sizeof(uint64_t), 0, 0, 0, false);
+      stageSendWrite(sizeof(uint64_t), 0, 0, 0, false);
       qp->postSend();
 #else
       // Send the first element using AtomicAdd. This should see the correct result.
-      stageAtomicAdd(0, 0, 1, false);
+      stageSendAtomicAdd(0, 0, 1, false);
       qp->postSend();
 #endif
 
       if (signaled) {
-        int wcNum = qp->pollCq();
+        int wcNum = qp->pollSendCq();
         while (wcNum == 0) {
-          wcNum = qp->pollCq();
+          wcNum = qp->pollSendCq();
         }
         ASSERT_EQ(wcNum, 1);
-        int status = qp->getWcStatus(0);
+        int status = qp->getSendWcStatus(0);
         ASSERT_EQ(status, static_cast<int>(mscclpp::WsStatus::Success));
       }
 
@@ -319,17 +320,17 @@ TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
   if (gEnv->rank == 1) {
     mscclpp::Timer timer;
     for (int iter = 0; iter < maxIter; ++iter) {
-      stageAtomicAdd(0, 0, 1, true);
+      stageSendAtomicAdd(0, 0, 1, true);
       qp->postSend();
       bool waiting = true;
       int spin = 0;
       while (waiting) {
-        int wcNum = qp->pollCq();
+        int wcNum = qp->pollSendCq();
         ASSERT_GE(wcNum, 0);
         for (int i = 0; i < wcNum; ++i) {
-          int status = qp->getWcStatus(i);
+          int status = qp->getSendWcStatus(i);
           if (status != static_cast<int>(mscclpp::WsStatus::Success)) {
-            FAIL() << "Work completion status error: " << qp->getWcStatusString(i);
+            FAIL() << "Work completion status error: " << qp->getSendWcStatusString(i);
           }
           waiting = false;
           break;
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index bad80f0a..17046a57 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -71,12 +71,12 @@ class IbPeerToPeerTest : public IbTestBase {
 
   void registerBufferAndConnect(void* buf, size_t size);
 
-  void stageSend(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled);
+  void stageSendWrite(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled);
 
-  void stageAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled);
+  void stageSendAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled);
 
-  void stageSendWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled,
-                        unsigned int immData);
+  void stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled,
+                             unsigned int immData);
 
   std::shared_ptr<mscclpp::TcpBootstrap> bootstrap;
   std::shared_ptr<mscclpp::IbCtx> ibCtx;
@@ -131,6 +131,8 @@ class CommunicatorTest : public CommunicatorTestBase {
 template <class T>
 using DeviceHandle = mscclpp::DeviceHandle<T>;
 
+using IbMode = mscclpp::EndpointConfig::Ib::Mode;
+
 class PortChannelOneToOneTest : public CommunicatorTestBase {
  protected:
   struct PingPongTestParams {
@@ -138,17 +140,19 @@ class PortChannelOneToOneTest : public CommunicatorTestBase {
     bool useIB;
     bool useEthernet;
     bool waitWithPoll;
+    IbMode ibMode;
   };
 
   void SetUp() override;
   void TearDown() override;
 
   void setupMeshConnections(std::vector<mscclpp::PortChannel>& portChannels, bool useIPC, bool useIb, bool useEthernet,
-                            void* sendBuff, size_t sendBuffBytes, void* recvBuff = nullptr, size_t recvBuffBytes = 0);
+                            void* sendBuff, size_t sendBuffBytes, void* recvBuff = nullptr, size_t recvBuffBytes = 0,
+                            IbMode ibMode = IbMode::Default);
   void testPingPong(PingPongTestParams params);
   void testPingPongPerf(PingPongTestParams params);
-  void testPacketPingPong(bool useIbOnly);
-  void testPacketPingPongPerf(bool useIbOnly);
+  void testPacketPingPong(bool useIbOnly, IbMode ibMode = IbMode::Default);
+  void testPacketPingPongPerf(bool useIbOnly, IbMode ibMode = IbMode::Default);
 
   std::shared_ptr<mscclpp::ProxyService> proxyService;
 };
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index cbd5cb6d..7cc5954a 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -18,7 +18,7 @@ void PortChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); }
 
 void PortChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::PortChannel>& portChannels, bool useIPC,
                                                    bool useIb, bool useEthernet, void* sendBuff, size_t sendBuffBytes,
-                                                   void* recvBuff, size_t recvBuffBytes) {
+                                                   void* recvBuff, size_t recvBuffBytes, IbMode ibMode) {
   const int rank = communicator->bootstrap()->getRank();
   const int worldSize = communicator->bootstrap()->getNranks();
   const bool isInPlace = (recvBuff == nullptr);
@@ -47,6 +47,7 @@ void PortChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::PortChan
     } else if (useIb) {
       cfg.transport = ibTransport;
       cfg.ib.gidIndex = std::stoi(gEnv->args["ib_gid_index"]);
+      cfg.ib.mode = ibMode;
     } else if (useEthernet) {
       cfg.transport = mscclpp::Transport::Ethernet;
     }
@@ -162,7 +163,8 @@ void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) {
 
   std::vector<mscclpp::PortChannel> portChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
-  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int),
+                       nullptr, 0, params.ibMode);
 
   std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
   for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
@@ -207,7 +209,8 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
 
   std::vector<mscclpp::PortChannel> portChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
-  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int),
+                       nullptr, 0, params.ibMode);
 
   std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
   for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
@@ -245,47 +248,64 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
 }
 
 TEST_F(PortChannelOneToOneTest, PingPong) {
-  testPingPong(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false});
+  testPingPong(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIb) {
+TEST_F(PortChannelOneToOneTest, PingPongIbHostMode) {
 #if defined(USE_IBVERBS)
-  testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false});
+  testPingPong(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 #else   // !defined(USE_IBVERBS)
   GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongEthernet) {
-  testPingPong(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false});
+  testPingPong(PingPongTestParams{
+      .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongWithPoll) {
-  testPingPong(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true});
+  testPingPong(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbWithPoll) {
+TEST_F(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
 #if defined(USE_IBVERBS)
-  testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true});
+  testPingPong(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
 #else   // !defined(USE_IBVERBS)
   GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongPerf) {
-  testPingPongPerf(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false});
+  testPingPongPerf(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfIb) {
+TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
 #if defined(USE_IBVERBS)
-  testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false});
+  testPingPongPerf(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
+#else   // !defined(USE_IBVERBS)
+  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+#endif  // !defined(USE_IBVERBS)
+}
+
+TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
+#if defined(USE_IBVERBS)
+  testPingPongPerf(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 #else   // !defined(USE_IBVERBS)
   GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) {
-  testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false});
+  testPingPongPerf(PingPongTestParams{
+      .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
 __device__ mscclpp::DeviceSyncer gChannelOneToOneTestPortChansSyncer;
@@ -354,7 +374,7 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m
   }
 }
 
-void PortChannelOneToOneTest::testPacketPingPong(bool useIb) {
+void PortChannelOneToOneTest::testPacketPingPong(bool useIb, IbMode ibMode) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -367,7 +387,7 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb) {
   auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
 
   setupMeshConnections(portChannels, !useIb, useIb, false, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket),
-                       getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
+                       getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket), ibMode);
 
   ASSERT_EQ(portChannels.size(), 1);
 
@@ -421,7 +441,7 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb) {
   proxyService->stopProxy();
 }
 
-void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) {
+void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -434,7 +454,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) {
   auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
 
   setupMeshConnections(portChannels, !useIb, useIb, false, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket),
-                       getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
+                       getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket), ibMode);
 
   ASSERT_EQ(portChannels.size(), 1);
 
@@ -477,21 +497,46 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) {
   proxyService->stopProxy();
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false); }
+TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongIb) {
+TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
 #if defined(USE_IBVERBS)
-  testPacketPingPong(true);
+  testPacketPingPong(true, IbMode::Host);
 #else   // !defined(USE_IBVERBS)
   GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false); }
+TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIb) {
+TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
 #if defined(USE_IBVERBS)
-  testPacketPingPongPerf(true);
+  testPacketPingPongPerf(true, IbMode::Host);
+#else   // !defined(USE_IBVERBS)
+  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+#endif  // !defined(USE_IBVERBS)
+}
+
+TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
+#if defined(USE_IBVERBS)
+  testPacketPingPongPerf(true, IbMode::HostNoAtomic);
+#else   // !defined(USE_IBVERBS)
+  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+#endif  // !defined(USE_IBVERBS)
+}
+
+TEST_F(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
+#if defined(USE_IBVERBS)
+  testPingPong(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
+#else   // !defined(USE_IBVERBS)
+  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+#endif  // !defined(USE_IBVERBS)
+}
+
+TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
+#if defined(USE_IBVERBS)
+  testPacketPingPong(true, IbMode::HostNoAtomic);
 #else   // !defined(USE_IBVERBS)
   GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
 #endif  // !defined(USE_IBVERBS)

From dff3bc7bbb4c38d71d918209513e513799fba69a Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Thu, 12 Feb 2026 17:27:20 -0800
Subject: [PATCH 09/52] Support Fusion for ReadPutPacket Operation at DSL
 (#742)

Support is being added for fusing the ReadPutPacket operation on DSL,
which reduces the overhead caused by reading packet data multiple times
in the scratch buffer. Fusion will occur when two rppkt operations are
executed consecutively with the same src_buffer:

rppkt(src, dst0) + rppkt(src, dst1) -> rppkt(src, [dst0, dst1]

Co-authored-by: Binyang Li <binyli@microsoft.com>
---
 .../mscclpp/language/internal/operations.py   | 20 +++++
 .../tests/single_node/allgather_pkt_rppkt.py  | 78 +++++++++++++++++++
 python/test/executor_test.py                  |  2 +-
 src/core/include/execution_kernel.hpp         | 14 ++--
 4 files changed, 105 insertions(+), 9 deletions(-)
 create mode 100644 python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py

diff --git a/python/mscclpp/language/internal/operations.py b/python/mscclpp/language/internal/operations.py
index 127f4a03..5f719c21 100644
--- a/python/mscclpp/language/internal/operations.py
+++ b/python/mscclpp/language/internal/operations.py
@@ -534,6 +534,7 @@ class PutOperation(BaseOperation):
         self.dst_buff = dst_buff
         self.channel_ids = channel_ids
         self.channel_type = channel_type
+        self.from_packet = from_packet
         self.to_packet = to_packet
         self.with_signal = with_signal
         self.with_signal_and_flush = with_signal_and_flush
@@ -579,6 +580,25 @@ class PutOperation(BaseOperation):
                 with_signal=self.with_signal,
                 with_signal_and_flush=self.with_signal_and_flush,
             )
+        elif (
+            isinstance(other, PutOperation)
+            and self.name == Instruction.read_put_packet
+            and self.name == other.name
+            and self.src_buff == other.src_buff
+            and self.channel_type == other.channel_type
+            and self.tbg_info == other.tbg_info
+        ):
+            fused_operation = PutOperation(
+                src_buff=self.src_buff,
+                dst_buff=self.dst_buff + other.dst_buff,
+                channel_ids=self.channel_ids + other.channel_ids,
+                channel_type=self.channel_type,
+                tbg_info=self.tbg_info,
+                from_packet=self.from_packet,
+                to_packet=self.to_packet,
+                with_signal=self.with_signal,
+                with_signal_and_flush=self.with_signal_and_flush,
+            )
 
         return fused_operation
 
diff --git a/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py b/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py
new file mode 100644
index 00000000..bda9e36c
--- /dev/null
+++ b/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py
@@ -0,0 +1,78 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def allgather_example(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Scratch Buffers
+        scratch_buffer = []
+        for gpu in range(gpu_size):
+            scratch_buffer.append(Buffer(gpu, 2 * gpu_size))
+
+        # Copying it to scratch buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            scratch_offset = gpu_size
+            input_buffer = rank.get_input_buffer()
+            rank.copy_packets(
+                scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1], input_buffer[0:1], tb=0
+            )
+
+        # Putting packets in the remote scratch buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            output_buffer = rank.get_output_buffer()
+            for peer in range(1, gpu_size):
+                dst_rank = (gpu + peer) % gpu_size
+                ch = MemoryChannel(dst_rank, gpu)
+                tb = 0
+                ch.read_put_packets(
+                    scratch_buffer[dst_rank][gpu : gpu + 1],
+                    scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1],
+                    tb,
+                )
+
+        # Copying packets from local scratch buffer to local buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            output_buffer = rank.get_output_buffer()
+            for peer in range(1, gpu_size):
+                dst_rank = (gpu + peer) % gpu_size
+                rank.unpack_packets(
+                    output_buffer[dst_rank : dst_rank + 1],
+                    scratch_buffer[gpu][dst_rank : dst_rank + 1],
+                    tb=0,
+                )
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+allgather_example(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 49e5166f..59bc1661 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -11,7 +11,7 @@ from mscclpp import (
     env,
 )
 from mscclpp import CommGroup, GpuBuffer
-from mscclpp.utils import KernelBuilder, GpuBuffer, pack
+from mscclpp.utils import KernelBuilder, pack
 import os
 import struct
 
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index 918bff61..74283244 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -298,11 +298,11 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat
   ChannelType chType = op.channelType;
   if (chType == ChannelType::MEMORY) {
     size_t nPackets = size / sizeof(PacketPayload<PacketType>);
+    PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[0] << 1));
     for (size_t pktIdx = threadIdx.x; pktIdx < nPackets; pktIdx += blockDim.x) {
+      PacketPayload<PacketType> data = pkts[pktIdx].read(flag_);
+      PacketType pkt(data, flag_);
       for (uint32_t idx = 0; idx < nOutput; ++idx) {
-        PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[idx] << 1));
-        PacketPayload<PacketType> data = pkts[pktIdx].read(flag_);
-        PacketType pkt(data, flag_);
         size_t offset = (scratchOffset_ + (dstOffsets[idx] << 1)) / sizeof(PacketType);
         void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.outputBufferRefs[idx].id]);
         mscclpp::write<PacketType>(remoteMemory, offset + pktIdx, pkt);
@@ -312,10 +312,8 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat
     // Ensuring Data Is Ready
     size_t nPackets = size / sizeof(PacketPayload<PacketType>);
     for (size_t pktIdx = threadIdx.x; pktIdx < nPackets; pktIdx += blockDim.x) {
-      for (uint32_t idx = 0; idx < nOutput; ++idx) {
-        PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[idx] << 1));
-        pkts[pktIdx].read(flag_);
-      }
+      PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[0] << 1));
+      pkts[pktIdx].read(flag_);
     }
     __syncthreads();
 
@@ -325,7 +323,7 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat
       return;
     }
     uint32_t dstOffset = (dstOffsets[chIdx] << 1) + scratchOffset_;
-    uint32_t srcOffset = (srcOffsets[chIdx] << 1) + scratchOffset_;
+    uint32_t srcOffset = (srcOffsets[0] << 1) + scratchOffset_;
     MemoryId dstMemoryId = portChannelBufferIds_[op.outputBufferRefs[chIdx].id];
     portChannels_[channelIndexes[chIdx]].put(
         dstMemoryId, dstOffset, static_cast<MemoryId>(BufferType::SCRATCH) + localMemoryIdBegin_, srcOffset, size << 1);

From bd68319e3eabe5d5370042ce1234e047032e8731 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 12 Feb 2026 19:06:18 -0800
Subject: [PATCH 10/52] Refactor algo selection logic and introduce
 symmetric_memory env (#741)

This PR refactors the algorithm selection logic in MSCCL++ and
introduces support for symmetric memory configuration through
environment variables.


1. Algorithm Selection Refactoring
Use separate class for algo selection. Could introduce more complex
logic for algo selection based on message size, arch, if cuda graph is
enabled and memory allocation method

2. Symmetric Memory Support
Introduced symmetricMemory parameter in algorithm context key
generation. Remove disableChannelCache env as is ambiguous

3. Add new args for build_default_algorithms
Add flag_buffer, and flag_buffer_size args to build default algorithm.
Then we could use unified flag buffer for different algorithms, avoid
application hanging when switch algo for different message size.

---------

Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
Co-authored-by: Qinghua Zhou <qinghuazhou@microsoft.com>
Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
---
 docs/guide/mscclpp-torch-integration.md       |   4 +-
 .../customized_allgather.cu                   |   7 +-
 .../torch-integration/customized_allgather.cu |   7 +-
 .../customized_comm_with_default_algo.py      |   4 +-
 include/mscclpp/algorithm.hpp                 |  12 +-
 include/mscclpp/env.hpp                       |  11 +-
 .../algorithm_collection_builder.hpp          |   6 +-
 include/mscclpp/gpu.hpp                       |   5 +
 python/csrc/algorithm.cpp                     |  18 +-
 python/csrc/core_py.cpp                       |   4 +-
 .../ext/algorithm_collection_builder_py.cpp   |   2 +-
 python/mscclpp/_core/algorithm.py             |  23 ++
 .../ext/algorithm_collection_builder.py       |  16 +-
 python/mscclpp/utils.py                       |   6 +
 src/core/algorithm.cc                         |  21 +-
 src/core/env.cpp                              |   4 +-
 .../algorithm_collection_builder.cc           |  18 +-
 .../allgather/allgather_fullmesh.cu           |   7 +-
 .../allgather/allgather_fullmesh_2.cu         |  19 +-
 .../allreduce/allreduce_allpair_packet.cu     |  83 ++----
 .../allreduce/allreduce_fullmesh.cu           |  29 +-
 .../collectives/allreduce/allreduce_nvls.cu   |  45 +++-
 .../allreduce/allreduce_nvls_packet.cu        |  82 ++----
 .../allreduce/allreduce_nvls_with_copy.cu     |  11 +-
 .../allreduce/allreduce_nvls_with_copy_2.cu   |  11 +-
 .../collectives/allreduce/allreduce_packet.cu |  48 ++--
 .../include/allgather/allgather_fullmesh.hpp  |   2 +-
 .../allgather/allgather_fullmesh_2.hpp        |   6 +-
 .../allreduce/allreduce_allpair_packet.hpp    |  14 +-
 .../include/allreduce/allreduce_fullmesh.hpp  |   3 +-
 .../include/allreduce/allreduce_nvls.hpp      |   4 +-
 .../allreduce/allreduce_nvls_packet.hpp       |  14 +-
 .../allreduce/allreduce_nvls_with_copy.hpp    |   2 +-
 .../allreduce/allreduce_nvls_with_copy_2.hpp  |   2 +-
 .../include/allreduce/allreduce_packet.hpp    |  12 +-
 .../collectives/include/allreduce/common.hpp  |   2 +-
 .../collectives/include/collective_utils.hpp  |   1 -
 src/ext/nccl/algorithm_selector.cc            | 172 ++++++++++++
 src/ext/nccl/algorithm_selector.hpp           |  48 ++++
 src/ext/nccl/datatype_conversion.hpp          |   6 +-
 src/ext/nccl/{nccl.cu => nccl.cc}             | 249 ++++++++----------
 test/torch/allreduce_temp_buff.py             |   4 +-
 test/torch/memory_report.py                   |   2 +-
 43 files changed, 657 insertions(+), 389 deletions(-)
 create mode 100644 src/ext/nccl/algorithm_selector.cc
 create mode 100644 src/ext/nccl/algorithm_selector.hpp
 rename src/ext/nccl/{nccl.cu => nccl.cc} (82%)

diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md
index 236dd8ef..6e3dc20b 100644
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -343,8 +343,8 @@ public:
             },
             // Context key generation function
             [self](const void* input, void* output,
-                   size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
-                return self->generateContextKey(input, output, inputSize, outputSize, dtype);
+                   size_t inputSize, size_t outputSize, mscclpp::DataType dtype, bool symmetricMemory) {
+                return self->generateContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
             }
         );
     }
diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu
index 436a6a94..e78c4777 100644
--- a/examples/customized-collective-algorithm/customized_allgather.cu
+++ b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -107,9 +107,10 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
                size_t outputSize,
                mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
+               bool symmetricMemory) {
           return self->generateAllgatherContextKey(input, output, inputSize, outputSize,
-                                                   static_cast<ncclDataType_t>(dtype));
+                                                   static_cast<ncclDataType_t>(dtype), symmetricMemory);
         });
     return allgatherAlgo;
   }
@@ -191,7 +192,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
   }
 
   mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
-                                                       size_t outputSize, ncclDataType_t dtype) {
+                                                       size_t outputSize, ncclDataType_t dtype, bool) {
     return {(void*)input, output, inputSize, outputSize, 0};
   }
 };
diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu
index 10400ddc..d48c4410 100644
--- a/examples/torch-integration/customized_allgather.cu
+++ b/examples/torch-integration/customized_allgather.cu
@@ -75,8 +75,9 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
                size_t outputSize,
                mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
-          return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype);
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
+               bool symmetricMemory) {
+          return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
         });
     return allgatherAlgo;
   }
@@ -159,7 +160,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
   }
 
   mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
-                                                       size_t outputSize, mscclpp::DataType dtype) {
+                                                       size_t outputSize, mscclpp::DataType dtype, bool) {
     return {(void*)input, output, inputSize, outputSize, 0};
   }
 };
diff --git a/examples/torch-integration/customized_comm_with_default_algo.py b/examples/torch-integration/customized_comm_with_default_algo.py
index 78560f15..281169cc 100644
--- a/examples/torch-integration/customized_comm_with_default_algo.py
+++ b/examples/torch-integration/customized_comm_with_default_algo.py
@@ -15,7 +15,9 @@ import ipaddress
 def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
     collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
     return collection_builder.build_default_algorithms(
-        scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
+        scratch_buffer=scratch_buffer.data_ptr(),
+        scratch_buffer_size=scratch_buffer.nbytes,
+        rank=rank,
     )
 
 
diff --git a/include/mscclpp/algorithm.hpp b/include/mscclpp/algorithm.hpp
index 7acdb8b8..6cc05ad4 100644
--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -96,11 +96,13 @@ class Algorithm {
   /// @param executor The executor for DSL algorithms (may be nullptr for native).
   /// @param nBlocks Number of CUDA blocks (0 for auto-selection).
   /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
+  /// @param symmetricMemory Whether to use symmetric memory optimization.
   /// @param extras Additional parameters for algorithm-specific customization.
   /// @return The result of the operation.
   virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                              size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                              std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
+                             bool symmetricMemory = false,
                              const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
 
   /// Reset the algorithm state, clearing any cached contexts.
@@ -201,9 +203,10 @@ class NativeAlgorithm : public Algorithm {
   /// @param inputSize Size of the input buffer.
   /// @param outputSize Size of the output buffer.
   /// @param dtype Data type of the elements.
+  /// @param symmetricMemory Whether symmetric memory is enabled.
   /// @return A key uniquely identifying this buffer configuration.
   using ContextKeyGenFunc = std::function<AlgorithmCtxKey(const void* input, void* output, size_t inputSize,
-                                                          size_t outputSize, DataType dtype)>;
+                                                          size_t outputSize, DataType dtype, bool symmetricMemory)>;
 
   /// Construct a NativeAlgorithm.
   /// @param name Human-readable name of the algorithm.
@@ -225,6 +228,7 @@ class NativeAlgorithm : public Algorithm {
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                      size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                      std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
+                     bool symmetricMemory = false,
                      const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
   const std::string& name() const override;
   const std::string& collective() const override;
@@ -274,6 +278,7 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                      size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                      std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
+                     bool symmetricMemory = false,
                      const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
   AlgorithmType type() const override { return AlgorithmType::DSL; }
   Constraint constraint() const override;
@@ -299,6 +304,7 @@ struct CollectiveRequest {
   const void* inputBuffer;
   void* outputBuffer;
   size_t messageSize;
+  cudaStream_t stream;
   const std::string& collective;
   const DataType dtype;
   const std::unordered_map<std::string, std::vector<uint64_t>>& hints;
@@ -358,6 +364,10 @@ class AlgorithmCollection {
   AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
 };
 
+/// Get a default GPU flag buffer (allocated once and reused).
+/// @return A pair of (shared_ptr to the flag buffer, size in bytes).
+std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer();
+
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_ALGORITHM_HPP_
\ No newline at end of file
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index bd3983e9..39f73e8d 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -98,12 +98,13 @@ class Env {
   /// debugging purposes. Currently supports `all`, `broadcast`, `allreduce`, `reducescatter`, and `allgather`.
   const std::string forceNcclFallbackOperation;
 
-  /// Env name: `MSCCLPP_DISABLE_CHANNEL_CACHE`. If set to true, it will disable the channel cache for NCCL APIs.
-  /// Currently, this should be set to true if the application may call NCCL APIs on the same local buffer with
-  /// different remote buffers, e.g., in the case of a dynamic communicator. If CUDA/HIP graphs are used, disabling
-  /// the channel cache won't affect the performance, but otherwise it may lead to performance degradation.
+  /// Env name: `MSCCLPP_NCCL_SYMMETRIC_MEMORY`. If set to true, it indicates that the application uses symmetric memory
+  /// allocation across all ranks, making it safe to cache memory handles for all NCCL algorithms. If set to false, the
+  /// system will either use non-zero-copy algorithms (when CUDA/HIP graphs are not enabled) or set up new connections
+  /// every time (when CUDA/HIP graphs are enabled). This should be set to false if the application may call NCCL APIs
+  /// on the same local buffer with different remote buffers, e.g., in the case of a dynamic communicator.
   /// Default is false.
-  const bool disableChannelCache;
+  const bool ncclSymmetricMemory;
 
   /// Env name: `MSCCLPP_FORCE_DISABLE_NVLS`. If set to true, it will disable the NVLS support in MSCCL++.
   /// Default is false.
diff --git a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
index 201d7440..394e8014 100644
--- a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
+++ b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
@@ -47,7 +47,8 @@ class AlgorithmCollectionBuilder {
   /// @return The built AlgorithmCollection containing all registered algorithms.
   AlgorithmCollection build();
 
-  AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, int rank);
+  AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer,
+                                             size_t flagBufferSize, int rank);
 
  private:
   AlgorithmCollectionBuilder() = default;
@@ -55,7 +56,8 @@ class AlgorithmCollectionBuilder {
   AlgoSelectFunc algoSelector_ = nullptr;
   AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
 
-  AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize);
+  AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize,
+                                                   uintptr_t flagBuffer, size_t flagBufferSize);
   AlgorithmCollection buildDefaultDslAlgorithms(int rank);
 
   static std::shared_ptr<AlgorithmCollectionBuilder> gAlgorithmCollectionBuilder_;
diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
index 6a0929aa..b8d096e2 100644
--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -15,6 +15,7 @@ using cudaGraphExec_t = hipGraphExec_t;
 using cudaDeviceProp = hipDeviceProp_t;
 using cudaStream_t = hipStream_t;
 using cudaStreamCaptureMode = hipStreamCaptureMode;
+using cudaStreamCaptureStatus = hipStreamCaptureStatus;
 using cudaMemcpyKind = hipMemcpyKind;
 using cudaIpcMemHandle_t = hipIpcMemHandle_t;
 
@@ -35,6 +36,9 @@ constexpr auto cudaErrorNotSupported = hipErrorNotSupported;
 constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
 constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
 constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed;
+constexpr auto cudaStreamCaptureStatusNone = hipStreamCaptureStatusNone;
+constexpr auto cudaStreamCaptureStatusActive = hipStreamCaptureStatusActive;
+constexpr auto cudaStreamCaptureStatusInvalidated = hipStreamCaptureStatusInvalidated;
 constexpr auto cudaHostAllocMapped = hipHostMallocMapped;
 constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined;
 constexpr auto cudaMemcpyDefault = hipMemcpyDefault;
@@ -98,6 +102,7 @@ constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVIC
 #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
 #define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__)
 #define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__)
+#define cudaStreamIsCapturing(...) hipStreamIsCapturing(__VA_ARGS__)
 #define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__)
 #define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__)
 #define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__)
diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp
index 3553256a..c8365566 100644
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -68,16 +68,17 @@ void register_algorithm(nb::module_& m) {
               "execute",
               [](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
                  size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
-                 std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock,
+                 std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
                  std::unordered_map<std::string, uintptr_t> extras) {
                 return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
                                     inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
-                                    nBlocks, nThreadsPerBlock, extras);
+                                    nBlocks, nThreadsPerBlock, symmetricMemory, extras);
               },
               nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
               nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
-              nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0,
-              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>());
+              nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
+              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>())
+          .def("reset", &Algorithm::reset);
 
   nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
       .def(nb::init<>())
@@ -108,8 +109,17 @@ void register_algorithm(nb::module_& m) {
       .def_prop_ro("output_buffer",
                    [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.outputBuffer); })
       .def_ro("message_size", &CollectiveRequest::messageSize)
+      .def_prop_ro("stream", [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.stream); })
       .def_prop_ro("collective", [](const CollectiveRequest& self) { return self.collective; })
       .def_ro("dtype", &CollectiveRequest::dtype)
       .def_prop_ro("hints", [](const CollectiveRequest& self) { return self.hints; })
       .def("buffer_mode", &CollectiveRequest::bufferMode);
+
+  m.def(
+      "cpp_get_default_flag_buffer",
+      []() {
+        auto [buffer, size] = getDefaultFlagBuffer();
+        return std::make_pair(reinterpret_cast<uintptr_t>(buffer.get()), size);
+      },
+      "Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size).");
 }
\ No newline at end of file
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index a862c7e5..7d1e37ba 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -44,7 +44,9 @@ void register_core(nb::module_& m) {
       .value("uint32", DataType::UINT32)
       .value("float16", DataType::FLOAT16)
       .value("float32", DataType::FLOAT32)
-      .value("bfloat16", DataType::BFLOAT16);
+      .value("bfloat16", DataType::BFLOAT16)
+      .value("float8_e4m3", DataType::FP8_E4M3)
+      .value("float8_e5m2", DataType::FP8_E5M2);
 
   nb::class_<Bootstrap>(m, "CppBootstrap")
       .def("get_rank", &Bootstrap::getRank)
diff --git a/python/csrc/ext/algorithm_collection_builder_py.cpp b/python/csrc/ext/algorithm_collection_builder_py.cpp
index 1a912724..be7f944e 100644
--- a/python/csrc/ext/algorithm_collection_builder_py.cpp
+++ b/python/csrc/ext/algorithm_collection_builder_py.cpp
@@ -29,6 +29,6 @@ void register_algorithm_collection_builder(nb::module_& m) {
            nb::arg("selector"))
       .def("build", &AlgorithmCollectionBuilder::build)
       .def("build_default_algorithms", &AlgorithmCollectionBuilder::buildDefaultAlgorithms, nb::arg("scratch_buffer"),
-           nb::arg("scratch_buffer_size"), nb::arg("rank"))
+           nb::arg("scratch_buffer_size"), nb::arg("flag_buffer"), nb::arg("flag_buffer_size"), nb::arg("rank"))
       .def_static("reset", &AlgorithmCollectionBuilder::reset);
 }
\ No newline at end of file
diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py
index 6c4a3f20..c712bf88 100644
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -4,6 +4,7 @@
 from __future__ import annotations
 from typing import Optional, Tuple, Dict
 from functools import cached_property
+import cupy as cp
 
 
 from mscclpp._mscclpp import (
@@ -18,6 +19,7 @@ from mscclpp._mscclpp import (
     CppReduceOp,
     CppAlgorithmBuilder,
     CppAlgorithmCollection,
+    cpp_get_default_flag_buffer,
 )
 
 __all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
@@ -160,6 +162,7 @@ class Algorithm:
         executor: Optional[CppExecutor] = None,
         nblocks=0,
         nthreads_per_block=0,
+        symmetric_memory: bool = False,
         extras: Optional[Dict[str, int]] = None,
     ) -> int:
         """Execute the collective algorithm.
@@ -176,6 +179,7 @@ class Algorithm:
             executor: The executor for DSL algorithms (required for DSL, optional for native).
             nblocks: Number of CUDA blocks (0 for auto-selection).
             nthreads_per_block: Number of threads per block (0 for auto-selection).
+            symmetric_memory: Whether to use symmetric memory optimization (default: False).
             extras: Additional algorithm-specific parameters.
 
         Returns:
@@ -193,9 +197,14 @@ class Algorithm:
             executor,
             nblocks,
             nthreads_per_block,
+            symmetric_memory,
             extras if extras is not None else {},
         )
 
+    def reset(self):
+        """Reset the internal state of the algorithm, if applicable."""
+        self._algorithm.reset()
+
 
 class AlgorithmBuilder:
     def __init__(self, algorithm_builder: CppAlgorithmBuilder):
@@ -230,3 +239,17 @@ class AlgorithmCollection:
         """Register an algorithm for a collective operation."""
         self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm)
         self._algorithms.append(algorithm)
+
+
+def get_default_flag_buffer() -> cp.ndarray:
+    """Get the default flag buffer for algorithm selection.
+
+    This buffer is used internally by default algorithms to store selection flags.
+    It is allocated as a shared GPU buffer and can be accessed from Python.
+
+    Returns:
+        A CuPy array representing the flag buffer on the GPU.
+    """
+    buffer_ptr, buffer_size = cpp_get_default_flag_buffer()
+    memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, None), 0)
+    return cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
diff --git a/python/mscclpp/ext/algorithm_collection_builder.py b/python/mscclpp/ext/algorithm_collection_builder.py
index 8361bd2f..80c68909 100644
--- a/python/mscclpp/ext/algorithm_collection_builder.py
+++ b/python/mscclpp/ext/algorithm_collection_builder.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 from typing import Union
-from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection
+from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_default_flag_buffer
 import atexit
 
 from mscclpp._mscclpp import CppAlgorithmCollectionBuilder
@@ -29,6 +29,7 @@ class AlgorithmCollectionBuilder:
         if not hasattr(self, "_initialized"):
             self._builder = CppAlgorithmCollectionBuilder.get_instance()
             self._initialized = True
+            self._flag_buffer = None
 
     def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]):
         if isinstance(algorithm_builder, AlgorithmBuilder):
@@ -50,8 +51,17 @@ class AlgorithmCollectionBuilder:
         collection = self._builder.build()
         return AlgorithmCollection(collection)
 
-    def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection:
-        native_collection = self._builder.build_default_algorithms(int(scratch_buffer), scratch_buffer_size, rank)
+    def build_default_algorithms(
+        self,
+        scratch_buffer: int,
+        scratch_buffer_size: int,
+        rank: int,
+    ) -> AlgorithmCollection:
+        if self._flag_buffer is None:
+            self._flag_buffer = get_default_flag_buffer()
+        native_collection = self._builder.build_default_algorithms(
+            int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank
+        )
         return AlgorithmCollection(native_collection)
 
 
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index 69dd7ce6..e7b7381b 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -192,5 +192,11 @@ def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType:
         return DataType.int32
     elif dtype == torch.bfloat16:
         return DataType.bfloat16
+    # Hardware supports either OCP format or FNUZ format for float8.
+    # Mapping both to the same MSCClPP data type.
+    elif dtype == torch.float8_e5m2 or dtype == torch.float8_e5m2fnuz:
+        return DataType.float8_e5m2
+    elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e4m3fnuz:
+        return DataType.float8_e4m3
     else:
         raise ValueError(f"Unknown data type: {dtype}")
diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc
index 31c98f15..eaaeb4a1 100644
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -3,6 +3,7 @@
 
 #include <filesystem>
 #include <mscclpp/algorithm.hpp>
+#include <mscclpp/gpu_utils.hpp>
 
 #include "logger.hpp"
 
@@ -40,12 +41,12 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
 CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
                                     size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
                                     cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
-                                    const std::unordered_map<std::string, uintptr_t>& extras) {
+                                    bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras) {
   if (!initialized_) {
     initFunc_(comm);
     initialized_ = true;
   }
-  AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype);
+  AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype, symmetricMemory);
   auto it = contexts_.find(ctxKey);
   if (it == contexts_.end()) {
     auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype);
@@ -155,7 +156,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
 
 CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                                  size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
-                                 std::shared_ptr<Executor> executor, int, int,
+                                 std::shared_ptr<Executor> executor, int, int, bool,
                                  const std::unordered_map<std::string, uintptr_t>&) {
   if (!executor) {
     THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
@@ -198,4 +199,18 @@ std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
 // TODO: implement this
 void DslAlgorithm::reset() {}
 
+static std::weak_ptr<uint32_t> gDefaultFlagBuffer;
+static size_t gDefaultFlagCount = 128;
+
+std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer() {
+  std::shared_ptr<uint32_t> flagBuffer = gDefaultFlagBuffer.lock();
+  if (!flagBuffer) {
+    flagBuffer = mscclpp::detail::gpuCallocShared<uint32_t>(gDefaultFlagCount);
+    std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
+    mscclpp::gpuMemcpy(flagBuffer.get(), initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
+    gDefaultFlagBuffer = flagBuffer;
+  }
+  return {flagBuffer, gDefaultFlagCount * sizeof(uint32_t)};
+}
+
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/core/env.cpp b/src/core/env.cpp
index a70e3d28..484b40af 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -64,7 +64,7 @@ Env::Env()
       cudaIpcUseDefaultStream(readEnv<bool>("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)),
       ncclSharedLibPath(readEnv<std::string>("MSCCLPP_NCCL_LIB_PATH", "")),
       forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
-      disableChannelCache(readEnv<bool>("MSCCLPP_DISABLE_CHANNEL_CACHE", false)),
+      ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
       forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)) {}
 
 std::shared_ptr<Env> env() {
@@ -91,7 +91,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream);
     logEnv("MSCCLPP_NCCL_LIB_PATH", globalEnv->ncclSharedLibPath);
     logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation);
-    logEnv("MSCCLPP_DISABLE_CHANNEL_CACHE", globalEnv->disableChannelCache);
+    logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
   }
   return globalEnv;
diff --git a/src/ext/collectives/algorithm_collection_builder.cc b/src/ext/collectives/algorithm_collection_builder.cc
index 67e616ae..1ede7519 100644
--- a/src/ext/collectives/algorithm_collection_builder.cc
+++ b/src/ext/collectives/algorithm_collection_builder.cc
@@ -49,8 +49,9 @@ AlgorithmCollection AlgorithmCollectionBuilder::build() {
 void AlgorithmCollectionBuilder::reset() { gAlgorithmCollectionBuilder_.reset(); }
 
 AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t scratchBuffer,
-                                                                       size_t scratchBufferSize, int rank) {
-  auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize);
+                                                                       size_t scratchBufferSize, uintptr_t flagBuffer,
+                                                                       size_t flagBufferSize, int rank) {
+  auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize);
   auto dslCollection = buildDefaultDslAlgorithms(rank);
   nativeCollection.extend(dslCollection);
   nativeCollection.setSelectors(algoSelector_, fallbackAlgoSelector_);
@@ -58,11 +59,15 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t
 }
 
 AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uintptr_t scratchBuffer,
-                                                                             size_t scratchBufferSize) {
+                                                                             size_t scratchBufferSize,
+                                                                             uintptr_t flagBuffer,
+                                                                             size_t flagBufferSize) {
   AlgorithmCollection collection;
-  auto allreduceAllpairPkt = std::make_shared<AllreduceAllpairPacket>(scratchBuffer, scratchBufferSize)->build();
+  auto allreduceAllpairPkt =
+      std::make_shared<AllreduceAllpairPacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
   collection.registerAlgorithm(allreduceAllpairPkt->collective(), allreduceAllpairPkt->name(), allreduceAllpairPkt);
-  auto allreduceNvlsPacket = std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize)->build();
+  auto allreduceNvlsPacket =
+      std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
   collection.registerAlgorithm(allreduceNvlsPacket->collective(), allreduceNvlsPacket->name(), allreduceNvlsPacket);
   auto allreduceNvlsWithCopy = std::make_shared<AllreduceNvlsWithCopy>(scratchBuffer, scratchBufferSize)->build();
   collection.registerAlgorithm(allreduceNvlsWithCopy->collective(), allreduceNvlsWithCopy->name(),
@@ -70,7 +75,8 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uin
   auto allreduceNvlsWithCopy2 = std::make_shared<AllreduceNvlsWithCopy2>(scratchBuffer, scratchBufferSize)->build();
   collection.registerAlgorithm(allreduceNvlsWithCopy2->collective(), allreduceNvlsWithCopy2->name(),
                                allreduceNvlsWithCopy2);
-  auto allreducePkt = std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize)->build();
+  auto allreducePkt =
+      std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
   collection.registerAlgorithm(allreducePkt->collective(), allreducePkt->name(), allreducePkt);
   auto allreduceNvls = std::make_shared<AllreduceNvls>()->build();
   collection.registerAlgorithm(allreduceNvls->collective(), allreduceNvls->name(), allreduceNvls);
diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index 34f8d4e7..0b288b38 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -170,7 +170,7 @@ std::shared_ptr<void> AllgatherFullmesh::initAllgatherContext(std::shared_ptr<Co
   return ctx;
 }
 
-AlgorithmCtxKey AllgatherFullmesh::generateAllgatherContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllgatherFullmesh::generateAllgatherContextKey(const void*, void*, size_t, DataType, bool) {
   // always return same key, non-zero copy algo
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
@@ -189,8 +189,9 @@ std::shared_ptr<Algorithm> AllgatherFullmesh::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllgatherContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 }  // namespace collective
diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
index 84f14ca2..cf6027d9 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -107,12 +107,6 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-AllgatherFullmesh2::AllgatherFullmesh2() : disableChannelCache_(false) {
-  if (mscclpp::env()->disableChannelCache) {
-    disableChannelCache_ = true;
-  }
-}
-
 void AllgatherFullmesh2::initialize(std::shared_ptr<Communicator> comm) {
   this->conns_ = setupConnections(comm);
   this->memorySemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
@@ -174,7 +168,7 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
   CUdeviceptr recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
   size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
-  if (disableChannelCache_) {
+  if (!symmetricMemory_) {
     channelOutOffset = 0;
     recvBytes = inputSize * comm->bootstrap()->getNranks();
     recvBasePtr = (CUdeviceptr)output;
@@ -197,10 +191,11 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
 }
 
 mscclpp::AlgorithmCtxKey AllgatherFullmesh2::generateAllgatherContextKey(const void*, void* output, size_t,
-                                                                         mscclpp::DataType) {
+                                                                         mscclpp::DataType, bool symmetricMemory) {
   static int tag = 0;
-  if (disableChannelCache_) {
-    // always return a new key if channel cache is disabled
+  symmetricMemory_ = symmetricMemory;
+  if (!symmetricMemory_) {
+    // always return a new key if symmetric memory is not enabled.
     return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, tag++};
   }
   size_t recvBytes;
@@ -224,7 +219,9 @@ std::shared_ptr<Algorithm> AllgatherFullmesh2::build() {
              [[maybe_unused]] size_t outputSize,
              mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
       [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
-             mscclpp::DataType dtype) { return self->generateAllgatherContextKey(input, output, inputSize, dtype); });
+             mscclpp::DataType dtype, bool symmetricMemory) {
+        return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
 }
 
 }  // namespace collective
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index f6081043..83950d7c 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -11,29 +11,18 @@
 namespace mscclpp {
 namespace collective {
 
-__device__ uint32_t deviceFlag = 1;
-
-template <ReduceOp OpType, typename T, bool flagPerBlock = false>
+template <ReduceOp OpType, typename T>
 __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                                   size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
-                                  int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags) {
+                                  int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
+                                  uint32_t flagSize) {
   // This version of allreduce only works for single nodes
   if (worldSize != nRanksPerNode) return;
 
   if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
   const int nPeers = nRanksPerNode - 1;
 
-  uint32_t flag = 0;
-  if constexpr (flagPerBlock) {
-    flag = ((uint32_t*)flags)[blockIdx.x];
-  } else {
-    flag = deviceFlag;
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((LL8Packet*)flags)[blockIdx.x].write(0, flag);
-    }
-  }
-
+  uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
   size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0;
   size_t channelScratchOffset = scratchBaseOffset;
 
@@ -62,22 +51,12 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
     }
     dst[idx] = data;
   }
-  if constexpr (flagPerBlock) {
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((uint32_t*)flags)[blockIdx.x] = flag + 1;
-    }
-  } else {
-    // Make sure all threadblocks have finished reading before incrementing the flag
-    if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
-      ((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
-    }
-    if (blockIdx.x == 0) {
-      __syncthreads();
-    }
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-      deviceFlag++;
-    }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ((uint32_t*)flags)[blockIdx.x] = flag + 1;
+  }
+  if (blockIdx.x == 0 && threadIdx.x >= gridDim.x && threadIdx.x < flagSize / sizeof(uint32_t)) {
+    ((uint32_t*)flags)[threadIdx.x] = flag + 1;
   }
 }
 
@@ -93,19 +72,13 @@ struct AllpairAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
-                          cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0,
+                          cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0,
                           int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
-    if (nBlocks == 7 || nBlocks == 28) {
-      allreduceAllPairs<OpType, T, true><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
-          (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-          nRanksPerNode, worldSize, nelems, numScratchBuff, flags);
-      return cudaGetLastError();
-    }
     allreduceAllPairs<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, numScratchBuff, flags);
+        nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
     return cudaGetLastError();
   }
 };
@@ -116,12 +89,6 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr<Communicator> comm) {
   RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
   registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory);
   registeredMemories_.push_back(scratchMemory);
-  flags_ = detail::gpuCallocShared<LL8Packet>(maxBlockNum_);
-  std::vector<uint32_t> flags(28, 1);
-  flags7_ = detail::gpuCallocShared<uint32_t>(7);
-  flags28_ = detail::gpuCallocShared<uint32_t>(28);
-  gpuMemcpy<uint32_t>(flags7_.get(), flags.data(), 7, cudaMemcpyHostToDevice);
-  gpuMemcpy<uint32_t>(flags28_.get(), flags.data(), 28, cudaMemcpyHostToDevice);
 }
 
 CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
@@ -133,13 +100,6 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
   }
-  void* flags = this->flags_.get();
-  if (blockAndThreadNum.first == 7) {
-    flags = this->flags7_.get();
-  } else if (blockAndThreadNum.first == 28) {
-    flags = this->flags28_.get();
-  }
-
   size_t sendBytes;
   CUdeviceptr sendBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -150,10 +110,11 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
     WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
-  cudaError_t error = allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr,
-                                nullptr, nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank,
-                                algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, flags,
-                                this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
+  cudaError_t error =
+      allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr,
+                nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerNode,
+                algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_,
+                this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -185,7 +146,7 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
   return ctx;
 }
 
-AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType, bool) {
   size_t sendBytes;
   CUdeviceptr sendBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -193,7 +154,8 @@ AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void*
 }
 
 std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
-  auto self = std::make_shared<AllreduceAllpairPacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+  auto self = std::make_shared<AllreduceAllpairPacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_,
+                                                       flagBuffer_, flagBufferSize_);
   return std::make_shared<NativeAlgorithm>(
       "default_allreduce_allpair_packet", "allreduce",
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
@@ -206,8 +168,9 @@ std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 }  // namespace collective
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index d04766c1..13c63ba1 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -149,7 +149,8 @@ struct AllreduceAllconnectAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t,
                           size_t channelOutDataOffset, size_t, int rank, int nRanksPerNode, int worldSize,
-                          size_t inputSize, cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks,
+                          int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0) nBlocks = 35;
@@ -180,8 +181,11 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   size_t recvBytes;
   CUdeviceptr recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
+  size_t channelOutOffset = 0;
+  if (symmetricMemory_) {
+    MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
+    channelOutOffset = (char*)output - (char*)recvBasePtr;
+  }
   std::shared_ptr<DeviceHandle<MemoryChannel>> inputChannelHandles;
   if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) {
     inputChannelHandles = this->memoryChannelsMap_[input].second;
@@ -204,7 +208,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(),
                 nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
-                stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -212,19 +216,21 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
   return CommResult::CommSuccess;
 }
 
-AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType) {
+AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType,
+                                                               bool symmetricMemory) {
   static int tag = 0;
   size_t recvBytes;
   CUdeviceptr recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  if (env()->disableChannelCache) {
+  symmetricMemory_ = symmetricMemory;
+  if (!symmetricMemory_) {
     return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, tag++};
   }
   return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, 0};
 }
 
 std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
-                                                              void* output, size_t, DataType) {
+                                                              void* output, size_t size, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
@@ -236,6 +242,10 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
   size_t recvBytes;
   CUdeviceptr recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
+  if (!symmetricMemory_) {
+    recvBytes = size;
+    recvBasePtr = (CUdeviceptr)output;
+  }
   RegisteredMemory localMemory = comm->registerMemory((void*)recvBasePtr, recvBytes, Transport::CudaIpc);
   ctx->registeredMemories = setupRemoteMemories(comm, ctx->rank, localMemory);
   ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory,
@@ -258,8 +268,9 @@ std::shared_ptr<Algorithm> AllreduceFullmesh::build() {
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 }  // namespace collective
diff --git a/src/ext/collectives/allreduce/allreduce_nvls.cu b/src/ext/collectives/allreduce/allreduce_nvls.cu
index 98f884f8..b07993a0 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls.cu
@@ -23,9 +23,18 @@ __global__ void __launch_bounds__(1024, 1)
   int nBlocks = gridDim.x;
   int bid = blockIdx.x;
   size_t sizePerRank = size / nRanksPerNode;
-  size_t sizePerBlock = sizePerRank / nBlocks;
+  const size_t minAlign = 16;
+  // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore
+  size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks;
+  sizePerBlock = (sizePerBlock + minAlign - 1) / minAlign * minAlign;
+
   size_t rankOffset = sizePerRank * rank;
   size_t blockOffset = sizePerBlock * bid + rankOffset;
+  size_t curBlockSize = 0;
+  if (sizePerBlock * bid < sizePerRank) {
+    curBlockSize = min(sizePerBlock, sizePerRank - sizePerBlock * bid);
+  }
+
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastPtr = multicast + bid;
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastOutPtr = multicastOut + bid;
 
@@ -44,8 +53,10 @@ __global__ void __launch_bounds__(1024, 1)
   __syncthreads();
   T* src = (T*)multicastPtr->mcPtr;
   T* dst = (T*)multicastOutPtr->mcPtr;
-  handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock,
-                             threadIdx.x, blockDim.x);
+  if (curBlockSize > 0) {
+    handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, curBlockSize,
+                               threadIdx.x, blockDim.x);
+  }
   __syncthreads();
   if (threadIdx.x < nPeers) {
     channels[threadIdx.x].relaxedSignal();
@@ -60,7 +71,7 @@ struct NvlsAdapter {
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
                           size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize,
-                          cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
 #if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)
     if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
       return cudaErrorNotSupported;
@@ -77,7 +88,12 @@ struct NvlsAdapter {
 };
 
 void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
-  nSwitchChannels_ = 8;
+  int device;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&device));
+  cudaDeviceProp deviceProp;
+  MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device));
+  computeCapabilityMajor_ = deviceProp.major;
+  nSwitchChannels_ = 32;
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -91,6 +107,10 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
                                               size_t inputSize, mscclpp::DataType dtype, ReduceOp op,
                                               cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
                                               const std::unordered_map<std::string, uintptr_t>&) {
+  if (!symmetricMemory_) {
+    WARN("AllreduceNvls requires symmetric memory for now.");
+    return CommResult::CommInvalidArgument;
+  }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype);
   if (!allreduce) {
@@ -110,12 +130,16 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
   if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
-    numBlocksAndThreads = {ctx->nRanksPerNode, 1024};
+    numBlocksAndThreads = {::min(ctx->nRanksPerNode, nSwitchChannels_), 1024};
+    // For GB200 devices, using more blocks to improve the performances when nRanksPerNode <= 8
+    if (computeCapabilityMajor_ == 10 && ctx->nRanksPerNode <= 8) {
+      numBlocksAndThreads.first = ::min(32, nSwitchChannels_);
+    }
   }
   cudaError_t error =
       allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels,
                 nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize,
-                inputSize, stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -124,7 +148,8 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
 }
 
 mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t,
-                                                                    mscclpp::DataType) {
+                                                                    mscclpp::DataType, bool symmetricMemory) {
+  symmetricMemory_ = symmetricMemory;
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -174,7 +199,9 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvls::build() {
              [[maybe_unused]] size_t outputSize,
              mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
       [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
-             mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); });
+             mscclpp::DataType dtype, bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
 }
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index bc7d596a..9f1371c2 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -9,25 +9,15 @@
 namespace mscclpp {
 namespace collective {
 
-__device__ uint32_t deviceFlag = 1;
-template <ReduceOp OpType, typename T, bool flagPerBlock = false>
+template <ReduceOp OpType, typename T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output,
                         [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
                         [[maybe_unused]] size_t nelems, [[maybe_unused]] size_t scratchBufferSize,
-                        [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags) {
+                        [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags,
+                        [[maybe_unused]] uint32_t flagBufferSize) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-  uint32_t flag = 0;
-  if constexpr (flagPerBlock) {
-    flag = ((uint32_t*)flags)[blockIdx.x];
-  } else {
-    flag = deviceFlag;
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((LL8Packet*)flags)[blockIdx.x].write(0, flag);
-    }
-  }
-
+  uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
   size_t scratchBaseOffset = (flag % 2) ? scratchBufferSize / 2 : 0;
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t nPktPerRank = nelems / worldSize / (sizeof(mscclpp::LL8Packet::Payload) / sizeof(T));
@@ -51,21 +41,13 @@ __global__ void __launch_bounds__(1024, 1)
     }
     dst[i] = data;
   }
-  if constexpr (flagPerBlock) {
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((uint32_t*)flags)[blockIdx.x] = flag + 1;
-    }
-  } else {
-    if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
-      ((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
-    }
-    if (blockIdx.x == 0) {
-      __syncthreads();
-    }
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-      deviceFlag++;
-    }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ((uint32_t*)flags)[blockIdx.x] = flag + 1;
+  }
+  // update other flags in-case using different number of blocks in next launch
+  if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) {
+    ((uint32_t*)flags)[threadIdx.x] = flag + 1;
   }
 #endif
 }
@@ -85,30 +67,17 @@ struct AllreduceNvlsPacketAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int, int worldSize, size_t inputSize, cudaStream_t stream,
-                          void* flags, uint32_t, int nBlocks, int nThreadsPerBlock) {
-    if (nBlocks == 4 || nBlocks == 8) {
-      allreduceNvlsPacket<OpType, T, true>
-          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels,
-                                                     inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags);
-    } else {
-      allreduceNvlsPacket<OpType, T>
-          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels,
-                                                     inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags);
-    }
+                          void* flags, uint32_t flagBufferSize, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    allreduceNvlsPacket<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+        (const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank,
+        worldSize, flags, flagBufferSize);
     return cudaGetLastError();
   }
 };
 
-void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator>) {
-  std::vector<uint32_t> flags(8, 1);
-  flags_ = detail::gpuCallocShared<LL8Packet>(16);
-  flags4_ = detail::gpuCallocShared<uint32_t>(4);
-  flags8_ = detail::gpuCallocShared<uint32_t>(8);
-  gpuMemcpy<uint32_t>(flags4_.get(), flags.data(), 4, cudaMemcpyHostToDevice);
-  gpuMemcpy<uint32_t>(flags8_.get(), flags.data(), 8, cudaMemcpyHostToDevice);
-}
+void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator>) {}
 
-AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
 
@@ -146,16 +115,10 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
-  void* flags = this->flags_.get();
-  if (blockAndThreadNum.first == 4) {
-    flags = this->flags4_.get();
-  } else if (blockAndThreadNum.first == 8) {
-    flags = this->flags8_.get();
-  }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr,
-                0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, flags,
-                0, blockAndThreadNum.first, blockAndThreadNum.second);
+                0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
+                (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -164,7 +127,8 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
 }
 
 std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
-  auto self = std::make_shared<AllreduceNvlsPacket>((uintptr_t)scratchBuffer_, scratchBufferSize_);
+  auto self = std::make_shared<AllreduceNvlsPacket>((uintptr_t)scratchBuffer_, scratchBufferSize_, flagBuffer_,
+                                                    flagBufferSize_);
   return std::make_shared<mscclpp::NativeAlgorithm>(
       "default_allreduce_nvls_packet", "allreduce",
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
@@ -178,7 +142,9 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
              [[maybe_unused]] size_t outputSize,
              mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
       [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
-             mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); });
+             mscclpp::DataType dtype, bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
 }
 }  // namespace collective
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
index 113fdb7c..033f3311 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
@@ -113,7 +113,7 @@ struct NvlsWithCopyAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
-                          cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
 #if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
     if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
       return cudaErrorNotSupported;
@@ -157,7 +157,7 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<void
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0,
+                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
@@ -166,7 +166,7 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<void
   return CommResult::CommSuccess;
 }
 
-AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
 
@@ -199,8 +199,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsWithCopy::build() {
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 }  // namespace collective
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu b/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
index 2a109c6f..96aa9168 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
@@ -150,7 +150,7 @@ struct NvlsWithCopy2Adapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
-                          cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
 #if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
     if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
       return cudaErrorNotSupported;
@@ -194,7 +194,7 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<voi
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0,
+                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
@@ -203,7 +203,7 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<voi
   return CommResult::CommSuccess;
 }
 
-AlgorithmCtxKey AllreduceNvlsWithCopy2::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceNvlsWithCopy2::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
 
@@ -236,8 +236,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsWithCopy2::build() {
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index 23ed5d09..9ce67085 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -11,13 +11,11 @@
 namespace mscclpp {
 namespace collective {
 
-__device__ uint32_t deviceFlag = 1;
-
 template <ReduceOp OpType, typename T>
 __global__ void __launch_bounds__(1024, 1)
     allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
                     size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize,
-                    size_t nelems, void* flags, uint32_t numScratchBuff
+                    size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff
 #if defined(ENABLE_NPKIT)
                     ,
                     NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) {
@@ -60,11 +58,7 @@ __global__ void __launch_bounds__(1024, 1)
   const int nPeers = nRanksPerNode - 1;
   const size_t nPkts = nelems / 2;
 
-  uint32_t flag = deviceFlag;
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    ((LL8Packet*)flags)[blockIdx.x].write(0, flag);
-  }
+  uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
   size_t channelScratchOffset = (flag % numScratchBuff) ? scratchBufferSize / numScratchBuff : 0;
 
   int nelemsPerRank = nelems / worldSize;
@@ -129,15 +123,12 @@ __global__ void __launch_bounds__(1024, 1)
     result[idx].y = data.y;
   }
 
-  // Make sure all threadblocks have finished reading before incrementing the flag
-  if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
-    ((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ((uint32_t*)flags)[blockIdx.x] = flag + 1;
   }
-  if (blockIdx.x == 0) {
-    __syncthreads();
-  }
-  if (threadIdx.x == 0 && blockIdx.x == 0) {
-    deviceFlag++;
+  if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) {
+    ((uint32_t*)flags)[threadIdx.x] = flag + 1;
   }
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && \
     defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT)
@@ -156,20 +147,22 @@ struct PacketAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
-                          cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0,
-                          int nThreadsPerBlock = 0) {
+                          cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff,
+                          int nBlocks = 0, int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
+    // Optimize the number of blocks to be multiple of (worldSize - 1)
+    nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1);
 #if defined(ENABLE_NPKIT)
     size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS;
     allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, flags, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
+        nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
         NpKit::GetCpuTimestamp());
 #else
     allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, flags, numScratchBuff);
+        nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
 #endif
     return cudaGetLastError();
   }
@@ -215,7 +208,6 @@ void AllreducePacket::initialize(std::shared_ptr<Communicator> comm) {
   RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
   registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory);
   registeredMemories_.push_back(scratchMemory);
-  flags_ = detail::gpuCallocShared<LL8Packet>(maxBlockNum_);
 }
 
 CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
@@ -233,7 +225,6 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   size_t channelInOffset = (char*)input - (char*)sendBasePtr;
 
-  void* flags = this->flags_.get();
   AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
@@ -242,7 +233,8 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr,
                 channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
-                stream, flags, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
+                stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
+                blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -274,7 +266,7 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
   return ctx;
 }
 
-AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType) {
+AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType, bool) {
   size_t sendBytes;
   CUdeviceptr sendBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -282,7 +274,8 @@ AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input,
 }
 
 std::shared_ptr<Algorithm> AllreducePacket::build() {
-  auto self = std::make_shared<AllreducePacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+  auto self = std::make_shared<AllreducePacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_,
+                                                flagBuffer_, flagBufferSize_);
   return std::make_shared<NativeAlgorithm>(
       "default_allreduce_packet", "allreduce", [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
@@ -294,8 +287,9 @@ std::shared_ptr<Algorithm> AllreducePacket::build() {
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 
diff --git a/src/ext/collectives/include/allgather/allgather_fullmesh.hpp b/src/ext/collectives/include/allgather/allgather_fullmesh.hpp
index 085f4ac4..d1a4bbcd 100644
--- a/src/ext/collectives/include/allgather/allgather_fullmesh.hpp
+++ b/src/ext/collectives/include/allgather/allgather_fullmesh.hpp
@@ -25,7 +25,7 @@ class AllgatherFullmesh : public AlgorithmBuilder {
 
   std::shared_ptr<void> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
                                              size_t, mscclpp::DataType);
-  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType);
+  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType, bool);
 
   void* scratchBuffer_;
   size_t scratchBufferSize_;
diff --git a/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp b/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp
index ea176ba1..56783e3b 100644
--- a/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp
+++ b/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp
@@ -11,11 +11,11 @@ namespace collective {
 
 class AllgatherFullmesh2 : public AlgorithmBuilder {
  public:
-  AllgatherFullmesh2();
+  AllgatherFullmesh2() = default;
   std::shared_ptr<Algorithm> build() override;
 
  private:
-  bool disableChannelCache_;
+  bool symmetricMemory_;
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
   const int nChannelsPerConnection_ = 35;
@@ -27,7 +27,7 @@ class AllgatherFullmesh2 : public AlgorithmBuilder {
 
   std::shared_ptr<void> initAllgatherContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType, bool);
 };
 
 }  // namespace collective
diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
index e995b940..bd402cfa 100644
--- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
@@ -9,8 +9,11 @@ namespace mscclpp {
 namespace collective {
 class AllreduceAllpairPacket : public AlgorithmBuilder {
  public:
-  AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
-      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
+      : scratchBuffer_((void*)scratchBuffer),
+        scratchBufferSize_(scratchBufferSize),
+        flagBuffer_(flagBuffer),
+        flagBufferSize_(flagBufferSize){};
   std::shared_ptr<Algorithm> build() override;
 
  private:
@@ -21,7 +24,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
   void* scratchBuffer_;
   size_t scratchBufferSize_;
@@ -30,9 +33,8 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
   std::vector<RegisteredMemory> registeredMemories_;
-  std::shared_ptr<LL8Packet> flags_;
-  std::shared_ptr<uint32_t> flags7_;
-  std::shared_ptr<uint32_t> flags28_;
+  uintptr_t flagBuffer_;
+  size_t flagBufferSize_;
 };
 }  // namespace collective
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
index 31a7f145..fa811b15 100644
--- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
@@ -20,7 +20,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   std::shared_ptr<Communicator> comm_;
@@ -32,6 +32,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
   RegisteredMemory localScratchMemory_;
   std::unordered_map<const void*, std::pair<std::vector<MemoryChannel>, std::shared_ptr<DeviceHandle<MemoryChannel>>>>
       memoryChannelsMap_;
+  bool symmetricMemory_ = false;
 };
 }  // namespace collective
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls.hpp
index 4591cb42..07074527 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls.hpp
@@ -12,6 +12,7 @@ class AllreduceNvls : public AlgorithmBuilder {
   std::shared_ptr<Algorithm> build() override;
 
  private:
+  bool symmetricMemory_ = false;
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
@@ -19,13 +20,14 @@ class AllreduceNvls : public AlgorithmBuilder {
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
   const size_t nvlsBufferSize_ = (1 << 30);
   uint32_t nSwitchChannels_;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
+  int computeCapabilityMajor_{0};
 };
 
 }  // namespace collective
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
index 8761162a..1cfb5ffd 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
@@ -10,8 +10,11 @@ namespace mscclpp {
 namespace collective {
 class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
  public:
-  AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
-      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
+      : scratchBuffer_((void*)scratchBuffer),
+        scratchBufferSize_(scratchBufferSize),
+        flagBuffer_(flagBuffer),
+        flagBufferSize_(flagBufferSize){};
   std::shared_ptr<mscclpp::Algorithm> build() override;
 
  private:
@@ -22,15 +25,14 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
                                              size_t, mscclpp::DataType);
-  mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType);
+  mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType, bool);
 
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   const size_t nvlsBufferSize_ = (1 << 30);
   const int maxBlockNum_ = 16;
-  std::shared_ptr<LL8Packet> flags_;
-  std::shared_ptr<uint32_t> flags4_;
-  std::shared_ptr<uint32_t> flags8_;
+  uintptr_t flagBuffer_;
+  size_t flagBufferSize_;
 };
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp
index 1077b122..97b72a2f 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp
@@ -20,7 +20,7 @@ class AllreduceNvlsWithCopy : public AlgorithmBuilder {
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
   const size_t nvlsBufferSize_ = (1 << 30);
   void* scratchBuffer_;
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp
index 7bfa9822..ca4ed1c6 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp
@@ -23,7 +23,7 @@ class AllreduceNvlsWithCopy2 : public AlgorithmBuilder {
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
   const size_t nvlsBufferSize_ = (1 << 30);
   void* scratchBuffer_;
diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
index f562aca5..f0438dea 100644
--- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
@@ -9,8 +9,11 @@ namespace mscclpp {
 namespace collective {
 class AllreducePacket : public AlgorithmBuilder {
  public:
-  AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
-      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
+      : scratchBuffer_((void*)scratchBuffer),
+        scratchBufferSize_(scratchBufferSize),
+        flagBuffer_(flagBuffer),
+        flagBufferSize_(flagBufferSize){};
   std::shared_ptr<Algorithm> build() override;
 
  private:
@@ -21,16 +24,17 @@ class AllreducePacket : public AlgorithmBuilder {
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   const int nSegmentsForScratchBuffer_ = 2;
   const int maxBlockNum_ = 56;
   std::vector<Connection> conns_;
+  uintptr_t flagBuffer_;
+  size_t flagBufferSize_;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
   std::vector<RegisteredMemory> registeredMemories_;
-  std::shared_ptr<LL8Packet> flags_;
 };
 }  // namespace collective
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 26b57dbf..4c28a24a 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -75,7 +75,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src
 using AllreduceFunc =
     std::function<cudaError_t(const void*, void*, void*, void*, void*, mscclpp::DeviceHandle<mscclpp::SwitchChannel>*,
                               mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t, int, int, int,
-                              size_t, cudaStream_t, void*, uint32_t, int, int)>;
+                              size_t, cudaStream_t, void*, uint32_t, uint32_t, int, int)>;
 
 template <template <ReduceOp, typename> class Adapter>
 AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index ea7a672d..74cf83fd 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -31,7 +31,6 @@ constexpr int NUM_SEMAPHORES = 64;
 constexpr int MAX_NRANKS_PER_NODE = 8;
 
 constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
-static bool mscclppDisableChannelCache = env()->disableChannelCache;
 
 std::vector<RegisteredMemory> setupRemoteMemories(std::shared_ptr<Communicator> comm, int rank,
                                                   RegisteredMemory localMemory);
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
new file mode 100644
index 00000000..37126d5b
--- /dev/null
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -0,0 +1,172 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "algorithm_selector.hpp"
+
+#include <mscclpp/env.hpp>
+#include <mscclpp/utils.hpp>
+
+#include "debug.h"
+
+namespace mscclpp {
+namespace nccl {
+
+static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, DataType dtype) {
+  bool nvlsSupported = config.nvlsSupported;
+  const bool isFp8 = dtype == DataType::FP8_E4M3 || dtype == DataType::FP8_E5M2;
+
+  if (!isFp8) {
+    return nvlsSupported;
+  }
+
+  // FP8 handling
+#if !defined(__HIP_PLATFORM_AMD__)
+  // NVLS does not support FP8 on devices with compute capability < 10
+  if (config.computeCapability.first < 10) {
+    return false;
+  }
+#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__))
+  return true;
+#else
+  return false;
+#endif
+#else
+  return nvlsSupported;
+#endif
+}
+
+bool matchExecutionPlan(std::shared_ptr<DslAlgorithm> algo, const CollectiveRequest& request) {
+  bool worldSizeMatch = algo->constraint().worldSize == request.worldSize;
+  bool ranksPerNodeMatch = algo->constraint().nRanksPerNode == request.nRanksPerNode;
+  bool collectiveMatch = algo->collective() == request.collective;
+  bool bufferModeMatch = algo->bufferMode() == CollectiveBufferMode::Any || request.bufferMode() == algo->bufferMode();
+  size_t effectiveSize =
+      (request.collective == "allgather") ? (request.messageSize * request.worldSize) : request.messageSize;
+  bool minSizeMatch = effectiveSize >= algo->messageRange().first;
+  bool maxSizeMatch = effectiveSize <= algo->messageRange().second;
+  bool result =
+      worldSizeMatch && ranksPerNodeMatch && collectiveMatch && bufferModeMatch && minSizeMatch && maxSizeMatch;
+  return result;
+}
+
+static std::shared_ptr<Algorithm> selectSingleNodeAllreduceBlackwell(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config) {
+  const size_t messageSize = request.messageSize;
+
+  const bool nvlsSupported = isNvlsSupportedForDataType(config, request.dtype);
+
+  // Small messages always use NVLS packet algorithm
+  if (messageSize <= (1 << 15)) {  // <= 32KB
+    return algoMap.at("default_allreduce_nvls_packet");
+  }
+
+  if (!config.symmetricMemory) {
+    if (messageSize <= (1 << 21)) {  // <= 2MB
+      return algoMap.at("default_allreduce_packet");
+    }
+    return nullptr;
+  }
+
+  // Symmetric memory path: can use cached memory handles
+  const bool useNvlsWithZeroCopy = nvlsSupported && config.isCuMemMapAllocated;
+  if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {  // <= 64KB or <= 1MB
+    return algoMap.at("default_allreduce_packet");
+  }
+  if (useNvlsWithZeroCopy) {
+    return algoMap.at("default_allreduce_nvls");
+  }
+
+  INFO(MSCCLPP_NCCL, "No suitable kernel for Blackwell architecture, fallback to nccl/rccl");
+  return nullptr;
+}
+
+std::shared_ptr<Algorithm> selectSingleNodeAllreduce(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config) {
+  // Use Blackwell-specific selection for compute capability 10.x
+  if (config.computeCapability.first == 10) {
+    return selectSingleNodeAllreduceBlackwell(algoMap, request, config);
+  }
+
+  const size_t messageSize = request.messageSize;
+
+  // Determine NVLS availability based on data type and device capability
+  const bool nvlsSupported = isNvlsSupportedForDataType(config, request.dtype);
+
+  const bool useNvlsWithZeroCopy = nvlsSupported && config.symmetricMemory && config.isCuMemMapAllocated;
+
+  // Very small messages: use allpair packet algorithm
+  if (messageSize <= (1 << 14)) {  // <= 16KB
+    return algoMap.at("default_allreduce_allpair_packet");
+  }
+  // Small messages with NVLS support
+  if (messageSize <= (1 << 15) && nvlsSupported) {  // <= 32KB
+    return algoMap.at("default_allreduce_nvls_packet");
+  }
+  // Medium messages: use packet algorithm
+  if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {  // <= 64KB or <= 1MB
+    return algoMap.at("default_allreduce_packet");
+  }
+  // Large messages with NVLS zero-copy support
+  if (nvlsSupported && useNvlsWithZeroCopy) {
+    return algoMap.at("default_allreduce_nvls");
+  }
+  // Large messages with NVLS but without zero-copy
+  if (nvlsSupported) {
+    if (messageSize < (1 << 24)) {  // < 16MB
+      return algoMap.at("default_allreduce_nvls_with_copy");
+    }
+    return algoMap.at("default_allreduce_nvls_with_copy2");
+  }
+#if defined(__HIP_PLATFORM_AMD__)
+  // AMD platform: use fullmesh algorithm
+  return algoMap.at("default_allreduce_fullmesh");
+#else
+  // NVIDIA without NVLS: use RSAG pipeline if no NCCL fallback
+  if (!config.ncclDlopenSharedLib) {
+    return algoMap.at("default_allreduce_fullmesh");
+  }
+  return nullptr;
+#endif
+}
+
+std::shared_ptr<Algorithm> selectSingleNodeAllgather(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    [[maybe_unused]] const AlgorithmSelectorConfig& config) {
+  const size_t messageSize = request.messageSize;
+
+  // For messages up to 32MB, use fullmesh2 algorithm
+  if (messageSize <= 32 * (1 << 20)) {
+    return algoMap.at("default_allgather_fullmesh2");
+  }
+
+#if defined(__HIP_PLATFORM_AMD__)
+  // AMD platform always uses fullmesh2
+  return algoMap.at("default_allgather_fullmesh2");
+#else
+  // NVIDIA: use fullmesh for large messages if no NCCL fallback is available
+  if (!config.ncclDlopenSharedLib) {
+    return algoMap.at("default_allgather_fullmesh");
+  }
+  return nullptr;
+#endif
+}
+
+std::shared_ptr<Algorithm> selectMultiNodeAlgorithm(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap [[maybe_unused]],
+    const CollectiveRequest& request [[maybe_unused]], const AlgorithmSelectorConfig& config [[maybe_unused]]) {
+  // TODO: Implement multi-node algorithm selection
+  // Multi-node scenarios will need to consider:
+  // 1. Multi-node NVLS (if supported by hardware)
+  // 2. Multi-node IB (InfiniBand)
+  // 3. Hierarchical algorithms (intra-node + inter-node)
+  // 4. Network topology awareness
+
+  // For now, return nullptr to fallback to NCCL/RCCL
+  INFO(MSCCLPP_NCCL, "Multi-node collective not yet supported, fallback to nccl/rccl");
+  return nullptr;
+}
+
+}  // namespace nccl
+}  // namespace mscclpp
diff --git a/src/ext/nccl/algorithm_selector.hpp b/src/ext/nccl/algorithm_selector.hpp
new file mode 100644
index 00000000..c8705f8b
--- /dev/null
+++ b/src/ext/nccl/algorithm_selector.hpp
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
+#define MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
+
+#include <memory>
+#include <mscclpp/algorithm.hpp>
+#include <mscclpp/core.hpp>
+#include <unordered_map>
+
+namespace mscclpp {
+namespace nccl {
+
+/// Configuration for algorithm selection
+struct AlgorithmSelectorConfig {
+  bool symmetricMemory;
+  bool nvlsSupported;
+  bool isCuMemMapAllocated;
+  bool inCaptureMode;
+  std::pair<int, int> computeCapability;
+  bool ncclDlopenSharedLib;
+};
+
+/// Select an algorithm for single-node allreduce
+std::shared_ptr<Algorithm> selectSingleNodeAllreduce(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config);
+
+/// Select an algorithm for single-node allgather
+std::shared_ptr<Algorithm> selectSingleNodeAllgather(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config);
+
+/// Select an algorithm for multi-node collective operations
+/// Currently returns nullptr to fallback to NCCL/RCCL
+/// TODO: Implement multi-node NVLS and multi-node IB algorithms
+std::shared_ptr<Algorithm> selectMultiNodeAlgorithm(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config);
+
+/// Check if an execution plan matches the request
+bool matchExecutionPlan(std::shared_ptr<DslAlgorithm> algo, const CollectiveRequest& request);
+
+}  // namespace nccl
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
diff --git a/src/ext/nccl/datatype_conversion.hpp b/src/ext/nccl/datatype_conversion.hpp
index 85bac056..8dfe6aab 100644
--- a/src/ext/nccl/datatype_conversion.hpp
+++ b/src/ext/nccl/datatype_conversion.hpp
@@ -9,6 +9,8 @@
 #include <cstddef>
 #include <mscclpp/gpu_data_types.hpp>
 
+#include "logger.hpp"
+
 // Convert ncclDataType_t to mscclpp::DataType
 inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
   switch (dtype) {
@@ -70,8 +72,8 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
       return ncclFloat8e5m2;
 #endif
     default:
-      assert(false && "Unsupported mscclpp::DataType");
-      return ncclNumTypes;
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "Unsupported mscclpp::DataType: " + std::to_string(static_cast<int>(dtype)));
   }
 }
 
diff --git a/src/ext/nccl/nccl.cu b/src/ext/nccl/nccl.cc
similarity index 82%
rename from src/ext/nccl/nccl.cu
rename to src/ext/nccl/nccl.cc
index 88b78d00..dec61aee 100644
--- a/src/ext/nccl/nccl.cu
+++ b/src/ext/nccl/nccl.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <algorithm>
 #include <filesystem>
@@ -19,8 +19,10 @@
 
 #include <mscclpp/algorithm.hpp>
 
+#include "algorithm_selector.hpp"
 #include "datatype_conversion.hpp"
-#include "debug.h"
+
+static constexpr auto MSCCLPP_NCCL = mscclpp::LogSubsys::NCCL;
 
 #define NCCL_API extern "C" __attribute__((visibility("default")))
 
@@ -80,17 +82,17 @@ static inline int mscclppNcclDlopenInit() {
   const char* ncclLibPath = mscclpp::env()->ncclSharedLibPath.c_str();
   if (ncclLibPath != nullptr && ncclLibPath[0] != '\0') {
     if (std::filesystem::is_directory(ncclLibPath)) {
-      WARN("The value of the environment variable %s is a directory", ncclLibPath);
+      WARN(MSCCLPP_NCCL, "The value of the environment variable %s is a directory", ncclLibPath);
       return dlopenError;
     }
 
     mscclppNcclDlHandle = dlopen(ncclLibPath, RTLD_LAZY | RTLD_NODELETE);
     if (!mscclppNcclDlHandle) {
-      WARN("Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
+      WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
       return dlopenError;
     }
   } else {
-    WARN("The value of MSCCLPP_NCCL_LIB_PATH is empty!\n");
+    WARN(MSCCLPP_NCCL, "The value of MSCCLPP_NCCL_LIB_PATH is empty!\n");
     return dlopenError;
   }
 
@@ -179,7 +181,7 @@ struct ncclComm {
 
 NCCL_API ncclResult_t ncclGetVersion(int* version) {
   if (version == nullptr) {
-    WARN("version is nullptr");
+    WARN(MSCCLPP_NCCL, "version is nullptr");
     return ncclInvalidArgument;
   }
   *version = MSCCLPP_VERSION;
@@ -188,7 +190,7 @@ NCCL_API ncclResult_t ncclGetVersion(int* version) {
 
 NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) {
   if (uniqueId == nullptr) {
-    WARN("uniqueId is nullptr");
+    WARN(MSCCLPP_NCCL, "uniqueId is nullptr");
     return ncclInvalidArgument;
   }
   if (mscclpp::UniqueIdBytes != NCCL_UNIQUE_ID_BYTES) return ncclInternalError;
@@ -212,21 +214,6 @@ static std::pair<int, int> getDeviceComputeCapability() {
   return std::make_pair(major, minor);
 }
 
-static bool matchExecutionPlan(std::shared_ptr<mscclpp::DslAlgorithm> algo, const mscclpp::CollectiveRequest& request) {
-  bool worldSizeMatch = algo->constraint().worldSize == request.worldSize;
-  bool ranksPerNodeMatch = algo->constraint().nRanksPerNode == request.nRanksPerNode;
-  bool collectiveMatch = algo->collective() == request.collective;
-  bool bufferModeMatch =
-      algo->bufferMode() == mscclpp::CollectiveBufferMode::Any || request.bufferMode() == algo->bufferMode();
-  size_t effectiveSize =
-      (request.collective == "allgather") ? (request.messageSize * request.worldSize) : request.messageSize;
-  bool minSizeMatch = effectiveSize >= algo->messageRange().first;
-  bool maxSizeMatch = effectiveSize <= algo->messageRange().second;
-  bool result =
-      worldSizeMatch && ranksPerNodeMatch && collectiveMatch && bufferModeMatch && minSizeMatch && maxSizeMatch;
-  return result;
-}
-
 static std::shared_ptr<mscclpp::Algorithm> algoSelector(
     const std::unordered_map<std::string, std::unordered_map<std::string, std::shared_ptr<mscclpp::Algorithm>>>&
         algoMapByCollective,
@@ -234,86 +221,64 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
   if (algoMapByCollective.find(request.collective) == algoMapByCollective.end()) {
     return nullptr;
   }
+
   for (const auto& pair : algoMapByCollective.at(request.collective)) {
     const auto& algo = pair.second;
     if (algo->type() == mscclpp::AlgorithmType::DSL) {
-      if (matchExecutionPlan(std::static_pointer_cast<mscclpp::DslAlgorithm>(algo), request)) {
+      if (mscclpp::nccl::matchExecutionPlan(std::static_pointer_cast<mscclpp::DslAlgorithm>(algo), request)) {
         return algo;
       }
     }
   }
-  if (request.nRanksPerNode != request.worldSize) {
-    // Fallback to nccl/rccl when multi-node
-    return nullptr;
-  }
-  static const bool mscclppDisableChannelCache = mscclpp::env()->disableChannelCache;
+
+  // Prepare algorithm selector configuration
   static const bool isNvlsSupported = mscclpp::isNvlsSupported();
   static const std::pair<int, int> deviceComputeCapability = getDeviceComputeCapability();
-  size_t messageSize = request.messageSize;
-  const std::string& collective = request.collective;
-  bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast<void*>(request.inputBuffer)) &&
-                             mscclpp::isCuMemMapAllocated(request.outputBuffer);
-  bool useNvlsWithZeroCopy = isNvlsSupported && !mscclppDisableChannelCache && isCuMemMapAllocated;
-  if (collective == "allgather") {
-    if (messageSize <= 32 * (1 << 20)) {
-      return algoMapByCollective.at(collective).at("default_allgather_fullmesh2");
-    } else {
-#if defined(__HIP_PLATFORM_AMD__)
-      return algoMapByCollective.at(collective).at("default_allgather_fullmesh2");
-#else
-      if (!mscclppNcclDlopenSharedLib) {
-        return algoMapByCollective.at(collective).at("default_allgather_fullmesh");
-      }
-#endif
-    }
+  static const bool ncclSymmetricMemory = mscclpp::env()->ncclSymmetricMemory;
+
+  const bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast<void*>(request.inputBuffer)) &&
+                                   mscclpp::isCuMemMapAllocated(request.outputBuffer);
+
+  cudaStreamCaptureStatus captureStatus = cudaStreamCaptureStatusNone;
+  CUDACHECK(cudaStreamIsCapturing(request.stream, &captureStatus));
+  const bool inCaptureMode = (captureStatus == cudaStreamCaptureStatusActive);
+
+  mscclpp::nccl::AlgorithmSelectorConfig config{.symmetricMemory = ncclSymmetricMemory,
+                                                .nvlsSupported = isNvlsSupported,
+                                                .isCuMemMapAllocated = isCuMemMapAllocated,
+                                                .inCaptureMode = inCaptureMode,
+                                                .computeCapability = deviceComputeCapability,
+                                                .ncclDlopenSharedLib = mscclppNcclDlopenSharedLib};
+
+  const auto& algoMap = algoMapByCollective.at(request.collective);
+
+  // Check if this is a multi-node scenario
+  if (request.nRanksPerNode != request.worldSize) {
+    return mscclpp::nccl::selectMultiNodeAlgorithm(algoMap, request, config);
   }
-  if (collective == "allreduce") {
-    bool useNvls = isNvlsSupported;
-    bool isFp8 = request.dtype == mscclpp::DataType::FP8_E4M3 || request.dtype == mscclpp::DataType::FP8_E5M2;
-#if !defined(__HIP_PLATFORM_AMD__)
-    if (isFp8 && deviceComputeCapability.first < 10) {
-      // NVLS does not support FP8 on devices with compute capability < 10
-      useNvls = false;
-    }
-#endif
-    if (messageSize <= (1 << 15) && useNvls) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls_packet");
-    }
-    if (messageSize <= (1 << 14)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_allpair_packet");
-    }
-    if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_packet");
-    }
-    if (useNvls && useNvlsWithZeroCopy) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls");
-    }
-    if (useNvls && messageSize < (1 << 24)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls_with_copy");
-    }
-    if (useNvls && messageSize >= (1 << 24)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls_with_copy2");
-    }
-#if defined(__HIP_PLATFORM_AMD__)
-    return algoMapByCollective.at(collective).at("default_allreduce_fullmesh");
-#else
-    if (!mscclppNcclDlopenSharedLib) {
-      return algoMapByCollective.at(collective).at("default_allreduce_fullmesh");
-    }
-#endif
+
+  // Single-node scenarios
+  if (request.collective == "allgather") {
+    return mscclpp::nccl::selectSingleNodeAllgather(algoMap, request, config);
   }
-  INFO(MSCCLPP_NCCL, "Failed to get algo from customized kernel, fallback to nccl/rccl");
+
+  if (request.collective == "allreduce") {
+    return mscclpp::nccl::selectSingleNodeAllreduce(algoMap, request, config);
+  }
+
+  INFO(MSCCLPP_NCCL, "No suitable algorithm found for collective '%s', fallback to nccl/rccl",
+       request.collective.c_str());
   return nullptr;
 }
 
 NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
   INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank %d, world_size=%d", rank, nranks);
   if (comm == nullptr) {
-    WARN("comm is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr");
     return ncclInvalidArgument;
   }
   if (nranks < 0 || rank < 0 || rank >= nranks) {
-    WARN("nranks is %d, rank is %d", nranks, rank);
+    WARN(MSCCLPP_NCCL, "nranks is %d, rank is %d", nranks, rank);
     return ncclInvalidArgument;
   }
   std::shared_ptr<mscclpp::TcpBootstrap> bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
@@ -327,12 +292,15 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
   commPtr->scratchBuffer_ = mscclpp::GpuBuffer<char>(commPtr->scratchBufferSize_).memory();
   commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm, commPtr->scratchBuffer_);
 
+  auto [flagBuffer, flagBufferSize] = mscclpp::getDefaultFlagBuffer();
+
   commPtr->nRanksPerNode = mscclppComm->bootstrap()->getNranksPerNode();
   commPtr->worldSize = mscclppComm->bootstrap()->getNranks();
   auto algoBuilder = mscclpp::collective::AlgorithmCollectionBuilder::getInstance();
   algoBuilder->setFallbackAlgorithmSelector(algoSelector);
   commPtr->algorithmCollection = algoBuilder->buildDefaultAlgorithms(
-      reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()), commPtr->scratchBufferSize_, rank);
+      reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()), commPtr->scratchBufferSize_,
+      reinterpret_cast<uintptr_t>(flagBuffer.get()), flagBufferSize, rank);
   // Extend with user-defined algorithms
   commPtr->algorithmCollection.extend(algoBuilder->build());
 
@@ -346,7 +314,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
   const std::string ncclLibPath = mscclpp::env()->ncclSharedLibPath;
   if (!ncclLibPath.empty() && !mscclppNcclDlopenSharedLib) {
     if (!tryLoadNcclSharedLib()) {
-      WARN("Failed to load the shared library for nccl/rccl");
+      WARN(MSCCLPP_NCCL, "Failed to load the shared library for nccl/rccl");
       return ncclInternalError;
     }
   }
@@ -361,7 +329,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
 
     commPtr->mscclppNcclComm = new ncclComm_t();
     if (commPtr->mscclppNcclComm == nullptr) {
-      WARN("Failed to allocate memory for mscclppNcclComm");
+      WARN(MSCCLPP_NCCL, "Failed to allocate memory for mscclppNcclComm");
       return ncclInternalError;
     }
     mscclppNcclOps.CommInitRank(reinterpret_cast<ncclComm_t*>(commPtr->mscclppNcclComm), nranks, mscclppNcclUniqueId,
@@ -378,7 +346,7 @@ NCCL_API ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int*) {
     return ncclCommInitRank(comm, ndev, Id, 0);
   }
   // TODO: implement this function
-  WARN("ncclCommInitAll is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommInitAll is currently unavailable");
   return ncclInternalError;
 }
 
@@ -389,7 +357,7 @@ NCCL_API ncclResult_t ncclCommFinalize(ncclComm_t comm) {
 
 NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   if (comm == nullptr) {
-    WARN("comm is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr");
     return ncclInvalidArgument;
   }
 #if defined(ENABLE_NPKIT)
@@ -447,7 +415,7 @@ NCCL_API ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclCom
 }
 
 ncclResult_t ncclCommInitRankScalable(ncclComm_t*, int, int, int, ncclUniqueId*, ncclConfig_t*) {
-  WARN("ncclCommInitRankScalable is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommInitRankScalable is currently unavailable");
   return ncclInternalError;
 }
 
@@ -481,7 +449,7 @@ NCCL_API const char* ncclGetLastError(ncclComm_t) {
 
 NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t, ncclResult_t* asyncError) {
   if (asyncError == nullptr) {
-    WARN("asyncError is nullptr");
+    WARN(MSCCLPP_NCCL, "asyncError is nullptr");
     return ncclInvalidArgument;
   }
   *asyncError = ncclSuccess;
@@ -490,7 +458,7 @@ NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t, ncclResult_t* asyncError
 
 NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
   if (comm == nullptr || count == nullptr) {
-    WARN("comm is nullptr or count is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr or count is nullptr");
     return ncclInvalidArgument;
   }
   *count = comm->comm->bootstrap()->getNranks();
@@ -499,7 +467,7 @@ NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
 
 NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) {
   if (comm == nullptr || device == nullptr) {
-    WARN("comm is nullptr or device is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr or device is nullptr");
     return ncclInvalidArgument;
   }
   *device = comm->comm->bootstrap()->getRank();
@@ -508,7 +476,7 @@ NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) {
 
 NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
   if (comm == nullptr || rank == nullptr) {
-    WARN("comm is nullptr or rank is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr or rank is nullptr");
     return ncclInvalidArgument;
   }
 
@@ -521,24 +489,24 @@ NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
 }
 
 NCCL_API ncclResult_t ncclCommWindowRegister(ncclComm_t, void*, size_t, ncclWindow_t*, int) {
-  WARN("ncclCommWindowRegister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommWindowRegister is currently unavailable");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclCommWindowDeregister(ncclComm_t, ncclWindow_t) {
-  WARN("ncclCommWindowDeregister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommWindowDeregister is currently unavailable");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t*, void*, ncclDataType_t, ncclScalarResidence_t, ncclComm_t) {
   // TODO: implement this function
-  WARN("ncclRedOpCreatePreMulSum is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclRedOpCreatePreMulSum is currently unavailable");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) {
   // TODO: implement this function
-  WARN("ncclRedOpDestroy is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclRedOpDestroy is currently unavailable");
   return ncclInternalError;
 }
 
@@ -549,7 +517,7 @@ NCCL_API ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t co
     return mscclppNcclOps.Reduce(sendbuff, recvbuff, count, datatype, op, root,
                                  *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
-  WARN("ncclReduce is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclReduce is currently unavailable");
   return ncclInternalError;
 }
 
@@ -569,9 +537,9 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
   }
   int rank = comm->comm->bootstrap()->getRank();
   if ((sendbuff == nullptr && root == rank) || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
-        "or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
+         "or comm is nullptr.");
     return ncclInvalidArgument;
   }
 
@@ -587,12 +555,15 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
   mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
   static std::unordered_map<std::string, std::vector<uint64_t>> hints{{"root", {static_cast<uint64_t>(root)}}};
   hints["root"][0] = static_cast<uint64_t>(root);
+
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
   mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                         .nRanksPerNode = comm->nRanksPerNode,
                                         .rank = rank,
                                         .inputBuffer = sendbuff,
                                         .outputBuffer = recvbuff,
                                         .messageSize = bytes,
+                                        .stream = stream,
                                         .collective = "broadcast",
                                         .dtype = dtype,
                                         .hints = hints};
@@ -600,7 +571,8 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
   if (algo != nullptr) {
     std::unordered_map<std::string, uintptr_t> extras{{"root", reinterpret_cast<uintptr_t>(&root)}};
     return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes, dtype,
-                                                   mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0, extras));
+                                                   mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0,
+                                                   symmetricMemory, extras));
   }
 
   if (mscclppNcclDlopenSharedLib == true) {
@@ -608,7 +580,7 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  WARN("No FallBack implementation for broadcast");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for broadcast");
   return ncclInvalidUsage;
 }
 
@@ -623,9 +595,9 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
   }
   // Checking if the parameters are valids
   if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, count is 0, "
-        "datatype is invalid, or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, count is 0, "
+         "datatype is invalid, or comm is nullptr.");
     return ncclInvalidArgument;
   }
   // Declarating variables
@@ -639,12 +611,14 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
   mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
   mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                         .nRanksPerNode = comm->nRanksPerNode,
                                         .rank = rank,
                                         .inputBuffer = sendbuff,
                                         .outputBuffer = recvbuff,
                                         .messageSize = bytes,
+                                        .stream = stream,
                                         .collective = "allreduce",
                                         .dtype = dtype,
                                         .hints = {}};
@@ -652,7 +626,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
   auto algo = comm->algorithmCollection.selectAlgorithm(request);
   if (algo != nullptr) {
     return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes, dtype,
-                                                   ncclRedOpToMscclpp(reductionOperation), stream, comm->executor));
+                                                   ncclRedOpToMscclpp(reductionOperation), stream, comm->executor, 0, 0,
+                                                   symmetricMemory));
   }
 
   if (mscclppNcclDlopenSharedLib == true) {
@@ -660,7 +635,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  WARN("No FallBack implementation for AllReduce");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for AllReduce");
   return ncclInvalidUsage;
 }
 
@@ -675,9 +650,9 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
   }
 
   if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
-        "or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
+         "or comm is nullptr.");
     return ncclInvalidArgument;
   }
 
@@ -693,19 +668,22 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
   int rank = comm->comm->bootstrap()->getRank();
   int nRank = comm->comm->bootstrap()->getNranks();
   mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
   mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                         .nRanksPerNode = comm->nRanksPerNode,
                                         .rank = rank,
                                         .inputBuffer = sendbuff,
                                         .outputBuffer = recvbuff,
                                         .messageSize = bytes * nRank,
+                                        .stream = stream,
                                         .collective = "reducescatter",
                                         .dtype = dtype,
                                         .hints = {}};
   auto algo = comm->algorithmCollection.selectAlgorithm(request);
   if (algo != nullptr) {
     return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes * nRank, bytes, dtype,
-                                                   ncclRedOpToMscclpp(op), stream, comm->executor));
+                                                   ncclRedOpToMscclpp(op), stream, comm->executor, 0, 0,
+                                                   symmetricMemory));
   }
 
   if (mscclppNcclDlopenSharedLib == true) {
@@ -713,7 +691,7 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
                                         *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  WARN("No FallBack implementation for ReduceScatter");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for ReduceScatter");
   return ncclInternalError;
 }
 
@@ -727,9 +705,9 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
     return ncclSuccess;
   }
   if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
-        "or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
+         "or comm is nullptr.");
     return ncclInvalidArgument;
   }
 
@@ -745,12 +723,14 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
   }
 
   mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
   mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                         .nRanksPerNode = comm->nRanksPerNode,
                                         .rank = rank,
                                         .inputBuffer = sendbuff,
                                         .outputBuffer = recvbuff,
                                         .messageSize = bytes,
+                                        .stream = stream,
                                         .collective = "allgather",
                                         .dtype = dtype,
                                         .hints = {}};
@@ -758,7 +738,8 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
   auto algo = comm->algorithmCollection.selectAlgorithm(request);
   if (algo != nullptr) {
     return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes * nRank, dtype,
-                                                   mscclpp::ReduceOp::NOP, stream, comm->executor));
+                                                   mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0,
+                                                   symmetricMemory));
   }
 
   if (mscclppNcclDlopenSharedLib == true) {
@@ -766,7 +747,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  WARN("No FallBack implementation for AllGather");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for AllGather");
   return ncclInvalidUsage;
 }
 
@@ -776,7 +757,7 @@ NCCL_API ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_
     return mscclppNcclOps.Send(sendbuff, count, datatype, peer, *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm),
                                stream);
   }
-  WARN("ncclSend is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclSend is currently unavailable");
   return ncclInternalError;
 }
 
@@ -786,7 +767,7 @@ NCCL_API ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t data
     return mscclppNcclOps.Recv(recvbuff, count, datatype, peer, *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm),
                                stream);
   }
-  WARN("ncclRecv is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclRecv is currently unavailable");
   return ncclInternalError;
 }
 
@@ -800,7 +781,7 @@ NCCL_API ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t
     return ncclSuccess;
   }
   // TODO: implement this function
-  WARN("ncclAllToAll is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclAllToAll is currently unavailable");
   return ncclInternalError;
 }
 
@@ -815,7 +796,7 @@ NCCL_API ncclResult_t ncclAllToAllv(const void* sendbuff, [[maybe_unused]] const
                                       cudaMemcpyDeviceToDevice, stream));
     return ncclSuccess;
   }
-  WARN("ncclAllToAllv is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclAllToAllv is currently unavailable");
   return ncclInternalError;
 }
 
@@ -824,7 +805,7 @@ NCCL_API ncclResult_t ncclGroupStart() {
   if (mscclppNcclDlopenSharedLib == true) {
     return mscclppNcclOps.GroupStart();
   }
-  WARN("ncclGroupStart is currently unavailable, return success");
+  WARN(MSCCLPP_NCCL, "ncclGroupStart is currently unavailable, return success");
   return ncclSuccess;
 }
 
@@ -832,56 +813,56 @@ NCCL_API ncclResult_t ncclGroupEnd() {
   if (mscclppNcclDlopenSharedLib == true) {
     return mscclppNcclOps.GroupEnd();
   }
-  WARN("ncclGroupEnd is currently unavailable, return success");
+  WARN(MSCCLPP_NCCL, "ncclGroupEnd is currently unavailable, return success");
   return ncclSuccess;
 }
 
 NCCL_API ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t*) {
   // TODO: implement this function
-  WARN("ncclGroupSimulateEnd is not implemented");
+  WARN(MSCCLPP_NCCL, "ncclGroupSimulateEnd is not implemented");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclCommRegister(const ncclComm_t, void*, size_t, void**) {
   // TODO: Implementation
-  WARN("ncclCommRegister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommRegister is currently unavailable");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
   // TODO: Implementation
-  WARN("ncclCommDeregister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommDeregister is currently unavailable");
   return ncclInternalError;
 }
 
 ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
   if (ptr == nullptr || size == 0) {
-    WARN("ptr is nullptr or size is 0");
+    WARN(MSCCLPP_NCCL, "ptr is nullptr or size is 0");
     return ncclInvalidArgument;
   }
   std::shared_ptr<char> sharedPtr;
   try {
     sharedPtr = mscclpp::GpuBuffer(size).memory();
     if (sharedPtr == nullptr) {
-      WARN("Failed to allocate memory via ncclMemAlloc");
+      WARN(MSCCLPP_NCCL, "Failed to allocate memory via ncclMemAlloc");
       return ncclSystemError;
     }
   } catch (const mscclpp::Error& e) {
     if (e.getErrorCode() == mscclpp::ErrorCode::InvalidUsage) {
-      WARN("Invalid usage: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Invalid usage: %s", e.what());
       return ncclInvalidUsage;
     } else {
-      WARN("Internal error: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Internal error: %s", e.what());
       return ncclInternalError;
     }
   } catch (const mscclpp::CudaError& e) {
-    WARN("Cuda error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cuda error: %s", e.what());
     return ncclUnhandledCudaError;
   } catch (const mscclpp::CuError& e) {
-    WARN("Cu error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cu error: %s", e.what());
     return ncclUnhandledCudaError;
   } catch (const mscclpp::BaseError& e) {
-    WARN("Base error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Base error: %s", e.what());
     return ncclInternalError;
   }
   ptrMap[sharedPtr.get()] = sharedPtr;
@@ -899,6 +880,6 @@ ncclResult_t ncclMemFree(void* ptr) {
   }
 
   // Pointer not found
-  WARN("Pointer not found");
+  WARN(MSCCLPP_NCCL, "Pointer not found");
   return ncclInvalidUsage;
 }
diff --git a/test/torch/allreduce_temp_buff.py b/test/torch/allreduce_temp_buff.py
index 339c665f..d2b7fe52 100644
--- a/test/torch/allreduce_temp_buff.py
+++ b/test/torch/allreduce_temp_buff.py
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 # run with:
-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  MSCCLPP_DISABLE_CHANNEL_CACHE=true  torchrun --nproc_per_node=8 ./allreduce_temp_buff.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  MSCCLPP_NCCL_SYMMETRIC_MEMORY=false  torchrun --nproc_per_node=8 ./allreduce_temp_buff.py
 
 import os
 import torch
diff --git a/test/torch/memory_report.py b/test/torch/memory_report.py
index 55c9bd7e..0aa30dc3 100644
--- a/test/torch/memory_report.py
+++ b/test/torch/memory_report.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so MSCCLPP_DISABLE_CHANNEL_CACHE=true  torchrun --nnodes=1 --nproc_per_node=8 memory_report.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so MSCCLPP_NCCL_SYMMETRIC_MEMORY=false  torchrun --nnodes=1 --nproc_per_node=8 memory_report.py
 import os, sys
 import torch
 import torch.distributed as dist

From edc9c3875172ae34e9d5202745680bb9a10bb3df Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Sat, 14 Feb 2026 02:49:25 +0800
Subject: [PATCH 11/52] Support uint8 data type for Allreduce (#736)

Support uint8 data type for Allreduce.
Current limitation: uint8 is not supported for NVLS.

Performance results with RCCL-test with MSCCLPP on MI300X:


\# out-of-place in-place
\# size count type redop root time algbw busbw #wrong time algbw busbw
#wrong
\# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
1024 | 512 | half | sum | -1 | 5.39 | 0.19 | 0.33 | 0 | 5.45 | 0.19 |
0.33 | 0
-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | --
2048 | 1024 | half | sum | -1 | 5.53 | 0.37 | 0.65 | 0 | 5.63 | 0.36 |
0.64 | 0
4096 | 2048 | half | sum | -1 | 5.55 | 0.74 | 1.29 | 0 | 5.56 | 0.74 |
1.29 | 0
8192 | 4096 | half | sum | -1 | 5.8 | 1.41 | 2.47 | 0 | 5.84 | 1.4 |
2.46 | 0
16384 | 8192 | half | sum | -1 | 6.57 | 2.49 | 4.36 | 0 | 6.56 | 2.5 |
4.37 | 0
32768 | 16384 | half | sum | -1 | 8.02 | 4.09 | 7.15 | 0 | 8.06 | 4.07 |
7.11 | 0
65536 | 32768 | half | sum | -1 | 8.77 | 7.47 | 13.07 | 0 | 8.82 | 7.43
| 13 | 0
131072 | 65536 | half | sum | -1 | 9.61 | 13.64 | 23.87 | 0 | 9.78 |
13.4 | 23.45 | 0
262144 | 131072 | half | sum | -1 | 11.68 | 22.44 | 39.27 | 0 | 12.1 |
21.67 | 37.93 | 0
524288 | 262144 | half | sum | -1 | 13.77 | 38.08 | 66.64 | 0 | 13.87 |
37.79 | 66.13 | 0
1048576 | 524288 | half | sum | -1 | 19.11 | 54.87 | 96.03 | 0 | 19.27 |
54.42 | 95.24 | 0
2097152 | 1048576 | half | sum | -1 | 24.1 | 87 | 152.26 | 0 | 24.24 |
86.52 | 151.41 | 0
4194304 | 2097152 | half | sum | -1 | 37.16 | 112.87 | 197.52 | 0 |
37.44 | 112.03 | 196.06 | 0
8388608 | 4194304 | half | sum | -1 | 61.53 | 136.33 | 238.58 | 0 |
61.68 | 135.99 | 237.99 | 0
16777216 | 8388608 | half | sum | -1 | 108.8 | 154.22 | 269.88 | 0 |
109.2 | 153.6 | 268.79 | 0
33554432 | 16777216 | half | sum | -1 | 197.8 | 169.68 | 296.94 | 0 |
198.6 | 168.92 | 295.61 | 0
67108864 | 33554432 | half | sum | -1 | 384.6 | 174.51 | 305.39 | 0 |
385.1 | 174.27 | 304.98 | 0
134217728 | 67108864 | half | sum | -1 | 754.1 | 177.99 | 311.48 | 0 |
754.9 | 177.78 | 311.12 | 0
268435456 | 134217728 | half | sum | -1 | 1491.8 | 179.94 | 314.89 | 0 |
1493.2 | 179.77 | 314.6 | 0
536870912 | 268435456 | half | sum | -1 | 2979.6 | 180.18 | 315.31 | 0 |
2983.9 | 179.92 | 314.87 | 0


\# out-of-place in-place
\# size count type redop root time algbw busbw #wrong time algbw busbw
#wrong
\# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
1024 | 1024 | fp8_e4m3 | sum | -1 | 5.4 | 0.19 | 0.33 | 0 | 5.45 | 0.19
| 0.33 | 0
-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | --
2048 | 2048 | fp8_e4m3 | sum | -1 | 5.5 | 0.37 | 0.65 | 0 | 5.6 | 0.37 |
0.64 | 0
4096 | 4096 | fp8_e4m3 | sum | -1 | 5.61 | 0.73 | 1.28 | 0 | 5.68 | 0.72
| 1.26 | 0
8192 | 8192 | fp8_e4m3 | sum | -1 | 5.96 | 1.38 | 2.41 | 0 | 5.98 | 1.37
| 2.4 | 0
16384 | 16384 | fp8_e4m3 | sum | -1 | 6.49 | 2.52 | 4.42 | 0 | 6.58 |
2.49 | 4.36 | 0
32768 | 32768 | fp8_e4m3 | sum | -1 | 8.09 | 4.05 | 7.09 | 0 | 8.15 |
4.02 | 7.03 | 0
65536 | 65536 | fp8_e4m3 | sum | -1 | 8.58 | 7.64 | 13.37 | 0 | 8.7 |
7.53 | 13.18 | 0
131072 | 131072 | fp8_e4m3 | sum | -1 | 9.44 | 13.88 | 24.29 | 0 | 9.62
| 13.63 | 23.85 | 0
262144 | 262144 | fp8_e4m3 | sum | -1 | 10.12 | 25.9 | 45.32 | 0 | 10.37
| 25.27 | 44.22 | 0
524288 | 524288 | fp8_e4m3 | sum | -1 | 13.73 | 38.19 | 66.82 | 0 |
13.89 | 37.74 | 66.04 | 0
1048576 | 1048576 | fp8_e4m3 | sum | -1 | 18.66 | 56.2 | 98.34 | 0 |
18.92 | 55.41 | 96.97 | 0
2097152 | 2097152 | fp8_e4m3 | sum | -1 | 24.54 | 85.46 | 149.56 | 0 |
24.63 | 85.16 | 149.03 | 0
4194304 | 4194304 | fp8_e4m3 | sum | -1 | 37.79 | 110.98 | 194.21 | 0 |
38.05 | 110.22 | 192.88 | 0
8388608 | 8388608 | fp8_e4m3 | sum | -1 | 62.22 | 134.82 | 235.94 | 0 |
62.63 | 133.94 | 234.4 | 0
16777216 | 16777216 | fp8_e4m3 | sum | -1 | 109.9 | 152.62 | 267.09 | 0
| 110.4 | 151.9 | 265.83 | 0
33554432 | 33554432 | fp8_e4m3 | sum | -1 | 201.1 | 166.82 | 291.94 | 0
| 202.3 | 165.84 | 290.22 | 0
67108864 | 67108864 | fp8_e4m3 | sum | -1 | 390 | 172.06 | 301.11 | 0 |
390.2 | 171.99 | 300.99 | 0
134217728 | 134217728 | fp8_e4m3 | sum | -1 | 763.9 | 175.7 | 307.47 | 0
| 764.2 | 175.62 | 307.34 | 0
268435456 | 268435456 | fp8_e4m3 | sum | -1 | 1509.5 | 177.83 | 311.2 |
0 | 1510.1 | 177.76 | 311.08 | 0
536870912 | 536870912 | fp8_e4m3 | sum | -1 | 3010.2 | 178.35 | 312.11 |
0 | 3014.2 | 178.11 | 311.7 | 0


\# out-of-place in-place
\# size count type redop root time algbw busbw #wrong time algbw busbw
#wrong
\# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
1024 | 1024 | fp8_e5m2 | sum | -1 | 5.41 | 0.19 | 0.33 | 0 | 5.44 | 0.19
| 0.33 | 0
-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | --
2048 | 2048 | fp8_e5m2 | sum | -1 | 5.5 | 0.37 | 0.65 | 0 | 5.67 | 0.36
| 0.63 | 0
4096 | 4096 | fp8_e5m2 | sum | -1 | 5.61 | 0.73 | 1.28 | 0 | 5.69 | 0.72
| 1.26 | 0
8192 | 8192 | fp8_e5m2 | sum | -1 | 5.96 | 1.37 | 2.4 | 0 | 6 | 1.36 |
2.39 | 0
16384 | 16384 | fp8_e5m2 | sum | -1 | 6.63 | 2.47 | 4.32 | 0 | 6.59 |
2.49 | 4.35 | 0
32768 | 32768 | fp8_e5m2 | sum | -1 | 8.07 | 4.06 | 7.1 | 0 | 8.16 |
4.02 | 7.03 | 0
65536 | 65536 | fp8_e5m2 | sum | -1 | 8.62 | 7.61 | 13.31 | 0 | 8.73 |
7.51 | 13.14 | 0
131072 | 131072 | fp8_e5m2 | sum | -1 | 9.43 | 13.9 | 24.33 | 0 | 9.6 |
13.66 | 23.9 | 0
262144 | 262144 | fp8_e5m2 | sum | -1 | 10.11 | 25.94 | 45.39 | 0 |
10.38 | 25.26 | 44.21 | 0
524288 | 524288 | fp8_e5m2 | sum | -1 | 13.73 | 38.19 | 66.84 | 0 |
13.87 | 37.79 | 66.13 | 0
1048576 | 1048576 | fp8_e5m2 | sum | -1 | 18.65 | 56.22 | 98.39 | 0 |
18.93 | 55.38 | 96.92 | 0
2097152 | 2097152 | fp8_e5m2 | sum | -1 | 24.54 | 85.47 | 149.57 | 0 |
24.63 | 85.16 | 149.03 | 0
4194304 | 4194304 | fp8_e5m2 | sum | -1 | 37.84 | 110.83 | 193.96 | 0 |
38.01 | 110.36 | 193.12 | 0
8388608 | 8388608 | fp8_e5m2 | sum | -1 | 62.32 | 134.61 | 235.58 | 0 |
62.55 | 134.12 | 234.71 | 0
16777216 | 16777216 | fp8_e5m2 | sum | -1 | 110 | 152.58 | 267.01 | 0 |
110.3 | 152.12 | 266.21 | 0
33554432 | 33554432 | fp8_e5m2 | sum | -1 | 201.1 | 166.9 | 292.07 | 0 |
201.8 | 166.26 | 290.96 | 0
67108864 | 67108864 | fp8_e5m2 | sum | -1 | 390 | 172.07 | 301.12 | 0 |
390.5 | 171.87 | 300.78 | 0
134217728 | 134217728 | fp8_e5m2 | sum | -1 | 763.9 | 175.69 | 307.46 |
0 | 764.5 | 175.56 | 307.23 | 0
268435456 | 268435456 | fp8_e5m2 | sum | -1 | 1509.4 | 177.84 | 311.22 |
0 | 1509.8 | 177.8 | 311.14 | 0
536870912 | 536870912 | fp8_e5m2 | sum | -1 | 3013 | 178.18 | 311.82 | 0
| 3018 | 177.89 | 311.31 | 0


\# out-of-place in-place
\# size count type redop root time algbw busbw #wrong time algbw busbw
#wrong
\# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
1024 | 1024 | uint8 | sum | -1 | 5.46 | 0.19 | 0.33 | 0 | 5.46 | 0.19 |
0.33 | 0
-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | --
2048 | 2048 | uint8 | sum | -1 | 5.54 | 0.37 | 0.65 | 0 | 5.63 | 0.36 |
0.64 | 0
4096 | 4096 | uint8 | sum | -1 | 5.61 | 0.73 | 1.28 | 0 | 5.63 | 0.73 |
1.27 | 0
8192 | 8192 | uint8 | sum | -1 | 5.9 | 1.39 | 2.43 | 0 | 5.9 | 1.39 |
2.43 | 0
16384 | 16384 | uint8 | sum | -1 | 6.6 | 2.48 | 4.35 | 0 | 6.64 | 2.47 |
4.32 | 0
32768 | 32768 | uint8 | sum | -1 | 8.99 | 3.65 | 6.38 | 0 | 8.99 | 3.64
| 6.38 | 0
65536 | 65536 | uint8 | sum | -1 | 9.44 | 6.94 | 12.15 | 0 | 9.58 | 6.84
| 11.98 | 0
131072 | 131072 | uint8 | sum | -1 | 11.72 | 11.18 | 19.57 | 0 | 11.83 |
11.08 | 19.4 | 0
262144 | 262144 | uint8 | sum | -1 | 12.29 | 21.32 | 37.31 | 0 | 12.45 |
21.05 | 36.84 | 0
524288 | 524288 | uint8 | sum | -1 | 13.87 | 37.8 | 66.15 | 0 | 13.93 |
37.64 | 65.88 | 0
1048576 | 1048576 | uint8 | sum | -1 | 19.11 | 54.88 | 96.04 | 0 | 19.3
| 54.33 | 95.08 | 0
2097152 | 2097152 | uint8 | sum | -1 | 24.38 | 86.01 | 150.51 | 0 |
24.52 | 85.53 | 149.67 | 0
4194304 | 4194304 | uint8 | sum | -1 | 37.52 | 111.78 | 195.61 | 0 |
37.76 | 111.08 | 194.39 | 0
8388608 | 8388608 | uint8 | sum | -1 | 62.4 | 134.44 | 235.26 | 0 |
62.56 | 134.1 | 234.67 | 0
16777216 | 16777216 | uint8 | sum | -1 | 110.2 | 152.22 | 266.39 | 0 |
110.3 | 152.04 | 266.08 | 0
33554432 | 33554432 | uint8 | sum | -1 | 199.8 | 167.94 | 293.9 | 0 |
197.5 | 169.88 | 297.29 | 0
67108864 | 67108864 | uint8 | sum | -1 | 386.3 | 173.73 | 304.03 | 0 |
378.4 | 177.37 | 310.39 | 0
134217728 | 134217728 | uint8 | sum | -1 | 758 | 177.07 | 309.87 | 0 |
741.1 | 181.12 | 316.95 | 0
268435456 | 268435456 | uint8 | sum | -1 | 1500.1 | 178.95 | 313.16 | 0
| 1466.2 | 183.09 | 320.4 | 0
536870912 | 536870912 | uint8 | sum | -1 | 2991.7 | 179.45 | 314.04 | 0
| 2924.8 | 183.56 | 321.23 | 0

---------

Co-authored-by: Qinghua Zhou <qinghuahzhou@microsoft.com>
---
 include/mscclpp/gpu_data_types.hpp            |  39 +++++++
 src/core/executor/execution_kernel.cu         |  11 ++
 src/core/include/execution_kernel.hpp         | 100 ++++++++++--------
 src/core/include/reduce_kernel.hpp            |  14 +--
 .../collectives/allreduce/allreduce_nvls.cu   |   6 +-
 .../allreduce/allreduce_nvls_with_copy.cu     |  22 ++--
 .../allreduce/allreduce_nvls_with_copy_2.cu   |  22 ++--
 .../collectives/include/allreduce/common.hpp  |   4 +
 src/ext/nccl/algorithm_selector.cc            |   6 ++
 src/ext/nccl/datatype_conversion.hpp          |   5 +
 10 files changed, 162 insertions(+), 67 deletions(-)

diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
index 9e7747a8..5a99355f 100644
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -75,6 +75,7 @@ enum class DataType {
   BFLOAT16,  // bfloat16 precision.
   FP8_E4M3,  // FP8 with E4M3 layout.
   FP8_E5M2,  // FP8 with E5M2 layout.
+  UINT8,     // 8-bit unsigned integer.
 };
 
 /// Word array.
@@ -154,12 +155,14 @@ DEFINE_VEC(f64x1, double, 1, double);
 
 DEFINE_VEC(i32x2, int32_t, 2, int2);
 DEFINE_VEC(u32x2, uint32_t, 2, uint2);
+DEFINE_VEC(u8x2, uint8_t, 2, uint16_t);
 DEFINE_VEC(f32x2, float, 2, float2);
 DEFINE_VEC(f16x2, __half, 2, __half2);
 DEFINE_VEC(bf16x2, __bfloat16, 2, __bfloat162);
 
 DEFINE_VEC(i32x4, int32_t, 4, int4);
 DEFINE_VEC(u32x4, uint32_t, 4, uint4);
+DEFINE_VEC(u8x4, uint8_t, 4, uint32_t);
 DEFINE_VEC(f32x4, float, 4, float4);
 DEFINE_VEC(f16x4, __half, 4, uint2);
 DEFINE_VEC(bf16x4, __bfloat16, 4, uint2);
@@ -427,6 +430,20 @@ MSCCLPP_DEVICE_INLINE f8_e5m2x4 operator+(const f8_e5m2x4& a, const f8_e5m2x4& b
 }
 #endif  // defined(__FP8_TYPES_EXIST__)
 
+MSCCLPP_DEVICE_INLINE u8x4 operator+(const u8x4& a, const u8x4& b) {
+#if defined(MSCCLPP_DEVICE_HIP)
+  // Optimized uint8_t x 4 sum using byte permute to avoid overflow between adjacent bytes
+  constexpr uint32_t even = 0x00ff00ffu;
+  uint32_t ua = a.storage;
+  uint32_t ub = b.storage;
+  uint32_t x = (ua & even) + (ub & even);
+  uint32_t y = (ua & ~even) + (ub & ~even);
+  return __byte_perm(x, y, 0x7250);
+#else
+  return __vadd4(a.storage, b.storage);
+#endif
+}
+
 template <typename T>
 MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) {
   return (a < b ? a : b);
@@ -450,6 +467,28 @@ MSCCLPP_DEVICE_INLINE bf16x2 min(const bf16x2& a, const bf16x2& b) {
   return __hmin2(a, b);
 }
 
+template <>
+MSCCLPP_DEVICE_INLINE u8x4 min(const u8x4& a, const u8x4& b) {
+#if defined(MSCCLPP_DEVICE_HIP)
+  // Optimized uint8_t x 4 min using 9-bit arithmetic
+  constexpr uint32_t ones = 0x01010101u;
+  constexpr uint32_t even = 0x00ff00ffu;  // even byte mask
+  uint32_t ua = a.storage;
+  uint32_t ub = b.storage;
+  // Use 9-bit arithmetic to compute d=a-b for each byte
+  uint32_t d0 = (ua & even) + (~ub & even) + ones;
+  uint32_t d1 = ((ua >> 8) & even) + (~(ub >> 8) & even) + ones;
+  // Move sign bit of each 9-bit delta into the least bit of origin byte
+  uint32_t s = __byte_perm(d0, d1, 0x7351) & ones;
+  // Broadcast least bit across whole byte
+  s *= 0xffu;
+  // Compose result by selecting bytes via: signbit(a-b)==1 ? a : b
+  return (ua & s) | (ub & ~s);
+#else
+  return __vminu4(a.storage, b.storage);
+#endif
+}
+
 #if defined(__FP8_TYPES_EXIST__)
 template <>
 MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) {
diff --git a/src/core/executor/execution_kernel.cu b/src/core/executor/execution_kernel.cu
index 4b1b06bc..ceddf9b7 100644
--- a/src/core/executor/execution_kernel.cu
+++ b/src/core/executor/execution_kernel.cu
@@ -32,6 +32,17 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
           NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
 #else
       );
+#endif
+      break;
+    case DataType::UINT8:
+      executionKernel<uint8_t, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+          rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores,
+          localMemoryIdBegin, flag
+#if defined(ENABLE_NPKIT)
+          ,
+          NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
+#else
+      );
 #endif
       break;
     case DataType::FLOAT16:
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index 74283244..f2fad0d5 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -521,51 +521,56 @@ MSCCLPP_DEVICE_INLINE void handleCopy(const Operation& op, void* input, void* ou
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
 template <typename T, bool ReuseScratch>
 MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint32_t offset, uint32_t unitSize) {
-  static_assert(sizeof(T) <= 8, "Only support type with size <= 8 bytes");
-  const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize);
-  if (size <= 0) {
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    assert(false && "MULTI_LOAD_REDUCE_STORE is not supported for uint8_t data type");
     return;
-  }
-  const uint32_t srcOffset = op.inputOffsets[0] + getOffset<ReuseScratch>(op.nvlsInputBufferType, offset);
-  const uint32_t dstOffset = op.outputOffsets[0] + getOffset<ReuseScratch>(op.nvlsOutputBufferType, offset);
-  assert(size % sizeof(T) == 0);
-  assert(srcOffset % sizeof(T) == 0);
-  assert(dstOffset % sizeof(T) == 0);
-
-  T* src = (T*)nvlsChannels_[op.nvlsInputIndex].mcPtr;
-  T* dst = (T*)nvlsChannels_[op.nvlsOutputIndex].mcPtr;
-  if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t>) {
-    const size_t nElem = size / sizeof(T);
-    const size_t srcOffsetElem = srcOffset / sizeof(T);
-    const size_t dstOffsetElem = dstOffset / sizeof(T);
-    VectorType<T, 1>* srcElem = reinterpret_cast<VectorType<T, 1>*>(src + srcOffsetElem);
-    VectorType<T, 1>* dstElem = reinterpret_cast<VectorType<T, 1>*>(dst + dstOffsetElem);
-    for (size_t idx = threadIdx.x; idx < nElem; idx += blockDim.x) {
-      auto val = SwitchChannelDeviceHandle::multimemLoadReduce(srcElem + idx);
-      SwitchChannelDeviceHandle::multimemStore(val, dstElem + idx);
-    }
   } else {
-    // handle data in 16-byte unit
-    using Type16 = mscclpp::VectorType<T, 16 / sizeof(T)>;
-    const size_t nType16 = size / sizeof(Type16);
-    const size_t srcOffset16 = srcOffset / sizeof(Type16);
-    const size_t dstOffset16 = dstOffset / sizeof(Type16);
-    Type16* src16 = reinterpret_cast<Type16*>(src) + srcOffset16;
-    Type16* dst16 = reinterpret_cast<Type16*>(dst) + dstOffset16;
-    for (size_t idx = threadIdx.x; idx < nType16; idx += blockDim.x) {
-      Type16 val = SwitchChannelDeviceHandle::multimemLoadReduce(src16 + idx);
-      SwitchChannelDeviceHandle::multimemStore(val, dst16 + idx);
+    static_assert(sizeof(T) <= 8, "Only support type with size <= 8 bytes");
+    const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize);
+    if (size <= 0) {
+      return;
     }
-    // handle rest of data
-    constexpr int RedBytes = (sizeof(T) == 8) ? 8 : 4;
-    using TypeRest = mscclpp::VectorType<T, RedBytes / sizeof(T)>;
-    const size_t processed = nType16 * sizeof(Type16);
-    const size_t nRest = (size - processed) / sizeof(TypeRest);
-    TypeRest* srcR = reinterpret_cast<TypeRest*>(src + srcOffset + processed);
-    TypeRest* dstR = reinterpret_cast<TypeRest*>(dst + dstOffset + processed);
-    for (size_t idx = threadIdx.x; idx < nRest; idx += blockDim.x) {
-      TypeRest val = SwitchChannelDeviceHandle::multimemLoadReduce(srcR + idx);
-      SwitchChannelDeviceHandle::multimemStore(val, dstR + idx);
+    const uint32_t srcOffset = op.inputOffsets[0] + getOffset<ReuseScratch>(op.nvlsInputBufferType, offset);
+    const uint32_t dstOffset = op.outputOffsets[0] + getOffset<ReuseScratch>(op.nvlsOutputBufferType, offset);
+    assert(size % sizeof(T) == 0);
+    assert(srcOffset % sizeof(T) == 0);
+    assert(dstOffset % sizeof(T) == 0);
+
+    T* src = (T*)nvlsChannels_[op.nvlsInputIndex].mcPtr;
+    T* dst = (T*)nvlsChannels_[op.nvlsOutputIndex].mcPtr;
+    if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t>) {
+      const size_t nElem = size / sizeof(T);
+      const size_t srcOffsetElem = srcOffset / sizeof(T);
+      const size_t dstOffsetElem = dstOffset / sizeof(T);
+      VectorType<T, 1>* srcElem = reinterpret_cast<VectorType<T, 1>*>(src + srcOffsetElem);
+      VectorType<T, 1>* dstElem = reinterpret_cast<VectorType<T, 1>*>(dst + dstOffsetElem);
+      for (size_t idx = threadIdx.x; idx < nElem; idx += blockDim.x) {
+        auto val = SwitchChannelDeviceHandle::multimemLoadReduce(srcElem + idx);
+        SwitchChannelDeviceHandle::multimemStore(val, dstElem + idx);
+      }
+    } else {
+      // handle data in 16-byte unit
+      using Type16 = mscclpp::VectorType<T, 16 / sizeof(T)>;
+      const size_t nType16 = size / sizeof(Type16);
+      const size_t srcOffset16 = srcOffset / sizeof(Type16);
+      const size_t dstOffset16 = dstOffset / sizeof(Type16);
+      Type16* src16 = reinterpret_cast<Type16*>(src) + srcOffset16;
+      Type16* dst16 = reinterpret_cast<Type16*>(dst) + dstOffset16;
+      for (size_t idx = threadIdx.x; idx < nType16; idx += blockDim.x) {
+        Type16 val = SwitchChannelDeviceHandle::multimemLoadReduce(src16 + idx);
+        SwitchChannelDeviceHandle::multimemStore(val, dst16 + idx);
+      }
+      // handle rest of data
+      constexpr int RedBytes = (sizeof(T) == 8) ? 8 : 4;
+      using TypeRest = mscclpp::VectorType<T, RedBytes / sizeof(T)>;
+      const size_t processed = nType16 * sizeof(Type16);
+      const size_t nRest = (size - processed) / sizeof(TypeRest);
+      TypeRest* srcR = reinterpret_cast<TypeRest*>(src + srcOffset + processed);
+      TypeRest* dstR = reinterpret_cast<TypeRest*>(dst + dstOffset + processed);
+      for (size_t idx = threadIdx.x; idx < nRest; idx += blockDim.x) {
+        TypeRest val = SwitchChannelDeviceHandle::multimemLoadReduce(srcR + idx);
+        SwitchChannelDeviceHandle::multimemStore(val, dstR + idx);
+      }
     }
   }
 }
@@ -894,6 +899,17 @@ class ExecutionKernel {
 #endif
         break;
 #endif  // __FP8_TYPES_EXIST__
+      case DataType::UINT8:
+        executionKernel<uint8_t, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores,
+            localMemoryIdBegin, flag
+#if defined(ENABLE_NPKIT)
+            ,
+            NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
+#else
+        );
+#endif
+        break;
     }
   }
 #else   // !defined(MSCCLPP_DEVICE_HIP)
diff --git a/src/core/include/reduce_kernel.hpp b/src/core/include/reduce_kernel.hpp
index 00dc7714..fd9bd1e9 100644
--- a/src/core/include/reduce_kernel.hpp
+++ b/src/core/include/reduce_kernel.hpp
@@ -60,16 +60,18 @@ MSCCLPP_DEVICE_INLINE DataType cal_vector(const DataType& a, const DataType& b)
   static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes");
   using CompType = typename std::conditional_t<
       std::is_same_v<T, __half>, f16x2,
-      std::conditional_t<std::is_same_v<T, __bfloat16>, bf16x2,
+      std::conditional_t<
+          std::is_same_v<T, __bfloat16>, bf16x2,
+          std::conditional_t<std::is_same_v<T, uint8_t>, u8x4,
 #if defined(__FP8_TYPES_EXIST__)
-                         std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
-                                            std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4,
+                             std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
+                                                std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4,
 #endif
-                                                               T
+                                                                   T
 #if defined(__FP8_TYPES_EXIST__)
-                                                               >>>>;
+                                                                   >>>>>;
 #else
-                         >>;
+                             >>>;
 #endif
   return cal_vector_helper<CompType, OpType>(a, b);
 }
diff --git a/src/ext/collectives/allreduce/allreduce_nvls.cu b/src/ext/collectives/allreduce/allreduce_nvls.cu
index b07993a0..b73e1d27 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls.cu
@@ -72,8 +72,12 @@ struct NvlsAdapter {
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
                           size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
+    if constexpr (std::is_same_v<T, uint8_t>) {
+      return cudaErrorNotSupported;
+    } else
 #if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)
-    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+        if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
       return cudaErrorNotSupported;
     } else
 #endif
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
index 033f3311..23d5ca4e 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
@@ -114,18 +114,22 @@ struct NvlsWithCopyAdapter {
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
-#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
-    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+    // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
+    if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
     } else
+#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
+      if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+        return cudaErrorNotSupported;
+      } else
 #endif
-    {
-      using ChannelType = DeviceHandle<BaseMemoryChannel>;
-      allreduce10<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                               nvlsChannels, inputSize, scratchBufferSize, rank,
-                                                               nRanksPerNode);
-      return cudaGetLastError();
-    }
+      {
+        using ChannelType = DeviceHandle<BaseMemoryChannel>;
+        allreduce10<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
+                                                                 nvlsChannels, inputSize, scratchBufferSize, rank,
+                                                                 nRanksPerNode);
+        return cudaGetLastError();
+      }
   }
 };
 
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu b/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
index 96aa9168..1d8a3478 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
@@ -151,18 +151,22 @@ struct NvlsWithCopy2Adapter {
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
                           cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
-#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
-    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+    // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
+    if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
     } else
+#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
+      if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+        return cudaErrorNotSupported;
+      } else
 #endif
-    {
-      using ChannelType = DeviceHandle<BaseMemoryChannel>;
-      allreduceNvlsWithCopy2<T>
-          <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels,
-                                                     inputSize, scratchBufferSize, rank, nRanksPerNode);
-      return cudaGetLastError();
-    }
+      {
+        using ChannelType = DeviceHandle<BaseMemoryChannel>;
+        allreduceNvlsWithCopy2<T>
+            <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
+                                                       nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
+        return cudaGetLastError();
+      }
   }
 };
 
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 4c28a24a..ab82417a 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -96,6 +96,8 @@ AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {
 #endif
     } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
       return Adapter<SUM, int>::call;
+    } else if (dtype == mscclpp::DataType::UINT8) {
+      return Adapter<SUM, uint8_t>::call;
     } else {
       return nullptr;
     }
@@ -116,6 +118,8 @@ AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {
 #endif
     } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
       return Adapter<MIN, int>::call;
+    } else if (dtype == mscclpp::DataType::UINT8) {
+      return Adapter<MIN, uint8_t>::call;
     } else {
       return nullptr;
     }
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
index 37126d5b..0e3b3cc1 100644
--- a/src/ext/nccl/algorithm_selector.cc
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -13,6 +13,12 @@ namespace nccl {
 
 static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, DataType dtype) {
   bool nvlsSupported = config.nvlsSupported;
+
+  // NVLS does not support uint8_t (no hardware support for byte-level reduction)
+  if (dtype == DataType::UINT8) {
+    return false;
+  }
+
   const bool isFp8 = dtype == DataType::FP8_E4M3 || dtype == DataType::FP8_E5M2;
 
   if (!isFp8) {
diff --git a/src/ext/nccl/datatype_conversion.hpp b/src/ext/nccl/datatype_conversion.hpp
index 8dfe6aab..bb315894 100644
--- a/src/ext/nccl/datatype_conversion.hpp
+++ b/src/ext/nccl/datatype_conversion.hpp
@@ -18,6 +18,8 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
       return mscclpp::DataType::INT32;
     case ncclUint32:
       return mscclpp::DataType::UINT32;
+    case ncclUint8:
+      return mscclpp::DataType::UINT8;
     case ncclFloat16:
       return mscclpp::DataType::FLOAT16;
     case ncclFloat32:
@@ -38,6 +40,7 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
 // Get the size in bytes of a data type
 inline size_t getDataTypeSize(mscclpp::DataType dtype) {
   switch (dtype) {
+    case mscclpp::DataType::UINT8:
     case mscclpp::DataType::FP8_E4M3:
     case mscclpp::DataType::FP8_E5M2:
       return 1;
@@ -59,6 +62,8 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
       return ncclInt32;
     case mscclpp::DataType::UINT32:
       return ncclUint32;
+    case mscclpp::DataType::UINT8:
+      return ncclUint8;
     case mscclpp::DataType::FLOAT16:
       return ncclFloat16;
     case mscclpp::DataType::FLOAT32:

From d0d5a8c034ebb31b12c22ef6d03f3701f750b272 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 13 Feb 2026 10:50:10 -0800
Subject: [PATCH 12/52] Add new CI pipeline for RCCL test (#746)

Add rccl allreduce/allgather test in ci pipeline
Fix hang issue which introduced by PR #741

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .azure-pipelines/rccl-api-test.yml        |  48 ++++++++
 .azure-pipelines/templates/nccl-test.yaml |  24 ++--
 .azure-pipelines/templates/rccl-test.yaml | 142 ++++++++++++++++++++++
 README.md                                 |   2 +
 src/ext/nccl/nccl.cc                      |   8 +-
 5 files changed, 210 insertions(+), 14 deletions(-)
 create mode 100644 .azure-pipelines/rccl-api-test.yml
 create mode 100644 .azure-pipelines/templates/rccl-test.yaml

diff --git a/.azure-pipelines/rccl-api-test.yml b/.azure-pipelines/rccl-api-test.yml
new file mode 100644
index 00000000..92c5874f
--- /dev/null
+++ b/.azure-pipelines/rccl-api-test.yml
@@ -0,0 +1,48 @@
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+      - .devcontainer/**
+      - .github/**
+      - docker/**
+      - docs/**
+      - '**/*.md'
+
+jobs:
+- job: RcclTestMI300X
+  displayName: Run MSCCLPP over RCCL Test (MI300X)
+  pool:
+    name: msccl-ci-mi300x
+
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/rccl-test.yaml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      sshKeySecureFile: mscclpp.pem
+      gpuArch:          gfx942
diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml
index bc804a94..1ce37d20 100644
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ b/.azure-pipelines/templates/nccl-test.yaml
@@ -200,10 +200,10 @@ steps:
         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
         export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
         cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -221,10 +221,10 @@ steps:
         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
         export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
         cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -242,10 +242,10 @@ steps:
         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
         export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
         cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 # - task: Bash@3
diff --git a/.azure-pipelines/templates/rccl-test.yaml b/.azure-pipelines/templates/rccl-test.yaml
new file mode 100644
index 00000000..040605df
--- /dev/null
+++ b/.azure-pipelines/templates/rccl-test.yaml
@@ -0,0 +1,142 @@
+# .azure-pipelines/templates/rccl-test.yaml
+# ------------------------------------------------
+# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+#   vmssName         – VMSS name to start/stop
+#   sshKeySecureFile – the secureFile name for your SSH key
+#   gpuArch          – GPU architecture (e.g. gfx942)
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: sshKeySecureFile
+  type: string
+- name: gpuArch
+  type: string
+  default: "gfx942"
+
+steps:
+- task: Bash@3
+  name: Build
+  displayName: Build
+  inputs:
+    targetType: 'inline'
+    script: |
+      mkdir build && cd build
+      CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      make -j
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: DownloadSecureFile@1
+  name: SshKeyFile
+  displayName: Download key file
+  inputs:
+    secureFile: ${{ parameters.sshKeySecureFile }}
+
+- task: Bash@3
+  name: InstallPackages
+  displayName: Install Packages
+  inputs:
+    targetType: 'inline'
+    script: |
+      sudo apt-get update -y
+      sudo apt-get install pssh -y
+      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+- task: AzureCLI@2
+  name: StartVMSS
+  displayName: Start VMSS
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
+- task: Bash@3
+  name: DeployTestEnv
+  displayName: Deploy Test Env
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: "single-node-test true rocm"
+    workingDirectory: $(System.DefaultWorkingDirectory)
+
+
+- task: Bash@3
+  name: InstallRcclTests
+  displayName: Install RCCL Tests
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      ROOT_DIR=$(System.DefaultWorkingDirectory)
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
+        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
+        cd;                                                        \
+        git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
+        cd rocm-systems;                                           \
+        git sparse-checkout init --cone;                           \
+        git sparse-checkout set projects/rccl-tests;               \
+        git checkout;                                              \
+        cd projects/rccl-tests;                                    \
+        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: RunRcclAllGatherTest
+  displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      ROOT_DIR=$(System.DefaultWorkingDirectory)
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
+        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
+        cd /root/mscclpp;                                         \
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN  /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: RunRcclAllReduceTest
+  displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      ROOT_DIR=$(System.DefaultWorkingDirectory)
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
+        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
+        cd /root/mscclpp;                                         \
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN  /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
+        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
+        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: AzureCLI@2
+  name: StopVMSS
+  displayName: Deallocate VMSS
+  condition: always()
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/README.md b/README.md
index 8f300a2a..5366f5b5 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,8 @@
 | Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
 | Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
 | Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
+| NCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=320665&branchName=main) |
+| RCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=448013&branchName=main) |
 
 A GPU-driven communication stack for scalable AI applications.
 
diff --git a/src/ext/nccl/nccl.cc b/src/ext/nccl/nccl.cc
index dec61aee..bfde4786 100644
--- a/src/ext/nccl/nccl.cc
+++ b/src/ext/nccl/nccl.cc
@@ -172,6 +172,8 @@ struct ncclComm {
   std::shared_ptr<mscclpp::Executor> executor;
   mscclpp::AlgorithmCollection algorithmCollection;
   std::shared_ptr<char> scratchBuffer_;
+  std::shared_ptr<void> flagBuffer_;
+  size_t flagBufferSize_;
   const size_t scratchBufferSize_ = (1 << 27);  // 128MB
   int nRanksPerNode;
   int worldSize;
@@ -292,7 +294,9 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
   commPtr->scratchBuffer_ = mscclpp::GpuBuffer<char>(commPtr->scratchBufferSize_).memory();
   commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm, commPtr->scratchBuffer_);
 
-  auto [flagBuffer, flagBufferSize] = mscclpp::getDefaultFlagBuffer();
+  auto [buffer, size] = mscclpp::getDefaultFlagBuffer();
+  commPtr->flagBuffer_ = buffer;
+  commPtr->flagBufferSize_ = size;
 
   commPtr->nRanksPerNode = mscclppComm->bootstrap()->getNranksPerNode();
   commPtr->worldSize = mscclppComm->bootstrap()->getNranks();
@@ -300,7 +304,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
   algoBuilder->setFallbackAlgorithmSelector(algoSelector);
   commPtr->algorithmCollection = algoBuilder->buildDefaultAlgorithms(
       reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()), commPtr->scratchBufferSize_,
-      reinterpret_cast<uintptr_t>(flagBuffer.get()), flagBufferSize, rank);
+      reinterpret_cast<uintptr_t>(commPtr->flagBuffer_.get()), commPtr->flagBufferSize_, rank);
   // Extend with user-defined algorithms
   commPtr->algorithmCollection.extend(algoBuilder->build());
 

From 4701ae3a952f1d28c42b96f44a74a27f3e56cf4c Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 18 Feb 2026 10:35:44 -0800
Subject: [PATCH 13/52] Update dtype name (#748)

- Change FP8_E4M3/FP8_E5M2 to FLOAT8_E4M3/FLOAT8_E5M2
- Add torch.uint8 to DataType.uint8 mapping
---
 include/mscclpp/gpu_data_types.hpp               | 16 ++++++++--------
 python/csrc/core_py.cpp                          |  5 +++--
 python/mscclpp/utils.py                          |  2 ++
 src/core/algorithm.cc                            | 12 ++++++------
 src/core/executor/execution_kernel.cu            |  4 ++--
 src/core/include/execution_kernel.hpp            |  4 ++--
 .../collectives/allreduce/allreduce_packet.cu    |  2 +-
 src/ext/collectives/include/allreduce/common.hpp |  8 ++++----
 src/ext/nccl/algorithm_selector.cc               |  2 +-
 src/ext/nccl/datatype_conversion.hpp             | 12 ++++++------
 10 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
index 5a99355f..1cecbea6 100644
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -68,14 +68,14 @@ namespace mscclpp {
 
 /// Data types supported by mscclpp operations.
 enum class DataType {
-  INT32,     // 32-bit signed integer.
-  UINT32,    // 32-bit unsigned integer.
-  FLOAT16,   // IEEE 754 half precision.
-  FLOAT32,   // IEEE 754 single precision.
-  BFLOAT16,  // bfloat16 precision.
-  FP8_E4M3,  // FP8 with E4M3 layout.
-  FP8_E5M2,  // FP8 with E5M2 layout.
-  UINT8,     // 8-bit unsigned integer.
+  INT32,        // 32-bit signed integer.
+  UINT32,       // 32-bit unsigned integer.
+  FLOAT16,      // IEEE 754 half precision.
+  FLOAT32,      // IEEE 754 single precision.
+  BFLOAT16,     // bfloat16 precision.
+  FLOAT8_E4M3,  // float8 with E4M3 layout.
+  FLOAT8_E5M2,  // float8 with E5M2 layout.
+  UINT8,        // 8-bit unsigned integer.
 };
 
 /// Word array.
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index 7d1e37ba..1a884cb3 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -45,8 +45,9 @@ void register_core(nb::module_& m) {
       .value("float16", DataType::FLOAT16)
       .value("float32", DataType::FLOAT32)
       .value("bfloat16", DataType::BFLOAT16)
-      .value("float8_e4m3", DataType::FP8_E4M3)
-      .value("float8_e5m2", DataType::FP8_E5M2);
+      .value("float8_e4m3", DataType::FLOAT8_E4M3)
+      .value("float8_e5m2", DataType::FLOAT8_E5M2)
+      .value("uint8", DataType::UINT8);
 
   nb::class_<Bootstrap>(m, "CppBootstrap")
       .def("get_rank", &Bootstrap::getRank)
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index e7b7381b..93cd786b 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -198,5 +198,7 @@ def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType:
         return DataType.float8_e5m2
     elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e4m3fnuz:
         return DataType.float8_e4m3
+    elif dtype == torch.uint8:
+        return DataType.uint8
     else:
         raise ValueError(f"Unknown data type: {dtype}")
diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc
index eaaeb4a1..98ac5520 100644
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -174,13 +174,13 @@ CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void*
                         stream);
       break;
 #if defined(__FP8_TYPES_EXIST__)
-    case DataType::FP8_E4M3:
-      executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FP8_E4M3, plan_,
-                        stream);
+    case DataType::FLOAT8_E4M3:
+      executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3,
+                        plan_, stream);
       break;
-    case DataType::FP8_E5M2:
-      executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FP8_E5M2, plan_,
-                        stream);
+    case DataType::FLOAT8_E5M2:
+      executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FLOAT8_E5M2,
+                        plan_, stream);
       break;
 #endif
     case DataType::INT32:
diff --git a/src/core/executor/execution_kernel.cu b/src/core/executor/execution_kernel.cu
index ceddf9b7..2d36bcf5 100644
--- a/src/core/executor/execution_kernel.cu
+++ b/src/core/executor/execution_kernel.cu
@@ -78,8 +78,8 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
       );
 #endif
       break;
-    case DataType::FP8_E4M3:
-    case DataType::FP8_E5M2:
+    case DataType::FLOAT8_E4M3:
+    case DataType::FLOAT8_E5M2:
       // FP8 is not supported in CUDA execution kernel.
       break;
   }
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index f2fad0d5..20147c30 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -876,7 +876,7 @@ class ExecutionKernel {
 #endif
         break;
 #if defined(__FP8_TYPES_EXIST__)
-      case DataType::FP8_E4M3:
+      case DataType::FLOAT8_E4M3:
         executionKernel<__fp8_e4m3, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
             rank, (__fp8_e4m3*)src, (__fp8_e4m3*)dst, (__fp8_e4m3*)scratch, scratchOffset, scratchChunkSize, plan,
             semaphores, localMemoryIdBegin, flag
@@ -887,7 +887,7 @@ class ExecutionKernel {
         );
 #endif
         break;
-      case DataType::FP8_E5M2:
+      case DataType::FLOAT8_E5M2:
         executionKernel<__fp8_e5m2, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
             rank, (__fp8_e5m2*)src, (__fp8_e5m2*)dst, (__fp8_e5m2*)scratch, scratchOffset, scratchChunkSize, plan,
             semaphores, localMemoryIdBegin, flag
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index 9ce67085..ceb545ee 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -188,7 +188,7 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
 
 #if defined(__FP8_TYPES_EXIST__)
   // FP8-specific tuning for 32KB-256KB range
-  if (dtype == DataType::FP8_E4M3 || dtype == DataType::FP8_E5M2) {
+  if (dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2) {
     if (inputSize < (64 << 10)) {
       nThreadsPerBlock = 64;
     } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index ab82417a..9bfac69a 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -89,9 +89,9 @@ AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {
       return Adapter<SUM, __bfloat16>::call;
 #endif
 #if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FP8_E4M3) {
+    } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
       return Adapter<SUM, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FP8_E5M2) {
+    } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
       return Adapter<SUM, __fp8_e5m2>::call;
 #endif
     } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
@@ -111,9 +111,9 @@ AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {
       return Adapter<MIN, __bfloat16>::call;
 #endif
 #if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FP8_E4M3) {
+    } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
       return Adapter<MIN, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FP8_E5M2) {
+    } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
       return Adapter<MIN, __fp8_e5m2>::call;
 #endif
     } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
index 0e3b3cc1..be3c58c7 100644
--- a/src/ext/nccl/algorithm_selector.cc
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -19,7 +19,7 @@ static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, Da
     return false;
   }
 
-  const bool isFp8 = dtype == DataType::FP8_E4M3 || dtype == DataType::FP8_E5M2;
+  const bool isFp8 = dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2;
 
   if (!isFp8) {
     return nvlsSupported;
diff --git a/src/ext/nccl/datatype_conversion.hpp b/src/ext/nccl/datatype_conversion.hpp
index bb315894..0270a753 100644
--- a/src/ext/nccl/datatype_conversion.hpp
+++ b/src/ext/nccl/datatype_conversion.hpp
@@ -28,9 +28,9 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
       return mscclpp::DataType::BFLOAT16;
 #ifdef __FP8_TYPES_EXIST__
     case ncclFloat8e4m3:
-      return mscclpp::DataType::FP8_E4M3;
+      return mscclpp::DataType::FLOAT8_E4M3;
     case ncclFloat8e5m2:
-      return mscclpp::DataType::FP8_E5M2;
+      return mscclpp::DataType::FLOAT8_E5M2;
 #endif
     default:
       throw mscclpp::Error("Unsupported ncclDataType_t: " + std::to_string(dtype), mscclpp::ErrorCode::InvalidUsage);
@@ -41,8 +41,8 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
 inline size_t getDataTypeSize(mscclpp::DataType dtype) {
   switch (dtype) {
     case mscclpp::DataType::UINT8:
-    case mscclpp::DataType::FP8_E4M3:
-    case mscclpp::DataType::FP8_E5M2:
+    case mscclpp::DataType::FLOAT8_E4M3:
+    case mscclpp::DataType::FLOAT8_E5M2:
       return 1;
     case mscclpp::DataType::FLOAT16:
     case mscclpp::DataType::BFLOAT16:
@@ -71,9 +71,9 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
     case mscclpp::DataType::BFLOAT16:
       return ncclBfloat16;
 #ifdef __FP8_TYPES_EXIST__
-    case mscclpp::DataType::FP8_E4M3:
+    case mscclpp::DataType::FLOAT8_E4M3:
       return ncclFloat8e4m3;
-    case mscclpp::DataType::FP8_E5M2:
+    case mscclpp::DataType::FLOAT8_E5M2:
       return ncclFloat8e5m2;
 #endif
     default:

From 39865c218bd19e18ef5d9b81f8ea56d0fe2ec0ba Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 20 Feb 2026 13:42:29 -0800
Subject: [PATCH 14/52] address flagBuffer ownership issue (#749)

This pull request updates the handling of the default flag buffer in the
C++ and Python bindings to ensure proper memory management when
interfacing with Python.

Make sure the buffer will not be deallocated when transfer ownership
from cpp to python
---
 include/mscclpp/algorithm.hpp                 |  2 +-
 python/csrc/algorithm.cpp                     | 13 +++++++----
 python/mscclpp/_core/algorithm.py             | 17 ++++++++++----
 .../ext/algorithm_collection_builder.py       |  4 ++--
 src/core/algorithm.cc                         | 23 +++++++++++--------
 src/ext/nccl/nccl.cc                          |  2 +-
 6 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/include/mscclpp/algorithm.hpp b/include/mscclpp/algorithm.hpp
index 6cc05ad4..07149cab 100644
--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -366,7 +366,7 @@ class AlgorithmCollection {
 
 /// Get a default GPU flag buffer (allocated once and reused).
 /// @return A pair of (shared_ptr to the flag buffer, size in bytes).
-std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer();
+std::pair<std::shared_ptr<void>, size_t> getFlagBuffer();
 
 }  // namespace mscclpp
 
diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp
index c8365566..f0d8980d 100644
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -116,10 +116,15 @@ void register_algorithm(nb::module_& m) {
       .def("buffer_mode", &CollectiveRequest::bufferMode);
 
   m.def(
-      "cpp_get_default_flag_buffer",
+      "cpp_get_flag_buffer",
       []() {
-        auto [buffer, size] = getDefaultFlagBuffer();
-        return std::make_pair(reinterpret_cast<uintptr_t>(buffer.get()), size);
+        auto [buffer, size] = getFlagBuffer();
+        uintptr_t ptr = reinterpret_cast<uintptr_t>(buffer.get());
+        // Transfer shared_ptr ownership into a capsule so Python's GC manages the lifetime.
+        auto prevent = std::make_unique<std::shared_ptr<void>>(std::move(buffer));
+        nb::capsule owner(prevent.get(), [](void* p) noexcept { delete static_cast<std::shared_ptr<void>*>(p); });
+        prevent.release();  // capsule now owns the pointer
+        return nb::make_tuple(ptr, size, owner);
       },
-      "Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size).");
+      "Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size, owner).");
 }
\ No newline at end of file
diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py
index c712bf88..9b870582 100644
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -19,7 +19,7 @@ from mscclpp._mscclpp import (
     CppReduceOp,
     CppAlgorithmBuilder,
     CppAlgorithmCollection,
-    cpp_get_default_flag_buffer,
+    cpp_get_flag_buffer,
 )
 
 __all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
@@ -241,15 +241,22 @@ class AlgorithmCollection:
         self._algorithms.append(algorithm)
 
 
-def get_default_flag_buffer() -> cp.ndarray:
+_flag_buffer_cache = None
+
+
+def get_flag_buffer() -> cp.ndarray:
     """Get the default flag buffer for algorithm selection.
 
     This buffer is used internally by default algorithms to store selection flags.
     It is allocated as a shared GPU buffer and can be accessed from Python.
+    The result is cached so all callers share the same buffer.
 
     Returns:
         A CuPy array representing the flag buffer on the GPU.
     """
-    buffer_ptr, buffer_size = cpp_get_default_flag_buffer()
-    memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, None), 0)
-    return cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
+    global _flag_buffer_cache
+    if _flag_buffer_cache is None:
+        buffer_ptr, buffer_size, owner = cpp_get_flag_buffer()
+        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, owner), 0)
+        _flag_buffer_cache = cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
+    return _flag_buffer_cache
diff --git a/python/mscclpp/ext/algorithm_collection_builder.py b/python/mscclpp/ext/algorithm_collection_builder.py
index 80c68909..ddfb929f 100644
--- a/python/mscclpp/ext/algorithm_collection_builder.py
+++ b/python/mscclpp/ext/algorithm_collection_builder.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 from typing import Union
-from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_default_flag_buffer
+from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_flag_buffer
 import atexit
 
 from mscclpp._mscclpp import CppAlgorithmCollectionBuilder
@@ -58,7 +58,7 @@ class AlgorithmCollectionBuilder:
         rank: int,
     ) -> AlgorithmCollection:
         if self._flag_buffer is None:
-            self._flag_buffer = get_default_flag_buffer()
+            self._flag_buffer = get_flag_buffer()
         native_collection = self._builder.build_default_algorithms(
             int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank
         )
diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc
index 98ac5520..683d4ddd 100644
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -199,18 +199,23 @@ std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
 // TODO: implement this
 void DslAlgorithm::reset() {}
 
-static std::weak_ptr<uint32_t> gDefaultFlagBuffer;
+static uint32_t* gDefaultFlagBuffer = nullptr;
+static std::weak_ptr<void> gDefaultFlagBufferWeak;
 static size_t gDefaultFlagCount = 128;
 
-std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer() {
-  std::shared_ptr<uint32_t> flagBuffer = gDefaultFlagBuffer.lock();
-  if (!flagBuffer) {
-    flagBuffer = mscclpp::detail::gpuCallocShared<uint32_t>(gDefaultFlagCount);
-    std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
-    mscclpp::gpuMemcpy(flagBuffer.get(), initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
-    gDefaultFlagBuffer = flagBuffer;
+std::pair<std::shared_ptr<void>, size_t> getFlagBuffer() {
+  auto ptr = gDefaultFlagBufferWeak.lock();
+  if (!ptr) {
+    if (!gDefaultFlagBuffer) {
+      // Intentionally never freed — CUDA driver reclaims GPU memory at process exit.
+      gDefaultFlagBuffer = static_cast<uint32_t*>(mscclpp::detail::gpuCalloc(gDefaultFlagCount * sizeof(uint32_t)));
+      std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
+      mscclpp::gpuMemcpy(gDefaultFlagBuffer, initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
+    }
+    ptr = std::shared_ptr<void>(gDefaultFlagBuffer, [](void*) {});
+    gDefaultFlagBufferWeak = ptr;
   }
-  return {flagBuffer, gDefaultFlagCount * sizeof(uint32_t)};
+  return {ptr, gDefaultFlagCount * sizeof(uint32_t)};
 }
 
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/ext/nccl/nccl.cc b/src/ext/nccl/nccl.cc
index bfde4786..afeb5bdb 100644
--- a/src/ext/nccl/nccl.cc
+++ b/src/ext/nccl/nccl.cc
@@ -294,7 +294,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
   commPtr->scratchBuffer_ = mscclpp::GpuBuffer<char>(commPtr->scratchBufferSize_).memory();
   commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm, commPtr->scratchBuffer_);
 
-  auto [buffer, size] = mscclpp::getDefaultFlagBuffer();
+  auto [buffer, size] = mscclpp::getFlagBuffer();
   commPtr->flagBuffer_ = buffer;
   commPtr->flagBufferSize_ = size;
 

From e2acf7f1c8a274a8ba71ae4b182bfb47010119ea Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Fri, 20 Feb 2026 16:04:12 -0800
Subject: [PATCH 15/52] Removing MPI Dependency (#743)

---
 .azure-pipelines/multi-nodes-test.yml         |  2 +-
 .../templates/integration-test.yaml           |  2 +-
 .azure-pipelines/templates/nccl-test.yaml     |  2 +-
 .azure-pipelines/templates/ut-no-ib-env.yaml  |  2 +-
 .azure-pipelines/templates/ut-npkit.yaml      |  2 +-
 .azure-pipelines/templates/ut.yaml            |  4 +-
 CMakeLists.txt                                |  2 +-
 python/mscclpp/_core/comm.py                  | 37 +++++++++++++------
 8 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 97a95c94..914c2317 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -44,7 +44,7 @@ jobs:
       targetType: 'inline'
       script: |
         mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
+        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
         make -j
       workingDirectory: '$(System.DefaultWorkingDirectory)'
 
diff --git a/.azure-pipelines/templates/integration-test.yaml b/.azure-pipelines/templates/integration-test.yaml
index b9dac24b..99ed6d04 100644
--- a/.azure-pipelines/templates/integration-test.yaml
+++ b/.azure-pipelines/templates/integration-test.yaml
@@ -19,7 +19,7 @@ steps:
     targetType: inline
     script: |
       mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
       make -j
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml
index 1ce37d20..56b75d3f 100644
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ b/.azure-pipelines/templates/nccl-test.yaml
@@ -27,7 +27,7 @@ steps:
     targetType: 'inline'
     script: |
       mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
+      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
       make -j
     workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
 
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml
index aa21c407..e6576f6d 100644
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yaml
@@ -16,7 +16,7 @@ steps:
     targetType: 'inline'
     script: |
       mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
       make -j
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yaml
index 0ab733c9..5c35317e 100644
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ b/.azure-pipelines/templates/ut-npkit.yaml
@@ -63,7 +63,7 @@ steps:
         set -e;                                                       \
         cd /root/mscclpp;                                             \
         mkdir -p build && cd build;                                   \
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
+        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
         make -j"'
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
index 82ff4aac..2086fd0a 100644
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -20,9 +20,9 @@ steps:
     script: |
       mkdir build && cd build
       if [ "${{ parameters.platform }}" == "rocm" ]; then
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
       else
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
       fi
       make -j
     workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6288dbb0..e524b9ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 # Options
 option(MSCCLPP_ENABLE_TRACE "Enable tracing" OFF)
-option(MSCCLPP_BUILD_TESTS "Build tests" ON)
+option(MSCCLPP_BUILD_TESTS "Build tests" OFF)
 option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
 option(MSCCLPP_BUILD_EXT_NCCL "Build NCCL interfaces" ON)
 option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py
index f0c5c219..e74a0e6f 100644
--- a/python/mscclpp/_core/comm.py
+++ b/python/mscclpp/_core/comm.py
@@ -19,8 +19,8 @@ from mscclpp._mscclpp import (
     CppTransport,
     CppTransportFlags,
 )
-import mpi4py
 import numpy as np
+import pickle
 
 from mscclpp.utils import is_torch_tensor
 
@@ -29,20 +29,35 @@ __all__ = ["CommGroup"]
 
 class CommGroup:
     def __init__(
-        self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None
+        self,
+        mpi_comm: "mpi4py.MPI.Comm" = None,
+        torch_group: "dist.ProcessGroup" = None,
+        interfaceIpPortTrio: str = "",
+        rank: int = None,
+        size: int = None,
     ):
-        if interfaceIpPortTrio == "":
-            self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
+        if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None):
             uniq_id = None
-            if mpi_comm.rank == 0:
-                # similar to NCCL's unique id
+            self.bootstrap = CppTcpBootstrap.create(rank, size)
+            if rank == 0:
                 uniq_id = self.bootstrap.create_unique_id()
-            uniq_id_global = mpi_comm.bcast(uniq_id, 0)
+            if mpi_comm is not None:
+                import mpi4py
+
+                uniq_id_global = mpi_comm.bcast(uniq_id, 0)
+            else:
+                import torch
+                import torch.distributed as dist
+
+                if rank == 0:
+                    uniq_id_global = uniq_id
+                    pickled_data = pickle.dumps(uniq_id)
+                    data_tensor = torch.frombuffer(bytearray(pickled_data), dtype=torch.uint8).clone()
+                else:
+                    data_tensor = torch.zeros(256, dtype=torch.uint8)
+                dist.broadcast(data_tensor, src=0, group=torch_group)
+                uniq_id_global = pickle.loads(data_tensor.numpy().tobytes())
             self.bootstrap.initialize(uniq_id_global)
-        elif mpi_comm:
-            # use this instead
-            self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
-            self.bootstrap.initialize(interfaceIpPortTrio)
         elif not interfaceIpPortTrio == "":
             assert rank >= 0 and size >= 1
             self.bootstrap = CppTcpBootstrap.create(rank, size)

From 3962574bcb73a6c8bb6d0c2449885b218bbe5e72 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 20 Feb 2026 16:11:16 -0800
Subject: [PATCH 16/52] Address installation issue in some env (#750)

This pull request updates the way the `nlohmann/json` library is fetched
and upgrades it to a newer version in both the main build and test
configuration files.
Addressed installation issue in some env
---
 CMakeLists.txt                   | 6 ++++--
 test/mscclpp-test/CMakeLists.txt | 5 ++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e524b9ab..8d02abd8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -169,9 +169,11 @@ if(MSCCLPP_USE_IB)
 endif()
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)
-
 include(FetchContent)
-FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
+FetchContent_Declare(json
+    GIT_REPOSITORY https://github.com/nlohmann/json.git
+    GIT_TAG v3.12.0
+)
 FetchContent_MakeAvailable(json)
 
 if("${INSTALL_PREFIX}" STREQUAL "")
diff --git a/test/mscclpp-test/CMakeLists.txt b/test/mscclpp-test/CMakeLists.txt
index eb2b26ca..241b7e02 100644
--- a/test/mscclpp-test/CMakeLists.txt
+++ b/test/mscclpp-test/CMakeLists.txt
@@ -1,7 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz)
+FetchContent_Declare(json
+    GIT_REPOSITORY https://github.com/nlohmann/json.git
+    GIT_TAG v3.12.0
+)
 FetchContent_MakeAvailable(json)
 
 function(add_mscclpp_test_executable name sources)

From 2a6f1c11927389bcee3398e0a43384aa3eb98e5e Mon Sep 17 00:00:00 2001
From: mahdiehghazim <mahdiehghazi@microsoft.com>
Date: Fri, 20 Feb 2026 22:46:32 -0500
Subject: [PATCH 17/52] Mahdieh/switchchannel test clean (#751)

This PR adds an example code for switch channel testing. It validates
switch channel on single node and multi node environments. We need to
add the description of the algorithms and the explanation of the code
under doc.

example outputs:

rank0:

./bidir_switch_channel 10.0.5.233:45571 0 0
Rank 0 (GPU 0): Preparing for tests ...
Rank 0 (GPU 0): bytes 4096, elapsed 0.0062328 ms/iter, BW 0.657169 GB/s
Rank 0 (GPU 0): bytes 4.1943e+06, elapsed 0.0164577 ms/iter, BW 254.854
GB/s
Rank 0 (GPU 0): bytes 1.34218e+08, elapsed 0.33628 ms/iter, BW 399.125
GB/s
Rank 0: Succeed!

rank1:
./bidir_switch_channel 10.0.5.233:45571 1 0
Rank 1 (GPU 0): Preparing for tests ...
Rank 1: Succeed!
---
 examples/tutorials/05-switch-channel/Makefile |  15 ++
 .../05-switch-channel/bidir_switch_channel.cu | 177 ++++++++++++++++++
 2 files changed, 192 insertions(+)
 create mode 100644 examples/tutorials/05-switch-channel/Makefile
 create mode 100644 examples/tutorials/05-switch-channel/bidir_switch_channel.cu

diff --git a/examples/tutorials/05-switch-channel/Makefile b/examples/tutorials/05-switch-channel/Makefile
new file mode 100644
index 00000000..1a211f64
--- /dev/null
+++ b/examples/tutorials/05-switch-channel/Makefile
@@ -0,0 +1,15 @@
+CUDA_HOME ?= /usr/local/cuda
+
+COMPILER := $(CUDA_HOME)/bin/nvcc
+ARCH_FLAG := -arch=native
+
+TARGET = bidir_switch_channel
+SRC = bidir_switch_channel.cu
+
+all: $(TARGET)
+
+$(TARGET): $(SRC)
+	$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp
+
+clean:
+	rm -f $(TARGET)
diff --git a/examples/tutorials/05-switch-channel/bidir_switch_channel.cu b/examples/tutorials/05-switch-channel/bidir_switch_channel.cu
new file mode 100644
index 00000000..658e6f05
--- /dev/null
+++ b/examples/tutorials/05-switch-channel/bidir_switch_channel.cu
@@ -0,0 +1,177 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <functional>
+#include <iostream>
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/switch_channel.hpp>
+#include <mscclpp/switch_channel_device.hpp>
+#include <sstream>
+
+#define PORT_NUMBER "50505"
+
+template <typename... Args>
+void log(Args &&...args) {
+  std::stringstream ss;
+  (ss << ... << args);
+  ss << std::endl;
+  std::cout << ss.str();
+}
+
+int spawn_process(std::function<void()> func) {
+  pid_t pid = fork();
+  if (pid < 0) return -1;
+  if (pid == 0) {
+    // Child process
+    func();
+    exit(0);
+  }
+  return pid;
+}
+
+int wait_process(int pid) {
+  int status;
+  if (waitpid(pid, &status, 0) < 0) {
+    return -1;
+  }
+  if (WIFEXITED(status)) {
+    return WEXITSTATUS(status);
+  }
+  return -1;
+}
+
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
+
+__device__ mscclpp::DeviceSyncer devSyncer;
+
+__global__ void kernelSwitchReduce(int rank, int numElements) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  // rank 0 performs on first half of data and rank 1 on second half
+  int min = rank * (numElements / 2);
+  int max = (rank + 1) * (numElements / 2);
+
+  for (int i = tid + min; i < max; i += stride) {
+    auto val = gConstSwitchChan.reduce<mscclpp::f32x1>(i);
+    gConstSwitchChan.broadcast(i, val);
+  }
+}
+
+void worker(int myRank, int gpuId, const std::string &ipPort) {
+  MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
+  const int nRanks = 2;
+  const int iter = 1000;
+  const size_t bufferBytes = 128 * 1024 * 1024;
+
+  log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");
+
+  // Build a connection and a semaphore
+  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
+  bootstrap->initialize(ipPort);
+  std::shared_ptr<mscclpp::Communicator> comm = std::make_shared<mscclpp::Communicator>(bootstrap);
+
+  std::vector<int> ranks;
+  ranks.reserve(nRanks);
+  for (int i = 0; i < nRanks; i++) ranks.push_back(i);
+
+  auto buffer = mscclpp::GpuBuffer<float>(bufferBytes);
+
+  auto nvlsConnection = mscclpp::connectNvlsCollective(comm, ranks, bufferBytes);
+
+  auto switchChannel = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer.data()), bufferBytes);
+
+  auto deviceHandle = switchChannel.deviceHandle();
+
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan, &deviceHandle, sizeof(deviceHandle)));
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  // Call the kernel in a loop for perf evaluation
+
+  for (size_t numElements : {1024, 1024 * 1024, 32 * 1024 * 1024}) {
+    cudaEvent_t start, end;
+    if (myRank == 0) {
+      MSCCLPP_CUDATHROW(cudaEventCreate(&start));
+      MSCCLPP_CUDATHROW(cudaEventCreate(&end));
+    }
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    bootstrap->barrier();
+
+    if (myRank == 0) {
+      MSCCLPP_CUDATHROW(cudaEventRecord(start, 0));
+    }
+
+    for (int i = 0; i < iter; ++i) {
+      kernelSwitchReduce<<<256, 1024>>>(myRank, numElements);
+    }
+
+    MSCCLPP_CUDATHROW(cudaGetLastError());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+    comm->bootstrap()->barrier();
+
+    if (myRank == 0) {
+      MSCCLPP_CUDATHROW(cudaEventRecord(end, 0));
+      MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
+      float elapsedTime;
+      float elapsedTimePerIter;
+      float gbps;
+      MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
+      elapsedTimePerIter = elapsedTime / iter;
+      float dataSize = numElements * 4;
+      gbps = dataSize / elapsedTimePerIter * 1e-6f;
+      log("Rank ", myRank, " (GPU ", gpuId, "): bytes ", dataSize, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
+          gbps, " GB/s");
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  if (argc == 1) {
+    int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
+    int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
+    if (pid0 < 0 || pid1 < 0) {
+      log("Failed to spawn processes.");
+      return -1;
+    }
+    int status0 = wait_process(pid0);
+    int status1 = wait_process(pid1);
+    if (status0 < 0 || status1 < 0) {
+      log("Failed to wait for processes.");
+      return -1;
+    }
+    if (status0 != 0 || status1 != 0) {
+      log("One of the processes failed.");
+      return -1;
+    }
+    log("Succeed!");
+    return 0;
+  } else if (argc == 4) {
+    std::string ipPort = argv[1];
+    int rank, gpuId;
+    try {
+      rank = std::stoi(argv[2]);
+      gpuId = std::stoi(argv[3]);
+    } catch (const std::exception &) {
+      log("Error: rank and gpu_id must be valid integers.");
+      return -1;
+    }
+    if (rank < 0 || rank > 2 || gpuId < 0) {
+      log("Error: rank must be between 0 and 1 and gpu_id must be non-negative.");
+      return -1;
+    }
+    worker(rank, gpuId, ipPort);
+    log("Rank ", rank, ": Succeed!");
+    return 0;
+  } else {
+    std::cerr << "Usage:\n"
+              << "  " << argv[0] << "                Run in intra-node mode\n"
+              << "  " << argv[0] << " <ip_port> <rank> <gpu_id>   Run in inter-node mode\n";
+    return -1;
+  }
+}

From b5256032fe407935fd9ddffb6e0847b0996f1d4b Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Mon, 23 Feb 2026 11:55:17 -0800
Subject: [PATCH 18/52] Disabling Nanobind Memory Leak Warnings in Release
 Builds (#745)

Co-authored-by: Binyang Li <binyli@microsoft.com>
---
 CMakeLists.txt                  | 1 +
 python/CMakeLists.txt           | 6 +++++-
 python/csrc/core_py.cpp         | 5 ++++-
 python/test/_cpp/proxy_test.cpp | 5 ++++-
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d02abd8..9ff7b075 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
 option(MSCCLPP_USE_IB "Use InfiniBand." ON)
 option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
+option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
 set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
 
 if(MSCCLPP_BYPASS_GPU_CHECK)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b84cea3a..5e784e92 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,6 +4,10 @@
 add_subdirectory(csrc)
 add_subdirectory(test)
 
+target_compile_definitions(mscclpp_py PRIVATE
+  $<$<BOOL:${MSCCLPP_DISABLE_NB_LEAK_WARNINGS}>:MSCCLPP_DISABLE_NB_LEAK_WARNINGS>
+)
+
 add_custom_target(pytest_lib_copy ALL
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
         ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_mscclpp.*.so
@@ -12,4 +16,4 @@ add_custom_target(pytest_lib_copy ALL
         ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_ext.*.so
         ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
     DEPENDS mscclpp_py mscclpp_py_test
-)
+)
\ No newline at end of file
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index 1a884cb3..47d76ac4 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -307,6 +307,9 @@ void register_core(nb::module_& m) {
 }
 
 NB_MODULE(_mscclpp, m) {
+#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS
+  nb::set_leak_warnings(false);
+#endif
   register_env(m);
   register_error(m);
   register_port_channel(m);
@@ -324,4 +327,4 @@ NB_MODULE(_mscclpp, m) {
 
   // ext
   register_algorithm_collection_builder(m);
-}
+}
\ No newline at end of file
diff --git a/python/test/_cpp/proxy_test.cpp b/python/test/_cpp/proxy_test.cpp
index 5bc18e23..697a5c38 100644
--- a/python/test/_cpp/proxy_test.cpp
+++ b/python/test/_cpp/proxy_test.cpp
@@ -63,10 +63,13 @@ class MyProxyService {
 };
 
 NB_MODULE(_ext, m) {
+#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS
+  nb::set_leak_warnings(false);
+#endif
   nb::class_<MyProxyService>(m, "MyProxyService")
       .def(nb::init<int, int, int, nb::list, nb::list>(), nb::arg("rank"), nb::arg("nranks"), nb::arg("data_size"),
            nb::arg("reg_mem_list"), nb::arg("sem_list"))
       .def("fifo_device_handle", &MyProxyService::fifoDeviceHandle)
       .def("start", &MyProxyService::start)
       .def("stop", &MyProxyService::stop);
-}
+}
\ No newline at end of file

From 7738603d6310dbb4f025a80cbb71da043cd46024 Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Mon, 23 Feb 2026 16:33:52 -0800
Subject: [PATCH 19/52] Adjusting Communicator in Python API (#752)

---
 python/mscclpp/_core/comm.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py
index e74a0e6f..d42349dd 100644
--- a/python/mscclpp/_core/comm.py
+++ b/python/mscclpp/_core/comm.py
@@ -38,6 +38,11 @@ class CommGroup:
     ):
         if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None):
             uniq_id = None
+            rank, size = (
+                (mpi_comm.Get_rank(), mpi_comm.Get_size())
+                if mpi_comm is not None
+                else (torch_group.rank(), torch_group.size())
+            )
             self.bootstrap = CppTcpBootstrap.create(rank, size)
             if rank == 0:
                 uniq_id = self.bootstrap.create_unique_id()

From 184dcbf9d774ff8ced74a53d24e596900333d321 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 24 Feb 2026 15:55:59 -0800
Subject: [PATCH 20/52] Add CI pipeline for no-IB environment testing (#755)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Add CI pipeline support for testing in environments without InfiniBand
(IB) hardware.

## Changes

### IB stubs for no-IB builds (`src/core/ib.cc`)
- Added stub implementations for `IbMr` and `IbQp` classes in the `#else
// !defined(USE_IBVERBS)` block so the library links successfully when
built with `-DMSCCLPP_USE_IB=OFF`.

### Environment variable to disable IB tests
(`MSCCLPP_DISABLE_IB_TESTS`)
- Added `disableIbTests` field to the `Env` class
(`include/mscclpp/env.hpp`, `src/core/env.cpp`), reading from
`MSCCLPP_DISABLE_IB_TESTS` env var.
- Exposed as `disable_ib_tests` in Python bindings
(`python/csrc/env_py.cpp`).
- Updated `python/test/test_mscclpp.py` to skip IB-dependent tests
(`create_group_and_connection` with IB transport, `test_h2h_semaphores`,
`test_h2h_semaphores_gil_release`) when `env().disable_ib_tests` is
true.

### CI pipeline (`ut-no-ib-env.yaml`, `ut.yml`)
The no-IB environment pipeline runs two phases:

1. **No-IB build phase**: Build with `-DMSCCLPP_USE_IB=OFF`, deploy, run
unit tests, multi-process unit tests, and pytests (with
`MSCCLPP_DISABLE_IB_TESTS=1`).
2. **IB build phase**: Rebuild with IB enabled (default), stop the
existing container, redeploy, and run pytests (with
`MSCCLPP_DISABLE_IB_TESTS=1`) — verifying that the full IB-enabled build
works correctly in a non-IB environment when IB tests are skipped.

Also increased the job timeout from 40 to 60 minutes to accommodate the
two-phase pipeline.
---
 .azure-pipelines/templates/ut-no-ib-env.yaml | 106 ++++++++++++++++++-
 .azure-pipelines/ut.yml                      |   2 +-
 python/test/test_mscclpp.py                  |  13 +--
 src/core/ib.cc                               |  28 +++++
 4 files changed, 140 insertions(+), 9 deletions(-)

diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml
index e6576f6d..0d97f9fc 100644
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yaml
@@ -16,7 +16,7 @@ steps:
     targetType: 'inline'
     script: |
       mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
       make -j
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
@@ -55,6 +55,51 @@ steps:
     arguments: single-node-test false
     workingDirectory: $(System.DefaultWorkingDirectory)
 
+- task: Bash@3
+  name: UnitTests
+  displayName: Run mscclpp unit tests
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      : > azureuser@10.0.0.4
+      tail -f azureuser@10.0.0.4 &
+      CHILD_PID=$!
+      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
+        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
+        cd /root/mscclpp;                                             \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
+        ./build/bin/unit_tests"'
+      kill $CHILD_PID
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: MpUnitTests
+  displayName: Run mscclpp multi-process unit tests
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      : > azureuser@10.0.0.4
+      tail -f azureuser@10.0.0.4 &
+      CHILD_PID=$!
+      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
+        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
+        export PATH=/usr/local/mpi/bin:\$PATH;                        \
+        cd /root/mscclpp;                                             \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
+        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
+        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
+        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
+      kill $CHILD_PID
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
 - task: Bash@3
   name: PyTests
   displayName: Run pytests
@@ -73,7 +118,64 @@ steps:
         export PATH=/usr/local/mpi/bin:\$PATH                          \
         export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
         cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py::test_executor -x"'
+        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
+      kill $CHILD_PID
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: StopContainer
+  displayName: Stop existing container
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+        "sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
+      rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+  
+- task: Bash@3
+  name: BuildWithIb
+  displayName: Rebuild with IB
+  inputs:
+    targetType: 'inline'
+    script: |
+      rm -rf build && mkdir build && cd build
+      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      make -j
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: DeployTestEnvWithIb
+  displayName: Deploy Test Env (with IB build)
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: single-node-test false
+    workingDirectory: $(System.DefaultWorkingDirectory)
+
+- task: Bash@3
+  name: PyTestsWithIbBuildDisableIb
+  displayName: Run pytests (IB build, IB tests disabled)
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      : > azureuser@10.0.0.4
+      tail -f azureuser@10.0.0.4 &
+      CHILD_PID=$!
+      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
+        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
+        export PATH=/usr/local/mpi/bin:\$PATH                          \
+        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
+        cd /root/mscclpp;                                              \
+        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
       kill $CHILD_PID
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 960f3eae..4aac07e6 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -113,7 +113,7 @@ jobs:
       gpuArch:          '90'
 
 - job: UnitTestNoIBEnv
-  timeoutInMinutes: 40
+  timeoutInMinutes: 60
   displayName: Test No IB Environment
   pool:
     name: msccl-ci-h100
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index a6899642..6b3119cb 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -162,13 +162,10 @@ def create_connection(group: CommGroup, connection_type: str):
 def create_group_and_connection(mpi_group: MpiGroup, connection_type: str):
     if (connection_type == "NVLink" or connection_type == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
         pytest.skip("cannot use nvlink/nvls for cross node")
+    if connection_type == "IB" and os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0":
+        pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1")
     group = CommGroup(mpi_group.comm)
-    try:
-        connection = create_connection(group, connection_type)
-    except Error as e:
-        if connection_type == "IB" and e.args[0] == ErrorCode.InvalidUsage:
-            pytest.skip("IB not supported on this node")
-        raise
+    connection = create_connection(group, connection_type)
     return group, connection
 
 
@@ -281,6 +278,8 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, connection_type: str,
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 def test_h2h_semaphores(mpi_group: MpiGroup):
+    if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0":
+        pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1")
     group = CommGroup(mpi_group.comm)
     tran = group.my_ib_device(group.my_rank % 8)
     endpoint = EndpointConfig(tran, Device(DeviceType.CPU))
@@ -301,6 +300,8 @@ def test_h2h_semaphores(mpi_group: MpiGroup):
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 def test_h2h_semaphores_gil_release(mpi_group: MpiGroup):
+    if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0":
+        pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1")
     group = CommGroup(mpi_group.comm)
     tran = group.my_ib_device(group.my_rank % 8)
     endpoint = EndpointConfig(tran, Device(DeviceType.CPU))
diff --git a/src/core/ib.cc b/src/core/ib.cc
index 2e7b867d..b8854a6e 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -636,6 +636,34 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport) { return ""; }
 
 MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string&) { return Transport::Unknown; }
 
+IbMr::~IbMr() {}
+IbMrInfo IbMr::getInfo() const { return IbMrInfo(); }
+const void* IbMr::getBuff() const { return nullptr; }
+uint32_t IbMr::getLkey() const { return 0; }
+
+IbQp::~IbQp() {}
+void IbQp::rtr(const IbQpInfo& /*info*/) {}
+void IbQp::rts() {}
+void IbQp::stageSendWrite(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint32_t /*size*/, uint64_t /*wrId*/,
+                          uint64_t /*srcOffset*/, uint64_t /*dstOffset*/, bool /*signaled*/) {}
+void IbQp::stageSendAtomicAdd(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint64_t /*wrId*/, uint64_t /*dstOffset*/,
+                              uint64_t /*addVal*/, bool /*signaled*/) {}
+void IbQp::stageSendWriteWithImm(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint32_t /*size*/, uint64_t /*wrId*/,
+                                 uint64_t /*srcOffset*/, uint64_t /*dstOffset*/, bool /*signaled*/,
+                                 unsigned int /*immData*/) {}
+void IbQp::postSend() {}
+void IbQp::stageRecv(uint64_t /*wrId*/) {}
+void IbQp::stageRecv(const IbMr* /*mr*/, uint64_t /*wrId*/, uint32_t /*size*/, uint64_t /*offset*/) {}
+void IbQp::postRecv() {}
+int IbQp::pollSendCq() { return 0; }
+int IbQp::pollRecvCq() { return 0; }
+int IbQp::getSendWcStatus(int /*idx*/) const { return 0; }
+std::string IbQp::getSendWcStatusString(int /*idx*/) const { return ""; }
+int IbQp::getNumSendCqItems() const { return 0; }
+int IbQp::getRecvWcStatus(int /*idx*/) const { return 0; }
+std::string IbQp::getRecvWcStatusString(int /*idx*/) const { return ""; }
+unsigned int IbQp::getRecvWcImmData(int /*idx*/) const { return 0; }
+
 #endif  // !defined(USE_IBVERBS)
 
 }  // namespace mscclpp

From 25435acf5d60461ae9705c4f872c12a127aae4d5 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 24 Feb 2026 16:43:23 -0800
Subject: [PATCH 21/52] Add new algos for GB200 (#747)

- Add new algos (allreduce_rsag, allreduce_rsag_pipeline and
allreduce_rsag_zero_copy) for GB200.
- Add IB stub for non-IB env
- Provides example for algorithm tunning with different nblocks/nthreads

Perf for allreduce_rsag
```
#                                                              out-of-place                       in-place
#       size         count      type   redop    root     time   algbw   busbw  #wrong     time   algbw   busbw  #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)             (us)  (GB/s)  (GB/s)
     1048576        262144     float     sum      -1    25.16   41.67   62.51       0    23.73   44.18   66.27       0
     2097152        524288     float     sum      -1    26.06   80.47  120.71       0    25.31   82.86  124.29       0
     4194304       1048576     float     sum      -1    31.09  134.93  202.39       0    30.75  136.39  204.58       0
     8388608       2097152     float     sum      -1    45.52  184.29  276.43       0    45.13  185.87  278.80       0
    16777216       4194304     float     sum      -1    75.73  221.53  332.30       0    75.51  222.18  333.27       0
    33554432       8388608     float     sum      -1   137.25  244.48  366.72       0   137.22  244.54  366.81       0
    67108864      16777216     float     sum      -1   271.34  247.32  370.99       0   270.86  247.76  371.65       0
   134217728      33554432     float     sum      -1   534.25  251.22  376.84       0   534.43  251.14  376.71       0
# Out of bounds values : 0 OK
# Avg bus bandwidth    : 264.454
#
# Collective test concluded: all_reduce_perf
```

perf for allreduce_rsag_pipeline
```
#                                                              out-of-place                       in-place
#       size         count      type   redop    root     time   algbw   busbw  #wrong     time   algbw   busbw  #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)             (us)  (GB/s)  (GB/s)
     1048576        262144     float     sum      -1    61.57   17.03   25.55       0    61.51   17.05   25.57       0
     2097152        524288     float     sum      -1    61.31   34.20   51.31       0    61.23   34.25   51.38       0
     4194304       1048576     float     sum      -1    61.62   68.06  102.10       0    61.84   67.83  101.74       0
     8388608       2097152     float     sum      -1    61.97  135.37  203.06       0    61.89  135.53  203.30       0
    16777216       4194304     float     sum      -1    63.15  265.65  398.48       0    62.89  266.76  400.15       0
    33554432       8388608     float     sum      -1   100.63  333.46  500.19       0    99.76  336.34  504.51       0
    67108864      16777216     float     sum      -1   180.04  372.75  559.13       0   179.75  373.34  560.01       0
   134217728      33554432     float     sum      -1   339.60  395.23  592.84       0   338.16  396.91  595.36       0
# Out of bounds values : 0 OK
# Avg bus bandwidth    : 304.665
#
# Collective test concluded: all_reduce_perf
```

perf for allreduce_rsag_zero_copy
```
#                                                              out-of-place                       in-place
#       size         count      type   redop    root     time   algbw   busbw  #wrong     time   algbw   busbw  #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)             (us)  (GB/s)  (GB/s)
     1048576        262144     float     sum      -1    14.99   69.93  104.90       0    14.44   72.61  108.92       0
     2097152        524288     float     sum      -1    16.19  129.56  194.33       0    15.85  132.32  198.48       0
     4194304       1048576     float     sum      -1    21.19  197.98  296.97       0    20.64  203.20  304.81       0
     8388608       2097152     float     sum      -1    31.04  270.27  405.41       0    30.68  273.44  410.16       0
    16777216       4194304     float     sum      -1    50.34  333.26  499.89       0    50.15  334.51  501.77       0
    33554432       8388608     float     sum      -1    89.58  374.56  561.84       0    88.65  378.48  567.73       0
    67108864      16777216     float     sum      -1   165.69  405.03  607.54       0   163.64  410.10  615.16       0
   134217728      33554432     float     sum      -1   323.19  415.28  622.93       0   318.01  422.05  633.07       0
# Out of bounds values : 0 OK
# Avg bus bandwidth    : 414.619
#
# Collective test concluded: all_reduce_perf
```

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
Co-authored-by: Qinghua Zhou <qinghuazhou@microsoft.com>
Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
---
 .github/workflows/codeql-analysis.yml         |  12 +-
 .../customized_comm_with_tuning.py            | 282 +++++++++++++++
 .../algorithm_collection_builder.cc           |  11 +
 .../collectives/allreduce/allreduce_rsag.cu   | 229 ++++++++++++
 .../allreduce/allreduce_rsag_pipeline.cu      | 336 ++++++++++++++++++
 .../allreduce/allreduce_rsag_zero_copy.cu     | 236 ++++++++++++
 .../include/allreduce/allreduce_rsag.hpp      |  43 +++
 .../allreduce/allreduce_rsag_pipeline.hpp     |  43 +++
 .../allreduce/allreduce_rsag_zero_copy.hpp    |  39 ++
 .../collectives/include/collective_utils.hpp  |   1 +
 src/ext/nccl/algorithm_selector.cc            |  13 +-
 11 files changed, 1236 insertions(+), 9 deletions(-)
 create mode 100644 examples/torch-integration/customized_comm_with_tuning.py
 create mode 100644 src/ext/collectives/allreduce/allreduce_rsag.cu
 create mode 100644 src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
 create mode 100644 src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
 create mode 100644 src/ext/collectives/include/allreduce/allreduce_rsag.hpp
 create mode 100644 src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
 create mode 100644 src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index b423e326..db3b488a 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -51,7 +51,7 @@ jobs:
         df -h
 
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v3
+      uses: github/codeql-action/init@v4
       with:
         languages: ${{ matrix.language }}
 
@@ -63,10 +63,10 @@ jobs:
       run: |
         rm -rf build && mkdir build && cd build
         cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
-        make -j
+        make -j4
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v3
+      uses: github/codeql-action/analyze@v4
       with:
         category: "/language:${{matrix.language}}/version:${{matrix.version}}"
 
@@ -96,7 +96,7 @@ jobs:
         df -h
 
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v3
+      uses: github/codeql-action/init@v4
       with:
         languages: ${{ matrix.language }}
 
@@ -108,9 +108,9 @@ jobs:
       run: |
         rm -rf build && mkdir build && cd build
         CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
-        make -j
+        make -j4
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v3
+      uses: github/codeql-action/analyze@v4
       with:
         category: "/language:${{matrix.language}}/version:${{matrix.version}}"
diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
new file mode 100644
index 00000000..b618df5c
--- /dev/null
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -0,0 +1,282 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8  customized_comm_with_tuning.py
+
+import os
+import torch
+import mscclpp.utils as mscclpp_utils
+import mscclpp
+import mscclpp.ext
+import netifaces as ni
+import ipaddress
+
+
+def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
+    collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
+    return collection_builder.build_default_algorithms(
+        scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
+    )
+
+
+def interfaces_for_ip_netifaces(ip: str):
+    target = ipaddress.ip_address(ip)
+    for interface in ni.interfaces():
+        addresses = ni.ifaddresses(interface)
+        if ni.AF_INET in addresses:
+            for link in addresses[ni.AF_INET]:
+                if "addr" in link:
+                    addr = ipaddress.ip_address(link["addr"])
+                    if addr == target:
+                        return interface
+    return None
+
+
+def to_mscclpp_reduce_op(op: torch.distributed.ReduceOp) -> mscclpp.ReduceOp:
+    if op == torch.distributed.ReduceOp.SUM:
+        return mscclpp.ReduceOp.SUM
+    elif op == torch.distributed.ReduceOp.MIN:
+        return mscclpp.ReduceOp.MIN
+    else:
+        raise ValueError(f"unsupported op: {op}")
+
+
+class CustomizedComm:
+    def __init__(self, comm: mscclpp.CommGroup):
+        self.comm = comm
+        self.rank = comm.my_rank
+        self.world_size = comm.nranks
+        self.local_rank = comm.my_rank % comm.nranks_per_node
+        self.n_ranks_per_node = comm.nranks_per_node
+        dlpack = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+        self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
+        algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
+        self._algorithm_nvls_packet = [
+            algo
+            for algo in algorithms
+            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
+        ][0]
+        self._algorithm_rsag_zero_copy = [
+            algo
+            for algo in algorithms
+            if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy"
+        ][0]
+        self._algorithm_packet = [
+            algo for algo in algorithms if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
+        ][0]
+        self._tune(n_warmup=5, n_graph_launches=10, n_ops_per_graph=100)
+
+    def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
+        sizes = [1 << i for i in range(10, 28)]
+        # Pre-fill with defaults for barrier
+        self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
+
+        tune_tensor = torch.rand(1 << 27, dtype=torch.float16, device="cuda")
+        candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
+        candidates_nthreads = [512, 768, 1024]
+
+        for size in sizes:
+            algos = []
+            if size <= 4 * 1024 * 1024:
+                algos.append(self._algorithm_nvls_packet)
+                algos.append(self._algorithm_packet)
+            if size >= 512 * 1024:
+                algos.append(self._algorithm_rsag_zero_copy)
+
+            best_time = float("inf")
+            best_config = None
+
+            for algo in algos:
+                for nb in candidates_nblocks:
+                    if algo.name == "default_allreduce_nvls_packet" and nb > 16:
+                        continue
+                    if algo.name == "default_allreduce_packet" and nb > 56:
+                        continue
+                    for nt in candidates_nthreads:
+                        if self._run_algo(algo, tune_tensor, size, nb, nt) != 0:
+                            continue
+
+                        for _ in range(n_warmup):
+                            self._run_algo(algo, tune_tensor, size, nb, nt)
+                        self.barrier()
+
+                        capture_stream = torch.cuda.Stream()
+                        capture_stream.wait_stream(torch.cuda.current_stream())
+
+                        g = torch.cuda.CUDAGraph()
+                        # Warmup on capture stream
+                        with torch.cuda.stream(capture_stream):
+                            self._run_algo(algo, tune_tensor, size, nb, nt)
+                        capture_stream.synchronize()
+
+                        with torch.cuda.graph(g, stream=capture_stream):
+                            for _ in range(n_ops_per_graph):
+                                self._run_algo(algo, tune_tensor, size, nb, nt)
+
+                        start_event = torch.cuda.Event(enable_timing=True)
+                        end_event = torch.cuda.Event(enable_timing=True)
+                        start_event.record(capture_stream)
+                        with torch.cuda.stream(capture_stream):
+                            for _ in range(n_graph_launches):
+                                g.replay()
+                        end_event.record(capture_stream)
+                        end_event.synchronize()
+
+                        elapsed = start_event.elapsed_time(end_event)
+
+                        # Synchronize timing results across all ranks to ensure consistent algorithm selection
+                        # replicate n times such due to algo limitations
+                        time_tensor = torch.full((self.world_size,), elapsed, dtype=torch.float64, device="cuda").to(
+                            dtype=torch.float32
+                        )
+                        torch.cuda.current_stream().wait_stream(capture_stream)
+                        # TODO: use all_reduce may cause problem if the time elapsed between different algos are too close.
+                        # May change to broadcast in the future if that becomes an issue.
+                        self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM)
+                        avg_time = time_tensor[self.rank].item() / self.world_size
+
+                        if avg_time < best_time:
+                            best_time = avg_time
+                            best_config = (algo, nb, nt)
+
+            if best_config:
+                self.best_configs[size] = best_config
+                if self.rank == 0:
+                    print(
+                        f"Size {size}: Best Algo {best_config[0].name} nblocks {best_config[1]} nthreads {best_config[2]} Time {(best_time/(n_graph_launches * n_ops_per_graph))*1000:.2f} us"
+                    )
+        # reset the algorithms after tuning
+        torch.cuda.synchronize()
+        for algo in algos:
+            algo.reset()
+
+    def _run_algo(self, algo, tensor, size, nblocks, nthreads):
+        return algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=tensor.data_ptr(),
+            output_buffer=tensor.data_ptr(),
+            input_size=size,
+            output_size=size,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+            op=mscclpp.ReduceOp.SUM,
+            stream=torch.cuda.current_stream().cuda_stream,
+            nblocks=nblocks,
+            nthreads_per_block=nthreads,
+        )
+
+    def get_tuned_config(self, size):
+        if size < 1024:
+            target_size = 1024
+        elif size > 256 * 1024 * 1024:
+            target_size = 256 * 1024 * 1024
+        else:
+            target_size = 1 << (size - 1).bit_length()
+        return self.best_configs.get(target_size)
+
+    def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
+        assert op == torch.distributed.ReduceOp.SUM
+        config = self.get_tuned_config(tensor.nbytes)
+        algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0)
+        ret = algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=tensor.data_ptr(),
+            output_buffer=tensor.data_ptr(),
+            input_size=tensor.nbytes,
+            output_size=tensor.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+            op=to_mscclpp_reduce_op(op),
+            stream=stream.cuda_stream if stream is not None else torch.cuda.current_stream().cuda_stream,
+            nblocks=nblocks,
+            nthreads_per_block=nthreads,
+        )
+        if ret != 0:
+            print(f"Rank {self.rank}: Algo {algo.name} failed with error {ret}")
+
+    def barrier(self):
+        tensor = torch.empty(self.world_size, dtype=torch.float, device=torch.device("cuda"))
+        self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
+
+    def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
+        low = 5 * 1024
+        high = 80 * 1024 * 1024
+        sizes = []
+        curr = low
+        while curr <= high:
+            sizes.append(curr)
+            curr *= 2
+
+        if self.rank == 0:
+            print(f"{'Size (Bytes)':<20} {'Time (us)':<20} {'AlgoBW (GB/s)':<20}")
+
+        dtype = torch.float16
+        capture_stream = torch.cuda.Stream()
+
+        for size in sizes:
+            tensor = torch.rand(size // 2, dtype=dtype, device="cuda")
+            capture_stream.wait_stream(torch.cuda.current_stream())
+            # Capture Graph
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g, stream=capture_stream):
+                for _ in range(n_iter_per_graph):
+                    self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
+
+            # warmup: Execute the graph once to prime the driver
+            with torch.cuda.stream(capture_stream):
+                for _ in range(n_warmup):
+                    g.replay()
+                self.barrier()
+            capture_stream.synchronize()
+
+            # Benchmark
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+
+            start_event.record(capture_stream)
+            with torch.cuda.stream(capture_stream):
+                for _ in range(n_graph_launches):
+                    g.replay()
+            end_event.record(capture_stream)
+            end_event.synchronize()
+
+            # Get elapsed time in milliseconds
+            elapsed_ms = start_event.elapsed_time(end_event)
+            avg_time_ms = elapsed_ms / (n_graph_launches * n_iter_per_graph)
+            time_us = avg_time_ms * 1000
+
+            alg_bw = size / (avg_time_ms * 1e-3) if avg_time_ms > 0 else 0
+            if self.rank == 0:
+                print(f"{size:<20} {time_us:<20.2f} {alg_bw / 1e9:<20.2f}")
+
+    def destroy(self):
+        self._algorithm_nvls_nonzero_copy = None
+        self._algorithm_nvls_packet = None
+        self.scratch_buffer = None
+        self.comm = None
+
+
+def init_dist() -> CustomizedComm:
+    rank = int(os.environ["RANK"])
+    world = int(os.environ["WORLD_SIZE"])
+    master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
+    master_port = os.environ["MSCCLPP_MASTER_PORT"]
+    interface = interfaces_for_ip_netifaces(master_addr)
+    if interface is None:
+        raise ValueError(f"Cannot find network interface for IP address {master_addr}")
+    interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
+    mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
+    return CustomizedComm(mscclpp_group)
+
+
+def main():
+    local = int(os.environ["LOCAL_RANK"])
+    torch.cuda.set_device(local)
+    comm = init_dist()
+    comm.benchmark(n_warmup=5, n_graph_launches=10, n_iter_per_graph=100)
+    comm.barrier()
+    torch.cuda.synchronize()
+    comm.destroy()
+    print(f"rank {local} All-reduce operation completed successfully.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/ext/collectives/algorithm_collection_builder.cc b/src/ext/collectives/algorithm_collection_builder.cc
index 1ede7519..2b3bec8d 100644
--- a/src/ext/collectives/algorithm_collection_builder.cc
+++ b/src/ext/collectives/algorithm_collection_builder.cc
@@ -13,6 +13,9 @@
 #include "allreduce/allreduce_nvls_with_copy.hpp"
 #include "allreduce/allreduce_nvls_with_copy_2.hpp"
 #include "allreduce/allreduce_packet.hpp"
+#include "allreduce/allreduce_rsag.hpp"
+#include "allreduce/allreduce_rsag_pipeline.hpp"
+#include "allreduce/allreduce_rsag_zero_copy.hpp"
 #include "logger.hpp"
 
 namespace mscclpp {
@@ -82,6 +85,14 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uin
   collection.registerAlgorithm(allreduceNvls->collective(), allreduceNvls->name(), allreduceNvls);
   auto allreduceFullmesh = std::make_shared<AllreduceFullmesh>(scratchBuffer, scratchBufferSize)->build();
   collection.registerAlgorithm(allreduceFullmesh->collective(), allreduceFullmesh->name(), allreduceFullmesh);
+  auto allreduceRsag = std::make_shared<AllreduceRsAg>(scratchBuffer, scratchBufferSize)->build();
+  collection.registerAlgorithm(allreduceRsag->collective(), allreduceRsag->name(), allreduceRsag);
+  auto allreduceRsagPipeline = std::make_shared<AllreduceRsAgPipeline>(scratchBuffer, scratchBufferSize)->build();
+  collection.registerAlgorithm(allreduceRsagPipeline->collective(), allreduceRsagPipeline->name(),
+                               allreduceRsagPipeline);
+  auto allreduceRsagZeroCopy = std::make_shared<AllreduceRsAgZeroCopy>()->build();
+  collection.registerAlgorithm(allreduceRsagZeroCopy->collective(), allreduceRsagZeroCopy->name(),
+                               allreduceRsagZeroCopy);
 
   auto allgatherFullmesh = std::make_shared<AllgatherFullmesh>(scratchBuffer, scratchBufferSize)->build();
   collection.registerAlgorithm(allgatherFullmesh->collective(), allgatherFullmesh->name(), allgatherFullmesh);
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
new file mode 100644
index 00000000..d5be2257
--- /dev/null
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -0,0 +1,229 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "allreduce/allreduce_rsag.hpp"
+#include "allreduce/common.hpp"
+#include "collective_utils.hpp"
+#include "logger.hpp"
+
+namespace mscclpp {
+namespace collective {
+
+// Allreduce using the Reduce-Scatter + All-Gather (RSAG) pattern.
+//
+// This algorithm performs allreduce in three phases over intra-node peers
+// connected via CudaIpc memory channels:
+//
+//   1. Scatter: Each rank copies its input data into a scratch buffer, then
+//      signals peers and waits for all peers to do the same.
+//
+//   2. Reduce-Scatter: Each rank reduces its assigned chunk by reading the
+//      corresponding chunks from all peers' scratch buffers (via remote memory
+//      handles) and applying the reduction op. The reduced result is written
+//      back to both the local result buffer and peers' scratch buffers.
+//
+//   3. All-Gather: After a second signal/wait barrier, each rank copies the
+//      reduced chunks produced by other ranks from the scratch buffer into its
+//      result buffer, completing the allreduce.
+//
+// Data is processed in int4-sized (16-byte) units for coalesced memory access,
+// with special handling for any remainder elements at the tail.
+template <ReduceOp OpType, typename T>
+__global__ void __launch_bounds__(1024, 1)
+    allreduceRsAg(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                  DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int nRanksPerNode,
+                  int worldSize, size_t nelems) {
+  int blockId = blockIdx.x;
+  uint32_t nPeers = nRanksPerNode - 1;
+
+  assert((uintptr_t)buff % sizeof(int4) == 0);
+  assert((uintptr_t)resultBuff % sizeof(int4) == 0);
+
+  constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
+  uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
+                           nelemsPerInt4 * nRanksPerNode;
+  uint32_t nelemsPerRank = alignedNelems / nRanksPerNode;
+  uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
+  uint32_t lastInt4Index = nelems / nelemsPerInt4;
+  uint32_t remainder = nelems % nelemsPerInt4;
+
+  int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
+  int4* resultBuff4 = reinterpret_cast<int4*>((char*)resultBuff);
+  int4* buff4 = reinterpret_cast<int4*>((char*)buff);
+  DeviceHandle<BaseMemoryChannel>* memoryChannelsLocal = memoryChannels + blockId * nPeers;
+
+  uint32_t nInt4PerBlock = nInt4PerRank / gridDim.x;
+  uint32_t remainderForBlock = nInt4PerRank % gridDim.x;
+  uint32_t offset4 = blockId * nInt4PerBlock;
+  if (blockId == (int)(gridDim.x - 1)) {
+    nInt4PerBlock += remainderForBlock;
+  }
+  if (nInt4PerBlock == 0) return;
+  uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerNode;
+
+  for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) {
+    int rankIdx = idx / nInt4PerBlock;
+    uint32_t offsetIdx = rankIdx * nInt4PerRank + offset4 + (idx % nInt4PerBlock);
+    if (offsetIdx > lastInt4Index) continue;
+    if (offsetIdx == lastInt4Index && remainder != 0) {
+      for (uint32_t i = 0; i < remainder; i++) {
+        ((T*)&scratch4[offsetIdx])[i] = ((T*)&buff4[offsetIdx])[i];
+      }
+      continue;
+    }
+    scratch4[offsetIdx] = buff4[offsetIdx];
+  }
+  __syncthreads();
+  if (threadIdx.x < nPeers) {
+    memoryChannelsLocal[threadIdx.x].signal();
+    memoryChannelsLocal[threadIdx.x].wait();
+  }
+  __syncthreads();
+  for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) {
+    uint32_t offset = idx + offset4 + rank * nInt4PerRank;
+    if (offset > lastInt4Index) continue;
+    int4 tmp = scratch4[offset];
+    for (uint32_t i = 0; i < nPeers; i++) {
+      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
+      int4 data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
+      tmp = cal_vector<T, OpType>(data, tmp);
+    }
+    for (uint32_t i = 0; i < nPeers; i++) {
+      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
+      mscclpp::write<int4>(((void**)remoteMemories)[peerIdx], offset, tmp);
+    }
+    if (offset == lastInt4Index && remainder != 0) {
+      for (uint32_t i = 0; i < remainder; i++) {
+        ((T*)&resultBuff4[offset])[i] = ((T*)&tmp)[i];
+      }
+      continue;
+    }
+    resultBuff4[offset] = tmp;
+  }
+  __syncthreads();
+  if (threadIdx.x < nPeers) {
+    memoryChannelsLocal[threadIdx.x].signal();
+    memoryChannelsLocal[threadIdx.x].wait();
+  }
+  __syncthreads();
+  for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) {
+    int rankIdx = idx / nInt4PerBlock;
+    if (rankIdx == rank) continue;
+    uint32_t offsetIdx = rankIdx * nInt4PerRank + offset4 + (idx % nInt4PerBlock);
+    if (offsetIdx > lastInt4Index) continue;
+    if (offsetIdx == lastInt4Index && remainder != 0) {
+      for (uint32_t i = 0; i < remainder; i++) {
+        ((T*)&resultBuff4[offsetIdx])[i] = ((T*)&scratch4[offsetIdx])[i];
+      }
+      continue;
+    }
+    resultBuff4[offsetIdx] = scratch4[offsetIdx];
+  }
+}
+
+template <ReduceOp OpType, typename T>
+struct AllreduceRsAgAdapter {
+  static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
+                          DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
+                          size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream,
+                          void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    using ChannelType = DeviceHandle<BaseMemoryChannel>;
+    size_t nelems = inputSize / sizeof(T);
+    if (nBlocks == 0 || nThreadsPerBlock == 0) {
+      nThreadsPerBlock = 1024;
+      nBlocks = 64;
+    }
+    allreduceRsAg<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+        (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
+        nRanksPerNode, worldSize, nelems);
+    return cudaGetLastError();
+  }
+};
+
+void AllreduceRsAg::initialize(std::shared_ptr<Communicator> comm) {
+  this->conns_ = setupConnections(comm);
+  nChannelsPerConnection_ = 64;
+  comm_ = comm;
+  // setup semaphores
+  this->scratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
+  RegisteredMemory localMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
+  this->remoteScratchMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), localMemory);
+  localScratchMemory_ = std::move(localMemory);
+
+  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, this->scratchSemaphores_, nChannelsPerConnection_);
+  this->baseMemoryChannelHandles_ = setupBaseMemoryChannelDeviceHandles(baseChannels_);
+  std::vector<void*> remoteMemoryHandles;
+  for (const auto& remoteMemory : this->remoteScratchMemories_) {
+    remoteMemoryHandles.push_back(remoteMemory.data());
+  }
+  this->remoteMemoryHandles_ = detail::gpuCallocShared<void*>(remoteMemoryHandles.size());
+  gpuMemcpy(this->remoteMemoryHandles_.get(), remoteMemoryHandles.data(), remoteMemoryHandles.size(),
+            cudaMemcpyHostToDevice);
+}
+
+CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
+                                              size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
+                                              int nBlocks, int nThreadsPerBlock,
+                                              const std::unordered_map<std::string, uintptr_t>&) {
+  auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgAdapter>(op, dtype);
+  if (!allreduce) {
+    WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
+         ", dtype=", static_cast<int>(dtype));
+    return CommResult::CommInvalidArgument;
+  }
+  if (inputSize > this->scratchBufferSize_) {
+    WARN(ALGO, "Input size ", inputSize, " exceeds scratch buffer size ", this->scratchBufferSize_);
+    return CommResult::CommInvalidArgument;
+  }
+  std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
+                                this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank,
+                                algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, 0,
+                                numBlocksAndThreads.first, numBlocksAndThreads.second);
+  if (error != cudaSuccess) {
+    WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
+    return CommResult::CommUnhandledCudaError;
+  }
+  return CommResult::CommSuccess;
+}
+
+AlgorithmCtxKey AllreduceRsAg::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
+  return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
+}
+
+std::shared_ptr<void> AllreduceRsAg::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void*,
+                                                          size_t, DataType) {
+  auto ctx = std::make_shared<AlgorithmCtx>();
+  ctx->rank = comm->bootstrap()->getRank();
+  ctx->workSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+
+  ctx->memorySemaphores = this->scratchSemaphores_;
+  ctx->registeredMemories = this->remoteScratchMemories_;
+  return ctx;
+}
+
+std::shared_ptr<Algorithm> AllreduceRsAg::build() {
+  auto self = std::make_shared<AllreduceRsAg>((uintptr_t)scratchBuffer_, scratchBufferSize_);
+  return std::make_shared<NativeAlgorithm>(
+      "default_allreduce_rsag", "allreduce",
+      [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
+      [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+        return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
+                                         extras);
+      },
+      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize,
+             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
+}
+}  // namespace collective
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
new file mode 100644
index 00000000..a230d8cd
--- /dev/null
+++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
@@ -0,0 +1,336 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "allreduce/allreduce_rsag_pipeline.hpp"
+#include "allreduce/common.hpp"
+#include "collective_utils.hpp"
+#include "logger.hpp"
+
+namespace mscclpp {
+namespace collective {
+constexpr int MAX_NBLOCKS_FOR_PUT = 32;
+constexpr int MAX_NBLOCKS_FOR_RECV = 32;
+constexpr int MAX_NBLOCKS_FOR_REDUCE = 64;
+constexpr int REDUCE_COPY_RATIO = 2;
+__device__ DeviceSemaphore semaphoreForSend[MAX_NBLOCKS_FOR_REDUCE];
+__device__ DeviceSemaphore semaphoreForRecv[MAX_NBLOCKS_FOR_REDUCE];
+__device__ DeviceSemaphore semaphoreForReduce[MAX_NBLOCKS_FOR_REDUCE];
+
+// TODO: move it to a common header file
+template <typename T>
+__device__ __forceinline__ int4 loadVec(const T* buff, size_t i, size_t nelems) {
+  constexpr size_t ElemsPerInt4 = sizeof(int4) / sizeof(T);
+  size_t offset = i * ElemsPerInt4;
+  if (offset + ElemsPerInt4 <= nelems) {
+    return reinterpret_cast<const int4*>(buff)[i];
+  } else {
+    union {
+      int4 i;
+      T t[ElemsPerInt4];
+    } vec;
+    vec.i = make_int4(0, 0, 0, 0);
+    for (size_t j = 0; j < ElemsPerInt4 && offset + j < nelems; ++j) {
+      vec.t[j] = buff[offset + j];
+    }
+    return vec.i;
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ void storeVec(T* buff, size_t i, int4 val, size_t nelems) {
+  constexpr size_t ElemsPerInt4 = sizeof(int4) / sizeof(T);
+  size_t offset = i * ElemsPerInt4;
+  if (offset + ElemsPerInt4 <= nelems) {
+    reinterpret_cast<int4*>(buff)[i] = val;
+  } else {
+    union {
+      int4 i;
+      T t[ElemsPerInt4];
+    } vec;
+    vec.i = val;
+    for (size_t j = 0; j < ElemsPerInt4 && offset + j < nelems; ++j) {
+      buff[offset + j] = vec.t[j];
+    }
+  }
+}
+
+// Pipelined Reduce-Scatter + All-Gather (RSAG) allreduce.
+//
+// This is a pipelined variant of the basic RSAG allreduce that overlaps
+// communication and computation by splitting the data into chunks processed
+// across multiple iterations. Three groups of thread blocks run concurrently
+// with different roles, synchronized via device semaphores:
+//
+//   PUT blocks  — Read local input chunks and write them into peers' scratch
+//                 buffers via remote memory handles (CudaIpc).
+//
+//   REDUCE blocks — After a signal/wait barrier confirming PUT completion,
+//                   reduce the local chunk with data received from all peers
+//                   in the scratch buffer. Write the reduced result to both
+//                   the local output and peers' scratch (for the AG phase).
+//
+//   RECV blocks — After a signal/wait barrier confirming REDUCE completion,
+//                 copy other ranks' reduced chunks from scratch into the
+//                 local result buffer, completing the all-gather.
+//
+// Pipelining is achieved by using a circular scratch buffer (pipelineDepth
+// stages). PUT blocks wait on a semaphore before reusing a scratch slot,
+// allowing the next iteration's PUT to overlap with the current iteration's
+// REDUCE and RECV. Each REDUCE block handles a subset of the PUT block's
+// data (controlled by REDUCE_COPY_RATIO), enabling finer-grained overlap.
+//
+// Data is processed in int4-sized (16-byte) units with vectorized load/store
+// helpers that handle tail elements.
+
+template <ReduceOp OpType, typename T>
+__global__ void __launch_bounds__(1024, 1)
+    allreduceRsAgPipeline(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                          DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank,
+                          int nRanksPerNode, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut,
+                          uint32_t nblocksForReduce, uint32_t nblocksForRecv) {
+  uint32_t bid = blockIdx.x;
+  constexpr uint32_t nStepsPerIter = 4;
+  uint32_t nInt4 = (nelems * sizeof(T) + sizeof(int4) - 1) / sizeof(int4);
+  uint32_t nInt4PerIter = nblocksForReduce * blockDim.x * nStepsPerIter;
+  const uint32_t chunkSize = nInt4PerIter * worldSize;
+  uint32_t nIters = (nInt4 + chunkSize - 1) / chunkSize;
+  uint32_t nPeers = nRanksPerNode - 1;
+  int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
+  const uint32_t scratchIterStride = 2 * chunkSize;  // one for AS, one for AG
+  const uint32_t pipelineDepth = scratchSize / sizeof(int4) / scratchIterStride;
+  assert(pipelineDepth >= 1);
+
+  if (bid < nblocksForPut) {
+    if (threadIdx.x == 0) {
+      semaphoreForSend[bid].set(pipelineDepth);
+    }
+    for (uint32_t iter = 0; iter < nIters; iter++) {
+      if (threadIdx.x == 0) {
+        semaphoreForSend[bid].acquire();
+      }
+      __syncthreads();
+      uint32_t threadIdInPut = bid * blockDim.x + threadIdx.x;
+      for (uint32_t peer = 0; peer < nPeers; peer++) {
+        int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+        int peerId = remoteRankId < rank ? remoteRankId : remoteRankId - 1;
+        // Read chunk[remoteRankId] from local buff, write to peer's scratch[rank] (sender's slot)
+        uint32_t srcOffset = iter * chunkSize + remoteRankId * nInt4PerIter;
+        uint32_t dstOffset = (iter % pipelineDepth) * scratchIterStride + rank * nInt4PerIter;
+        int4 tmp[nStepsPerIter * REDUCE_COPY_RATIO];
+#pragma unroll
+        for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) {
+          uint32_t offset = srcOffset + threadIdInPut + step * blockDim.x * nblocksForPut;
+          tmp[step] = loadVec(buff, offset, nelems);
+        }
+#pragma unroll
+        for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) {
+          uint32_t offset = dstOffset + threadIdInPut + step * blockDim.x * nblocksForPut;
+          mscclpp::write<int4>(((void**)remoteMemories)[peerId], offset, tmp[step]);
+        }
+      }
+      __syncthreads();
+      if (threadIdx.x < REDUCE_COPY_RATIO) {
+        semaphoreForReduce[bid * REDUCE_COPY_RATIO + threadIdx.x].release();
+      }
+    }
+  } else if (bid < nblocksForPut + nblocksForReduce) {
+    uint32_t bidInReduce = bid - nblocksForPut;
+    DeviceHandle<BaseMemoryChannel>* localMemoryChannels = memoryChannels + bidInReduce * nPeers;
+    // Map REDUCE blocks to PUT blocks: REDUCE blocks 0,1 handle PUT block 0's data
+    uint32_t putBlockId = bidInReduce / REDUCE_COPY_RATIO;
+    uint32_t subBlockId = bidInReduce % REDUCE_COPY_RATIO;
+    for (uint32_t iter = 0; iter < nIters; iter++) {
+      if (threadIdx.x == 0) {
+        semaphoreForReduce[bidInReduce].acquire();
+      }
+      uint32_t baseOffset = (iter % pipelineDepth) * scratchIterStride;
+      uint32_t baseSrcOffset = iter * chunkSize;
+
+      // Use same thread mapping as PUT: putBlockId * blockDim.x + threadIdx.x
+      uint32_t threadIdInPut = putBlockId * blockDim.x + threadIdx.x;
+      __syncthreads();
+      if (threadIdx.x < nPeers) {
+        localMemoryChannels[threadIdx.x].signal();
+        localMemoryChannels[threadIdx.x].wait();
+      }
+      __syncthreads();
+#pragma unroll nStepsPerIter
+      for (uint32_t step = 0; step < nStepsPerIter; step++) {
+        // Map to PUT's step pattern: each REDUCE block handles nStepsPerIter steps
+        // subBlockId determines which subset of the REDUCE_COPY_RATIO * nStepsPerIter steps
+        uint32_t putStep = subBlockId * nStepsPerIter + step;
+        uint32_t myChunkOffset =
+            baseSrcOffset + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
+        int4 tmp = loadVec(buff, myChunkOffset, nelems);
+        // Add data from each peer's slot in scratch (peer sent their chunk[rank] to our scratch[peer])
+        for (uint32_t peer = 0; peer < nPeers; peer++) {
+          int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+          uint32_t peerSlotOffset =
+              baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
+          int4 data = scratch4[peerSlotOffset];
+          tmp = cal_vector<T, OpType>(data, tmp);
+        }
+        storeVec(resultBuff, myChunkOffset, tmp, nelems);
+        // Broadcast reduced result to all peers' scratch at SCATTER_AG_OFFSET + rank * nInt4PerIter
+        uint32_t dstOffset =
+            baseOffset + chunkSize + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
+        for (uint32_t i = 0; i < nPeers; i++) {
+          int peerIdx = (rank + i + 1) % nRanksPerNode;
+          int index = peerIdx < rank ? peerIdx : peerIdx - 1;
+          mscclpp::write<int4>(((void**)remoteMemories)[index], dstOffset, tmp);
+        }
+      }
+      __syncthreads();
+      if (threadIdx.x == 0) {
+        semaphoreForRecv[bidInReduce].release();
+      }
+    }
+  } else if (bid < nblocksForPut + nblocksForReduce + nblocksForRecv) {
+    uint32_t bidInRecv = bid - nblocksForPut - nblocksForReduce;
+    DeviceHandle<BaseMemoryChannel>* localMemoryChannels = memoryChannels + (nblocksForReduce + bidInRecv) * nPeers;
+    for (uint32_t iter = 0; iter < nIters; iter++) {
+      if (threadIdx.x < REDUCE_COPY_RATIO) {
+        semaphoreForRecv[bidInRecv * REDUCE_COPY_RATIO + threadIdx.x].acquire();
+      }
+      uint32_t baseOffset = scratchIterStride * (iter % pipelineDepth);
+      uint32_t baseDstOffset = chunkSize * iter;
+      int threadIdInRecv = bidInRecv * blockDim.x + threadIdx.x;
+      __syncthreads();
+      if (threadIdx.x < nPeers) {
+        localMemoryChannels[threadIdx.x].signal();
+        localMemoryChannels[threadIdx.x].wait();
+      }
+      __syncthreads();
+      // Copy other ranks' reduced chunks from scratch to result
+      for (uint32_t peer = 0; peer < nPeers; peer++) {
+        int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+        for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) {
+          uint32_t offset = baseOffset + chunkSize + remoteRankId * nInt4PerIter + threadIdInRecv +
+                            step * blockDim.x * nblocksForRecv;
+          uint32_t dstOffset =
+              baseDstOffset + remoteRankId * nInt4PerIter + threadIdInRecv + step * blockDim.x * nblocksForRecv;
+          storeVec(resultBuff, dstOffset, scratch4[offset], nelems);
+        }
+      }
+      __syncthreads();
+      if (threadIdx.x == 0) {
+        semaphoreForSend[bidInRecv].release();
+      }
+    }
+  }
+}
+
+template <ReduceOp OpType, typename T>
+struct AllreduceRsAgPipelineAdapter {
+  static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
+                          DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
+                          size_t scratchSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    using ChannelType = DeviceHandle<BaseMemoryChannel>;
+    size_t nelems = inputSize / sizeof(T);
+    uint32_t nblocksForPut = MAX_NBLOCKS_FOR_PUT;
+    uint32_t nblocksForReduce = MAX_NBLOCKS_FOR_REDUCE;
+    uint32_t nblocksForRecv = MAX_NBLOCKS_FOR_RECV;
+    int maxNblocks = nblocksForPut + nblocksForReduce + nblocksForRecv;
+    if (nBlocks == 0 || nThreadsPerBlock == 0) {
+      nThreadsPerBlock = 1024;
+      nBlocks = maxNblocks;
+    } else {
+      nBlocks = nBlocks / (REDUCE_COPY_RATIO + 2) * (REDUCE_COPY_RATIO + 2);
+      if (nBlocks > maxNblocks) {
+        WARN(ALGO, "The number of blocks is too large for the allreduce pipeline algorithm, reducing it to ",
+             maxNblocks);
+        nBlocks = maxNblocks;
+      }
+      nblocksForPut = nBlocks / (REDUCE_COPY_RATIO + 2);
+      nblocksForReduce = nblocksForPut * REDUCE_COPY_RATIO;
+      nblocksForRecv = nblocksForPut;
+    }
+    allreduceRsAgPipeline<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+        (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
+        nRanksPerNode, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv);
+    return cudaGetLastError();
+  }
+};
+
+void AllreduceRsAgPipeline::initialize(std::shared_ptr<Communicator> comm) {
+  this->conns_ = setupConnections(comm);
+  nChannelsPerConnection_ = MAX_NBLOCKS_FOR_REDUCE + MAX_NBLOCKS_FOR_RECV;
+  comm_ = comm;
+  // setup semaphores
+  this->scratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
+  RegisteredMemory localMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
+  this->remoteScratchMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), localMemory);
+  localScratchMemory_ = std::move(localMemory);
+
+  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, this->scratchSemaphores_, nChannelsPerConnection_);
+  this->baseMemoryChannelHandles_ = setupBaseMemoryChannelDeviceHandles(baseChannels_);
+  std::vector<void*> remoteMemoryHandles;
+  for (const auto& remoteMemory : this->remoteScratchMemories_) {
+    remoteMemoryHandles.push_back(remoteMemory.data());
+  }
+  this->remoteMemoryHandles_ = detail::gpuCallocShared<void*>(remoteMemoryHandles.size());
+  gpuMemcpy(this->remoteMemoryHandles_.get(), remoteMemoryHandles.data(), remoteMemoryHandles.size(),
+            cudaMemcpyHostToDevice);
+}
+
+CommResult AllreduceRsAgPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
+                                                      size_t inputSize, DataType dtype, ReduceOp op,
+                                                      cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                                      const std::unordered_map<std::string, uintptr_t>&) {
+  auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgPipelineAdapter>(op, dtype);
+  if (!allreduce) {
+    WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
+         ", dtype=", static_cast<int>(dtype));
+    return CommResult::CommInvalidArgument;
+  }
+  std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
+                                this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, this->scratchBufferSize_,
+                                algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0,
+                                0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+  if (error != cudaSuccess) {
+    WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
+    return CommResult::CommUnhandledCudaError;
+  }
+  return CommResult::CommSuccess;
+}
+
+AlgorithmCtxKey AllreduceRsAgPipeline::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
+  return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
+}
+
+std::shared_ptr<void> AllreduceRsAgPipeline::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
+                                                                  void*, size_t, DataType) {
+  auto ctx = std::make_shared<AlgorithmCtx>();
+  ctx->rank = comm->bootstrap()->getRank();
+  ctx->workSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+
+  ctx->memorySemaphores = this->scratchSemaphores_;
+  ctx->registeredMemories = this->remoteScratchMemories_;
+  return ctx;
+}
+
+std::shared_ptr<Algorithm> AllreduceRsAgPipeline::build() {
+  auto self = std::make_shared<AllreduceRsAgPipeline>((uintptr_t)scratchBuffer_, scratchBufferSize_);
+  return std::make_shared<NativeAlgorithm>(
+      "default_allreduce_rsag_pipeline", "allreduce",
+      [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
+      [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+        return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
+                                         extras);
+      },
+      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize,
+             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
+}
+}  // namespace collective
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
new file mode 100644
index 00000000..caac07ae
--- /dev/null
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -0,0 +1,236 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "allreduce/allreduce_rsag_zero_copy.hpp"
+#include "allreduce/common.hpp"
+#include "collective_utils.hpp"
+#include "logger.hpp"
+
+namespace mscclpp {
+namespace collective {
+
+__device__ mscclpp::DeviceSyncer globalSyncer;
+
+// Zero-copy Reduce-Scatter + All-Gather (RSAG) allreduce.
+//
+// Unlike the standard RSAG which copies input into a scratch buffer first,
+// this variant reads directly from peers' input buffers and writes reduced
+// results directly to peers' output buffers — eliminating the need for a
+// separate scratch buffer and reducing memory traffic.
+//
+// The algorithm runs in a single kernel with the following steps:
+//
+//   1. Barrier: Signal and wait on all peers to ensure input buffers are ready.
+//
+//   2. Reduce-Scatter: Each rank reads its assigned chunk from every peer's
+//      input buffer (via CudaIpc remote memory handles), reduces all values
+//      locally, then writes the reduced result to its own output buffer AND
+//      directly to every peer's output buffer at the same offset.
+//
+//   3. Global sync + Barrier: A device-wide sync ensures all writes complete,
+//      followed by a final signal/wait to guarantee all peers have finished
+//      writing, making the full output buffer valid on every rank.
+//
+// This approach requires registering both input and output buffers as remote
+// memories (2 * nPeers handles), but avoids scratch buffer allocation and
+// the extra copy steps of the standard RSAG. The NRanksPerNode template
+// parameter enables compile-time unrolling of peer loops (supports 4 or 8).
+
+template <int NRanksPerNode, ReduceOp OpType, typename T>
+__global__ void __launch_bounds__(1024, 1)
+    allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                          DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int worldSize,
+                          size_t nelems) {
+  int blockId = blockIdx.x;
+
+  assert((uintptr_t)buff % sizeof(int4) == 0);
+  assert((uintptr_t)resultBuff % sizeof(int4) == 0);
+
+  constexpr int NPeers = NRanksPerNode - 1;
+  constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
+  const uint32_t outputRemoteBufferOffset = NRanksPerNode - 1;
+  uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
+                           nelemsPerInt4 * NRanksPerNode;
+  uint32_t nelemsPerRank = alignedNelems / NRanksPerNode;
+  uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
+  uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4;
+
+  int4* resultBuff4 = reinterpret_cast<int4*>((char*)resultBuff);
+  int4* buff4 = reinterpret_cast<int4*>((char*)buff);
+  DeviceHandle<BaseMemoryChannel>* memoryChannelsLocal = memoryChannels + blockId * NPeers;
+
+  uint32_t nInt4PerBlock = nInt4PerRank / gridDim.x;
+  uint32_t remainderForBlock = nInt4PerRank % gridDim.x;
+  uint32_t offset4 = blockId * nInt4PerBlock;
+  if (blockId == (int)(gridDim.x - 1)) {
+    nInt4PerBlock += remainderForBlock;
+  }
+  if (nInt4PerBlock == 0) return;
+
+  if (threadIdx.x < NPeers) {
+    memoryChannelsLocal[threadIdx.x].relaxedSignal();
+    memoryChannelsLocal[threadIdx.x].relaxedWait();
+  }
+  __syncthreads();
+  int4 data[NPeers];
+  for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) {
+    uint32_t offset = idx + offset4 + rank * nInt4PerRank;
+    if (offset >= nInt4Total) continue;
+    int4 tmp = buff4[offset];
+#pragma unroll
+    for (int i = 0; i < NPeers; i++) {
+      int rankIdx = (rank + i + 1) % NRanksPerNode;
+      int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
+      data[i] = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
+    }
+    for (int i = 0; i < NPeers; i++) {
+      tmp = cal_vector<T, OpType>(data[i], tmp);
+    }
+#pragma unroll
+    for (int i = 0; i < NPeers; i++) {
+      int rankIdx = (rank + i + 1) % NRanksPerNode;
+      int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
+      mscclpp::write<int4>(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp);
+    }
+    resultBuff4[offset] = tmp;
+  }
+  // Use device barrier gives better performance here.
+  globalSyncer.sync(gridDim.x);
+  if (blockIdx.x == 0 && threadIdx.x < NPeers) {
+    memoryChannelsLocal[threadIdx.x].signal();
+    memoryChannelsLocal[threadIdx.x].wait();
+  }
+}
+
+template <ReduceOp OpType, typename T>
+struct AllreduceRsAgZeroCopyAdapter {
+  static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
+                          DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
+                          size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream,
+                          void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    using ChannelType = DeviceHandle<BaseMemoryChannel>;
+    size_t nelems = inputSize / sizeof(T);
+    if (nBlocks == 0 || nThreadsPerBlock == 0) {
+      nThreadsPerBlock = 1024;
+      nBlocks = 64;
+      if (inputSize >= (1 << 26)) {
+        nBlocks = 128;
+      }
+    }
+    if (nRanksPerNode == 4) {
+      allreduceRsAgZeroCopy<4, OpType, T>
+          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
+                                                     switchChannel, remoteMemories, rank, worldSize, nelems);
+    } else if (nRanksPerNode == 8) {
+      allreduceRsAgZeroCopy<8, OpType, T>
+          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
+                                                     switchChannel, remoteMemories, rank, worldSize, nelems);
+    } else {
+      THROW(ALGO, Error, ErrorCode::InvalidUsage, "Unsupported number of ranks per node: ", nRanksPerNode);
+    }
+    return cudaGetLastError();
+  }
+};
+
+void AllreduceRsAgZeroCopy::initialize(std::shared_ptr<Communicator> comm) {
+  this->conns_ = setupConnections(comm);
+  nChannelsPerConnection_ = 128;
+  comm_ = comm;
+  // setup semaphores
+  this->semaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
+  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, this->semaphores_, nChannelsPerConnection_);
+  this->baseMemoryChannelHandles_ = setupBaseMemoryChannelDeviceHandles(baseChannels_);
+}
+
+CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
+                                                      size_t inputSize, DataType dtype, ReduceOp op,
+                                                      cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                                      const std::unordered_map<std::string, uintptr_t>&) {
+  auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgZeroCopyAdapter>(op, dtype);
+  if (!allreduce) {
+    WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
+         ", dtype=", static_cast<int>(dtype));
+    return CommResult::CommInvalidArgument;
+  }
+  std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  cudaError_t error =
+      allreduce(input, nullptr, output, this->baseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(),
+                nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream,
+                nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+  if (error != cudaSuccess) {
+    WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
+    return CommResult::CommUnhandledCudaError;
+  }
+  return CommResult::CommSuccess;
+}
+
+AlgorithmCtxKey AllreduceRsAgZeroCopy::generateAllreduceContextKey(const void* inputBuffer, void* outputBuffer,
+                                                                   size_t size, DataType, bool symmetricMemory) {
+  // For non-symmetric algorithms, we use both input and output buffer pointers in the key.
+  static int tag = 0;
+  if (symmetricMemory) {
+    size_t inputBytes, outputBytes;
+    CUdeviceptr inputBasePtr, outputBasePtr;
+    MSCCLPP_CUTHROW(cuMemGetAddressRange(&inputBasePtr, &inputBytes, (CUdeviceptr)inputBuffer));
+    MSCCLPP_CUTHROW(cuMemGetAddressRange(&outputBasePtr, &outputBytes, (CUdeviceptr)outputBuffer));
+    return AlgorithmCtxKey{(void*)inputBasePtr, (void*)outputBasePtr, inputBytes, outputBytes, 0};
+  }
+  return AlgorithmCtxKey{(void*)inputBuffer, outputBuffer, size, size, ++tag};
+}
+
+std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_ptr<Communicator> comm, const void* input,
+                                                                  void* output, size_t size, DataType) {
+  auto ctx = std::make_shared<AlgorithmCtx>();
+  ctx->rank = comm->bootstrap()->getRank();
+  ctx->workSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+
+  ctx->memorySemaphores = this->semaphores_;
+
+  // register input and output memories
+  RegisteredMemory inputMemory = comm->registerMemory((void*)input, size, Transport::CudaIpc);
+  RegisteredMemory outputMemory = comm->registerMemory(output, size, Transport::CudaIpc);
+  this->inputMemories_.push_back(inputMemory);
+  this->outputMemories_.push_back(outputMemory);
+
+  auto remoteInputMemories = setupRemoteMemories(comm, ctx->rank, inputMemory);
+  auto remoteOutputMemories = setupRemoteMemories(comm, ctx->rank, outputMemory);
+  ctx->registeredMemories.insert(ctx->registeredMemories.end(), remoteInputMemories.begin(), remoteInputMemories.end());
+  ctx->registeredMemories.insert(ctx->registeredMemories.end(), remoteOutputMemories.begin(),
+                                 remoteOutputMemories.end());
+  std::vector<void*> remoteMemoryHandles;
+  for (const auto& remoteMemory : ctx->registeredMemories) {
+    remoteMemoryHandles.push_back(remoteMemory.data());
+  }
+  ctx->remoteMemoryHandles = detail::gpuCallocShared<void*>(remoteMemoryHandles.size());
+  gpuMemcpy(ctx->remoteMemoryHandles.get(), remoteMemoryHandles.data(), remoteMemoryHandles.size(),
+            cudaMemcpyHostToDevice);
+
+  // store local registered memories to ctx for lifetime management
+  ctx->registeredMemories.push_back(inputMemory);
+  ctx->registeredMemories.push_back(outputMemory);
+  return ctx;
+}
+
+std::shared_ptr<Algorithm> AllreduceRsAgZeroCopy::build() {
+  auto self = std::make_shared<AllreduceRsAgZeroCopy>();
+  return std::make_shared<NativeAlgorithm>(
+      "default_allreduce_rsag_zero_copy", "allreduce",
+      [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
+      [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+        return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
+                                         extras);
+      },
+      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize,
+             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
+}
+}  // namespace collective
+}  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
new file mode 100644
index 00000000..6e033f67
--- /dev/null
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_EXT_ALLREDUCE_RSAG_HPP_
+#define MSCCLPP_EXT_ALLREDUCE_RSAG_HPP_
+
+#include <mscclpp/algorithm.hpp>
+
+namespace mscclpp {
+namespace collective {
+
+class AllreduceRsAg : public mscclpp::AlgorithmBuilder {
+ public:
+  AllreduceRsAg(uintptr_t scratchBuffer, size_t scratchBufferSize)
+      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  std::shared_ptr<mscclpp::Algorithm> build() override;
+
+ private:
+  void initialize(std::shared_ptr<Communicator> comm);
+  CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+                                 DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                 const std::unordered_map<std::string, uintptr_t>& extras);
+
+  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
+                                             DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
+  void* scratchBuffer_;
+  size_t scratchBufferSize_;
+  std::shared_ptr<Communicator> comm_;
+  int nChannelsPerConnection_;
+  std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> scratchSemaphores_;
+  std::vector<RegisteredMemory> remoteScratchMemories_;
+  RegisteredMemory localScratchMemory_;
+
+  std::vector<BaseMemoryChannel> baseChannels_;
+  std::shared_ptr<DeviceHandle<BaseMemoryChannel>> baseMemoryChannelHandles_;
+  std::shared_ptr<void*> remoteMemoryHandles_;
+};
+}  // namespace collective
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_ALLREDUCE_RSAG_HPP_
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
new file mode 100644
index 00000000..2a740ac0
--- /dev/null
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_EXT_ALLREDUCE_RSAG_PIPELINE_HPP_
+#define MSCCLPP_EXT_ALLREDUCE_RSAG_PIPELINE_HPP_
+
+#include <mscclpp/algorithm.hpp>
+
+namespace mscclpp {
+namespace collective {
+
+class AllreduceRsAgPipeline : public mscclpp::AlgorithmBuilder {
+ public:
+  AllreduceRsAgPipeline(uintptr_t scratchBuffer, size_t scratchBufferSize)
+      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  std::shared_ptr<mscclpp::Algorithm> build() override;
+
+ private:
+  void initialize(std::shared_ptr<Communicator> comm);
+  CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+                                 DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                 const std::unordered_map<std::string, uintptr_t>& extras);
+
+  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
+                                             DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
+  void* scratchBuffer_;
+  size_t scratchBufferSize_;
+  std::shared_ptr<Communicator> comm_;
+  int nChannelsPerConnection_;
+  std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> scratchSemaphores_;
+  std::vector<RegisteredMemory> remoteScratchMemories_;
+  RegisteredMemory localScratchMemory_;
+
+  std::vector<BaseMemoryChannel> baseChannels_;
+  std::shared_ptr<DeviceHandle<BaseMemoryChannel>> baseMemoryChannelHandles_;
+  std::shared_ptr<void*> remoteMemoryHandles_;
+};
+}  // namespace collective
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_ALLREDUCE_RSAG_PIPELINE_HPP_
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
new file mode 100644
index 00000000..6153a0e4
--- /dev/null
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_EXT_ALLREDUCE_RSAG_ZERO_COPY_HPP_
+#define MSCCLPP_EXT_ALLREDUCE_RSAG_ZERO_COPY_HPP_
+
+#include <mscclpp/algorithm.hpp>
+
+namespace mscclpp {
+namespace collective {
+
+class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder {
+ public:
+  AllreduceRsAgZeroCopy() = default;
+  std::shared_ptr<mscclpp::Algorithm> build() override;
+
+ private:
+  void initialize(std::shared_ptr<Communicator> comm);
+  CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+                                 DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                 const std::unordered_map<std::string, uintptr_t>& extras);
+
+  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
+                                             DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
+  std::shared_ptr<Communicator> comm_;
+  int nChannelsPerConnection_;
+  std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> semaphores_;
+  std::vector<RegisteredMemory> inputMemories_;
+  std::vector<RegisteredMemory> outputMemories_;
+
+  std::vector<BaseMemoryChannel> baseChannels_;
+  std::shared_ptr<DeviceHandle<BaseMemoryChannel>> baseMemoryChannelHandles_;
+};
+}  // namespace collective
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_ALLREDUCE_RSAG_ZERO_COPY_HPP_
\ No newline at end of file
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 74cf83fd..bff2d5c9 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -84,6 +84,7 @@ class AlgorithmCtx {
   std::shared_ptr<DeviceHandle<PortChannel>> portChannelDeviceHandles;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores;
   std::vector<std::shared_ptr<Host2DeviceSemaphore>> hostSemaphores;
+  std::shared_ptr<void*> remoteMemoryHandles;
   std::unordered_map<std::string, std::shared_ptr<void>> extras;
 };
 
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
index be3c58c7..d523320e 100644
--- a/src/ext/nccl/algorithm_selector.cc
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -71,7 +71,15 @@ static std::shared_ptr<Algorithm> selectSingleNodeAllreduceBlackwell(
     if (messageSize <= (1 << 21)) {  // <= 2MB
       return algoMap.at("default_allreduce_packet");
     }
-    return nullptr;
+    if (config.inCaptureMode) {
+      // CUDA graph mode: setup new connections each time (zero-copy for graph)
+      return algoMap.at("default_allreduce_rsag_zero_copy");
+    }
+    // Non-graph mode: use non-zero-copy algorithms
+    if (messageSize <= (1 << 23)) {  // <= 8MB
+      return algoMap.at("default_allreduce_rsag");
+    }
+    return algoMap.at("default_allreduce_rsag_pipeline");
   }
 
   // Symmetric memory path: can use cached memory handles
@@ -83,8 +91,7 @@ static std::shared_ptr<Algorithm> selectSingleNodeAllreduceBlackwell(
     return algoMap.at("default_allreduce_nvls");
   }
 
-  INFO(MSCCLPP_NCCL, "No suitable kernel for Blackwell architecture, fallback to nccl/rccl");
-  return nullptr;
+  return algoMap.at("default_allreduce_rsag_zero_copy");
 }
 
 std::shared_ptr<Algorithm> selectSingleNodeAllreduce(

From ab49386839b3ed600715e3a841c889c8906a37e3 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 27 Feb 2026 10:59:36 -0800
Subject: [PATCH 22/52] Add doc for perf tunning (#756)

---
 docs/guide/mscclpp-torch-integration.md | 129 ++++++++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md
index 6e3dc20b..5e223546 100644
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -468,3 +468,132 @@ stream_handle = torch.cuda.current_stream().cuda_stream
 
 All examples are in [`examples/torch-integration/`](../../examples/torch-integration/).
 
+---
+
+## Performance Tuning
+
+The default algorithms use a fixed heuristic to select algorithms based on message size. For production workloads, you can achieve significantly better performance by **auto-tuning** — benchmarking every candidate algorithm, block count, and thread count for each message size at startup, then using the fastest configuration at runtime.
+
+**Full example:** [customized_comm_with_tuning.py](../../examples/torch-integration/customized_comm_with_tuning.py)
+
+### How It Works
+
+1. **Candidate selection** — For each power-of-two message size from 1 KB to 128 MB, the tuner picks the applicable algorithms:
+   - Small messages (≤ 4 MB): `default_allreduce_nvls_packet`, `default_allreduce_packet`
+   - Large messages (≥ 512 KB): `default_allreduce_rsag_zero_copy`
+   - Overlapping sizes get all three candidates.
+
+2. **Grid search** — Each candidate is run with every combination of block counts (`4, 8, 16, … 128`) and thread counts (`512, 768, 1024`). Results are captured in a CUDA graph and timed.
+
+3. **Cross-rank consensus** — Elapsed times are averaged across all ranks with an allreduce so every GPU selects the same configuration.
+
+4. **Runtime dispatch** — `get_tuned_config()` rounds the actual message size up to the next power of two and returns the winning `(algorithm, nblocks, nthreads)` triple.
+
+### Loading Candidate Algorithms
+
+The same `load_algorithms` helper from Approach 1 is reused. The tuner extracts multiple algorithm objects:
+
+```python
+algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
+
+self._algorithm_nvls_packet = [
+    algo for algo in algorithms
+    if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
+][0]
+
+self._algorithm_rsag_zero_copy = [
+    algo for algo in algorithms
+    if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy"
+][0]
+
+self._algorithm_packet = [
+    algo for algo in algorithms
+    if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
+][0]
+```
+
+### The Tuning Loop
+
+The tuning loop iterates over message sizes, candidate algorithms, and kernel launch parameters. CUDA graphs are used for accurate timing:
+
+```python
+def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
+    sizes = [1 << i for i in range(10, 28)]
+    self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
+
+    tune_tensor = torch.rand(1 << 27, dtype=torch.float16, device="cuda")
+    candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
+    candidates_nthreads = [512, 768, 1024]
+
+    for size in sizes:
+        algos = []
+        if size <= 4 * 1024 * 1024:
+            algos.append(self._algorithm_nvls_packet)
+            algos.append(self._algorithm_packet)
+        if size >= 512 * 1024:
+            algos.append(self._algorithm_rsag_zero_copy)
+
+        best_time = float("inf")
+        best_config = None
+
+        for algo in algos:
+            for nb in candidates_nblocks:
+                for nt in candidates_nthreads:
+                    if self._run_algo(algo, tune_tensor, size, nb, nt) != 0:
+                        continue  # skip unsupported configs
+
+                    # Warmup, then time with CUDA graphs
+                    # ... (see full example for graph capture logic)
+
+                    # Average timing across ranks
+                    time_tensor = torch.full(
+                        (self.world_size,), elapsed, dtype=torch.float64, device="cuda"
+                    ).to(dtype=torch.float32)
+                    self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM)
+                    avg_time = time_tensor[self.rank].item() / self.world_size
+
+                    if avg_time < best_time:
+                        best_time = avg_time
+                        best_config = (algo, nb, nt)
+
+        if best_config:
+            self.best_configs[size] = best_config
+```
+
+### Dispatching with Tuned Configuration
+
+At runtime, round the message size to the next power of two and look up the best configuration:
+
+```python
+def get_tuned_config(self, size):
+    if size < 1024:
+        target_size = 1024
+    elif size > 256 * 1024 * 1024:
+        target_size = 256 * 1024 * 1024
+    else:
+        target_size = 1 << (size - 1).bit_length()
+    return self.best_configs.get(target_size)
+
+def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None):
+    config = self.get_tuned_config(tensor.nbytes)
+    algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0)
+    algo.execute(
+        comm=self.comm.communicator,
+        input_buffer=tensor.data_ptr(),
+        output_buffer=tensor.data_ptr(),
+        input_size=tensor.nbytes,
+        output_size=tensor.nbytes,
+        dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+        op=mscclpp.ReduceOp.SUM,
+        stream=stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream,
+        nblocks=nblocks,
+        nthreads_per_block=nthreads,
+    )
+```
+
+### Running the Tuning Example
+
+```bash
+MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
+  torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py
+```

From 4bc1999001984e472ba24e3638443c39393e5f52 Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Fri, 27 Feb 2026 17:50:43 -0800
Subject: [PATCH 23/52] Adding Support to Setting Message Size Range in Native
 Algorithm API (#758)

---
 include/mscclpp/algorithm.hpp     |  7 +++++++
 python/csrc/algorithm.cpp         |  6 ++++++
 python/mscclpp/_core/algorithm.py | 15 ++++++++++++++-
 src/core/algorithm.cc             |  9 +++++++++
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/include/mscclpp/algorithm.hpp b/include/mscclpp/algorithm.hpp
index 07149cab..65b1ab3c 100644
--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -84,6 +84,11 @@ class Algorithm {
   /// @return The Constraint struct specifying worldSize and nRanksPerNode requirements.
   virtual Constraint constraint() const = 0;
 
+  /// Set the valid message size range for this algorithm.
+  /// @param minMessageSize Minimum supported message size in bytes.
+  /// @param maxMessageSize Maximum supported message size in bytes.
+  virtual void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) = 0;
+
   /// Execute the algorithm.
   /// @param comm The communicator to use.
   /// @param input Pointer to the input buffer.
@@ -233,6 +238,7 @@ class NativeAlgorithm : public Algorithm {
   const std::string& name() const override;
   const std::string& collective() const override;
   const std::pair<size_t, size_t>& messageRange() const override;
+  void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) override;
   const std::unordered_map<std::string, uint64_t>& tags() const override;
   const CollectiveBufferMode& bufferMode() const override;
   AlgorithmType type() const override { return AlgorithmType::Native; }
@@ -273,6 +279,7 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
   const std::string& name() const override;
   const std::string& collective() const override;
   const std::pair<size_t, size_t>& messageRange() const override;
+  void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) override;
   const std::unordered_map<std::string, uint64_t>& tags() const override;
   const CollectiveBufferMode& bufferMode() const override;
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp
index f0d8980d..1a93cbc0 100644
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -60,6 +60,12 @@ void register_algorithm(nb::module_& m) {
           .def_prop_ro("name", &Algorithm::name)
           .def_prop_ro("collective", &Algorithm::collective)
           .def_prop_ro("message_range", &Algorithm::messageRange)
+          .def(
+              "set_message_size_range",
+              [](Algorithm& self, size_t minMessageSize, size_t maxMessageSize) {
+                self.setMessageSizeRange(minMessageSize, maxMessageSize);
+              },
+              nb::arg("min_message_size"), nb::arg("max_message_size"))
           .def_prop_ro("tags", &Algorithm::tags)
           .def_prop_ro("buffer_mode", &Algorithm::bufferMode)
           .def_prop_ro("constraint", &Algorithm::constraint)
diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py
index 9b870582..744cf39e 100644
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -114,11 +114,24 @@ class Algorithm:
         """The collective operation this algorithm implements (e.g., "allreduce", "allgather")."""
         return self._algorithm.collective
 
-    @cached_property
+    @property
     def message_size_range(self) -> Tuple[int, int]:
         """The valid message size range (min_size, max_size) in bytes."""
         return (self._algorithm.message_range[0], self._algorithm.message_range[1])
 
+    def set_message_size_range(self, min_message_size: int, max_message_size: int):
+        """Set the valid message size range in bytes.
+
+        Args:
+            min_message_size: Minimum supported message size in bytes.
+            max_message_size: Maximum supported message size in bytes.
+
+        Only supported for native algorithms. Raises TypeError for DSL algorithms.
+        """
+        if self.is_dsl_algorithm():
+            raise TypeError("set_message_size_range is only supported for native algorithms")
+        self._algorithm.set_message_size_range(min_message_size, max_message_size)
+
     @cached_property
     def tags(self) -> Dict[str, int]:
         """Dictionary of tag names to tag values for algorithm selection hints."""
diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc
index 683d4ddd..99e7b031 100644
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -66,6 +66,11 @@ const std::pair<size_t, size_t>& NativeAlgorithm::messageRange() const {
   return range;
 }
 
+void NativeAlgorithm::setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) {
+  minMessageSize_ = minMessageSize;
+  maxMessageSize_ = maxMessageSize;
+}
+
 const std::unordered_map<std::string, uint64_t>& NativeAlgorithm::tags() const { return tags_; }
 
 const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferMode_; }
@@ -143,6 +148,10 @@ const std::pair<size_t, size_t>& DslAlgorithm::messageRange() const {
   return range;
 }
 
+void DslAlgorithm::setMessageSizeRange(size_t, size_t) {
+  THROW(EXEC, Error, ErrorCode::InvalidUsage, "setMessageSizeRange is only supported for native algorithms");
+}
+
 const std::unordered_map<std::string, uint64_t>& DslAlgorithm::tags() const { return tags_; }
 
 const CollectiveBufferMode& DslAlgorithm::bufferMode() const {

From 69565a2f3289f7cf3f1d73e15aa2a228195db8b2 Mon Sep 17 00:00:00 2001
From: Xingbo Wu <wuxb45@users.noreply.github.com>
Date: Mon, 2 Mar 2026 23:53:59 +0000
Subject: [PATCH 24/52] Do threadInit/cudaSetDevice before other cuda calls
 (#757)

I recently encountered a weird memory usage issue.
After starting the proxy service on a cuda device X > 0, I notice an
unexpected thread entity apprear on both the GPU X and GPU 0, where GPU
0's share is about 500MB. Note that when the device is 0, there is no
extra memory usage.
The image clearly shows that when 8 ranks each using one GPU and
starting proxies, the GPU 0 sees 7 extra threads, each consuming 500MB
extra memory.
<img width="1247" height="1367" alt="Screenshot 2026-02-28 000153"
src="https://github.com/user-attachments/assets/cfd0d47f-319b-4ebb-bf19-dec66062e6f4"
/>


After tracking down to when it happens, I identified the root cause in
Proxy thread initialization.

    // never capture in a proxy thread
    auto mode = cudaStreamCaptureModeRelaxed;
    MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode));

    pimpl_->threadInit();

The call to cudaThreadExchangeStreamCaptureMode() actually triggers some
resource allocation on the "current device" which is still 0 for the
starting thread.
The later threadInit() is too late to set the correct GPU number.

The fix is simple: call threadInit() before the first cuda call:

    pimpl_->threadInit();
    // never capture in a proxy thread
    auto mode = cudaStreamCaptureModeRelaxed;
    MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode));

This guarantees that the current device is properly set before calling
any resource-allocating cuda functions.

This is the memory usage after the fix. The extra memory usages are
gone.

<img width="1242" height="459" alt="Image (1)"
src="https://github.com/user-attachments/assets/4256e4c8-6f1d-4844-9f77-5b2935387df9"
/>

---------

Co-authored-by: Binyang Li <binyli@microsoft.com>
---
 include/mscclpp/proxy.hpp | 4 +++-
 src/core/proxy.cc         | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/mscclpp/proxy.hpp b/include/mscclpp/proxy.hpp
index 36a56a90..990deabb 100644
--- a/include/mscclpp/proxy.hpp
+++ b/include/mscclpp/proxy.hpp
@@ -29,7 +29,9 @@ class Proxy {
  public:
   /// Constructor.
   /// @param handler Handler for each FIFO trigger.
-  /// @param threadInit Optional function run in proxy thread before FIFO consumption.
+  /// @param threadInit Optional function run once in the proxy thread before FIFO consumption.
+  ///        The function should initialize thread runtime context before any CUDA API call in that thread
+  ///        (for example, set CUDA device and optionally bind NUMA affinity).
   /// @param fifoSize FIFO size (default: DEFAULT_FIFO_SIZE).
   Proxy(ProxyHandler handler, std::function<void()> threadInit, int fifoSize = DEFAULT_FIFO_SIZE);
 
diff --git a/src/core/proxy.cc b/src/core/proxy.cc
index 2a980505..de5b90fc 100644
--- a/src/core/proxy.cc
+++ b/src/core/proxy.cc
@@ -59,11 +59,15 @@ MSCCLPP_API_CPP Proxy::~Proxy() {
 MSCCLPP_API_CPP void Proxy::start(bool blocking) {
   pimpl_->running.store(true, std::memory_order_release);
   pimpl_->service = std::thread([this] {
+    // threadInit() is responsible for setting up the runtime context for the thread.
+    // The default implementation sets the CUDA device and NUMA affinity to match the main thread (see Proxy ctor).
+    // It should be called before any CUDA API calls to avoid resource allocation on unwanted GPUs.
+    pimpl_->threadInit();
+
     // never capture in a proxy thread
     auto mode = cudaStreamCaptureModeRelaxed;
     MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode));
 
-    pimpl_->threadInit();
     pimpl_->threadStarted.store(true, std::memory_order_release);
 
     ProxyHandler handler = this->pimpl_->handler;

From 3751f0299b5944b114b38fbf3883cb61a8d12913 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 6 Mar 2026 16:33:35 -0800
Subject: [PATCH 25/52] Fix NCCL fallback comm destroy and use latest NCCL
 release in CI (#760)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Fix NCCL fallback communicator cleanup errors and update CI to use
stable NCCL releases.

## Problem

When using `LD_PRELOAD=libmscclpp_nccl.so` with NCCL fallback enabled,
the following warnings appear at program exit:

```
NCCL WARN commReclaim: cleanup comm 0x55a0dcadaa90 rank 3 failed in destroy/abort, error 3
```

This is caused by three bugs in the NCCL fallback communicator lifecycle
management.

## Root Causes & Fixes

### 1. Symbol interposition during NCCL cleanup (`RTLD_DEEPBIND`)

**Root cause:** When the fallback NCCL library is loaded via `dlopen`,
its internal calls to its own public API functions (e.g.,
`ncclCommWindowDeregister`, `ncclMemFree`) during `commFree` cleanup are
intercepted by our `LD_PRELOAD`'d stub functions, which return errors.
This causes NCCL's `commReclaim` to report `error 3`
(`ncclSystemError`).

**Fix:** Add `RTLD_DEEPBIND` to the `dlopen` flags. This makes the
dlopen'd NCCL library resolve its own symbols internally first,
bypassing our interposition layer for internal calls.

### 2. Missing `ncclCommFinalize` forwarding

**Root cause:** `CommFinalize` was not in the `mscclppNcclOps_t` struct
and was never loaded via `dlsym`. So `ncclCommFinalize` never forwarded
to the real NCCL's finalize, which is required before `ncclCommDestroy`
in NCCL 2.29+.

**Fix:** Add `CommFinalize` to the ops struct and load it via `dlsym`.
Forward the call in `ncclCommFinalize`.

### 3. CI: Use latest NCCL release tag

The CI pipeline was cloning the NCCL default branch (which may contain
unreleased/unstable code). Updated to fetch the latest release tag via
GitHub API and clone that specific tag.

## Testing

Verified with the exact CI command:
```bash
mpirun -np 8 --bind-to numa --allow-run-as-root \
  -x LD_PRELOAD=libmscclpp_nccl.so \
  -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE \
  -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" \
  -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so \
  all_reduce_perf -b 1K -e 1G -f 2 -d half -G 1 -w 10 -n 100
```

- **Before:** `commReclaim: error 3` warnings on all 8 ranks at exit
- **After:** Clean exit, no warnings, correct results

## Files Changed

- `src/ext/nccl/nccl.cc` — Fix comm destroy lifecycle (RTLD_DEEPBIND,
CommFinalize forwarding, destroy order)
- `.azure-pipelines/templates/nccl-test.yaml` — Use latest NCCL release
tag in CI
---
 .azure-pipelines/templates/nccl-test.yaml |  4 +++-
 src/ext/nccl/nccl.cc                      | 21 +++++++++++++++++----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml
index 56b75d3f..ef4a9fa8 100644
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ b/.azure-pipelines/templates/nccl-test.yaml
@@ -180,7 +180,9 @@ steps:
       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd; git clone https://github.com/NVIDIA/nccl.git;          \
+        LATEST_TAG=\$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\\\" -f4); \
+        if [ -z \"\$LATEST_TAG\" ]; then echo \"Failed to fetch latest NCCL tag\"; exit 1; fi; \
+        cd; git clone --branch \$LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git; \
         cd nccl;                                                   \
         make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
     workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/src/ext/nccl/nccl.cc b/src/ext/nccl/nccl.cc
index afeb5bdb..e12f12b6 100644
--- a/src/ext/nccl/nccl.cc
+++ b/src/ext/nccl/nccl.cc
@@ -43,6 +43,7 @@ typedef enum mscclppNcclDlopenErr {
 typedef struct _mscclppNcclOps_t {
   ncclResult_t (*CommInitRank)(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
   ncclResult_t (*GetUniqueId)(ncclUniqueId* uniqueId);
+  ncclResult_t (*CommFinalize)(ncclComm_t comm);
   ncclResult_t (*CommDestroy)(ncclComm_t comm);
   ncclResult_t (*CommUserRank)(const ncclComm_t, int* rank);
   ncclResult_t (*AllReduce)(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op,
@@ -86,7 +87,7 @@ static inline int mscclppNcclDlopenInit() {
       return dlopenError;
     }
 
-    mscclppNcclDlHandle = dlopen(ncclLibPath, RTLD_LAZY | RTLD_NODELETE);
+    mscclppNcclDlHandle = dlopen(ncclLibPath, RTLD_LAZY | RTLD_NODELETE | RTLD_DEEPBIND);
     if (!mscclppNcclDlHandle) {
       WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
       return dlopenError;
@@ -99,6 +100,7 @@ static inline int mscclppNcclDlopenInit() {
   NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, CommInitRank,
              ncclResult_t(*)(ncclComm_t*, int, ncclUniqueId, int));
   NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, GetUniqueId, ncclResult_t(*)(ncclUniqueId*));
+  NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, CommFinalize, ncclResult_t(*)(ncclComm_t));
   NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, CommDestroy, ncclResult_t(*)(ncclComm_t));
   NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, CommUserRank, ncclResult_t(*)(ncclComm_t, int*));
   NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, AllReduce,
@@ -355,6 +357,17 @@ NCCL_API ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int*) {
 }
 
 NCCL_API ncclResult_t ncclCommFinalize(ncclComm_t comm) {
+  if (comm == nullptr) {
+    WARN(MSCCLPP_NCCL, "comm is nullptr");
+    return ncclInvalidArgument;
+  }
+  ncclComm_t* mscclppNcclCommPtr = reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm);
+  if (mscclppNcclCommPtr != nullptr && mscclppNcclOps.CommFinalize != nullptr) {
+    ncclResult_t result = mscclppNcclOps.CommFinalize(*mscclppNcclCommPtr);
+    if (result != ncclSuccess) {
+      return result;
+    }
+  }
   comm->comm->bootstrap()->barrier();
   return ncclSuccess;
 }
@@ -373,11 +386,11 @@ NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
 #endif
 
   ncclComm_t* mscclppNcclCommPtr = reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm);
-  delete comm;
   if (mscclppNcclCommPtr != nullptr) {
-    mscclppNcclOps.CommDestroy(*reinterpret_cast<ncclComm_t*>(mscclppNcclCommPtr));
-    delete static_cast<ncclComm_t*>(mscclppNcclCommPtr);
+    mscclppNcclOps.CommDestroy(*mscclppNcclCommPtr);
+    delete mscclppNcclCommPtr;
   }
+  delete comm;
   return ncclSuccess;
 }
 

From bf946ea51eb9bc4a7dab46fb7764b61efde7721b Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 9 Mar 2026 10:22:45 -0700
Subject: [PATCH 26/52] Fix multicast handle leak, cuMemMap offset handling,
 and rename NVLS allreduce algorithms (#759)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

This PR addresses a multicast resource leak, fixes `cuMemMap` offset
handling for multicast handles, renames NVLS allreduce algorithm classes
for clarity, and adds a new unit test for `SwitchChannel`.

### Bug Fixes

#### 1. Fix multicast allocation handle leak in `createMulticast()`
(`gpu_ipc_mem.cc`)

`GpuIpcMemHandle::createMulticast()` called
`cuMulticastCreate(&allocHandle, ...)` but never released the local
`allocHandle` after exporting it to shareable handles (POSIX FD /
Fabric). This caused a reference count leak — the multicast object was
never freed even after all mappings and imported handles were released.

Per the [CUDA Driver API docs for
`cuMemRelease`](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html):
> *"The memory allocation will be freed when all outstanding mappings to
the memory are unmapped and when all outstanding references to the
handle (including its shareable counterparts) are also released."*

The fix adds `cuMemRelease(allocHandle)` after export, matching the
existing pattern used for regular allocations in
`GpuIpcMemHandle::create()`.

**Impact:** Without this fix, repeated creation/destruction of NVLS
connections causes OOM after ~120 iterations when allocating 1GB
multicast buffers on H100.

#### 2. Fix `cuMemMap` offset for multicast handles (`gpu_ipc_mem.cc`)

`cuMemMap` requires `offset=0` for multicast handles. Previously, the
code attempted to map at a non-zero offset within the multicast object,
leading to errors when binding multiple buffers to the same
`NvlsConnection`. The fix maps the entire range `[0, mcOffset +
bufferSize)` and returns the pointer offset by `mcOffset`. This only
consumes extra virtual address space; no additional physical memory is
used.

### Refactoring

#### 3. Rename NVLS allreduce algorithm classes

Renamed for clarity:
- `AllreduceNvls` → `AllreduceNvlsZeroCopy`
- `AllreduceNvlsWithCopy` → `AllreduceNvlsWarpPipeline`
- `AllreduceNvlsWithCopy2` → `AllreduceNvlsBlockPipeline`

Updated all references in builder, selector, docs, and examples.

#### 4. Move `nvlsConnections` setup to `initialize()`

Moved `nvlsConnections_` from `AlgorithmCtx` (which no longer has this
member) to individual algorithm class members, initialized in their
`initialize()` methods.

### Tests

#### 5. Add `TwoChannelsSameConnection` test

New unit test that creates two `SwitchChannel` instances from the same
`NvlsConnection`, performs reduce operations on both, and verifies
correctness. This exercises the multi-bind path that triggered the
`cuMemMap` offset fix.

### Files Changed

- `src/core/gpu_ipc_mem.cc` — multicast handle leak fix + cuMemMap
offset fix
- `src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu` (renamed)
- `src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu`
(renamed)
- `src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu`
(renamed)
- `src/ext/collectives/allreduce/allreduce_nvls_packet.cu` —
nvlsConnections fix
- `src/ext/collectives/include/allreduce/*.hpp` — renamed headers
- `src/ext/collectives/algorithm_collection_builder.cc` — updated
references
- `src/ext/nccl/algorithm_selector.cc` — updated algorithm names
- `test/mp_unit/switch_channel_tests.cu` — new test
- `docs/guide/mscclpp-torch-integration.md` — updated names
- `examples/torch-integration/customized_comm_with_default_algo.py` —
updated names
---
 docs/guide/mscclpp-torch-integration.md       | 74 +++++++++++++++++--
 .../customized_comm_with_default_algo.py      |  2 +-
 .../customized_comm_with_tuning.py            | 26 ++++++-
 src/core/gpu_ipc_mem.cc                       | 55 ++++++++------
 .../algorithm_collection_builder.cc           | 20 ++---
 ..._2.cu => allreduce_nvls_block_pipeline.cu} | 50 +++++++------
 .../allreduce/allreduce_nvls_packet.cu        |  8 +-
 ...opy.cu => allreduce_nvls_warp_pipeline.cu} | 54 +++++++-------
 ...ce_nvls.cu => allreduce_nvls_zero_copy.cu} | 30 +++++---
 ....hpp => allreduce_nvls_block_pipeline.hpp} | 12 ++-
 .../allreduce/allreduce_nvls_packet.hpp       |  1 +
 ...2.hpp => allreduce_nvls_warp_pipeline.hpp} | 11 +--
 ..._nvls.hpp => allreduce_nvls_zero_copy.hpp} | 14 +++-
 .../collectives/include/collective_utils.hpp  |  1 -
 src/ext/nccl/algorithm_selector.cc            |  8 +-
 test/mp_unit/switch_channel_tests.cu          | 67 +++++++++++++++++
 16 files changed, 313 insertions(+), 120 deletions(-)
 rename src/ext/collectives/allreduce/{allreduce_nvls_with_copy_2.cu => allreduce_nvls_block_pipeline.cu} (82%)
 rename src/ext/collectives/allreduce/{allreduce_nvls_with_copy.cu => allreduce_nvls_warp_pipeline.cu} (78%)
 rename src/ext/collectives/allreduce/{allreduce_nvls.cu => allreduce_nvls_zero_copy.cu} (89%)
 rename src/ext/collectives/include/allreduce/{allreduce_nvls_with_copy.hpp => allreduce_nvls_block_pipeline.hpp} (76%)
 rename src/ext/collectives/include/allreduce/{allreduce_nvls_with_copy_2.hpp => allreduce_nvls_warp_pipeline.hpp} (78%)
 rename src/ext/collectives/include/allreduce/{allreduce_nvls.hpp => allreduce_nvls_zero_copy.hpp} (67%)

diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md
index 5e223546..1c966155 100644
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -129,7 +129,7 @@ class CustomizedComm:
         self._algo_large = [
             algo for algo in algorithms
             if algo.collective == "allreduce"
-            and algo.name == "default_allreduce_nvls_with_copy"
+            and algo.name == "default_allreduce_nvls_warp_pipeline"
         ][0]
 
     def all_reduce(self, tensor: torch.Tensor, stream=None):
@@ -479,9 +479,9 @@ The default algorithms use a fixed heuristic to select algorithms based on messa
 ### How It Works
 
 1. **Candidate selection** — For each power-of-two message size from 1 KB to 128 MB, the tuner picks the applicable algorithms:
+   - All sizes (when NVLS is supported): `default_allreduce_nvls_zero_copy`
    - Small messages (≤ 4 MB): `default_allreduce_nvls_packet`, `default_allreduce_packet`
    - Large messages (≥ 512 KB): `default_allreduce_rsag_zero_copy`
-   - Overlapping sizes get all three candidates.
 
 2. **Grid search** — Each candidate is run with every combination of block counts (`4, 8, 16, … 128`) and thread counts (`512, 768, 1024`). Results are captured in a CUDA graph and timed.
 
@@ -489,6 +489,36 @@ The default algorithms use a fixed heuristic to select algorithms based on messa
 
 4. **Runtime dispatch** — `get_tuned_config()` rounds the actual message size up to the next power of two and returns the winning `(algorithm, nblocks, nthreads)` triple.
 
+### Symmetric Memory Allocation
+
+Algorithms like `default_allreduce_nvls_zero_copy` require **symmetric memory** — memory where the buffer offset is the same for each rank, allocated via `mscclpp.RawGpuBuffer` (`cuMemAlloc`). Regular `torch.rand()` or `torch.empty()` allocations cannot be used with these algorithms because they do not guarantee the same offset across ranks. Instead, allocate a single large buffer and reuse it for all message sizes:
+
+```python
+# Allocate symmetric memory via RawGpuBuffer and wrap as a PyTorch tensor
+tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
+tune_tensor.normal_()
+```
+
+When executing an algorithm with symmetric memory, pass `symmetric_memory=True`:
+
+```python
+def _run_algo(self, algo, tensor, size, nblocks, nthreads):
+    return algo.execute(
+        comm=self.comm.communicator,
+        input_buffer=tensor.data_ptr(),
+        output_buffer=tensor.data_ptr(),
+        input_size=size,
+        output_size=size,
+        dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+        op=mscclpp.ReduceOp.SUM,
+        stream=torch.cuda.current_stream().cuda_stream,
+        nblocks=nblocks,
+        nthreads_per_block=nthreads,
+        symmetric_memory=True,
+    )
+```
+
 ### Loading Candidate Algorithms
 
 The same `load_algorithms` helper from Approach 1 is reused. The tuner extracts multiple algorithm objects:
@@ -510,23 +540,35 @@ self._algorithm_packet = [
     algo for algo in algorithms
     if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
 ][0]
+
+# NVLS zero-copy is only available on supported hardware
+if mscclpp.is_nvls_supported():
+    self._algorithm_nvls_zero_copy = [
+        algo for algo in algorithms
+        if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy"
+    ][0]
 ```
 
 ### The Tuning Loop
 
-The tuning loop iterates over message sizes, candidate algorithms, and kernel launch parameters. CUDA graphs are used for accurate timing:
+The tuning loop iterates over message sizes, candidate algorithms, and kernel launch parameters. CUDA graphs are used for accurate timing. Note the use of `RawGpuBuffer` for symmetric memory:
 
 ```python
 def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
     sizes = [1 << i for i in range(10, 28)]
     self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
 
-    tune_tensor = torch.rand(1 << 27, dtype=torch.float16, device="cuda")
+    # Use RawGpuBuffer for symmetric memory allocation
+    tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+    tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
+    tune_tensor.normal_()
     candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
     candidates_nthreads = [512, 768, 1024]
 
     for size in sizes:
         algos = []
+        if mscclpp.is_nvls_supported():
+            algos.append(self._algorithm_nvls_zero_copy)
         if size <= 4 * 1024 * 1024:
             algos.append(self._algorithm_nvls_packet)
             algos.append(self._algorithm_packet)
@@ -562,7 +604,7 @@ def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
 
 ### Dispatching with Tuned Configuration
 
-At runtime, round the message size to the next power of two and look up the best configuration:
+At runtime, round the message size to the next power of two and look up the best configuration. When the tensor is allocated from `RawGpuBuffer` (`cuMemAlloc`) and the buffer offset is the same for each rank, pass `symmetric_memory=True` to the `execute()` call (see the [Symmetric Memory Allocation](#symmetric-memory-allocation) section above):
 
 ```python
 def get_tuned_config(self, size):
@@ -591,6 +633,28 @@ def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None):
     )
 ```
 
+### Benchmarking with Symmetric Memory
+
+When benchmarking tuned configurations, use the same `RawGpuBuffer` allocation pattern. Create one large buffer and slice it for each message size:
+
+```python
+def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
+    # Allocate a single large RawGpuBuffer (symmetric memory) and reuse for all sizes
+    dtype = torch.float16
+    bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype))
+    bench_buf = torch.utils.dlpack.from_dlpack(bench_buf)
+    bench_buf.normal_()
+
+    for size in sizes:
+        n_elements = size // bench_buf.element_size()
+        tensor = bench_buf[:n_elements]
+
+        # Capture CUDA graph, warmup, and time...
+        with torch.cuda.graph(g, stream=capture_stream):
+            for _ in range(n_iter_per_graph):
+                self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
+```
+
 ### Running the Tuning Example
 
 ```bash
diff --git a/examples/torch-integration/customized_comm_with_default_algo.py b/examples/torch-integration/customized_comm_with_default_algo.py
index 281169cc..3e933107 100644
--- a/examples/torch-integration/customized_comm_with_default_algo.py
+++ b/examples/torch-integration/customized_comm_with_default_algo.py
@@ -61,7 +61,7 @@ class CustomizedComm:
         self._algorithm_nvls_nonzero_copy = [
             algo
             for algo in algorithms
-            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_with_copy"
+            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_warp_pipeline"
         ][0]
 
     def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index b618df5c..41be5825 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -64,6 +64,12 @@ class CustomizedComm:
         self._algorithm_packet = [
             algo for algo in algorithms if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
         ][0]
+        if mscclpp.is_nvls_supported():
+            self._algorithm_nvls_zero_copy = [
+                algo
+                for algo in algorithms
+                if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy"
+            ][0]
         self._tune(n_warmup=5, n_graph_launches=10, n_ops_per_graph=100)
 
     def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
@@ -71,12 +77,16 @@ class CustomizedComm:
         # Pre-fill with defaults for barrier
         self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
 
-        tune_tensor = torch.rand(1 << 27, dtype=torch.float16, device="cuda")
+        tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+        tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
+        tune_tensor.normal_()
         candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
         candidates_nthreads = [512, 768, 1024]
 
         for size in sizes:
             algos = []
+            if mscclpp.is_nvls_supported():
+                algos.append(self._algorithm_nvls_zero_copy)
             if size <= 4 * 1024 * 1024:
                 algos.append(self._algorithm_nvls_packet)
                 algos.append(self._algorithm_packet)
@@ -150,7 +160,7 @@ class CustomizedComm:
         for algo in algos:
             algo.reset()
 
-    def _run_algo(self, algo, tensor, size, nblocks, nthreads):
+    def _run_algo(self, algo: mscclpp.Algorithm, tensor, size, nblocks, nthreads):
         return algo.execute(
             comm=self.comm.communicator,
             input_buffer=tensor.data_ptr(),
@@ -162,6 +172,7 @@ class CustomizedComm:
             stream=torch.cuda.current_stream().cuda_stream,
             nblocks=nblocks,
             nthreads_per_block=nthreads,
+            symmetric_memory=True,
         )
 
     def get_tuned_config(self, size):
@@ -188,6 +199,7 @@ class CustomizedComm:
             stream=stream.cuda_stream if stream is not None else torch.cuda.current_stream().cuda_stream,
             nblocks=nblocks,
             nthreads_per_block=nthreads,
+            symmetric_memory=True,
         )
         if ret != 0:
             print(f"Rank {self.rank}: Algo {algo.name} failed with error {ret}")
@@ -211,8 +223,16 @@ class CustomizedComm:
         dtype = torch.float16
         capture_stream = torch.cuda.Stream()
 
+        # Allocate a single large RawGpuBuffer (symmetric memory) and reuse it for all sizes.
+        # Cannot allocate per-size tensors with symmetric memory.
+        bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype))
+        bench_buf = torch.utils.dlpack.from_dlpack(bench_buf)
+        bench_buf.normal_()
+
         for size in sizes:
-            tensor = torch.rand(size // 2, dtype=dtype, device="cuda")
+            n_elements = size // bench_buf.element_size()
+            tensor = bench_buf[:n_elements]
+
             capture_stream.wait_stream(torch.cuda.current_stream())
             # Capture Graph
             g = torch.cuda.CUDAGraph()
diff --git a/src/core/gpu_ipc_mem.cc b/src/core/gpu_ipc_mem.cc
index 3c9b41c4..bc9d375d 100644
--- a/src/core/gpu_ipc_mem.cc
+++ b/src/core/gpu_ipc_mem.cc
@@ -249,8 +249,13 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
   }
 
   if (handle->typeFlags == GpuIpcMemHandle::Type::None) {
+    cuMemRelease(allocHandle);
     THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created");
   }
+
+  // Release the local allocation handle. The exported POSIX FD / Fabric handle keeps the
+  // multicast object alive. Each importer will get its own handle via cuMemImportFromShareableHandle.
+  MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
   return handle;
 #else   // !(CUDA_NVLS_API_AVAILABLE)
   THROW(GPU, Error, ErrorCode::InvalidUsage,
@@ -418,41 +423,45 @@ std::shared_ptr<void> GpuIpcMem::mapMulticast([[maybe_unused]] int numDevices, [
   // This will block until all devices call cuMulticastAddDevice()
   MSCCLPP_CUTHROW(cuMulticastBindAddr(allocHandle_, mcOffset, bufferAddr, bufferSize, 0));
 
+  // cuMemMap requires offset to be 0 for multicast handles, so we map the entire range
+  // [0, mcOffset + bufferSize) and return a pointer at mcPtr + mcOffset. This only consumes
+  // extra virtual address space for the mcOffset region; no additional physical memory is used.
+  size_t mapSize = mcOffset + bufferSize;
   CUdeviceptr mcPtr;
-  MSCCLPP_CUTHROW(cuMemAddressReserve(&mcPtr, bufferSize, minMcGran, 0U, 0));
-  MSCCLPP_CUTHROW(cuMemMap(mcPtr, bufferSize, 0, allocHandle_, 0));
+  MSCCLPP_CUTHROW(cuMemAddressReserve(&mcPtr, mapSize, minMcGran, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap(mcPtr, mapSize, 0, allocHandle_, 0));
 
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   accessDesc.location.id = deviceId;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  MSCCLPP_CUTHROW(cuMemSetAccess(mcPtr, bufferSize, &accessDesc, 1));
+  MSCCLPP_CUTHROW(cuMemSetAccess(mcPtr, mapSize, &accessDesc, 1));
 
   // Return shared_ptr with custom deleter that unmaps and unbinds
   CUmemGenericAllocationHandle allocHandle = allocHandle_;
-  return std::shared_ptr<void>(
-      reinterpret_cast<void*>(mcPtr), [self = shared_from_this(), mcOffset, bufferSize, allocHandle](void* ptr) {
-        CUresult res;
-        const char* errStr;
+  return std::shared_ptr<void>(reinterpret_cast<void*>(mcPtr + mcOffset), [self = shared_from_this(), mcPtr, mapSize,
+                                                                           mcOffset, bufferSize, allocHandle](void*) {
+    CUresult res;
+    const char* errStr;
 
-        res = cuMemUnmap((CUdeviceptr)ptr, bufferSize);
-        if (res != CUDA_SUCCESS) {
-          (void)cuGetErrorString(res, &errStr);
-          WARN(GPU, "Failed to unmap CUDA memory at pointer ", (void*)ptr, ": ", errStr);
-        }
+    res = cuMemUnmap(mcPtr, mapSize);
+    if (res != CUDA_SUCCESS) {
+      (void)cuGetErrorString(res, &errStr);
+      WARN(GPU, "Failed to unmap CUDA memory at pointer ", (void*)mcPtr, ": ", errStr);
+    }
 
-        res = cuMemAddressFree((CUdeviceptr)ptr, bufferSize);
-        if (res != CUDA_SUCCESS) {
-          (void)cuGetErrorString(res, &errStr);
-          WARN(GPU, "Failed to free CUDA memory at pointer ", (void*)ptr, ": ", errStr);
-        }
+    res = cuMemAddressFree(mcPtr, mapSize);
+    if (res != CUDA_SUCCESS) {
+      (void)cuGetErrorString(res, &errStr);
+      WARN(GPU, "Failed to free CUDA memory at pointer ", (void*)mcPtr, ": ", errStr);
+    }
 
-        int deviceId;
-        CUdevice device;
-        if (cudaGetDevice(&deviceId) == cudaSuccess && cuDeviceGet(&device, deviceId) == CUDA_SUCCESS) {
-          (void)cuMulticastUnbind(allocHandle, device, mcOffset, bufferSize);
-        }
-      });
+    int deviceId;
+    CUdevice device;
+    if (cudaGetDevice(&deviceId) == cudaSuccess && cuDeviceGet(&device, deviceId) == CUDA_SUCCESS) {
+      (void)cuMulticastUnbind(allocHandle, device, mcOffset, bufferSize);
+    }
+  });
 #else   // !(CUDA_NVLS_API_AVAILABLE)
   THROW(GPU, Error, ErrorCode::InvalidUsage,
         "NVLS is not supported on this device (requires CUDA version >= 12.3 and Linux kernel version >= 5.6.0)");
diff --git a/src/ext/collectives/algorithm_collection_builder.cc b/src/ext/collectives/algorithm_collection_builder.cc
index 2b3bec8d..2a7e6e91 100644
--- a/src/ext/collectives/algorithm_collection_builder.cc
+++ b/src/ext/collectives/algorithm_collection_builder.cc
@@ -8,10 +8,10 @@
 #include "allgather/allgather_fullmesh_2.hpp"
 #include "allreduce/allreduce_allpair_packet.hpp"
 #include "allreduce/allreduce_fullmesh.hpp"
-#include "allreduce/allreduce_nvls.hpp"
+#include "allreduce/allreduce_nvls_block_pipeline.hpp"
 #include "allreduce/allreduce_nvls_packet.hpp"
-#include "allreduce/allreduce_nvls_with_copy.hpp"
-#include "allreduce/allreduce_nvls_with_copy_2.hpp"
+#include "allreduce/allreduce_nvls_warp_pipeline.hpp"
+#include "allreduce/allreduce_nvls_zero_copy.hpp"
 #include "allreduce/allreduce_packet.hpp"
 #include "allreduce/allreduce_rsag.hpp"
 #include "allreduce/allreduce_rsag_pipeline.hpp"
@@ -72,12 +72,14 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uin
   auto allreduceNvlsPacket =
       std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
   collection.registerAlgorithm(allreduceNvlsPacket->collective(), allreduceNvlsPacket->name(), allreduceNvlsPacket);
-  auto allreduceNvlsWithCopy = std::make_shared<AllreduceNvlsWithCopy>(scratchBuffer, scratchBufferSize)->build();
-  collection.registerAlgorithm(allreduceNvlsWithCopy->collective(), allreduceNvlsWithCopy->name(),
-                               allreduceNvlsWithCopy);
-  auto allreduceNvlsWithCopy2 = std::make_shared<AllreduceNvlsWithCopy2>(scratchBuffer, scratchBufferSize)->build();
-  collection.registerAlgorithm(allreduceNvlsWithCopy2->collective(), allreduceNvlsWithCopy2->name(),
-                               allreduceNvlsWithCopy2);
+  auto allreduceNvlsWarpPipeline =
+      std::make_shared<AllreduceNvlsWarpPipeline>(scratchBuffer, scratchBufferSize)->build();
+  collection.registerAlgorithm(allreduceNvlsWarpPipeline->collective(), allreduceNvlsWarpPipeline->name(),
+                               allreduceNvlsWarpPipeline);
+  auto allreduceNvlsBlockPipeline =
+      std::make_shared<AllreduceNvlsBlockPipeline>(scratchBuffer, scratchBufferSize)->build();
+  collection.registerAlgorithm(allreduceNvlsBlockPipeline->collective(), allreduceNvlsBlockPipeline->name(),
+                               allreduceNvlsBlockPipeline);
   auto allreducePkt =
       std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
   collection.registerAlgorithm(allreducePkt->collective(), allreducePkt->name(), allreducePkt);
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
similarity index 82%
rename from src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
rename to src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 1d8a3478..b542a6a6 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -3,7 +3,7 @@
 
 #include <mscclpp/algorithm.hpp>
 
-#include "allreduce/allreduce_nvls_with_copy_2.hpp"
+#include "allreduce/allreduce_nvls_block_pipeline.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
 #include "debug.h"
@@ -15,11 +15,12 @@ __device__ DeviceSemaphore deviceSemaphore[NUM_SEMAPHORES];
 
 template <typename T>
 __global__ void __launch_bounds__(1024, 1)
-    allreduceNvlsWithCopy2([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch, [[maybe_unused]] void* dst,
-                           [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
-                           [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels, [[maybe_unused]] size_t size,
-                           [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
-                           [[maybe_unused]] int nRanksPerNode) {
+    allreduceNvlsBlockPipeline([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch,
+                               [[maybe_unused]] void* dst,
+                               [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                               [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels,
+                               [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize,
+                               [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
   int nPeers = nRanksPerNode - 1;
@@ -146,7 +147,7 @@ __global__ void __launch_bounds__(1024, 1)
 }
 
 template <ReduceOp OpType, typename T>
-struct NvlsWithCopy2Adapter {
+struct NvlsBlockPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
@@ -162,7 +163,7 @@ struct NvlsWithCopy2Adapter {
 #endif
       {
         using ChannelType = DeviceHandle<BaseMemoryChannel>;
-        allreduceNvlsWithCopy2<T>
+        allreduceNvlsBlockPipeline<T>
             <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
                                                        nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
         return cudaGetLastError();
@@ -170,7 +171,7 @@ struct NvlsWithCopy2Adapter {
   }
 };
 
-void AllreduceNvlsWithCopy2::initialize(std::shared_ptr<Communicator> comm) {
+void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
   int nBaseChannels = 64;
   this->conns_ = setupConnections(comm);
@@ -180,14 +181,15 @@ void AllreduceNvlsWithCopy2::initialize(std::shared_ptr<Communicator> comm) {
   // setup base memory channels
   this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
+  this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
-CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
-                                                       void* output, size_t inputSize, DataType dtype, ReduceOp op,
-                                                       cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                       const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
+                                                           void* output, size_t inputSize, DataType dtype, ReduceOp op,
+                                                           cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                                           const std::unordered_map<std::string, uintptr_t>&) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsWithCopy2Adapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -201,35 +203,35 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<voi
                                 ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
+    WARN("AllreduceNvlsBlockPipeline failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
 }
 
-AlgorithmCtxKey AllreduceNvlsWithCopy2::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
+AlgorithmCtxKey AllreduceNvlsBlockPipeline::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
 
-std::shared_ptr<void> AllreduceNvlsWithCopy2::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
-                                                                   void*, size_t, DataType) {
+std::shared_ptr<void> AllreduceNvlsBlockPipeline::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
+                                                                       void*, size_t, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
   ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
   ctx->switchChannels =
-      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
 
-std::shared_ptr<Algorithm> AllreduceNvlsWithCopy2::build() {
-  auto self = std::make_shared<AllreduceNvlsWithCopy2>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+std::shared_ptr<Algorithm> AllreduceNvlsBlockPipeline::build() {
+  auto self =
+      std::make_shared<AllreduceNvlsBlockPipeline>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
   return std::make_shared<NativeAlgorithm>(
-      "default_allreduce_nvls_with_copy2", "allreduce",
+      "default_allreduce_nvls_block_pipeline", "allreduce",
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
@@ -247,4 +249,4 @@ std::shared_ptr<Algorithm> AllreduceNvlsWithCopy2::build() {
 }
 
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index 9f1371c2..9824fbcd 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -75,7 +75,10 @@ struct AllreduceNvlsPacketAdapter {
   }
 };
 
-void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator>) {}
+void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator> comm) {
+  int nSwitchChannels = 1;
+  this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
+}
 
 AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
@@ -90,9 +93,8 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
 
   // setup channels
   int nSwitchChannels = 1;
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
   ctx->switchChannels =
-      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
similarity index 78%
rename from src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
rename to src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 23d5ca4e..bc03ab26 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -3,7 +3,7 @@
 
 #include <mscclpp/algorithm.hpp>
 
-#include "allreduce/allreduce_nvls_with_copy.hpp"
+#include "allreduce/allreduce_nvls_warp_pipeline.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
 #include "debug.h"
@@ -13,11 +13,12 @@ namespace collective {
 
 template <typename T>
 __global__ void __launch_bounds__(1024, 1)
-    allreduce10([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch, [[maybe_unused]] void* dst,
-                [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
-                [[maybe_unused]] DeviceHandle<SwitchChannel>* multicast, [[maybe_unused]] size_t size,
-                [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
-                [[maybe_unused]] int nRanksPerNode) {
+    allreduceNvlsWarpPipeline([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch,
+                              [[maybe_unused]] void* dst,
+                              [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                              [[maybe_unused]] DeviceHandle<SwitchChannel>* multicast, [[maybe_unused]] size_t size,
+                              [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
+                              [[maybe_unused]] int nRanksPerNode) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
   int nPeers = nRanksPerNode - 1;
@@ -109,7 +110,7 @@ __global__ void __launch_bounds__(1024, 1)
 }
 
 template <ReduceOp OpType, typename T>
-struct NvlsWithCopyAdapter {
+struct NvlsWarpPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
@@ -125,15 +126,15 @@ struct NvlsWithCopyAdapter {
 #endif
       {
         using ChannelType = DeviceHandle<BaseMemoryChannel>;
-        allreduce10<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                                 nvlsChannels, inputSize, scratchBufferSize, rank,
-                                                                 nRanksPerNode);
+        allreduceNvlsWarpPipeline<T>
+            <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
+                                                       nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
         return cudaGetLastError();
       }
   }
 };
 
-void AllreduceNvlsWithCopy::initialize(std::shared_ptr<Communicator> comm) {
+void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
   int nBaseChannels = 64;
   this->conns_ = setupConnections(comm);
@@ -143,14 +144,15 @@ void AllreduceNvlsWithCopy::initialize(std::shared_ptr<Communicator> comm) {
   // setup base memory channels
   this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
+  this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
-CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
-                                                      void* output, size_t inputSize, DataType dtype, ReduceOp op,
-                                                      cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                      const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
+                                                          void* output, size_t inputSize, DataType dtype, ReduceOp op,
+                                                          cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                                          const std::unordered_map<std::string, uintptr_t>&) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsWithCopyAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsWarpPipelineAdapter>(op, dtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -164,35 +166,35 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<void
                                 ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
+    WARN("AllreduceNvlsWarpPipeline failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
 }
 
-AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
+AlgorithmCtxKey AllreduceNvlsWarpPipeline::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
 
-std::shared_ptr<void> AllreduceNvlsWithCopy::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
-                                                                  void*, size_t, DataType) {
+std::shared_ptr<void> AllreduceNvlsWarpPipeline::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
+                                                                      void*, size_t, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
   ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
   ctx->switchChannels =
-      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
 
-std::shared_ptr<Algorithm> AllreduceNvlsWithCopy::build() {
-  auto self = std::make_shared<AllreduceNvlsWithCopy>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+std::shared_ptr<Algorithm> AllreduceNvlsWarpPipeline::build() {
+  auto self =
+      std::make_shared<AllreduceNvlsWarpPipeline>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
   return std::make_shared<NativeAlgorithm>(
-      "default_allreduce_nvls_with_copy", "allreduce",
+      "default_allreduce_nvls_warp_pipeline", "allreduce",
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
@@ -209,4 +211,4 @@ std::shared_ptr<Algorithm> AllreduceNvlsWithCopy::build() {
       });
 }
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_nvls.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
similarity index 89%
rename from src/ext/collectives/allreduce/allreduce_nvls.cu
rename to src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index b73e1d27..f251bcda 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -3,7 +3,7 @@
 
 #include <mscclpp/core.hpp>
 
-#include "allreduce/allreduce_nvls.hpp"
+#include "allreduce/allreduce_nvls_zero_copy.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
 #include "debug.h"
@@ -11,6 +11,8 @@
 namespace mscclpp {
 namespace collective {
 
+constexpr int MAX_NBLOCKS = 32;
+
 template <typename T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceNvls([[maybe_unused]] mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>* memoryChannels,
@@ -105,6 +107,8 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
   // setup base memory channels
   this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nSwitchChannels_);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
+  this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
+  this->nvlsOutConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
 CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
@@ -134,12 +138,18 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
   if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
-    numBlocksAndThreads = {::min(ctx->nRanksPerNode, nSwitchChannels_), 1024};
-    // For GB200 devices, using more blocks to improve the performances when nRanksPerNode <= 8
-    if (computeCapabilityMajor_ == 10 && ctx->nRanksPerNode <= 8) {
-      numBlocksAndThreads.first = ::min(32, nSwitchChannels_);
+    numBlocksAndThreads = {::min(ctx->nRanksPerNode, MAX_NBLOCKS), 1024};
+    // For GB200 devices with MNNVLS (Multi-Node NVLink Sharp), scale the number of blocks inversely with
+    // the number of GPUs. Empirically, 32 blocks works well for 4 GPUs and 16 for 8 GPUs, which
+    // follows the formula 128 / nGPUs, clamped to [1, MAX_NBLOCKS].
+    if (computeCapabilityMajor_ == 10) {
+      numBlocksAndThreads.first = ::max(1, ::min(128 / ctx->workSize, MAX_NBLOCKS));
     }
   }
+  if (numBlocksAndThreads.first > MAX_NBLOCKS) {
+    WARN("Number of blocks exceeds maximum supported value of %d", MAX_NBLOCKS);
+    return CommResult::CommInvalidArgument;
+  }
   cudaError_t error =
       allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels,
                 nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize,
@@ -174,13 +184,11 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
 
   // setup channels
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
-  ctx->switchChannels = setupNvlsChannels(ctx->nvlsConnections, (void*)sendBasePtr, sendBytes, nSwitchChannels_);
+  ctx->switchChannels = setupNvlsChannels(this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_);
   if (input != output) {
-    auto nvlsOutConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
+    auto nvlsOutConnections = this->nvlsOutConnections_;
     std::vector<mscclpp::SwitchChannel> outChannels =
-        setupNvlsChannels(nvlsOutConnections, (void*)recvBasePtr, recvBytes, nSwitchChannels_);
-    ctx->nvlsConnections.insert(ctx->nvlsConnections.end(), nvlsOutConnections.begin(), nvlsOutConnections.end());
+        setupNvlsChannels(this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_);
     ctx->switchChannels.insert(ctx->switchChannels.end(), outChannels.begin(), outChannels.end());
   }
 
@@ -191,7 +199,7 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
 std::shared_ptr<mscclpp::Algorithm> AllreduceNvls::build() {
   auto self = std::make_shared<AllreduceNvls>();
   return std::make_shared<mscclpp::NativeAlgorithm>(
-      "default_allreduce_nvls", "allreduce",
+      "default_allreduce_nvls_zero_copy", "allreduce",
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
similarity index 76%
rename from src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp
rename to src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
index 97b72a2f..8b9b04ae 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
@@ -1,14 +1,17 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#ifndef MSCCLPP_EXT_ALLREDUCE_NVLS_BLOCK_PIPELINE_HPP_
+#define MSCCLPP_EXT_ALLREDUCE_NVLS_BLOCK_PIPELINE_HPP_
+
 #include <mscclpp/algorithm.hpp>
 
 namespace mscclpp {
 namespace collective {
 
-class AllreduceNvlsWithCopy : public AlgorithmBuilder {
+class AllreduceNvlsBlockPipeline : public AlgorithmBuilder {
  public:
-  AllreduceNvlsWithCopy(uintptr_t scratchBuffer, size_t scratchBufferSize)
+  AllreduceNvlsBlockPipeline(uintptr_t scratchBuffer, size_t scratchBufferSize)
       : scratchBuffer_(reinterpret_cast<void*>(scratchBuffer)), scratchBufferSize_(scratchBufferSize){};
   std::shared_ptr<Algorithm> build() override;
 
@@ -29,6 +32,9 @@ class AllreduceNvlsWithCopy : public AlgorithmBuilder {
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
 };
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_ALLREDUCE_NVLS_BLOCK_PIPELINE_HPP_
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
index 1cfb5ffd..65a48923 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
@@ -33,6 +33,7 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
   const int maxBlockNum_ = 16;
   uintptr_t flagBuffer_;
   size_t flagBufferSize_;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
 };
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
similarity index 78%
rename from src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp
rename to src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
index ca4ed1c6..e392b54e 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
@@ -1,17 +1,17 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#ifndef MSCCLPP_EXT_ALLREDUCE_NVLS_WITH_COPY_2_HPP_
-#define MSCCLPP_EXT_ALLREDUCE_NVLS_WITH_COPY_2_HPP_
+#ifndef MSCCLPP_EXT_ALLREDUCE_NVLS_WARP_PIPELINE_HPP_
+#define MSCCLPP_EXT_ALLREDUCE_NVLS_WARP_PIPELINE_HPP_
 
 #include <mscclpp/algorithm.hpp>
 
 namespace mscclpp {
 namespace collective {
 
-class AllreduceNvlsWithCopy2 : public AlgorithmBuilder {
+class AllreduceNvlsWarpPipeline : public AlgorithmBuilder {
  public:
-  AllreduceNvlsWithCopy2(uintptr_t scratchBuffer, size_t scratchBufferSize)
+  AllreduceNvlsWarpPipeline(uintptr_t scratchBuffer, size_t scratchBufferSize)
       : scratchBuffer_(reinterpret_cast<void*>(scratchBuffer)), scratchBufferSize_(scratchBufferSize){};
   std::shared_ptr<Algorithm> build() override;
 
@@ -32,8 +32,9 @@ class AllreduceNvlsWithCopy2 : public AlgorithmBuilder {
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
 };
 }  // namespace collective
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_EXT_ALLREDUCE_NVLS_WITH_COPY_2_HPP_
\ No newline at end of file
+#endif  // MSCCLPP_EXT_ALLREDUCE_NVLS_WARP_PIPELINE_HPP_
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
similarity index 67%
rename from src/ext/collectives/include/allreduce/allreduce_nvls.hpp
rename to src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
index 07074527..d0593500 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -1,6 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#ifndef MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_
+#define MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_
+
 #include <mscclpp/algorithm.hpp>
 
 namespace mscclpp {
@@ -22,13 +25,20 @@ class AllreduceNvls : public AlgorithmBuilder {
                                              DataType);
   AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
-  const size_t nvlsBufferSize_ = (1 << 30);
+  // Large buffer size because cuMemMap requires offset=0 for multicast handles, so the entire
+  // user allocation must be mapped. This only reserves virtual address space; no physical memory
+  // is consumed beyond what is actually bound.
+  const size_t nvlsBufferSize_ = (1UL << 34);
   uint32_t nSwitchChannels_;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsOutConnections_;
   int computeCapabilityMajor_{0};
 };
 
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_
\ No newline at end of file
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index bff2d5c9..f705a9d1 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -78,7 +78,6 @@ class AlgorithmCtx {
   std::vector<MemoryChannel> memoryChannels;
   std::vector<SwitchChannel> switchChannels;
   std::vector<PortChannel> portChannels;
-  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections;
   std::shared_ptr<DeviceHandle<MemoryChannel>> memoryChannelDeviceHandles;
   std::shared_ptr<DeviceHandle<SwitchChannel>> switchChannelDeviceHandles;
   std::shared_ptr<DeviceHandle<PortChannel>> portChannelDeviceHandles;
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
index d523320e..82dd2d9e 100644
--- a/src/ext/nccl/algorithm_selector.cc
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -88,7 +88,7 @@ static std::shared_ptr<Algorithm> selectSingleNodeAllreduceBlackwell(
     return algoMap.at("default_allreduce_packet");
   }
   if (useNvlsWithZeroCopy) {
-    return algoMap.at("default_allreduce_nvls");
+    return algoMap.at("default_allreduce_nvls_zero_copy");
   }
 
   return algoMap.at("default_allreduce_rsag_zero_copy");
@@ -123,14 +123,14 @@ std::shared_ptr<Algorithm> selectSingleNodeAllreduce(
   }
   // Large messages with NVLS zero-copy support
   if (nvlsSupported && useNvlsWithZeroCopy) {
-    return algoMap.at("default_allreduce_nvls");
+    return algoMap.at("default_allreduce_nvls_zero_copy");
   }
   // Large messages with NVLS but without zero-copy
   if (nvlsSupported) {
     if (messageSize < (1 << 24)) {  // < 16MB
-      return algoMap.at("default_allreduce_nvls_with_copy");
+      return algoMap.at("default_allreduce_nvls_warp_pipeline");
     }
-    return algoMap.at("default_allreduce_nvls_with_copy2");
+    return algoMap.at("default_allreduce_nvls_block_pipeline");
   }
 #if defined(__HIP_PLATFORM_AMD__)
   // AMD platform: use fullmesh algorithm
diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu
index 44f4ebed..a12919e3 100644
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -68,3 +68,70 @@ TEST_F(SwitchChannelTest, SimpleAllReduce) {
   }
   ASSERT_EQ(result, expected) << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank;
 }
+
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
+
+__global__ void kernelSwitchReduceTwo() {
+#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+  auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan1.broadcast(0, val1);
+  auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan2.broadcast(0, val2);
+#endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+}
+
+TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  std::vector<int> ranks;
+  for (int i = 0; i < numRanksToUse; i++) {
+    ranks.push_back(i);
+  }
+
+  const size_t bufSize = 1024;
+  auto buffer1 = mscclpp::GpuBuffer<float>(bufSize / sizeof(float));
+  auto buffer2 = mscclpp::GpuBuffer<float>(bufSize / sizeof(float));
+  float data1 = (gEnv->rank + 1.0f) * 1.0f;
+  float data2 = (gEnv->rank + 1.0f) * 10.0f;
+  MSCCLPP_CUDATHROW(cudaMemcpy(buffer1.data(), &data1, sizeof(data1), cudaMemcpyHostToDevice));
+  MSCCLPP_CUDATHROW(cudaMemcpy(buffer2.data(), &data2, sizeof(data2), cudaMemcpyHostToDevice));
+
+  // Connection size must be large enough for two granularity-aligned buffers.
+  // The multicast granularity is typically 2MB, so we need at least 2 * 2MB.
+  const size_t connSize = buffer1.bytes() + buffer2.bytes();
+  auto nvlsConnection = mscclpp::connectNvlsCollective(communicator, ranks, connSize);
+
+  // Bind two separate buffers to the same connection
+  auto switchChannel1 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer1.data()), bufSize);
+  auto switchChannel2 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer2.data()), bufSize);
+
+  auto deviceHandle1 = switchChannel1.deviceHandle();
+  auto deviceHandle2 = switchChannel2.deviceHandle();
+
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan1, &deviceHandle1, sizeof(deviceHandle1)));
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan2, &deviceHandle2, sizeof(deviceHandle2)));
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  communicator->bootstrap()->barrier();
+
+  if (gEnv->rank == 0) {
+    kernelSwitchReduceTwo<<<1, 1>>>();
+    MSCCLPP_CUDATHROW(cudaGetLastError());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  }
+  communicator->bootstrap()->barrier();
+
+  float result1, result2;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result1, buffer1.data(), sizeof(result1), cudaMemcpyDeviceToHost));
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result2, buffer2.data(), sizeof(result2), cudaMemcpyDeviceToHost));
+
+  float expected1 = 0.0f;
+  float expected2 = 0.0f;
+  for (int i = 0; i < numRanksToUse; i++) {
+    expected1 += (i + 1.0f) * 1.0f;
+    expected2 += (i + 1.0f) * 10.0f;
+  }
+  ASSERT_EQ(result1, expected1) << "Channel1: expected " << expected1 << " but got " << result1;
+  ASSERT_EQ(result2, expected2) << "Channel2: expected " << expected2 << " but got " << result2;
+}

From 5d18835417da4d3d95841179e19f69aeebf796f4 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 19 Mar 2026 11:52:09 -0700
Subject: [PATCH 27/52] Fix use-after-free for fabric allocation handle in
 GpuIpcMemHandle (#764)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Fix a use-after-free where the CUDA allocation handle
(`CUmemGenericAllocationHandle`) was released prematurely while the
exported fabric handle still referenced it.

## Problem

Unlike POSIX FD handles (where the kernel keeps the allocation alive via
the open file descriptor), fabric handles do not hold their own
reference to the underlying allocation. The original code called
`cuMemRelease(allocHandle)` immediately after exporting the fabric
handle, freeing the allocation. When a remote process later tries to
`cuMemImportFromShareableHandle` using that fabric handle, it references
a freed allocation — a **use-after-free**.

This affected both code paths:

1. **`GpuIpcMemHandle::create()`**: The local `allocHandle` obtained via
`cuMemRetainAllocationHandle` was released right after fabric export,
leaving the fabric handle dangling.
2. **`GpuIpcMemHandle::createMulticast()`**: The `allocHandle` from
`cuMulticastCreate` was unconditionally released, even when it was the
only thing keeping the multicast object alive for the fabric handle.

## Fix

- **Added `allocHandle` field** to the `fabric` struct in
`GpuIpcMemHandle` to store the allocation handle and keep it alive for
the lifetime of the `GpuIpcMemHandle`.
- **`create()`**: Retain an additional reference via
`cuMemRetainAllocationHandle` and store it in `fabric.allocHandle` when
a fabric handle is successfully exported.
- **`createMulticast()`**: Store the `allocHandle` directly in
`fabric.allocHandle` instead of unconditionally releasing it. Only
release if fabric export was not used.
- **`deleter()`**: Release `fabric.allocHandle` via `cuMemRelease` when
the handle type includes `Fabric`, ensuring proper cleanup.
- **`GpuIpcMem` constructor (importer side)**: Clear
`fabric.allocHandle` after importing, since the importer gets its own
handle via `cuMemImportFromShareableHandle` and should not release the
exporter's allocation handle.

## Files Changed

- `src/core/include/gpu_ipc_mem.hpp` — Added
`CUmemGenericAllocationHandle allocHandle` to fabric struct.
- `src/core/gpu_ipc_mem.cc` — Retain/release allocation handle properly
across create, createMulticast, deleter, and importer paths.
---
 src/core/gpu_ipc_mem.cc          | 18 +++++++++++++++---
 src/core/include/gpu_ipc_mem.hpp |  1 +
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/core/gpu_ipc_mem.cc b/src/core/gpu_ipc_mem.cc
index bc9d375d..c863ecdd 100644
--- a/src/core/gpu_ipc_mem.cc
+++ b/src/core/gpu_ipc_mem.cc
@@ -140,6 +140,11 @@ void GpuIpcMemHandle::deleter(GpuIpcMemHandle* handle) {
       UnixSocketServer::instance().unregisterFd(handle->posixFd.fd);
       ::close(handle->posixFd.fd);
     }
+    if (handle->typeFlags & GpuIpcMemHandle::Type::Fabric) {
+      if (handle->fabric.allocHandle != 0) {
+        cuMemRelease(handle->fabric.allocHandle);
+      }
+    }
     delete handle;
   }
 }
@@ -148,6 +153,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
   auto handle = UniqueGpuIpcMemHandle(new GpuIpcMemHandle(), &GpuIpcMemHandle::deleter);
   handle->typeFlags = GpuIpcMemHandle::Type::None;
   handle->posixFd.fd = -1;
+  handle->fabric.allocHandle = {};
 
   CUdeviceptr basePtr;
   size_t sz;
@@ -189,6 +195,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
   // FABRIC handle
   if (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) ==
       CUDA_SUCCESS) {
+    MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&(handle->fabric.allocHandle), (void*)basePtr));
     handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
   }
 
@@ -232,6 +239,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
   handle->offsetFromBase = 0;
   handle->typeFlags = GpuIpcMemHandle::Type::None;
   handle->posixFd.fd = -1;
+  handle->fabric.allocHandle = {};
 
   // POSIX FD handle
   int fileDesc;
@@ -246,6 +254,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
   if (isFabricAvailable && (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle,
                                                          CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS)) {
     handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
+    handle->fabric.allocHandle = allocHandle;
   }
 
   if (handle->typeFlags == GpuIpcMemHandle::Type::None) {
@@ -253,9 +262,10 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
     THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created");
   }
 
-  // Release the local allocation handle. The exported POSIX FD / Fabric handle keeps the
-  // multicast object alive. Each importer will get its own handle via cuMemImportFromShareableHandle.
-  MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
+  // Only release allocHandle if it is not stored in fabric.allocHandle.
+  if (!(handle->typeFlags & GpuIpcMemHandle::Type::Fabric)) {
+    MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
+  }
   return handle;
 #else   // !(CUDA_NVLS_API_AVAILABLE)
   THROW(GPU, Error, ErrorCode::InvalidUsage,
@@ -275,6 +285,8 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
   if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
     if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
         CUDA_SUCCESS) {
+      // Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
+      handle_.fabric.allocHandle = {};
       type_ = GpuIpcMemHandle::Type::Fabric;
     }
   }
diff --git a/src/core/include/gpu_ipc_mem.hpp b/src/core/include/gpu_ipc_mem.hpp
index 923e807d..f66545c2 100644
--- a/src/core/include/gpu_ipc_mem.hpp
+++ b/src/core/include/gpu_ipc_mem.hpp
@@ -44,6 +44,7 @@ struct GpuIpcMemHandle {
 
   struct {
     char handle[64];
+    CUmemGenericAllocationHandle allocHandle;
   } fabric;
 
   static void deleter(GpuIpcMemHandle* handle);

From 93f6eeaa6b3db46cdf11d659835e81cedc9c94ff Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Tue, 24 Mar 2026 23:34:38 -0400
Subject: [PATCH 28/52] Remove GTest dependency, add code coverage, and
 refactor unit tests and CI pipelines (#744)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Removes the GTest dependency, replacing it with a minimal custom
framework (`test/framework.*`) that covers only what the tests actually
use — a unified `TEST()` macro with SFINAE-based fixture auto-detection,
`EXPECT_*`/`ASSERT_*` assertions, environments, and setup/teardown.
- `--exclude-perf-tests` flag and substring-based negative filtering
- `MSCCLPP_ENABLE_COVERAGE` CMake option with gcov/lcov; CI uploads to
Codecov
- Merges standalone `test/perf/` into main test targets
- Refactors Azure pipelines to reduce redundancies & make more readable

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
---
 .azure-pipelines/codecov.yml                  |  93 ++++
 .azure-pipelines/integration-test.yml         |   6 +-
 .azure-pipelines/multi-nodes-test.yml         | 164 ++-----
 .../{nccl-api-test.yaml => nccl-api-test.yml} |   6 +-
 .azure-pipelines/rccl-api-test.yml            |   3 +-
 .azure-pipelines/templates/codecov.yml        | 110 +++++
 .azure-pipelines/templates/deploy.yml         | 131 ++++++
 .../templates/integration-test.yaml           | 242 -----------
 .../templates/integration-test.yml            |  76 ++++
 .azure-pipelines/templates/nccl-test.yaml     | 282 ------------
 .azure-pipelines/templates/nccl-test.yml      |  76 ++++
 .azure-pipelines/templates/rccl-test.yaml     | 142 ------
 .azure-pipelines/templates/rccl-test.yml      |  63 +++
 .../templates/run-remote-task.yml             |  27 ++
 .azure-pipelines/templates/stop.yml           |  20 +
 .azure-pipelines/templates/ut-no-ib-env.yaml  | 191 ---------
 .azure-pipelines/templates/ut-no-ib-env.yml   |  95 ++++
 .azure-pipelines/templates/ut-npkit.yaml      | 145 -------
 .azure-pipelines/templates/ut-npkit.yml       |  57 +++
 .azure-pipelines/templates/ut.yaml            | 142 ------
 .azure-pipelines/templates/ut.yml             |  48 +++
 .azure-pipelines/ut-rocm.yml                  |  50 ---
 .azure-pipelines/ut.yml                       |  45 +-
 .codecov.yml                                  |  24 ++
 .github/workflows/codeql-analysis.yml         |   6 +-
 .../{doc-build.yaml => doc-build.yml}         |   0
 .github/workflows/integration-test-backup.yml |  69 ---
 .github/workflows/mscclpp-lang.yml            |   2 +-
 .github/workflows/ut-backup.yml               |  52 ---
 .gitignore                                    |   1 +
 CMakeLists.txt                                |  59 ++-
 README.md                                     |  13 +-
 docker/base-dev-x.dockerfile                  |  30 +-
 docker/build.sh                               |  18 +-
 docs/quickstart.md                            |   5 +-
 test/CMakeLists.txt                           |  27 +-
 test/deploy/deploy.sh                         |   2 +-
 test/deploy/run-remote.sh                     | 107 +++++
 test/deploy/run_tests.sh                      |   1 -
 test/executor_test.cc                         |  10 +-
 test/framework.cc                             | 323 ++++++++++++++
 test/framework.hpp                            | 405 ++++++++++++++++++
 test/mp_unit/bootstrap_tests.cc               |  18 +-
 test/mp_unit/communicator_tests.cu            |   8 +-
 test/mp_unit/executor_tests.cc                |   7 +-
 test/mp_unit/ib_tests.cu                      |  12 +-
 test/mp_unit/memory_channel_tests.cu          |  95 ++--
 test/mp_unit/mp_unit_tests.cc                 |  17 +-
 test/mp_unit/mp_unit_tests.hpp                |  14 +-
 test/mp_unit/port_channel_tests.cu            | 145 ++-----
 test/mp_unit/switch_channel_tests.cu          |  45 +-
 test/perf/CMakeLists.txt                      |  44 --
 test/perf/fifo_test.cu                        | 298 -------------
 test/perf/framework.cc                        | 208 ---------
 test/perf/framework.hpp                       |  80 ----
 test/unit/CMakeLists.txt                      |   4 +-
 test/unit/compile_tests.cu                    |   4 +-
 test/unit/core_tests.cc                       |  20 +-
 test/unit/errors_tests.cc                     |  17 +-
 test/unit/fifo_perf_tests.cu                  |  85 ++++
 test/unit/fifo_tests.cu                       |   5 +-
 test/unit/gpu_utils_tests.cc                  |   6 +-
 test/unit/local_channel_tests.cu              |   6 +-
 test/unit/numa_tests.cc                       |   6 +-
 test/unit/socket_tests.cc                     |   5 +-
 test/unit/unit_tests_main.cc                  |   6 +
 test/unit/utils_internal_tests.cc             |   3 +-
 test/unit/utils_tests.cc                      |   6 +-
 68 files changed, 2116 insertions(+), 2416 deletions(-)
 create mode 100644 .azure-pipelines/codecov.yml
 rename .azure-pipelines/{nccl-api-test.yaml => nccl-api-test.yml} (88%)
 create mode 100644 .azure-pipelines/templates/codecov.yml
 create mode 100644 .azure-pipelines/templates/deploy.yml
 delete mode 100644 .azure-pipelines/templates/integration-test.yaml
 create mode 100644 .azure-pipelines/templates/integration-test.yml
 delete mode 100644 .azure-pipelines/templates/nccl-test.yaml
 create mode 100644 .azure-pipelines/templates/nccl-test.yml
 delete mode 100644 .azure-pipelines/templates/rccl-test.yaml
 create mode 100644 .azure-pipelines/templates/rccl-test.yml
 create mode 100644 .azure-pipelines/templates/run-remote-task.yml
 create mode 100644 .azure-pipelines/templates/stop.yml
 delete mode 100644 .azure-pipelines/templates/ut-no-ib-env.yaml
 create mode 100644 .azure-pipelines/templates/ut-no-ib-env.yml
 delete mode 100644 .azure-pipelines/templates/ut-npkit.yaml
 create mode 100644 .azure-pipelines/templates/ut-npkit.yml
 delete mode 100644 .azure-pipelines/templates/ut.yaml
 create mode 100644 .azure-pipelines/templates/ut.yml
 delete mode 100644 .azure-pipelines/ut-rocm.yml
 create mode 100644 .codecov.yml
 rename .github/workflows/{doc-build.yaml => doc-build.yml} (100%)
 delete mode 100644 .github/workflows/integration-test-backup.yml
 delete mode 100644 .github/workflows/ut-backup.yml
 create mode 100755 test/deploy/run-remote.sh
 create mode 100644 test/framework.cc
 create mode 100644 test/framework.hpp
 delete mode 100644 test/perf/CMakeLists.txt
 delete mode 100644 test/perf/fifo_test.cu
 delete mode 100644 test/perf/framework.cc
 delete mode 100644 test/perf/framework.hpp
 create mode 100644 test/unit/fifo_perf_tests.cu
 create mode 100644 test/unit/unit_tests_main.cc

diff --git a/.azure-pipelines/codecov.yml b/.azure-pipelines/codecov.yml
new file mode 100644
index 00000000..c4abeaa7
--- /dev/null
+++ b/.azure-pipelines/codecov.yml
@@ -0,0 +1,93 @@
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - apps/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+      - .devcontainer/**
+      - .github/**
+      - apps/**
+      - docker/**
+      - docs/**
+      - '**/*.md'
+
+jobs:
+- job: CodeCoverageA100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci
+      vmssName:         mscclpp-ci
+      gpuArch:          '80'
+
+- job: CodeCoverageH100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-h100
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
+
+- job: CodeCoverageMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      platform:         rocm
+      gpuArch:          gfx942
diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index f6fe3a47..d5d5f9bd 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -41,11 +41,10 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: IntegrationTestH100
@@ -61,10 +60,9 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       perfBaselineFile: test/deploy/perf_ndmv5.jsonl
       gpuArch:          '90'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 914c2317..d4924879 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -37,33 +37,6 @@ jobs:
     image: $[ variables['containerImage'] ]
 
   steps:
-  - task: Bash@3
-    name: Build
-    displayName: Build
-    inputs:
-      targetType: 'inline'
-      script: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
-        make -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: DownloadSecureFile@1
-    name: SshKeyFile
-    displayName: Download key file
-    inputs:
-      secureFile: mscclpp-ssh.key
-
-  - task: Bash@3
-    name: InstallPackages
-    displayName: Install Packages
-    inputs:
-      targetType: 'inline'
-      script: |
-        sudo apt-get update -y
-        sudo apt-get install pssh -y
-        curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
   - task: Bash@3
     displayName: Add HostEntry
     inputs:
@@ -77,107 +50,46 @@ jobs:
           echo "Entry already exists, nothing to do."
         fi
 
-  - task: AzureCLI@2
-    name: StartVMSS
-    displayName: Start VMSS
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss start --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/deploy.yml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT
 
-  - task: Bash@3
-    name: DeployTestEnv
-    displayName: Deploy Test Env
-    inputs:
-      targetType: filePath
-      filePath: test/deploy/deploy.sh
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMscclppTest
+      displayName: Run multi-nodes mscclpp-test
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
 
-  - task: Bash@3
-    name: RunMscclppTest
-    displayName: Run multi-nodes mscclpp-test
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodeUnitTest
+      displayName: Run multi-nodes unit tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
 
-  - task: Bash@3
-    name: RunMultiNodeUnitTest
-    displayName: Run multi-nodes unit tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonTests
+      displayName: Run multi-nodes python tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh pytests
 
-  - task: Bash@3
-    name: RunMultiNodePythonTests
-    displayName: Run multi-nodes python tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonBenchmark
+      displayName: Run multi-nodes python benchmark
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
 
-  - task: Bash@3
-    name: RunMultiNodePythonBenchmark
-    displayName: Run multi-nodes python benchmark
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
-        kill $CHILD_PID
-
-  - task: AzureCLI@2
-    name: StopVMSS
-    displayName: Deallocate VMSS
-    condition: always()
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss deallocate  --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/stop.yml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT
diff --git a/.azure-pipelines/nccl-api-test.yaml b/.azure-pipelines/nccl-api-test.yml
similarity index 88%
rename from .azure-pipelines/nccl-api-test.yaml
rename to .azure-pipelines/nccl-api-test.yml
index 4951c5bd..cc017412 100644
--- a/.azure-pipelines/nccl-api-test.yaml
+++ b/.azure-pipelines/nccl-api-test.yml
@@ -40,11 +40,10 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       nvccGencode:      "-gencode=arch=compute_80,code=sm_80"
 
 - job: NcclTestH100
@@ -61,9 +60,8 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       nvccGencode:      "-gencode=arch=compute_90,code=sm_90"
\ No newline at end of file
diff --git a/.azure-pipelines/rccl-api-test.yml b/.azure-pipelines/rccl-api-test.yml
index 92c5874f..43841079 100644
--- a/.azure-pipelines/rccl-api-test.yml
+++ b/.azure-pipelines/rccl-api-test.yml
@@ -40,9 +40,8 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/rccl-test.yaml
+  - template: templates/rccl-test.yml
     parameters:
       subscription:     mscclpp-ci-mi300x
       vmssName:         mscclpp-mi300x-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          gfx942
diff --git a/.azure-pipelines/templates/codecov.yml b/.azure-pipelines/templates/codecov.yml
new file mode 100644
index 00000000..08797351
--- /dev/null
+++ b/.azure-pipelines/templates/codecov.yml
@@ -0,0 +1,110 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildType:        Debug
+    cmakeArgs:        '-DMSCCLPP_ENABLE_COVERAGE=ON'
+    buildDisplayName: 'Build with coverage'
+    buildName:        BuildCoverage
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+- template: run-remote-task.yml
+  parameters:
+    name: TestsCoverageNonPerf
+    displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+      STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
+      export GCOV_PREFIX=/root/mscclpp
+      export GCOV_PREFIX_STRIP=$STRIP_COUNT
+
+      echo "Running unit_tests..."
+      ./build/bin/unit_tests
+      echo "unit_tests: PASSED"
+
+      echo "Running mp_unit_tests -np 2..."
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 2: PASSED"
+
+      echo "Running mp_unit_tests -np 4..."
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 4: PASSED"
+
+- template: run-remote-task.yml
+  parameters:
+    name: CaptureCoverage
+    displayName: Capture coverage data with lcov
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+
+      GCOV_TOOL_ARG=""
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1
+        GCOV_WRAPPER=$(mktemp)
+        printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
+        chmod +x "$GCOV_WRAPPER"
+        GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}"
+      fi
+
+      lcov --version
+      LCOV_CAPTURE_ARGS=""
+      if lcov --help 2>&1 | grep -q "inconsistent"; then
+        LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
+      fi
+
+      lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
+      if [ ! -s coverage.info ]; then
+        echo "ERROR: coverage.info was not generated."
+        exit 1
+      fi
+
+      lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
+      lcov --list coverage.info
+      ls -la coverage.info
+
+- task: Bash@3
+  name: FetchCoverage
+  displayName: Fetch coverage data from remote VM
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      HOST=$(head -1 ${HOSTFILE})
+      ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
+        'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
+      scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: UploadCodecov
+  displayName: Upload coverage to Codecov
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      curl -Os https://cli.codecov.io/latest/linux/codecov
+      chmod +x codecov
+      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml
new file mode 100644
index 00000000..fc116acf
--- /dev/null
+++ b/.azure-pipelines/templates/deploy.yml
@@ -0,0 +1,131 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+# Build parameters
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+  default: ''
+- name: buildType
+  type: string
+  default: 'Release'
+- name: buildTests
+  type: string
+  default: 'true'
+- name: cmakeArgs
+  type: string
+  default: ''
+- name: buildName
+  type: string
+  default: 'Build'
+- name: buildDisplayName
+  type: string
+  default: 'Build'
+# Deploy parameters
+- name: deployArgs
+  type: string
+  default: ''
+
+steps:
+# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
+- task: Bash@3
+  name: EnsureAzureCLI
+  displayName: Ensure Azure CLI Installed
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      if command -v az >/dev/null 2>&1; then
+        az version >/dev/null
+        exit 0
+      fi
+      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
+# 1. Build
+- task: Bash@3
+  name: ${{ parameters.buildName }}
+  displayName: ${{ parameters.buildDisplayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      BUILD_TESTS_ARG=""
+      if [ "${{ parameters.buildTests }}" = "true" ]; then
+        BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON"
+      fi
+
+      GPU_ARCH_ARG=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        eval CXX=/opt/rocm/bin/hipcc cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_ROCM=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      else
+        eval cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_CUDA=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      fi
+      make -j
+      cd ..
+      pwd > build/BUILD_PREFIX
+      echo "=== Build artifacts ==="
+      ls -la build/bin/ || echo "ERROR: build/bin/ missing after build"
+      du -sh build/bin/* 2>/dev/null || true
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 2. Download SSH key + install packages + start VMSS
+- task: DownloadSecureFile@1
+  name: SshKeyFile
+  displayName: Download key file
+  inputs:
+    secureFile: mscclpp.pem
+
+- task: Bash@3
+  name: InstallPackages
+  displayName: Install Packages
+  inputs:
+    targetType: 'inline'
+    script: |
+      sudo apt-get update -y
+      sudo apt-get install pssh -y
+
+- task: AzureCLI@2
+  name: StartVMSS
+  displayName: Start VMSS
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
+
+# 3. Deploy test environment
+- task: Bash@3
+  name: DeployTestEnv
+  displayName: Deploy Test Env
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: ${{ parameters.deployArgs }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/templates/integration-test.yaml b/.azure-pipelines/templates/integration-test.yaml
deleted file mode 100644
index 99ed6d04..00000000
--- a/.azure-pipelines/templates/integration-test.yaml
+++ /dev/null
@@ -1,242 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: perfBaselineFile
-  type: string
-  default: 'test/deploy/perf_ndmv4.jsonl'
-- name: gpuArch
-  type: string
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: inline
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: AllGatherTest
-  displayName: Run mscclpp AllGather test
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        set -e;                                                   \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl;       \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: SendRecvTest
-  displayName: Run mscclpp SendRecv test
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: AllReduceTest
-  displayName: Run mscclpp AllReduce test
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl;                 \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: AllToAll
-  displayName: Run mscclpp AllToAll test
-  inputs:
-    targetType: 'inline'
-    script: |
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: CheckPerfNumber
-  displayName: Check collective primitives performance
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        cd /root/mscclpp;                                         \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PythonAllReduceBenchmark
-  displayName: Python Allreduce Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        set -e;                                                    \
-        cd /root/mscclpp;                                          \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 -m pip install .;                                     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: FifoPerfBenchmark
-  displayName: FIFO Performance Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        ./build/bin/perf/fifo_test"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
\ No newline at end of file
diff --git a/.azure-pipelines/templates/integration-test.yml b/.azure-pipelines/templates/integration-test.yml
new file mode 100644
index 00000000..b686e4f2
--- /dev/null
+++ b/.azure-pipelines/templates/integration-test.yml
@@ -0,0 +1,76 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: perfBaselineFile
+  type: string
+  default: 'test/deploy/perf_ndmv4.jsonl'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test'
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllGatherTest
+    displayName: Run mscclpp AllGather test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: SendRecvTest
+    displayName: Run mscclpp SendRecv test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllReduceTest
+    displayName: Run mscclpp AllReduce test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllToAll
+    displayName: Run mscclpp AllToAll test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: CheckPerfNumber
+    displayName: Check collective primitives performance
+    remoteScript: |
+      python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
+
+- template: run-remote-task.yml
+  parameters:
+    name: PythonAllReduceBenchmark
+    displayName: Python Allreduce Benchmark
+    remoteScript: |
+      python3 -m pip install .
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
\ No newline at end of file
diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml
deleted file mode 100644
index ef4a9fa8..00000000
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ /dev/null
@@ -1,282 +0,0 @@
-# .azure-pipelines/templates/nccl-test.yaml
-# ----------------------------------------
-# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
-#
-# Parameters:
-#   subscription     – Azure subscription to use for VMSS start/stop
-#   sshKeySecureFile – the secureFile name for your SSH key
-
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: nvccGencode
-  type: string
-  default: "-gencode=arch=compute_80,code=sm_80"
-
-steps:
-- checkout: self
-- checkout: git://One/msccl-users
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: mscclpp/test/deploy/deploy.sh
-    arguments: nccltest-single-node
-    workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
-
-- task: Bash@3
-  name: CopyMscclUsers
-  displayName: Copy msccl-users
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      DST_DIR="/tmp/mscclpp/msccl-users"
-      parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: GenerateExecutionFile
-#   displayName: Generate execution file
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp/msccl-users;  \
-#         mkdir -p execution-files;      \
-#         cd /root/mscclpp/msccl-users;  \
-#         bash algos/mscclpp_a100/generate_execution_plan.sh"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: InstallNcclTests
-  displayName: Install NCCL Tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd; git clone https://github.com/NVIDIA/nccl-tests.git;    \
-        cd nccl-tests;                                             \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllReduceTest
-#   displayName: Run NCCL AllReduce Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllGatherTest
-#   displayName: Run NCCL AllGather Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterTest
-#   displayName: Run NCCL Reduce Scatter Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: InstallNccl
-  displayName: Install NCCL
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        LATEST_TAG=\$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\\\" -f4); \
-        if [ -z \"\$LATEST_TAG\" ]; then echo \"Failed to fetch latest NCCL tag\"; exit 1; fi; \
-        cd; git clone --branch \$LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git; \
-        cd nccl;                                                   \
-        make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunNcclAllGatherFallbaclkToNcclTest
-  displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunNcclAllReduceFallbaclkToNcclTest
-  displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunNcclBroadcastFallbaclkToNcclTest
-  displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterFallbaclkToNcclTest
-#   displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";                                                                 \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;                                                                            \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml
new file mode 100644
index 00000000..211e2393
--- /dev/null
+++ b/.azure-pipelines/templates/nccl-test.yml
@@ -0,0 +1,76 @@
+# .azure-pipelines/templates/nccl-test.yml
+# ----------------------------------------
+# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: nvccGencode
+  type: string
+  default: "-gencode=arch=compute_80,code=sm_80"
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    deployArgs:       'nccltest-single-node'
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallNcclTests
+    displayName: Install NCCL Tests
+    remoteScript: |
+      cd
+      git clone https://github.com/NVIDIA/nccl-tests.git
+      cd nccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallNccl
+    displayName: Install NCCL
+    remoteScript: |
+      LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
+      if [ -z "$LATEST_TAG" ]; then
+        echo "Failed to fetch latest NCCL tag"
+        exit 1
+      fi
+      cd
+      git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
+      cd nccl
+      make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclAllGatherFallbaclkToNcclTest
+    displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclAllReduceFallbaclkToNcclTest
+    displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclBroadcastFallbaclkToNcclTest
+    displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/rccl-test.yaml b/.azure-pipelines/templates/rccl-test.yaml
deleted file mode 100644
index 040605df..00000000
--- a/.azure-pipelines/templates/rccl-test.yaml
+++ /dev/null
@@ -1,142 +0,0 @@
-# .azure-pipelines/templates/rccl-test.yaml
-# ------------------------------------------------
-# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
-#
-# Parameters:
-#   subscription     – Azure subscription to use for VMSS start/stop
-#   vmssName         – VMSS name to start/stop
-#   sshKeySecureFile – the secureFile name for your SSH key
-#   gpuArch          – GPU architecture (e.g. gfx942)
-
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: gpuArch
-  type: string
-  default: "gfx942"
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test true rocm"
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
-
-- task: Bash@3
-  name: InstallRcclTests
-  displayName: Install RCCL Tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd;                                                        \
-        git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
-        cd rocm-systems;                                           \
-        git sparse-checkout init --cone;                           \
-        git sparse-checkout set projects/rccl-tests;               \
-        git checkout;                                              \
-        cd projects/rccl-tests;                                    \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunRcclAllGatherTest
-  displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN  /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunRcclAllReduceTest
-  displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN  /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/rccl-test.yml b/.azure-pipelines/templates/rccl-test.yml
new file mode 100644
index 00000000..8e247161
--- /dev/null
+++ b/.azure-pipelines/templates/rccl-test.yml
@@ -0,0 +1,63 @@
+# .azure-pipelines/templates/rccl-test.yml
+# ------------------------------------------------
+# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+#   vmssName         – VMSS name to start/stop
+#   gpuArch          – GPU architecture (e.g. gfx942)
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+  default: "gfx942"
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         rocm
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildTests:       false
+    deployArgs:       'single-node-test true rocm'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallRcclTests
+    displayName: Install RCCL Tests
+    remoteScript: |
+      cd
+      git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git
+      cd rocm-systems
+      git sparse-checkout init --cone
+      git sparse-checkout set projects/rccl-tests
+      git checkout
+      cd projects/rccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunRcclAllGatherTest
+    displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunRcclAllReduceTest
+    displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/run-remote-task.yml b/.azure-pipelines/templates/run-remote-task.yml
new file mode 100644
index 00000000..37b3a7d7
--- /dev/null
+++ b/.azure-pipelines/templates/run-remote-task.yml
@@ -0,0 +1,27 @@
+parameters:
+- name: name
+  type: string
+  default: ''
+- name: displayName
+  type: string
+- name: runRemoteArgs
+  type: string
+  default: ''
+- name: remoteScript
+  type: string
+- name: workingDirectory
+  type: string
+  default: '$(System.DefaultWorkingDirectory)'
+
+steps:
+- task: Bash@3
+  ${{ if ne(parameters.name, '') }}:
+    name: ${{ parameters.name }}
+  displayName: ${{ parameters.displayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD'
+      ${{ parameters.remoteScript }}
+      REMOTE_CMD
+    workingDirectory: ${{ parameters.workingDirectory }}
diff --git a/.azure-pipelines/templates/stop.yml b/.azure-pipelines/templates/stop.yml
new file mode 100644
index 00000000..40498c29
--- /dev/null
+++ b/.azure-pipelines/templates/stop.yml
@@ -0,0 +1,20 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+
+steps:
+- task: AzureCLI@2
+  name: StopVMSS
+  displayName: Deallocate VMSS
+  condition: always()
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml
deleted file mode 100644
index 0d97f9fc..00000000
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ /dev/null
@@ -1,191 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: gpuArch
-  type: string
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: single-node-test false
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
-- task: Bash@3
-  name: UnitTests
-  displayName: Run mscclpp unit tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: StopContainer
-  displayName: Stop existing container
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-        "sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
-      rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-  
-- task: Bash@3
-  name: BuildWithIb
-  displayName: Rebuild with IB
-  inputs:
-    targetType: 'inline'
-    script: |
-      rm -rf build && mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: DeployTestEnvWithIb
-  displayName: Deploy Test Env (with IB build)
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: single-node-test false
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
-- task: Bash@3
-  name: PyTestsWithIbBuildDisableIb
-  displayName: Run pytests (IB build, IB tests disabled)
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
\ No newline at end of file
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yml b/.azure-pipelines/templates/ut-no-ib-env.yml
new file mode 100644
index 00000000..a62f1a77
--- /dev/null
+++ b/.azure-pipelines/templates/ut-no-ib-env.yml
@@ -0,0 +1,95 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_USE_IB=OFF'
+    deployArgs:       'single-node-test false'
+
+- template: run-remote-task.yml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: run-remote-task.yml
+  parameters:
+    name: StopContainer
+    displayName: Stop existing container
+    runRemoteArgs: '--no-docker --no-log'
+    remoteScript: |
+      sudo docker stop mscclpp-test || true
+      sudo docker rm mscclpp-test || true
+
+- task: Bash@3
+  displayName: Remove generated SSH key files
+  inputs:
+    targetType: 'inline'
+    script: |
+      rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+  
+- task: Bash@3
+  name: BuildWithIb
+  displayName: Rebuild with IB
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      cmake \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+        -DMSCCLPP_USE_CUDA=ON \
+        -DMSCCLPP_BUILD_TESTS=ON \
+        -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      make -j
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: DeployTestEnvWithIb
+  displayName: Deploy Test Env (with IB build)
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: single-node-test false
+    workingDirectory: $(System.DefaultWorkingDirectory)
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTestsWithIbBuildDisableIb
+    displayName: Run pytests (IB build, IB tests disabled)
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yaml
deleted file mode 100644
index 5c35317e..00000000
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ /dev/null
@@ -1,145 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: gpuArch
-  type: string
-
-
-steps:
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        set -e;                                                       \
-        cd /root/mscclpp;                                             \
-        mkdir -p build && cd build;                                   \
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
-        make -j"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: 'inline'
-    script: |
-      # set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output;     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json';      \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output;  \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;          \
-        grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;   \
-        grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml
new file mode 100644
index 00000000..e53b5cf5
--- /dev/null
+++ b/.azure-pipelines/templates/ut-npkit.yml
@@ -0,0 +1,57 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
+    deployArgs:       'single-node-test'
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce"
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
deleted file mode 100644
index 2086fd0a..00000000
--- a/.azure-pipelines/templates/ut.yaml
+++ /dev/null
@@ -1,142 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: platform
-  type: string
-  default: 'cuda'
-- name: gpuArch
-  type: string
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      if [ "${{ parameters.platform }}" == "rocm" ]; then
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      else
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      fi
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test true ${{ parameters.platform }}"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
-- task: Bash@3
-  name: UnitTests
-  displayName: Run mscclpp unit tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/ut.yml b/.azure-pipelines/templates/ut.yml
new file mode 100644
index 00000000..9d17e923
--- /dev/null
+++ b/.azure-pipelines/templates/ut.yml
@@ -0,0 +1,48 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/ut-rocm.yml b/.azure-pipelines/ut-rocm.yml
deleted file mode 100644
index 8b0aed1a..00000000
--- a/.azure-pipelines/ut-rocm.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-trigger:
-  branches:
-    include:
-    - main
-    - release/*
-  paths:
-    exclude:
-    - .devcontainer/**
-    - .github/**
-    - apps/**
-    - docker/**
-    - docs/**
-    - '**/*.md'
-
-pr:
-  branches:
-    include:
-    - main
-    - release/*
-  drafts: false
-  paths:
-    exclude:
-      - .devcontainer/**
-      - .github/**
-      - apps/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
-
-jobs:
-- job: UnitTestMI300X
-  timeoutInMinutes: 40
-  pool:
-    name: msccl-ci-mi300x
-  strategy:
-    matrix:
-      rocm6_2:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
-
-  container:
-    image: $(containerImage)
-
-  steps:
-  - template: templates/ut.yaml
-    parameters:
-      subscription:     mscclpp-ci-mi300x
-      vmssName:         mscclpp-mi300x-ci
-      sshKeySecureFile: mscclpp.pem
-      platform:         rocm
-      gpuArch:          gfx942
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 4aac07e6..4e6f96b1 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -37,17 +37,16 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: UnitTestWithNpKitA100
@@ -59,17 +58,16 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: UnitTestH100
@@ -79,17 +77,16 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
 - job: UnitTestWithNpKitH100
@@ -99,17 +96,16 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
 - job: UnitTestNoIBEnv
@@ -121,15 +117,34 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-no-ib-env.yaml
+  - template: templates/ut-no-ib-env.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
+
+- job: UnitTestMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      platform:         rocm
+      gpuArch:          gfx942
diff --git a/.codecov.yml b/.codecov.yml
new file mode 100644
index 00000000..a98f1e89
--- /dev/null
+++ b/.codecov.yml
@@ -0,0 +1,24 @@
+codecov:
+  require_ci_to_pass: yes
+
+coverage:
+  status:
+    project:
+      default:
+        target: 68%
+        threshold: 1%
+    patch:
+      default:
+        target: 80%
+
+flag_management:
+  default_rules:
+    carryforward: true
+
+ignore:
+  - "test/"
+  - "examples/"
+  - "python/"
+  - "tools/"
+  - "docs/"
+  - "docker/"
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index db3b488a..fb065141 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -40,7 +40,7 @@ jobs:
       fail-fast: false
       matrix:
         language: [ 'cpp', 'python' ]
-        version: [ 'cuda11.8', 'cuda12.8' ]
+        version: [ 'cuda11.8', 'cuda12.9' ]
 
     steps:
     - name: Checkout repository
@@ -62,7 +62,7 @@ jobs:
     - name: Build
       run: |
         rm -rf build && mkdir build && cd build
-        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
+        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF ..
         make -j4
 
     - name: Perform CodeQL Analysis
@@ -107,7 +107,7 @@ jobs:
     - name: Build
       run: |
         rm -rf build && mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
+        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF ..
         make -j4
 
     - name: Perform CodeQL Analysis
diff --git a/.github/workflows/doc-build.yaml b/.github/workflows/doc-build.yml
similarity index 100%
rename from .github/workflows/doc-build.yaml
rename to .github/workflows/doc-build.yml
diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml
deleted file mode 100644
index 900e8aba..00000000
--- a/.github/workflows/integration-test-backup.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-name: IntegrationTest
-
-on: workflow_dispatch
-
-jobs:
-  IntegrationTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-
-      - name: Lock GPU clock frequency
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: Run mscclpp AllGather test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-
-      - name: Run mscclpp SendRecv test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-
-      - name: Run mscclpp AllReduce test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
-
-      - name: Run mscclpp AllToAll test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-
-      - name: Check collective primitives performance
-        run: |
-          set -e
-          python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
diff --git a/.github/workflows/mscclpp-lang.yml b/.github/workflows/mscclpp-lang.yml
index 5947b087..a9187e96 100644
--- a/.github/workflows/mscclpp-lang.yml
+++ b/.github/workflows/mscclpp-lang.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
         fail-fast: false
         matrix:
-          version: [ 'cuda11.8', 'cuda12.8' ]
+          version: [ 'cuda11.8', 'cuda12.9' ]
 
     steps:
     - uses: actions/checkout@v4
diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml
deleted file mode 100644
index 8849c353..00000000
--- a/.github/workflows/ut-backup.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: UnitTest
-
-on: workflow_dispatch
-
-jobs:
-  UnitTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-        working-directory: ${{ github.workspace }}
-
-      - name: LockGPUClock
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: UnitTests
-        run: |
-          ./build/bin/unit_tests
-
-      - name: MpUnitTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
-
-      - name: PyTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x
diff --git a/.gitignore b/.gitignore
index ed3b94c4..74307e67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 .vscode/
 build/
+build_coverage/
 __pycache__
 .*.swp
 *.so
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ff7b075..9db54d15 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 cmake_minimum_required(VERSION 3.25)
 project(mscclpp LANGUAGES CXX)
@@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
 option(MSCCLPP_USE_IB "Use InfiniBand." ON)
 option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
+option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
 option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
 set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
 
@@ -99,6 +100,62 @@ else()
         message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.")
     endif()
 endif()
+
+# Code coverage setup
+if(MSCCLPP_ENABLE_COVERAGE)
+    if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+        message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading")
+    endif()
+    
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+        message(STATUS "Code coverage enabled")
+        
+        # Add coverage flags to C++ targets only (not CUDA)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:--coverage>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O0>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-g>)
+        add_link_options($<$<LINK_LANGUAGE:CXX>:--coverage>)
+        
+        # Find lcov
+        find_program(LCOV_PATH lcov)
+        
+        if(NOT LCOV_PATH)
+            message(WARNING "lcov not found. Install lcov to generate coverage reports.")
+        endif()
+        
+        if(LCOV_PATH)
+            # Add coverage target
+            add_custom_target(coverage
+                COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Running tests..."
+                COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
+                COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info"
+                
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Generating code coverage report"
+            )
+            
+            # Add coverage clean target
+            add_custom_target(coverage-clean
+                COMMAND ${CMAKE_COMMAND} -E remove coverage.info
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Cleaning coverage data"
+            )
+        endif()
+    else()
+        message(WARNING "Code coverage is only supported with GCC or Clang compilers")
+    endif()
+endif()
 if(MSCCLPP_GPU_ARCHS)
     string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
     string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
diff --git a/README.md b/README.md
index 5366f5b5..58586a30 100644
--- a/README.md
+++ b/README.md
@@ -3,15 +3,16 @@
 [![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest)
 [![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
 [![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
-[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp)
 
 | Testing Pipelines        | Build Status      |
 |--------------------------|-------------------|
-| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
-| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
-| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
-| NCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=320665&branchName=main) |
-| RCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=448013&branchName=main) |
+| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Unit Tests (ROCm)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
+| NCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
+| RCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |
 
 A GPU-driven communication stack for scalable AI applications.
 
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
index 3aa81422..7c6c927e 100644
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         htop \
-        lcov \
         vim \
         && \
     apt-get autoremove -y && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install lcov 2.2
+RUN LCOV_VERSION="2.2" && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        cpanminus \
+        gcc \
+        make \
+        perl \
+        && \
+    cpanm --notest \
+        Capture::Tiny \
+        DateTime \
+        JSON::XS \
+        Memory::Process \
+        TimeDate \
+        && \
+    cd /tmp && \
+    curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \
+    tar xzf lcov.tar.gz && \
+    cd lcov-${LCOV_VERSION} && \
+    make install && \
+    cd / && rm -rf /tmp/lcov* && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
 # Install CMake 3.26.4
 RUN OS_ARCH=$(uname -m) && \
     CMAKE_VERSION="3.26.4" && \
@@ -47,7 +72,8 @@ RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
         export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
     fi && \
     pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r python/requirements_${target_type}.txt
+    pip install --no-cache-dir -r python/requirements_${target_type}.txt && \
+    pip install --no-cache-dir coverage xlsxwriter
 
 # Cleanup
 RUN rm -rf /tmp/mscclpp
diff --git a/docker/build.sh b/docker/build.sh
index 63552f74..89568e19 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -4,27 +4,27 @@ set -e
 
 declare -A baseImageTable
 baseImageTable=(
-    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
-    ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
-    ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
-    ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
+    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
     ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
     ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
-    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
+    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
     ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
     ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
 )
 
 declare -A extraLdPathTable
 extraLdPathTable=(
-    ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
-    ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
-    ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
+    ["cuda11.8"]="/usr/local/cuda-11.8/compat"
+    ["cuda12.4"]="/usr/local/cuda-12.4/compat"
+    ["cuda12.8"]="/usr/local/cuda-12.8/compat"
+    ["cuda12.9"]="/usr/local/cuda-12.9/compat"
+    ["cuda13.0"]="/usr/local/cuda-13.0/compat"
     ["rocm6.2"]="/opt/rocm/lib"
 )
 
 declare -A ofedVersionTable
 ofedVersionTable=(
+    ["cuda11.8"]="23.07-0.5.1.2"
     ["cuda12.4"]="23.07-0.5.1.2"
     ["cuda12.8"]="24.10-1.1.4.0"
     ["cuda12.9"]="24.10-1.1.4.0"
@@ -36,7 +36,7 @@ TARGET=${1}
 OS_ARCH=$(uname -m)
 
 print_usage() {
-    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
+    echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
 }
 
 if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
diff --git a/docs/quickstart.md b/docs/quickstart.md
index ac1b7d6b..b7a68050 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -42,7 +42,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se
 
 ```bash
 # For NVIDIA platforms
-$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
+$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash
 # For AMD platforms
 $ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
 ```
@@ -171,7 +171,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
 For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
 
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
@@ -189,13 +188,11 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
 
 Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
 Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6452ebf8..82b799dc 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,13 +1,12 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
-find_package(MPI)
+find_package(MPI REQUIRED)
 
 set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads)
 if(MSCCLPP_USE_IB)
     list(APPEND TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
 endif()
-set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main)
 set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
 set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include)
 
@@ -17,6 +16,7 @@ if(MSCCLPP_USE_ROCM)
     foreach(arch ${MSCCLPP_GPU_ARCHS})
         add_compile_options(--offload-arch=${arch})
     endforeach()
+    add_compile_definitions(__HIP_PLATFORM_AMD__)
 endif()
 
 function(add_test_executable name sources)
@@ -38,28 +38,25 @@ add_test_executable(executor_test executor_test.cc)
 configure_file(run_mpi_test.sh.in run_mpi_test.sh)
 
 include(CTest)
-include(FetchContent)
-FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip)
-option(INSTALL_GTEST OFF)
-FetchContent_MakeAvailable(googletest)
-include(GoogleTest)
+
+# Build test framework library
+add_library(test_framework STATIC framework.cc)
+target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_INC_COMMON})
+target_link_libraries(test_framework PUBLIC MPI::MPI_CXX)
 
 # Unit tests
 add_executable(unit_tests)
-target_link_libraries(unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST})
+target_link_libraries(unit_tests ${TEST_LIBS_COMMON} test_framework)
 target_include_directories(unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
 add_subdirectory(unit)
-gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST)
+add_test(NAME unit_tests COMMAND unit_tests)
 
 # Multi-process unit tests
 add_executable(mp_unit_tests)
-target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST} MPI::MPI_CXX)
+target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} test_framework MPI::MPI_CXX)
 target_include_directories(mp_unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
 add_subdirectory(mp_unit)
-gtest_discover_tests(mp_unit_tests DISCOVERY_MODE PRE_TEST)
+add_test(NAME mp_unit_tests COMMAND ${CMAKE_CURRENT_BINARY_DIR}/run_mpi_test.sh mp_unit_tests 2)
 
 # mscclpp-test
 add_subdirectory(mscclpp-test)
-
-# Performance tests
-add_subdirectory(perf)
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index b26ff1a8..1f1d0e52 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -1,4 +1,4 @@
-set -e
+set -ex
 
 TEST_NAME=$1
 IB_ENVIRONMENT="${2:-true}"
diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
new file mode 100755
index 00000000..b646ea92
--- /dev/null
+++ b/test/deploy/run-remote.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# Run a command on remote CI VMs via parallel-ssh.
+# By default, runs inside the mscclpp-test docker container.
+#
+# Usage:
+#   run-remote.sh [OPTIONS] < <command_script>
+#
+# Options:
+#   --no-docker   Run command directly on the host, not inside docker
+#   --no-log      Don't tail the log file in the background
+#   --hostfile    Override hostfile path (default: test/deploy/hostfile_ci)
+#   --host        Run command on a single host (uses parallel-ssh -H)
+#   --user        SSH user when using --host or custom hostfile
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HOSTFILE="${SCRIPT_DIR}/hostfile_ci"
+SSH_OPTION="StrictHostKeyChecking=no"
+KeyFilePath="${SSHKEYFILE_SECUREFILEPATH}"
+
+USE_DOCKER=true
+USE_LOG=true
+TARGET_HOST=""
+REMOTE_USER=""
+
+usage() {
+    echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] < <command_script>" >&2
+}
+
+require_value() {
+    local opt="$1"
+    local val="$2"
+    if [ -z "$val" ]; then
+        echo "Missing value for ${opt}" >&2
+        exit 1
+    fi
+}
+
+while [[ "$1" == --* ]]; do
+    case "$1" in
+        --no-docker) USE_DOCKER=false; shift ;;
+        --no-log)    USE_LOG=false; shift ;;
+        --hostfile)
+            require_value "--hostfile" "${2-}"
+            HOSTFILE="$2"
+            shift 2
+            ;;
+        --host)
+            require_value "--host" "${2-}"
+            TARGET_HOST="$2"
+            shift 2
+            ;;
+        --user)
+            require_value "--user" "${2-}"
+            REMOTE_USER="$2"
+            shift 2
+            ;;
+        *) echo "Unknown option: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ $# -ne 0 ] || [ -t 0 ]; then
+    usage
+    exit 1
+fi
+
+CMD=$(cat)
+if [ -z "$CMD" ]; then
+    usage
+    exit 1
+fi
+CMD_B64=$(printf '%s' "$CMD" | base64 | tr -d '\n')
+
+PSSH_TARGET_ARGS=()
+if [ -n "$TARGET_HOST" ]; then
+    PSSH_TARGET_ARGS=(-H "$TARGET_HOST")
+else
+    PSSH_TARGET_ARGS=(-h "$HOSTFILE")
+fi
+
+PSSH_USER_ARGS=()
+if [ -n "$REMOTE_USER" ]; then
+    PSSH_USER_ARGS=(-l "$REMOTE_USER")
+fi
+
+PSSH_COMMON=(
+    -t 0
+    "${PSSH_TARGET_ARGS[@]}"
+    "${PSSH_USER_ARGS[@]}"
+    -x "-i ${KeyFilePath}"
+    -O "$SSH_OPTION"
+)
+
+if $USE_DOCKER; then
+    INNER="set -euxo pipefail;"
+    INNER+=" cd /root/mscclpp;"
+    INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
+    INNER+=" CMD_B64='${CMD_B64}';"
+    INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
+
+    parallel-ssh -i "${PSSH_COMMON[@]}" \
+        "sudo docker exec mscclpp-test bash -c \"${INNER}\""
+else
+    parallel-ssh -i "${PSSH_COMMON[@]}" \
+        "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"
+fi
diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh
index 488fa81f..0c05a090 100644
--- a/test/deploy/run_tests.sh
+++ b/test/deploy/run_tests.sh
@@ -1,6 +1,5 @@
 set -e
 HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
-export PATH=/usr/local/mpi/bin:$PATH
 
 function run_mscclpp_test()
 {
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 0e7869ab..2378e7ff 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -93,11 +93,8 @@ double benchTime(int rank, std::shared_ptr<mscclpp::Bootstrap> bootstrap, std::s
 
 int main(int argc, char* argv[]) {
   if (argc != 5 && argc != 6) {
-    std::cerr << "Usage: " << argv[0] << " <buffer size>"
-              << " <execution plan path>"
-              << " <number of iterations>"
-              << " <number of graph iterations>"
-              << " (optional) <packet type>" << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <buffer size> <execution plan path>"
+              << " <number of iterations> <number of graph iterations> (optional) <packet type>" << std::endl;
     return 1;
   }
 
@@ -142,7 +139,8 @@ int main(int argc, char* argv[]) {
     NpKit::Shutdown();
   }
 
-  std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << deltaSec * 1.e6 << " us" << std::endl;
+  double latencyUs = deltaSec * 1.e6;
+  std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << latencyUs << " us" << std::endl;
   MPI_Finalize();
   return 0;
 }
diff --git a/test/framework.cc b/test/framework.cc
new file mode 100644
index 00000000..73cf1272
--- /dev/null
+++ b/test/framework.cc
@@ -0,0 +1,323 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "framework.hpp"
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+namespace mscclpp {
+namespace test {
+
+// Global state
+static int gMpiRank = 0;
+static int gMpiSize = 1;
+static bool gMpiInitialized = false;
+static bool gCurrentTestPassed = true;
+static std::string gCurrentTestFailureMessage;
+static std::string gCurrentTestName;
+
+std::string currentTestName() { return gCurrentTestName; }
+
+namespace utils {
+
+void initializeMPI(int argc, char* argv[]) {
+  if (gMpiInitialized) return;
+
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (!initialized) {
+    MPI_Init(&argc, &argv);
+  }
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &gMpiRank);
+  MPI_Comm_size(MPI_COMM_WORLD, &gMpiSize);
+  gMpiInitialized = true;
+}
+
+static void finalizeMPI() {
+  if (!gMpiInitialized) return;
+
+  MPI_Finalize();
+  gMpiInitialized = false;
+}
+
+bool isMainRank() { return gMpiRank == 0; }
+
+int getMPIRank() { return gMpiRank; }
+
+int getMPISize() { return gMpiSize; }
+
+void cleanupMPI() { finalizeMPI(); }
+
+void reportFailure(const char* file, int line, const std::string& message) {
+  gCurrentTestPassed = false;
+  std::ostringstream oss;
+  oss << file << ":" << line << ": " << message;
+  if (!gCurrentTestFailureMessage.empty()) {
+    gCurrentTestFailureMessage += "\n";
+  }
+  gCurrentTestFailureMessage += oss.str();
+  std::cerr << oss.str() << std::endl;
+}
+
+void reportSuccess() {
+  gCurrentTestPassed = true;
+  gCurrentTestFailureMessage.clear();
+}
+
+// Timer implementation
+Timer::Timer() : isRunning_(false) {}
+
+void Timer::start() {
+  startTime_ = std::chrono::high_resolution_clock::now();
+  isRunning_ = true;
+}
+
+void Timer::stop() {
+  endTime_ = std::chrono::high_resolution_clock::now();
+  isRunning_ = false;
+}
+
+double Timer::elapsedMicroseconds() const {
+  if (isRunning_) {
+    auto now = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration_cast<std::chrono::microseconds>(now - startTime_).count();
+  }
+  return std::chrono::duration_cast<std::chrono::microseconds>(endTime_ - startTime_).count();
+}
+
+double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
+
+double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
+
+void cudaCheck(cudaError_t err, const char* file, int line) {
+  if (err != cudaSuccess) {
+    std::string msg =
+        std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
+    throw std::runtime_error(msg);
+  }
+}
+
+}  // namespace utils
+
+// TestRegistry implementation
+TestRegistry& TestRegistry::instance() {
+  static TestRegistry registry;
+  return registry;
+}
+
+void TestRegistry::registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
+                                bool isPerfTest) {
+  tests_.push_back({suiteName, testName, std::move(factory), isPerfTest});
+}
+
+void TestRegistry::addEnvironment(Environment* env) { environments_.push_back(env); }
+
+// Returns true if the test should run given the filter string.
+// Filter syntax:
+//   ""          -> run all
+//   "Pattern"   -> run only tests whose full name contains Pattern
+//   "-Pattern"  -> run all tests EXCEPT those whose full name contains Pattern
+static bool matchesFilter(const std::string& fullName, const std::string& filter) {
+  if (filter.empty()) return true;
+  if (filter[0] == '-') {
+    // Negative filter: exclude tests matching any comma-separated pattern
+    std::string patterns = filter.substr(1);
+    size_t pos = 0;
+    while (pos < patterns.size()) {
+      size_t comma = patterns.find(',', pos);
+      std::string pattern = (comma == std::string::npos) ? patterns.substr(pos) : patterns.substr(pos, comma - pos);
+      if (!pattern.empty() && fullName.find(pattern) != std::string::npos) {
+        return false;
+      }
+      pos = (comma == std::string::npos) ? patterns.size() : comma + 1;
+    }
+    return true;
+  }
+  // Positive filter: include only matching tests
+  return fullName.find(filter) != std::string::npos;
+}
+
+int TestRegistry::runAllTests(int argc, char* argv[]) {
+  // Initialize MPI if not already initialized
+  if (!gMpiInitialized) {
+    utils::initializeMPI(argc, argv);
+  }
+
+  // Parse command line arguments
+  std::string filter;
+  bool excludePerfTests = false;
+
+  for (int i = 1; i < argc; ++i) {
+    std::string arg = argv[i];
+    if (arg.find("--filter=") == 0) {
+      filter = arg.substr(9);  // Length of "--filter="
+    } else if (arg == "--filter" && i + 1 < argc) {
+      filter = argv[i + 1];
+      ++i;
+    } else if (arg == "--exclude-perf-tests") {
+      excludePerfTests = true;
+    }
+  }
+
+  // Set up global test environments
+  for (auto* env : environments_) {
+    try {
+      env->SetUp();
+    } catch (const std::exception& e) {
+      if (gMpiRank == 0) {
+        std::cerr << "Failed to set up test environment: " << e.what() << std::endl;
+      }
+      return 1;
+    }
+  }
+
+  int passed = 0;
+  int failed = 0;
+  int skipped = 0;
+
+  // Count tests to run
+  int totalToRun = 0;
+  int skippedByFilter = 0;
+  for (const auto& entry : tests_) {
+    std::string fullName = entry.suiteName + "." + entry.testName;
+    if (excludePerfTests && entry.isPerfTest) {
+      skippedByFilter++;
+      continue;
+    }
+    if (!matchesFilter(fullName, filter)) {
+      skippedByFilter++;
+      continue;
+    }
+    totalToRun++;
+  }
+
+  if (gMpiRank == 0) {
+    std::cout << "[==========] Running " << totalToRun << " tests";
+    if (skippedByFilter > 0) {
+      std::cout << " (" << skippedByFilter << " skipped by filter)";
+    }
+    std::cout << ".\n";
+  }
+
+  for (const auto& entry : tests_) {
+    std::string fullName = entry.suiteName + "." + entry.testName;
+
+    if (excludePerfTests && entry.isPerfTest) continue;
+    if (!matchesFilter(fullName, filter)) continue;
+
+    gCurrentTestPassed = true;
+    gCurrentTestFailureMessage.clear();
+    gCurrentTestName = fullName;
+
+    if (gMpiRank == 0) {
+      std::cout << "[ RUN      ] " << fullName << std::endl;
+    }
+
+    TestCase* testCase = nullptr;
+    bool testSkipped = false;
+    bool setUpSucceeded = false;
+    try {
+      testCase = entry.factory();
+      testCase->SetUp();
+      setUpSucceeded = true;
+      testCase->TestBody();
+    } catch (const SkipException& e) {
+      gCurrentTestPassed = true;
+      testSkipped = true;
+      if (gMpiRank == 0) {
+        std::cout << "[  SKIPPED ] " << fullName << ": " << e.what() << std::endl;
+      }
+    } catch (const std::exception& e) {
+      gCurrentTestPassed = false;
+      if (gCurrentTestFailureMessage.empty()) {
+        gCurrentTestFailureMessage = e.what();
+      }
+    } catch (...) {
+      gCurrentTestPassed = false;
+      if (gCurrentTestFailureMessage.empty()) {
+        gCurrentTestFailureMessage = "Unknown exception";
+      }
+    }
+
+    // Always call TearDown() if SetUp() succeeded, even if TestBody() threw
+    if (setUpSucceeded && testCase != nullptr) {
+      try {
+        testCase->TearDown();
+      } catch (const std::exception& e) {
+        // If test already failed, keep original failure message
+        if (gCurrentTestPassed) {
+          gCurrentTestPassed = false;
+          gCurrentTestFailureMessage = std::string("TearDown() failed: ") + e.what();
+        }
+      } catch (...) {
+        if (gCurrentTestPassed) {
+          gCurrentTestPassed = false;
+          gCurrentTestFailureMessage = "TearDown() failed with unknown exception";
+        }
+      }
+    }
+
+    delete testCase;
+    gCurrentTestName.clear();
+
+    if (testSkipped) {
+      skipped++;
+      continue;
+    }
+
+    // Synchronize test status across all MPI processes
+    int localPassed = gCurrentTestPassed ? 1 : 0;
+    int globalPassed = 1;
+    if (gMpiInitialized) {
+      MPI_Allreduce(&localPassed, &globalPassed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    } else {
+      globalPassed = localPassed;
+    }
+
+    if (gMpiRank == 0) {
+      if (globalPassed) {
+        std::cout << "[       OK ] " << fullName << std::endl;
+        passed++;
+      } else {
+        std::cout << "[  FAILED  ] " << fullName << std::endl;
+        failed++;
+      }
+    }
+  }
+
+  if (gMpiRank == 0) {
+    std::cout << "[==========] " << totalToRun << " tests ran.\n";
+    if (passed > 0) {
+      std::cout << "[  PASSED  ] " << passed << " tests.\n";
+    }
+    if (skipped > 0) {
+      std::cout << "[  SKIPPED ] " << skipped << " tests.\n";
+    }
+    if (failed > 0) {
+      std::cout << "[  FAILED  ] " << failed << " tests.\n";
+    }
+  }
+
+  // Tear down global test environments (in reverse order)
+  for (auto it = environments_.rbegin(); it != environments_.rend(); ++it) {
+    try {
+      (*it)->TearDown();
+      delete *it;
+    } catch (const std::exception& e) {
+      if (gMpiRank == 0) {
+        std::cerr << "Failed to tear down test environment: " << e.what() << std::endl;
+      }
+    }
+  }
+  environments_.clear();
+
+  return failed > 0 ? 1 : 0;
+}
+
+}  // namespace test
+}  // namespace mscclpp
diff --git a/test/framework.hpp b/test/framework.hpp
new file mode 100644
index 00000000..26a32d5b
--- /dev/null
+++ b/test/framework.hpp
@@ -0,0 +1,405 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_TEST_FRAMEWORK_HPP_
+#define MSCCLPP_TEST_FRAMEWORK_HPP_
+
+#include <mpi.h>
+
+#include <chrono>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <mscclpp/gpu.hpp>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace mscclpp {
+namespace test {
+
+// Test case base class
+class TestCase {
+ public:
+  virtual ~TestCase() = default;
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+  virtual void TestBody() = 0;
+};
+
+// Environment base class (for global test setup/teardown)
+class Environment {
+ public:
+  virtual ~Environment() = default;
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+};
+
+// Test registry and runner
+class TestRegistry {
+ public:
+  using TestFactory = std::function<TestCase*()>;
+
+  static TestRegistry& instance();
+
+  void registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
+                    bool isPerfTest = false);
+  void addEnvironment(Environment* env);
+  int runAllTests(int argc, char* argv[]);
+
+ private:
+  TestRegistry() = default;
+  struct TestEntry {
+    std::string suiteName;
+    std::string testName;
+    TestFactory factory;
+    bool isPerfTest;
+  };
+  std::vector<TestEntry> tests_;
+  std::vector<Environment*> environments_;
+};
+
+// Returns "Suite.Name" for the currently running test, or "" if none.
+std::string currentTestName();
+
+// Utility functions
+namespace utils {
+
+// MPI management
+void initializeMPI(int argc, char* argv[]);
+void cleanupMPI();
+bool isMainRank();
+int getMPIRank();
+int getMPISize();
+
+// Timing utilities
+class Timer {
+ public:
+  Timer();
+  void start();
+  void stop();
+  double elapsedMicroseconds() const;
+  double elapsedMilliseconds() const;
+  double elapsedSeconds() const;
+
+ private:
+  std::chrono::high_resolution_clock::time_point startTime_;
+  std::chrono::high_resolution_clock::time_point endTime_;
+  bool isRunning_;
+};
+
+// CUDA utilities
+void cudaCheck(cudaError_t err, const char* file, int line);
+#define CUDA_CHECK(call) mscclpp::test::utils::cudaCheck(call, __FILE__, __LINE__)
+
+// Test assertion helpers
+void reportFailure(const char* file, int line, const std::string& message);
+void reportSuccess();
+
+}  // namespace utils
+
+// Exception for test skips
+class SkipException : public std::runtime_error {
+ public:
+  explicit SkipException(const std::string& message) : std::runtime_error(message) {}
+};
+
+// Helper class for FAIL() macro — supports message streaming via operator<<
+class FailHelper {
+ public:
+  explicit FailHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  FailHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~FailHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed: " + msg);
+    } else {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed");
+    }
+    throw std::runtime_error("Test failed");
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+// Helper class for SKIP_TEST() macro — supports message streaming via operator<<
+// Usage: SKIP_TEST() << "Reason for skipping";
+class SkipHelper {
+ public:
+  explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  SkipHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~SkipHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      throw SkipException("Test skipped: " + msg);
+    } else {
+      throw SkipException("Test skipped");
+    }
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+// SFINAE helper: resolves to T if T is a complete type (user-defined fixture),
+// otherwise falls back to TestCase. This lets TEST() work with or without a fixture class.
+namespace detail {
+template <typename...>
+using void_t = void;
+
+template <typename T, typename = void_t<>>
+struct FixtureOf {
+  using type = TestCase;
+};
+template <typename T>
+struct FixtureOf<T, void_t<decltype(sizeof(T))>> {
+  using type = T;
+};
+}  // namespace detail
+
+}  // namespace test
+}  // namespace mscclpp
+
+// --- Test registration macros ---
+// TEST(Suite, Name): if Suite is a previously-defined class, the test inherits from it (fixture).
+// Otherwise, the test inherits from TestCase (no fixture needed).
+
+#define TEST(test_fixture, test_name)                                                                       \
+  class test_fixture;                                                                                       \
+  class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
+   public:                                                                                                  \
+    void TestBody() override;                                                                               \
+  };                                                                                                        \
+  static bool test_fixture##_##test_name##_registered = []() {                                              \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                                 \
+        #test_fixture, #test_name,                                                                          \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); });            \
+    return true;                                                                                            \
+  }();                                                                                                      \
+  void test_fixture##_##test_name##_Test::TestBody()
+
+#define PERF_TEST(test_fixture, test_name)                                                                  \
+  class test_fixture;                                                                                       \
+  class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
+   public:                                                                                                  \
+    void TestBody() override;                                                                               \
+  };                                                                                                        \
+  static bool test_fixture##_##test_name##_registered = []() {                                              \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                                 \
+        #test_fixture, #test_name,                                                                          \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, true);      \
+    return true;                                                                                            \
+  }();                                                                                                      \
+  void test_fixture##_##test_name##_Test::TestBody()
+
+// --- Test runner macro ---
+#define RUN_ALL_TESTS() ::mscclpp::test::TestRegistry::instance().runAllTests(argc, argv)
+
+// Assertion macros
+#define EXPECT_TRUE(condition)                                                                          \
+  do {                                                                                                  \
+    if (!(condition)) {                                                                                 \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
+    }                                                                                                   \
+  } while (0)
+
+#define EXPECT_FALSE(condition)                                                                          \
+  do {                                                                                                   \
+    if (condition) {                                                                                     \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
+    }                                                                                                    \
+  } while (0)
+
+#define EXPECT_EQ(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 == v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_NE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 != v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_LT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 < v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+    }                                                                                \
+  } while (0)
+
+#define EXPECT_LE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 <= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_GT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 > v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+    }                                                                                \
+  } while (0)
+
+#define EXPECT_GE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 >= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_TRUE(condition)                                                                          \
+  do {                                                                                                  \
+    if (!(condition)) {                                                                                 \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
+      throw std::runtime_error("Test assertion failed");                                                \
+    }                                                                                                   \
+  } while (0)
+
+#define ASSERT_FALSE(condition)                                                                          \
+  do {                                                                                                   \
+    if (condition) {                                                                                     \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
+      throw std::runtime_error("Test assertion failed");                                                 \
+    }                                                                                                    \
+  } while (0)
+
+#define ASSERT_EQ(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 == v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_NE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 != v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_LT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 < v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+      throw std::runtime_error("Test assertion failed");                             \
+    }                                                                                \
+  } while (0)
+
+#define ASSERT_LE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 <= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_GT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 > v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+      throw std::runtime_error("Test assertion failed");                             \
+    }                                                                                \
+  } while (0)
+
+#define ASSERT_GE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 >= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_NO_THROW(statement)                                                                         \
+  do {                                                                                                     \
+    try {                                                                                                  \
+      statement;                                                                                           \
+    } catch (const std::exception& e) {                                                                    \
+      std::ostringstream oss;                                                                              \
+      oss << "Expected: " #statement " not to throw\n  Actual: threw " << e.what();                        \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());                                \
+      throw std::runtime_error("Test assertion failed");                                                   \
+    } catch (...) {                                                                                        \
+      ::mscclpp::test::utils::reportFailure(                                                               \
+          __FILE__, __LINE__, "Expected: " #statement " not to throw\n  Actual: threw unknown exception"); \
+      throw std::runtime_error("Test assertion failed");                                                   \
+    }                                                                                                      \
+  } while (0)
+
+// --- Test control macros ---
+
+// Fail the current test immediately. Usage: FAIL() << "reason";
+#define FAIL() ::mscclpp::test::FailHelper(__FILE__, __LINE__)
+
+// Skip the current test. Usage: SKIP_TEST() << "reason";
+#define SKIP_TEST() ::mscclpp::test::SkipHelper(__FILE__, __LINE__)
+
+#endif  // MSCCLPP_TEST_FRAMEWORK_HPP_
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index 4bbab2f1..c28087a4 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
@@ -48,7 +48,7 @@ void BootstrapTest::bootstrapTestAll(std::shared_ptr<mscclpp::Bootstrap> bootstr
   bootstrapTestSendRecv(bootstrap);
 }
 
-TEST_F(BootstrapTest, WithId) {
+TEST(BootstrapTest, WithId) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   mscclpp::UniqueId id;
   if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
@@ -57,13 +57,13 @@ TEST_F(BootstrapTest, WithId) {
   bootstrapTestAll(bootstrap);
 }
 
-TEST_F(BootstrapTest, WithIpPortPair) {
+TEST(BootstrapTest, WithIpPortPair) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   bootstrap->initialize(gEnv->args["ip_port"]);
   bootstrapTestAll(bootstrap);
 }
 
-TEST_F(BootstrapTest, ResumeWithId) {
+TEST(BootstrapTest, ResumeWithId) {
   // This test may take a few minutes.
   bootstrapTestTimer.set(300);
 
@@ -76,19 +76,19 @@ TEST_F(BootstrapTest, ResumeWithId) {
   }
 }
 
-TEST_F(BootstrapTest, ResumeWithIpPortPair) {
+TEST(BootstrapTest, ResumeWithIpPortPair) {
   for (int i = 0; i < 5; ++i) {
     auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
     bootstrap->initialize(gEnv->args["ip_port"]);
   }
 }
 
-TEST_F(BootstrapTest, ExitBeforeConnect) {
+TEST(BootstrapTest, ExitBeforeConnect) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   bootstrap->createUniqueId();
 }
 
-TEST_F(BootstrapTest, TimeoutWithId) {
+TEST(BootstrapTest, TimeoutWithId) {
   mscclpp::Timer timer;
 
   // All ranks initialize a bootstrap with their own id (will hang)
@@ -99,7 +99,7 @@ TEST_F(BootstrapTest, TimeoutWithId) {
     // Set bootstrap timeout to 1 second
     bootstrap->initialize(id, 1);
   } catch (const mscclpp::Error& e) {
-    ASSERT_EQ(e.getErrorCode(), mscclpp::ErrorCode::Timeout);
+    ASSERT_TRUE(e.getErrorCode() == mscclpp::ErrorCode::Timeout);
   }
 
   // Timeout should be sligtly greater than 1 second
@@ -139,7 +139,7 @@ class MPIBootstrap : public mscclpp::Bootstrap {
   }
 };
 
-TEST_F(BootstrapTest, MPIBootstrap) {
+TEST(BootstrapTest, MPIBootstrap) {
   auto bootstrap = std::make_shared<MPIBootstrap>();
   bootstrapTestAll(bootstrap);
 }
diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu
index 9d83532a..066c5514 100644
--- a/test/mp_unit/communicator_tests.cu
+++ b/test/mp_unit/communicator_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
@@ -185,7 +185,7 @@ bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
   return true;
 }
 
-TEST_F(CommunicatorTest, BasicWrite) {
+TEST(CommunicatorTest, BasicWrite) {
   if (gEnv->rank >= numRanksToUse) return;
 
   deviceBufferInit();
@@ -215,7 +215,7 @@ __global__ void kernelWaitSemaphores(mscclpp::Host2DeviceSemaphore::DeviceHandle
   }
 }
 
-TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
+TEST(CommunicatorTest, WriteWithDeviceSemaphores) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::unordered_map<int, std::shared_ptr<mscclpp::Host2DeviceSemaphore>> semaphores;
@@ -254,7 +254,7 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
   communicator->bootstrap()->barrier();
 }
 
-TEST_F(CommunicatorTest, WriteWithHostSemaphores) {
+TEST(CommunicatorTest, WriteWithHostSemaphores) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::unordered_map<int, std::shared_ptr<mscclpp::Host2HostSemaphore>> semaphores;
diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc
index a903ed08..4f3f2545 100644
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -1,7 +1,8 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
+#include <unistd.h>
 
 #include <filesystem>
 #include <mscclpp/env.hpp>
@@ -22,7 +23,7 @@ std::string getExecutablePath() {
 
 void ExecutorTest::SetUp() {
   if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) {
-    GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2";
+    SKIP_TEST() << "This test requires world size to be 2 and ranks per node to be 2";
   }
   MultiProcessTest::SetUp();
 
@@ -49,7 +50,7 @@ void ExecutorTest::TearDown() {
   MultiProcessTest::TearDown();
 }
 
-TEST_F(ExecutorTest, TwoNodesAllreduce) {
+TEST(ExecutorTest, TwoNodesAllreduce) {
   std::string executablePath = getExecutablePath();
   std::filesystem::path path = executablePath;
   std::filesystem::path executionFilesPath =
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 051030ac..04ab402d 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
@@ -18,9 +18,7 @@ void IbTestBase::SetUp() {
 }
 
 void IbPeerToPeerTest::SetUp() {
-#if !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
+  REQUIRE_IBVERBS;
 
   IbTestBase::SetUp();
 
@@ -80,7 +78,7 @@ void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint6
   qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
 }
 
-TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
+TEST(IbPeerToPeerTest, SimpleSendRecv) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -195,7 +193,7 @@ __global__ void kernelMemoryConsistency(uint64_t* data, volatile uint64_t* curIt
   }
 }
 
-TEST_F(IbPeerToPeerTest, MemoryConsistency) {
+TEST(IbPeerToPeerTest, MemoryConsistency) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -303,7 +301,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
   EXPECT_EQ(res, 0);
 }
 
-TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
+TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
index f6ef3aed..318d301a 100644
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <algorithm>
 
@@ -8,7 +8,7 @@
 void MemoryChannelOneToOneTest::SetUp() {
   // Need at least two ranks within a node
   if (gEnv->nRanksPerNode < 2) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   // Use only two ranks
   setNumRanksToUse(2);
@@ -88,27 +88,12 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   // The least nelem is 2 for packet ping pong
-  kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 1024 * 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
+  for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelWrapper(buff.get(), gEnv->rank, nElem, ret.get(), defaultNTries);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   int nTries = 1000000;
   communicator->bootstrap()->barrier();
@@ -169,7 +154,7 @@ __global__ void kernelMemPutPingPong(int* buff, int rank, int nElem, int* ret) {
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
+TEST(MemoryChannelOneToOneTest, PutPingPong) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -187,28 +172,12 @@ TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 }
 
 __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
@@ -248,7 +217,7 @@ __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
+TEST(MemoryChannelOneToOneTest, GetPingPong) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -266,28 +235,12 @@ TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 }
 
 __global__ void kernelMemLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
@@ -371,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int*
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, LL8PacketPingPong) {
+TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
   auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
   packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
 }
 
-TEST_F(MemoryChannelOneToOneTest, LL16PacketPingPong) {
+TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
   auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
diff --git a/test/mp_unit/mp_unit_tests.cc b/test/mp_unit/mp_unit_tests.cc
index cafd9bbc..2f6dc1ca 100644
--- a/test/mp_unit/mp_unit_tests.cc
+++ b/test/mp_unit/mp_unit_tests.cc
@@ -98,14 +98,18 @@ static std::unordered_map<std::string, std::string> parseArgs(int argc, const ch
       continue;
     }
 
-    // Unrecognized positional token: ignore to keep parser permissive for gtest/MPI extras
+    // Unrecognized positional token: ignore
   }
 
   return options;
 }
 
 void MultiProcessTestEnv::SetUp() {
-  MPI_Init(NULL, NULL);
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (!initialized) {
+    MPI_Init(NULL, NULL);
+  }
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
   // get the local number of nodes with MPI
@@ -128,18 +132,17 @@ void MultiProcessTest::TearDown() {
 }
 
 int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
   gEnv = new MultiProcessTestEnv(argc, (const char**)argv);
-  ::testing::AddGlobalTestEnvironment(gEnv);
+  ::mscclpp::test::TestRegistry::instance().addEnvironment(gEnv);
   return RUN_ALL_TESTS();
 }
 
-TEST_F(MultiProcessTest, Prelim) {
+TEST(MultiProcessTest, Prelim) {
   // Test to make sure the MPI environment is set up correctly
   ASSERT_GE(gEnv->worldSize, 2);
 }
 
-TEST_F(MultiProcessTest, HostName) {
+TEST(MultiProcessTest, HostName) {
   const size_t maxNameLen = 1024;
   std::vector<char> buffer(gEnv->worldSize * maxNameLen, '\0');
   std::string hostName = mscclpp::getHostName(maxNameLen, '\0');
@@ -159,7 +162,7 @@ TEST_F(MultiProcessTest, HostName) {
   }
 }
 
-TEST_F(MultiProcessTest, HostHash) {
+TEST(MultiProcessTest, HostHash) {
   std::vector<uint64_t> buffer(gEnv->worldSize, 0);
   uint64_t hostHash = mscclpp::getHostHash();
   buffer[gEnv->rank] = hostHash;
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index 17046a57..03e4cbde 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -4,8 +4,6 @@
 #ifndef MSCCLPP_MP_UNIT_TESTS_HPP_
 #define MSCCLPP_MP_UNIT_TESTS_HPP_
 
-#include <gtest/gtest.h>
-
 #include <mscclpp/core.hpp>
 #include <mscclpp/executor.hpp>
 #include <mscclpp/memory_channel.hpp>
@@ -13,10 +11,18 @@
 #include <mscclpp/port_channel.hpp>
 #include <mscclpp/utils.hpp>
 
+#include "../framework.hpp"
 #include "ib.hpp"
 #include "utils_internal.hpp"
 
-class MultiProcessTestEnv : public ::testing::Environment {
+// Skip the current test if IBVerbs is not available in this build
+#if defined(USE_IBVERBS)
+#define REQUIRE_IBVERBS
+#else
+#define REQUIRE_IBVERBS SKIP_TEST() << "This test requires IBVerbs that the current build does not support."
+#endif
+
+class MultiProcessTestEnv : public ::mscclpp::test::Environment {
  public:
   MultiProcessTestEnv(int argc, const char** argv);
 
@@ -37,7 +43,7 @@ mscclpp::Transport ibIdToTransport(int id);
 int rankToLocalRank(int rank);
 int rankToNode(int rank);
 
-class MultiProcessTest : public ::testing::Test {
+class MultiProcessTest : public ::mscclpp::test::TestCase {
  protected:
   void TearDown() override;
 };
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 7cc5954a..764c3299 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <cstdint>
 #include <mscclpp/concurrency_device.hpp>
@@ -178,26 +178,12 @@ void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) {
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   const int nTries = 1000;
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, params.waitWithPoll, nTries, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   proxyService->stopProxy();
 }
@@ -223,8 +209,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const std::string testName = ::mscclpp::test::currentTestName();
   const int nTries = 1000;
 
   // Warm-up
@@ -247,63 +232,51 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   proxyService->stopProxy();
 }
 
-TEST_F(PortChannelOneToOneTest, PingPong) {
+TEST(PortChannelOneToOneTest, PingPong) {
   testPingPong(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostMode) {
+  REQUIRE_IBVERBS;
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongEthernet) {
+TEST(PortChannelOneToOneTest, PingPongEthernet) {
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongWithPoll) {
+TEST(PortChannelOneToOneTest, PingPongWithPoll) {
   testPingPong(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
+  REQUIRE_IBVERBS;
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerf) {
+TEST(PortChannelOneToOneTest, PingPongPerf) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
+  REQUIRE_IBVERBS;
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) {
+TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
@@ -407,34 +380,14 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb, IbMode ibMode) {
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   const int nTries = 1000;
-
   // The least nelem is 2 for packet ping pong
-  kernelProxyLLPingPong<true>
-      <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 2, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true>
-      <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
-                                           1024 * 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
-                                           4 * 1024 * 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelProxyLLPingPong<true>
+        <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, nElem, nTries, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   communicator->bootstrap()->barrier();
 
@@ -471,8 +424,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
 
   proxyService->startProxy();
 
-  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const std::string testName = ::mscclpp::test::currentTestName();
   const int nTries = 1000000;
 
   // Warm-up
@@ -497,47 +449,32 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
   proxyService->stopProxy();
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
+TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPong(true, IbMode::Host);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
+TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPongPerf(true, IbMode::Host);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPongPerf(true, IbMode::HostNoAtomic);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPong(true, IbMode::HostNoAtomic);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu
index a12919e3..6d913c64 100644
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <algorithm>
 #include <mscclpp/switch_channel.hpp>
@@ -10,10 +10,10 @@
 void SwitchChannelTest::SetUp() {
   // Need at least two ranks within a node
   if (gEnv->nRanksPerNode < 2) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   if (!mscclpp::isNvlsSupported()) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   // Use only two ranks
   setNumRanksToUse(2);
@@ -23,6 +23,8 @@ void SwitchChannelTest::SetUp() {
 void SwitchChannelTest::TearDown() { CommunicatorTestBase::TearDown(); }
 
 __constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
 
 __global__ void kernelSwitchReduce() {
 #if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
@@ -31,7 +33,16 @@ __global__ void kernelSwitchReduce() {
 #endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
 }
 
-TEST_F(SwitchChannelTest, SimpleAllReduce) {
+__global__ void kernelSwitchReduceTwo() {
+#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+  auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan1.broadcast(0, val1);
+  auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan2.broadcast(0, val2);
+#endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+}
+
+TEST(SwitchChannelTest, SimpleAllReduce) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::vector<int> ranks;
@@ -66,22 +77,13 @@ TEST_F(SwitchChannelTest, SimpleAllReduce) {
   for (int i = 0; i < numRanksToUse; i++) {
     expected += i + 1.0f;
   }
-  ASSERT_EQ(result, expected) << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank;
+  if (result != expected) {
+    std::cerr << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank << std::endl;
+  }
+  ASSERT_EQ(result, expected);
 }
 
-__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
-__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
-
-__global__ void kernelSwitchReduceTwo() {
-#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
-  auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
-  gConstSwitchChan1.broadcast(0, val1);
-  auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
-  gConstSwitchChan2.broadcast(0, val2);
-#endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
-}
-
-TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
+TEST(SwitchChannelTest, TwoChannelsSameConnection) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::vector<int> ranks;
@@ -97,12 +99,9 @@ TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
   MSCCLPP_CUDATHROW(cudaMemcpy(buffer1.data(), &data1, sizeof(data1), cudaMemcpyHostToDevice));
   MSCCLPP_CUDATHROW(cudaMemcpy(buffer2.data(), &data2, sizeof(data2), cudaMemcpyHostToDevice));
 
-  // Connection size must be large enough for two granularity-aligned buffers.
-  // The multicast granularity is typically 2MB, so we need at least 2 * 2MB.
   const size_t connSize = buffer1.bytes() + buffer2.bytes();
   auto nvlsConnection = mscclpp::connectNvlsCollective(communicator, ranks, connSize);
 
-  // Bind two separate buffers to the same connection
   auto switchChannel1 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer1.data()), bufSize);
   auto switchChannel2 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer2.data()), bufSize);
 
@@ -132,6 +131,6 @@ TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
     expected1 += (i + 1.0f) * 1.0f;
     expected2 += (i + 1.0f) * 10.0f;
   }
-  ASSERT_EQ(result1, expected1) << "Channel1: expected " << expected1 << " but got " << result1;
-  ASSERT_EQ(result2, expected2) << "Channel2: expected " << expected2 << " but got " << result2;
+  ASSERT_EQ(result1, expected1);
+  ASSERT_EQ(result2, expected2);
 }
diff --git a/test/perf/CMakeLists.txt b/test/perf/CMakeLists.txt
deleted file mode 100644
index 6a16c034..00000000
--- a/test/perf/CMakeLists.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# Find required packages
-find_package(MPI REQUIRED)
-
-# Note: nlohmann_json::nlohmann_json target is already available from the main project
-
-# Set up common libraries and includes for tests
-set(PERF_TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads MPI::MPI_CXX)
-if(MSCCLPP_USE_IB)
-    list(APPEND PERF_TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
-endif()
-
-set(PERF_TEST_INC_COMMON 
-    PRIVATE ${PROJECT_SOURCE_DIR}/include 
-    SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
-
-# Function to add a test executable
-function(add_perf_test_executable name sources)
-    if(MSCCLPP_USE_ROCM)
-        set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
-    endif()
-    add_executable(${name} ${sources})
-    target_link_libraries(${name} ${PERF_TEST_LIBS_COMMON})
-    
-    # Link nlohmann_json - use the target from main project
-    target_link_libraries(${name} nlohmann_json::nlohmann_json)
-    
-    if(MSCCLPP_USE_IB)
-        target_compile_definitions(${name} PRIVATE USE_IBVERBS)
-    endif()
-    
-    target_include_directories(${name} ${PERF_TEST_INC_COMMON})
-    target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS)
-    
-    # Set C++ standard
-    target_compile_features(${name} PRIVATE cxx_std_17)
-
-    set_target_properties(${name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/perf")
-endfunction()
-
-# Add FIFO test  
-add_perf_test_executable(fifo_test "framework.cc;fifo_test.cu")
diff --git a/test/perf/fifo_test.cu b/test/perf/fifo_test.cu
deleted file mode 100644
index bb77a106..00000000
--- a/test/perf/fifo_test.cu
+++ /dev/null
@@ -1,298 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <getopt.h>
-
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mscclpp/fifo.hpp>
-#include <mscclpp/gpu_utils.hpp>
-#include <mscclpp/numa.hpp>
-#include <sstream>
-#include <stdexcept>
-
-#include "framework.hpp"
-
-using namespace mscclpp::test;
-
-// Constants for timeout and trigger calculation
-constexpr uint64_t TIMEOUT_SPINS = 1000000;
-constexpr int MIN_TRIGGERS = 1000;
-constexpr int MIN_WARMUP_TRIGGERS = 100;
-constexpr int TRIGGERS_PER_FIFO_SIZE = 10;
-constexpr int WARMUP_TRIGGERS_PER_FIFO_SIZE = 2;
-
-__constant__ mscclpp::FifoDeviceHandle gFifoDeviceHandle;
-
-__global__ void kernelFifoPush(size_t numTriggers) {
-  mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  mscclpp::ProxyTrigger trigger;
-  for (size_t i = 1; i <= numTriggers; ++i) {
-    trigger.fst = i;
-    trigger.snd = tid ^ i;
-    fifo.push(trigger);
-  }
-}
-
-__global__ void kernelFifoPushSync(size_t numTriggers) {
-  mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
-  mscclpp::ProxyTrigger trigger;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (size_t i = 1; i <= numTriggers; ++i) {
-    trigger.fst = i;
-    trigger.snd = tid ^ i;
-    fifo.sync(fifo.push(trigger));
-  }
-}
-
-static void setupCuda(int& cudaDevice, int& numaNode) {
-  utils::CUDA_CHECK(cudaGetDevice(&cudaDevice));
-  numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
-  mscclpp::numaBind(numaNode);
-}
-
-// Helper function to consume triggers from FIFO
-static bool consumeTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
-  int totalTriggers = numTriggers * parallel;
-  std::unordered_map<int, int> triggerCounts;
-  for (int i = 0; i < totalTriggers; ++i) {
-    mscclpp::ProxyTrigger trigger;
-    uint64_t spin = 0;
-    do {
-      trigger = hostFifo->poll();
-      if (spin++ > TIMEOUT_SPINS) {
-        return false;
-      }
-    } while (trigger.fst == 0 || trigger.snd == 0);
-
-    // Process trigger (see src/proxy.cc)
-    trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
-    trigger.snd = trigger.snd ^ trigger.fst;
-    assert(triggerCounts[trigger.snd] + 1 == trigger.fst);
-    triggerCounts[trigger.snd]++;
-    hostFifo->pop();
-  }
-  return true;
-}
-
-// Helper function to run a single kernel variant and return performance metrics
-std::tuple<double, double, int, int> runSingleKernelVariant(void (*kernel)(size_t),
-                                                            std::unique_ptr<mscclpp::Fifo>& hostFifo,
-                                                            cudaStream_t stream, int numParallel) {
-  // Calculate triggers based on FIFO size
-  const int numTriggers = std::max(MIN_TRIGGERS, static_cast<int>(hostFifo->size() * TRIGGERS_PER_FIFO_SIZE));
-  const int warmupTriggers =
-      std::max(MIN_WARMUP_TRIGGERS, static_cast<int>(hostFifo->size() * WARMUP_TRIGGERS_PER_FIFO_SIZE));
-
-  // Warmup
-  kernel<<<numParallel, 1, 0, stream>>>(warmupTriggers);
-  utils::CUDA_CHECK(cudaGetLastError());
-
-  // Process warmup triggers (note: total triggers = warmupTriggers * numParallel)
-  if (!consumeTriggers(hostFifo, warmupTriggers, numParallel)) {
-    return {0.0, 0.0, 0, 0};  // Return error values
-  }
-  utils::CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  // Benchmark
-  utils::Timer timer;
-  timer.start();
-
-  kernel<<<numParallel, 1, 0, stream>>>(numTriggers);
-  utils::CUDA_CHECK(cudaGetLastError());
-
-  // Process all triggers
-  if (!consumeTriggers(hostFifo, numTriggers, numParallel)) {
-    return {0.0, 0.0, 0, 0};
-  }
-  utils::CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  timer.stop();
-
-  const int totalTriggers = numTriggers * numParallel;
-  double throughput = totalTriggers / timer.elapsedSeconds();
-  double duration_us = timer.elapsedMicroseconds();
-
-  utils::CUDA_CHECK(cudaDeviceSynchronize());
-
-  return {throughput, duration_us, totalTriggers, warmupTriggers * numParallel};
-}
-
-void runFifoTestVariant(std::unique_ptr<mscclpp::Fifo>& hostFifo, cudaStream_t stream, int numParallel,
-                        nlohmann::ordered_json& combinedMetrics) {
-  auto [pushThroughput, pushDuration, numTriggers, warmupTriggers] =
-      runSingleKernelVariant(kernelFifoPush, hostFifo, stream, numParallel);
-
-  auto [syncThroughput, syncDuration, syncNumTriggers, syncWarmupTriggers] =
-      runSingleKernelVariant(kernelFifoPushSync, hostFifo, stream, numParallel);
-
-  auto formatThroughput = [](double thru) {
-    return double(int(thru * 10)) / 10.0;  // Round to 1 decimal place
-  };
-
-  std::string prefix = "p" + std::to_string(numParallel) + "_";
-  combinedMetrics[prefix + "push_throughput"] = formatThroughput(pushThroughput);
-  combinedMetrics[prefix + "push_sync_throughput"] = formatThroughput(syncThroughput);
-  combinedMetrics[prefix + "push_duration_us"] = pushDuration;
-  combinedMetrics[prefix + "push_sync_duration_us"] = syncDuration;
-  combinedMetrics[prefix + "num_triggers"] = numTriggers;
-  combinedMetrics[prefix + "warmup_triggers"] = warmupTriggers;
-}
-
-struct FifoTestConfig {
-  int fifoSize;
-  std::vector<int> parallelismLevels;
-
-  // Constructor with default parallelism levels
-  FifoTestConfig(int size, const std::vector<int>& parallel = {1, 2, 4, 8, 16})
-      : fifoSize(size), parallelismLevels(parallel) {}
-};
-
-void runFifoTest(const FifoTestConfig& config, [[maybe_unused]] int rank, [[maybe_unused]] int worldSize,
-                 [[maybe_unused]] int localRank) {
-  if (config.fifoSize <= 0) {
-    throw std::invalid_argument("FIFO size must be positive");
-  }
-  if (config.parallelismLevels.empty()) {
-    throw std::invalid_argument("At least one parallelism level must be specified");
-  }
-
-  int cudaDevice, numaNode;
-  setupCuda(cudaDevice, numaNode);
-
-  auto hostFifo = std::make_unique<mscclpp::Fifo>(config.fifoSize);
-
-  mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
-  utils::CUDA_CHECK(cudaMemcpyToSymbol(gFifoDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
-
-  cudaStream_t stream;
-  utils::CUDA_CHECK(cudaStreamCreate(&stream));
-
-  // Create test name with parallelism range
-  std::string testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_Parallel";
-
-  // Add parallelism range to test name (e.g., "P1-16" or "P1-4-16-64")
-  if (!config.parallelismLevels.empty()) {
-    testName += std::to_string(config.parallelismLevels.front());
-    if (config.parallelismLevels.size() > 1) {
-      testName += "-" + std::to_string(config.parallelismLevels.back());
-
-      // If parallelism levels have non-standard steps, include more detail
-      if (config.parallelismLevels.size() > 2 &&
-          (config.parallelismLevels[1] != 2 * config.parallelismLevels[0] || config.parallelismLevels.size() > 3)) {
-        testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_ParallelCustom";
-      }
-    }
-  }
-
-  // Print test configuration
-  if (utils::isMainRank()) {
-    std::stringstream ss;
-    ss << "Running FIFO test with size=" << config.fifoSize << ", parallelism_levels=[";
-    for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
-      if (i > 0) ss << ",";
-      ss << config.parallelismLevels[i];
-    }
-    ss << "]";
-    std::cout << ss.str() << std::endl;
-  }
-
-  nlohmann::ordered_json combinedMetrics;
-
-  for (int numParallel : config.parallelismLevels) {
-    runFifoTestVariant(hostFifo, stream, numParallel, combinedMetrics);
-  }
-
-  std::map<std::string, std::string> testParams;
-  testParams["fifo_size"] = std::to_string(static_cast<int>(hostFifo->size()));
-
-  // Add parallelism levels to test parameters
-  std::stringstream parallelismStream;
-  for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
-    if (i > 0) parallelismStream << ",";
-    parallelismStream << config.parallelismLevels[i];
-  }
-  testParams["parallelism_levels"] = parallelismStream.str();
-
-  utils::recordResult(testName, "fifo", combinedMetrics, testParams);
-
-  utils::CUDA_CHECK(cudaStreamDestroy(stream));
-}
-
-void runAllFifoTests([[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] int localRank) {
-  // clang-format off
-  std::vector<FifoTestConfig> configs = {
-      {1, {1}},
-      {128, {1, 8, 64, 128}},
-      {512, {1, 8, 64, 256, 512}},
-  };
-  // clang-format on
-
-  for (const auto& config : configs) {
-    runFifoTest(config, rank, worldSize, localRank);
-  }
-}
-
-static void printUsage(char* argv0) {
-  std::stringstream ss;
-  ss << "Usage: " << argv0 << " [OPTIONS]\n"
-     << "\n"
-     << "Options:\n"
-     << "  -o, --output-format FORMAT   Output format: human or json (default: human)\n"
-     << "  -f, --output-file FILE       JSON output file path (default: report.jsonl)\n"
-     << "  -v, --verbose                Increase verbosity\n"
-     << "  -h, --help                   Show this help message\n";
-  std::cout << ss.str();
-}
-
-int main(int argc, char* argv[]) {
-  std::string outputFormat = "human";
-  std::string outputFile = "report.jsonl";
-  bool verbose = false;
-
-  static struct option longOptions[] = {{"output-format", required_argument, 0, 'o'},
-                                        {"output-file", required_argument, 0, 'f'},
-                                        {"verbose", no_argument, 0, 'v'},
-                                        {"help", no_argument, 0, 'h'},
-                                        {0, 0, 0, 0}};
-
-  int c;
-  while ((c = getopt_long(argc, argv, "o:f:vh", longOptions, nullptr)) != -1) {
-    switch (c) {
-      case 'o':
-        outputFormat = optarg;
-        break;
-      case 'f':
-        outputFile = optarg;
-        break;
-      case 'v':
-        verbose = true;
-        break;
-      case 'h':
-        printUsage(argv[0]);
-        return 0;
-      default:
-        printUsage(argv[0]);
-        return 1;
-    }
-  }
-
-  std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>> tests = {
-      {"AllFifoTests", "FIFO performance tests with multiple configurations", runAllFifoTests}};
-
-  int result = utils::runMultipleTests(argc, argv, tests);
-
-  if (utils::isMainRank()) {
-    if (outputFormat == "json") {
-      utils::writeResultsToFile(outputFile);
-    } else {
-      utils::printResults(verbose);
-    }
-  }
-
-  utils::cleanupMPI();
-
-  return result;
-}
diff --git a/test/perf/framework.cc b/test/perf/framework.cc
deleted file mode 100644
index 85f7abd8..00000000
--- a/test/perf/framework.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "framework.hpp"
-
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-namespace mscclpp {
-namespace test {
-
-// Global state for results
-static std::vector<TestResult> g_results;
-static int g_mpi_rank = 0;
-static int g_mpi_size = 1;
-static bool g_mpi_initialized = false;
-
-namespace utils {
-
-// Internal MPI helper functions (not exposed in header)
-void initializeMPI(int argc, char* argv[]) {
-  if (g_mpi_initialized) return;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_mpi_rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &g_mpi_size);
-  g_mpi_initialized = true;
-}
-
-static void finalizeMPI() {
-  if (!g_mpi_initialized) return;
-
-  MPI_Finalize();
-  g_mpi_initialized = false;
-}
-
-static int getMPIRank() { return g_mpi_rank; }
-
-static int getMPISize() { return g_mpi_size; }
-
-static bool isMainProcess() { return g_mpi_rank == 0; }
-
-// Public utility functions for test output
-bool isMainRank() { return g_mpi_rank == 0; }
-
-void cleanupMPI() { finalizeMPI(); }
-
-std::string getCurrentTimestamp() {
-  auto now = std::chrono::system_clock::now();
-  auto time_t = std::chrono::system_clock::to_time_t(now);
-  std::stringstream ss;
-  ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%S");
-  return ss.str();
-}
-
-void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& test_params) {
-  TestResult result;
-  result.test_name = test_name;
-  result.test_category = test_category;
-  result.test_params = test_params;
-  result.metrics = metrics;
-  result.num_processes = g_mpi_size;
-  result.process_rank = g_mpi_rank;
-  result.timestamp = getCurrentTimestamp();
-
-  g_results.push_back(result);
-}
-
-void writeResultsToFile(const std::string& filename) {
-  std::ofstream file(filename);
-  if (!file) {
-    throw std::runtime_error("Cannot open output file: " + filename);
-  }
-
-  for (const auto& result : g_results) {
-    nlohmann::ordered_json j;
-    j["test_name"] = result.test_name;
-    j["test_category"] = result.test_category;
-    j["test_config"] = result.test_params;
-    j["metrics"] = result.metrics;
-    j["num_processes"] = result.num_processes;
-    j["process_rank"] = result.process_rank;
-    j["timestamp"] = result.timestamp;
-
-    file << j.dump() << std::endl;
-  }
-}
-
-void printResults(bool verbose) {
-  if (!isMainProcess()) return;
-
-  std::cout << "\n=== Test Results ===" << std::endl;
-
-  for (const auto& result : g_results) {
-    std::cout << "\nTest: " << result.test_name << " (" << result.test_category << ")" << std::endl;
-
-    if (verbose && !result.test_params.empty()) {
-      std::cout << "  Parameters:" << std::endl;
-      for (const auto& param : result.test_params) {
-        std::cout << "    " << param.first << ": " << param.second << std::endl;
-      }
-    }
-
-    std::cout << "  Metrics:" << std::endl;
-    for (auto it = result.metrics.begin(); it != result.metrics.end(); ++it) {
-      std::cout << "    " << it.key() << ": " << it.value() << std::endl;
-    }
-  }
-  std::cout << std::endl;
-}
-
-// Timer implementation
-Timer::Timer() : is_running_(false) {}
-
-void Timer::start() {
-  start_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = true;
-}
-
-void Timer::stop() {
-  end_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = false;
-}
-
-double Timer::elapsedMicroseconds() const {
-  if (is_running_) {
-    auto now = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration_cast<std::chrono::microseconds>(now - start_time_).count();
-  }
-  return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
-}
-
-double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
-
-double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
-
-void cudaCheck(cudaError_t err, const char* file, int line) {
-  if (err != cudaSuccess) {
-    std::string msg =
-        std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
-    throw std::runtime_error(msg);
-  }
-}
-
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests) {
-  int totalResult = 0;
-
-  // Initialize MPI once for all tests
-  initializeMPI(argc, argv);
-
-  try {
-    // Get MPI information
-    int rank = getMPIRank();
-    int size = getMPISize();
-    int local_rank = rank;  // For simplicity, assume local_rank = rank
-
-    for (const auto& test : tests) {
-      const std::string& testName = std::get<0>(test);
-      const std::string& testDescription = std::get<1>(test);
-      const std::function<void(int, int, int)>& testFunction = std::get<2>(test);
-
-      if (rank == 0) {
-        std::cout << "Running test: " << testName << std::endl;
-        if (!testDescription.empty()) {
-          std::cout << "  " << testDescription << std::endl;
-        }
-      }
-
-      // Don't clear results - accumulate them for all tests in the same file
-      // g_results.clear();  // Commented out to accumulate results
-
-      try {
-        // Run the individual test function with MPI information
-        testFunction(rank, size, local_rank);
-
-        // Synchronize before moving to next test
-        MPI_Barrier(MPI_COMM_WORLD);
-
-      } catch (const std::exception& e) {
-        if (rank == 0) {
-          std::cerr << "Error in test " << testName << ": " << e.what() << std::endl;
-        }
-        totalResult = 1;
-      }
-    }
-
-    // Don't cleanup MPI here - let the caller handle it
-    // finalizeMPI();
-
-  } catch (const std::exception& e) {
-    if (g_mpi_rank == 0) {
-      std::cerr << "Error: " << e.what() << std::endl;
-    }
-    finalizeMPI();
-    return 1;
-  }
-
-  return totalResult;
-}
-
-}  // namespace utils
-}  // namespace test
-}  // namespace mscclpp
diff --git a/test/perf/framework.hpp b/test/perf/framework.hpp
deleted file mode 100644
index e9b8c31f..00000000
--- a/test/perf/framework.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
-#define MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
-
-#include <mpi.h>
-
-#include <chrono>
-#include <fstream>
-#include <functional>
-#include <map>
-#include <mscclpp/gpu.hpp>
-#include <nlohmann/json.hpp>
-#include <string>
-#include <tuple>
-#include <vector>
-
-namespace mscclpp {
-namespace test {
-
-// Test result structure
-struct TestResult {
-  std::string test_name;
-  std::string test_category;
-  std::map<std::string, std::string> test_params;
-  nlohmann::ordered_json metrics;
-  int num_processes;
-  int process_rank;
-  std::string timestamp;
-};
-
-// Simple utility functions for testing
-namespace utils {
-
-// Test execution utilities
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests);
-
-// MPI management
-void initializeMPI(int argc, char* argv[]);
-void cleanupMPI();
-bool isMainRank();
-
-// Result recording
-void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& test_params = {});
-
-// Output utilities
-void writeResultsToFile(const std::string& filename);
-void printResults(bool verbose = false);
-void cleanupMPI();
-
-// Timing utilities
-class Timer {
- public:
-  Timer();
-  void start();
-  void stop();
-  double elapsedMicroseconds() const;
-  double elapsedMilliseconds() const;
-  double elapsedSeconds() const;
-
- private:
-  std::chrono::high_resolution_clock::time_point start_time_;
-  std::chrono::high_resolution_clock::time_point end_time_;
-  bool is_running_;
-};
-
-// CUDA utilities
-void cudaCheck(cudaError_t err, const char* file, int line);
-#define CUDA_CHECK(call) cudaCheck(call, __FILE__, __LINE__)
-
-}  // namespace utils
-
-}  // namespace test
-}  // namespace mscclpp
-
-#endif  // MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 312d31ef..7836e063 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,11 +1,13 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 target_sources(unit_tests PRIVATE
+    unit_tests_main.cc
     core_tests.cc
     gpu_utils_tests.cc
     errors_tests.cc
     fifo_tests.cu
+    fifo_perf_tests.cu
     numa_tests.cc
     socket_tests.cc
     utils_tests.cc
diff --git a/test/unit/compile_tests.cu b/test/unit/compile_tests.cu
index 9db91a4f..893bb940 100644
--- a/test/unit/compile_tests.cu
+++ b/test/unit/compile_tests.cu
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #undef NDEBUG
 #ifndef DEBUG_BUILD
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index 32e6a1b5..d2552ff3 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -1,12 +1,14 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/core.hpp>
 
-class LocalCommunicatorTest : public ::testing::Test {
+#include "../framework.hpp"
+
+// TODO: TransportFlags needs operator<< for EXPECT_EQ to work
+// Using ASSERT_TRUE with manual comparisons as workaround
+
+class LocalCommunicatorTest : public ::mscclpp::test::TestCase {
  protected:
   void SetUp() override {
     bootstrap = std::make_shared<mscclpp::TcpBootstrap>(0, 1);
@@ -18,15 +20,15 @@ class LocalCommunicatorTest : public ::testing::Test {
   std::shared_ptr<mscclpp::Communicator> comm;
 };
 
-TEST_F(LocalCommunicatorTest, RegisterMemory) {
+TEST(LocalCommunicatorTest, RegisterMemory) {
   int dummy[42];
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
   EXPECT_EQ(memory.data(), &dummy);
   EXPECT_EQ(memory.size(), sizeof(dummy));
-  EXPECT_EQ(memory.transports(), mscclpp::NoTransports);
+  ASSERT_TRUE(memory.transports() == mscclpp::NoTransports);
 }
 
-TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
+TEST(LocalCommunicatorTest, SendMemoryToSelf) {
   int dummy[42];
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
   comm->sendMemory(memory, 0);
@@ -34,5 +36,5 @@ TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
   auto sameMemory = memoryFuture.get();
   EXPECT_EQ(sameMemory.data(), memory.data());
   EXPECT_EQ(sameMemory.size(), memory.size());
-  EXPECT_EQ(sameMemory.transports(), memory.transports());
+  ASSERT_TRUE(sameMemory.transports() == memory.transports());
 }
diff --git a/test/unit/errors_tests.cc b/test/unit/errors_tests.cc
index f9faad19..3eeed387 100644
--- a/test/unit/errors_tests.cc
+++ b/test/unit/errors_tests.cc
@@ -1,30 +1,33 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/errors.hpp>
 
+#include "../framework.hpp"
+
+// TODO: ErrorCode needs operator<< for EXPECT_EQ to work
+// Using ASSERT_TRUE with manual comparisons as workaround
+
 TEST(ErrorsTest, SystemError) {
   mscclpp::Error error("test", mscclpp::ErrorCode::SystemError);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::SystemError);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::SystemError);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: SystemError)"));
 }
 
 TEST(ErrorsTest, InternalError) {
   mscclpp::Error error("test", mscclpp::ErrorCode::InternalError);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InternalError);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InternalError);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InternalError)"));
 }
 
 TEST(ErrorsTest, InvalidUsage) {
   mscclpp::Error error("test", mscclpp::ErrorCode::InvalidUsage);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InvalidUsage);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InvalidUsage);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InvalidUsage)"));
 }
 
 TEST(ErrorsTest, Timeout) {
   mscclpp::Error error("test", mscclpp::ErrorCode::Timeout);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::Timeout);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::Timeout);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: Timeout)"));
 }
diff --git a/test/unit/fifo_perf_tests.cu b/test/unit/fifo_perf_tests.cu
new file mode 100644
index 00000000..34b5d6bc
--- /dev/null
+++ b/test/unit/fifo_perf_tests.cu
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <cassert>
+#include <memory>
+#include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/numa.hpp>
+#include <unordered_map>
+
+#include "../framework.hpp"
+
+// Simple FIFO performance test to be run as part of unit_tests
+// This is a performance test that can be excluded from coverage runs
+// using the --exclude-perf-tests flag.
+
+constexpr uint64_t TIMEOUT_SPINS = 1000000;
+constexpr int MIN_TRIGGERS = 100;  // Reduced for faster unit test execution
+
+__constant__ mscclpp::FifoDeviceHandle gFifoPerfDeviceHandle;
+
+__global__ void kernelFifoPerfPush(size_t numTriggers) {
+  mscclpp::FifoDeviceHandle& fifo = gFifoPerfDeviceHandle;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  mscclpp::ProxyTrigger trigger;
+  for (size_t i = 1; i <= numTriggers; ++i) {
+    trigger.fst = i;
+    trigger.snd = tid ^ i;
+    fifo.push(trigger);
+  }
+}
+
+static bool consumePerfTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
+  int totalTriggers = numTriggers * parallel;
+  std::unordered_map<int, int> triggerCounts;
+  for (int i = 0; i < totalTriggers; ++i) {
+    mscclpp::ProxyTrigger trigger;
+    uint64_t spin = 0;
+    do {
+      trigger = hostFifo->poll();
+      if (spin++ > TIMEOUT_SPINS) {
+        return false;
+      }
+    } while (trigger.fst == 0 || trigger.snd == 0);
+
+    trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
+    trigger.snd = trigger.snd ^ trigger.fst;
+    if (triggerCounts[trigger.snd] + 1 != trigger.fst) {
+      return false;  // Validation failed
+    }
+    triggerCounts[trigger.snd]++;
+    hostFifo->pop();
+  }
+  return true;
+}
+
+PERF_TEST(FifoPerfTest, BasicPerformance) {
+  int cudaDevice, numaNode;
+  CUDA_CHECK(cudaGetDevice(&cudaDevice));
+  numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
+  mscclpp::numaBind(numaNode);
+
+  const int fifoSize = 128;
+  const int numTriggers = MIN_TRIGGERS;
+  const int numParallel = 1;
+
+  auto hostFifo = std::make_unique<mscclpp::Fifo>(fifoSize);
+  mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
+  CUDA_CHECK(cudaMemcpyToSymbol(gFifoPerfDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
+
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+
+  // Run kernel
+  kernelFifoPerfPush<<<numParallel, 1, 0, stream>>>(numTriggers);
+  CUDA_CHECK(cudaGetLastError());
+
+  // Process triggers
+  bool success = consumePerfTriggers(hostFifo, numTriggers, numParallel);
+  ASSERT_TRUE(success);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/test/unit/fifo_tests.cu b/test/unit/fifo_tests.cu
index b67a220d..8d30ca5e 100644
--- a/test/unit/fifo_tests.cu
+++ b/test/unit/fifo_tests.cu
@@ -1,13 +1,12 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/fifo.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 
+#include "../framework.hpp"
 #include "utils_internal.hpp"
 
 #define ITER 10000  // should be larger than the FIFO size for proper testing
diff --git a/test/unit/gpu_utils_tests.cc b/test/unit/gpu_utils_tests.cc
index f4aba0d7..977314e9 100644
--- a/test/unit/gpu_utils_tests.cc
+++ b/test/unit/gpu_utils_tests.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/gpu_utils.hpp>
 
+#include "../framework.hpp"
+
 TEST(GpuUtilsTest, StreamPool) {
   auto streamPool = mscclpp::gpuStreamPool();
   cudaStream_t s;
diff --git a/test/unit/local_channel_tests.cu b/test/unit/local_channel_tests.cu
index 50ffc9ea..699baa38 100644
--- a/test/unit/local_channel_tests.cu
+++ b/test/unit/local_channel_tests.cu
@@ -1,13 +1,13 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/port_channel.hpp>
 #include <mscclpp/port_channel_device.hpp>
 
+#include "../framework.hpp"
+
 #define MAGIC_CONST 777
 
 __constant__ mscclpp::PortChannelDeviceHandle gPortChannel;
diff --git a/test/unit/numa_tests.cc b/test/unit/numa_tests.cc
index dfa63a74..46bf5e18 100644
--- a/test/unit/numa_tests.cc
+++ b/test/unit/numa_tests.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 
+#include "../framework.hpp"
+
 TEST(NumaTest, Basic) {
   int num;
   MSCCLPP_CUDATHROW(cudaGetDeviceCount(&num));
diff --git a/test/unit/socket_tests.cc b/test/unit/socket_tests.cc
index 1ab592ba..a5598938 100644
--- a/test/unit/socket_tests.cc
+++ b/test/unit/socket_tests.cc
@@ -1,11 +1,10 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/utils.hpp>
 #include <thread>
 
+#include "../framework.hpp"
 #include "socket.h"
 #include "utils_internal.hpp"
 
diff --git a/test/unit/unit_tests_main.cc b/test/unit/unit_tests_main.cc
new file mode 100644
index 00000000..397566e0
--- /dev/null
+++ b/test/unit/unit_tests_main.cc
@@ -0,0 +1,6 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "../framework.hpp"
+
+int main(int argc, char** argv) { return RUN_ALL_TESTS(); }
diff --git a/test/unit/utils_internal_tests.cc b/test/unit/utils_internal_tests.cc
index 5479a681..8526d9fe 100644
--- a/test/unit/utils_internal_tests.cc
+++ b/test/unit/utils_internal_tests.cc
@@ -1,10 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include <gtest/gtest.h>
-
 #include <thread>
 
+#include "../framework.hpp"
 #include "utils_internal.hpp"
 
 TEST(UtilsInternalTest, getHostHash) {
diff --git a/test/unit/utils_tests.cc b/test/unit/utils_tests.cc
index fa079b30..51562c21 100644
--- a/test/unit/utils_tests.cc
+++ b/test/unit/utils_tests.cc
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/errors.hpp>
 #include <mscclpp/utils.hpp>
 #include <thread>
 
+#include "../framework.hpp"
+
 TEST(UtilsTest, getHostName) {
   std::string hostname1 = mscclpp::getHostName(1024, '.');
   EXPECT_FALSE(hostname1.empty());

From 62ab8883a6e6de810887270291d60edbdc33cb25 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Mon, 30 Mar 2026 01:34:53 +0000
Subject: [PATCH 29/52] Update multinode mode selection logic for IB and
 NVSwitch; Add tests of EP equivalent workloads

---
 python/mscclpp/ext/alltoallv_single.py        |  37 ++-
 python/test/test_alltoallv_mscclpp.py         | 285 +++++++++++++++++-
 .../alltoallv/alltoallv_fullmesh.cu           |  32 +-
 3 files changed, 341 insertions(+), 13 deletions(-)

diff --git a/python/mscclpp/ext/alltoallv_single.py b/python/mscclpp/ext/alltoallv_single.py
index 1ba7c9b4..e84171ee 100644
--- a/python/mscclpp/ext/alltoallv_single.py
+++ b/python/mscclpp/ext/alltoallv_single.py
@@ -239,6 +239,8 @@ class MscclppAlltoAllV:
         # Fast path: skip GPU copies + bootstrap exchange if split sizes unchanged
         splits_key = (tuple(send_counts_bytes), tuple(recv_counts_bytes))
         if splits_key != self._cached_splits_key:
+            import sys as _sys
+            print(f"  [rank {self._rank}] alltoallv: splits changed, doing bootstrap exchange", flush=True)
             # Clear cached contexts to free RegisteredMemory for old (possibly freed) tensors.
             # Without this, stale CUDA IPC handles accumulate and eventually SIGSEGV.
             if hasattr(self._algo, 'reset'):
@@ -250,7 +252,9 @@ class MscclppAlltoAllV:
             self._d_recv_displs.copy_(torch.tensor(recv_displs_bytes, dtype=torch.int64))
 
             # Exchange recv displacements with peers via bootstrap
+            print(f"  [rank {self._rank}] alltoallv: starting _exchange_recv_displs", flush=True)
             remote_recv_displs = self._exchange_recv_displs(recv_displs_bytes)
+            print(f"  [rank {self._rank}] alltoallv: _exchange_recv_displs done", flush=True)
             self._d_remote_recv_displs.copy_(torch.tensor(remote_recv_displs, dtype=torch.int64))
 
             # Cache for subsequent calls
@@ -258,6 +262,15 @@ class MscclppAlltoAllV:
             self._cached_input_size = sum(send_counts_bytes)
             self._cached_output_size = sum(recv_counts_bytes)
 
+            # Barrier: all ranks must finish the displacement exchange before any
+            # rank enters algo.execute() → initialize(), which does its own
+            # bootstrap operations (comm->connect, setupRemoteMemories).
+            # Without this barrier, fast ranks' bootstrap messages from
+            # initialize() can collide with slow ranks still in _exchange_recv_displs.
+            print(f"  [rank {self._rank}] alltoallv: waiting on bootstrap barrier", flush=True)
+            self._comm.bootstrap().barrier()
+            print(f"  [rank {self._rank}] alltoallv: bootstrap barrier done", flush=True)
+
         # Get stream
         if stream is None:
             stream = torch.cuda.current_stream()
@@ -275,6 +288,21 @@ class MscclppAlltoAllV:
             output_alloc_size = output.nelement() * output.element_size()
         
         # Execute the optimized kernel
+        import sys as _sys
+        # Clear any stale CUDA errors before executing (the C++ code checks
+        # cudaGetLastError() after the kernel and returns INTERNAL_ERROR if any
+        # previous error was pending).
+        torch.cuda.synchronize()
+        # Also clear the CUDA error state via cudaGetLastError (consumes the error)
+        import ctypes
+        try:
+            _cudart = ctypes.CDLL("libcudart.so")
+            _last_err = _cudart.cudaGetLastError()
+            if _last_err != 0:
+                print(f"  [rank {self._rank}] WARNING: cleared stale CUDA error code {_last_err} before execute", flush=True)
+        except Exception:
+            pass
+        print(f"  [rank {self._rank}] alltoallv: calling algo.execute(input_alloc={input_alloc_size}, output_alloc={output_alloc_size})", flush=True)
         result = self._algo.execute(
             self._comm,
             input.data_ptr(),
@@ -289,8 +317,15 @@ class MscclppAlltoAllV:
             0,     # nthreads_per_block (auto)
             self._extras,
         )
+        print(f"  [rank {self._rank}] alltoallv: algo.execute returned {result}", flush=True)
         
-        if result != 0:
+        from mscclpp._mscclpp import CommResult
+        if result != CommResult.COMM_SUCCESS:
+            # Get detailed CUDA error before raising
+            try:
+                torch.cuda.synchronize()
+            except Exception as cuda_err:
+                raise RuntimeError(f"alltoallv execution failed with code {result}; CUDA error: {cuda_err}")
             raise RuntimeError(f"alltoallv execution failed with code {result}")
         
         return output
diff --git a/python/test/test_alltoallv_mscclpp.py b/python/test/test_alltoallv_mscclpp.py
index 95dbf044..6b95dd85 100644
--- a/python/test/test_alltoallv_mscclpp.py
+++ b/python/test/test_alltoallv_mscclpp.py
@@ -74,6 +74,9 @@ def _tcp_broadcast_unique_id(unique_id_bytes: bytes, rank: int, world_size: int,
 
 
 def main():
+    # Do NOT set CUDA_LAUNCH_BLOCKING=1 — it prevents the proxy thread from
+    # delivering IB data while the kernel is running (deadlock).
+    
     # Get rank/world from MPI environment
     rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", os.environ.get("PMI_RANK", 0)))
     world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", os.environ.get("PMI_SIZE", 1)))
@@ -197,10 +200,44 @@ def main():
         device='cuda'
     )
     
-    output = alltoallv.all_to_all_single(input_data)
-    
+    # ── DEBUG: print tensor sizes before all_to_all_single ──
+    print(f"  [rank {rank}] input_data: numel={input_data.numel()}, shape={input_data.shape}, "
+          f"dtype={input_data.dtype}, device={input_data.device}, "
+          f"storage_size={input_data.untyped_storage().size()}, "
+          f"data_ptr=0x{input_data.data_ptr():x}")
+    print(f"  [rank {rank}] world_size={world_size}, chunk_size={chunk_size}, "
+          f"expected_total_elems={world_size * chunk_size}, "
+          f"scratch_buffer_size={alltoallv._scratch_size}")
+    sys.stdout.flush()
+    dist.barrier()
+
+    try:
+        output = alltoallv.all_to_all_single(input_data)
+    except Exception as e:
+        print(f"  [rank {rank}] all_to_all_single RAISED: {e}")
+        # Try to get the actual CUDA error
+        try:
+            torch.cuda.synchronize()
+        except Exception as e2:
+            print(f"  [rank {rank}] CUDA error after all_to_all_single: {e2}")
+        sys.stdout.flush()
+        raise
+
+    # ── DEBUG: print output tensor sizes ──
+    print(f"  [rank {rank}] output: numel={output.numel()}, shape={output.shape}, "
+          f"dtype={output.dtype}, device={output.device}, "
+          f"storage_size={output.untyped_storage().size()}, "
+          f"data_ptr=0x{output.data_ptr():x}")
+    sys.stdout.flush()
+
     # Verify: each chunk should come from different ranks
-    torch.cuda.synchronize()
+    try:
+        torch.cuda.synchronize()
+    except Exception as e:
+        print(f"  [rank {rank}] cuda.synchronize FAILED: {e}")
+        sys.stdout.flush()
+        raise
+
     expected_total = sum(r * world_size * chunk_size for r in range(world_size))
     actual_total = output[:chunk_size].sum().item()  # Just check first chunk is from rank 0
     expected = 0 * world_size * chunk_size + sum(range(chunk_size))
@@ -316,6 +353,14 @@ def main():
             return f"{nbytes // 1024}KB"
         return f"{nbytes}B"
 
+    def fmt_size_decimal(nbytes: int) -> str:
+        """Format size using decimal MB (÷1000000) to match NCCL EP reporting."""
+        if nbytes >= 1000000:
+            return f"{nbytes / 1000000:.2f}MB"
+        elif nbytes >= 1000:
+            return f"{nbytes / 1000:.1f}KB"
+        return f"{nbytes}B"
+
     def print_header():
         if rank == 0:
             if use_torch_baseline:
@@ -491,6 +536,240 @@ def main():
         if rank == 0:
             print("\n[Test 4] Skipped (real MoE workloads require exactly 8 ranks)")
 
+    # ── Test 5: NCCL EP Low-Latency equivalent workload ──────────────────
+    # Detect if torch baseline is available for Tests 5 & 6
+    use_torch_baseline = True
+    try:
+        tiny_in = torch.zeros(world_size, dtype=torch.float32, device='cuda')
+        tiny_out = torch.zeros(world_size, dtype=torch.float32, device='cuda')
+        dist.all_to_all_single(tiny_out, tiny_in)
+    except Exception:
+        use_torch_baseline = False
+        if rank == 0:
+            print("  [INFO] torch all_to_all_single unavailable, skipping torch baseline in Tests 5/6")
+
+    # Matches the data volume of:
+    #   mpirun -np N ep_bench -a ll -t 128 -d 7168
+    #
+    # ep_bench LL config: 128 tokens/rank, 256 experts, top_k=8,
+    # hidden=7168, bf16.
+    # Target byte counts: dispatch=14.55 MB, combine=14.55 MB, selections=1015
+    #
+    # Expert assignment: for each token, generate 256 scores = abs(N(0,1))+1,
+    # pick top-8 expert indices. Then mask 9 random (token,k) slots with -1
+    # to get exactly 1015 valid selections (128*8 - 9 = 1015).
+    # Seed: mt19937(1 + rank).
+
+    LL_NUM_TOKENS = 128      # tokens per rank
+    LL_NUM_EXPERTS = 256
+    LL_TOP_K = 8
+    LL_HIDDEN = 7168         # bf16 elements per token
+    LL_NUM_MASKED = 9        # 128*8 - 9 = 1015 valid selections
+
+    if world_size >= 2:
+        num_local_experts = LL_NUM_EXPERTS // world_size
+
+        # Replicate LL expert assignment with numpy mt19937
+        import numpy as np
+        rng = np.random.RandomState(1 + rank)
+
+        # For each token: generate 256 scores, pick top-8 expert indices
+        topk_idx = np.zeros((LL_NUM_TOKENS, LL_TOP_K), dtype=np.int64)
+        for i in range(LL_NUM_TOKENS):
+            scores = np.abs(rng.randn(LL_NUM_EXPERTS)) + 1.0
+            top_experts = np.argpartition(scores, -LL_TOP_K)[-LL_TOP_K:]
+            topk_idx[i] = top_experts
+
+        # Mask ~10 random positions with -1
+        for _ in range(LL_NUM_MASKED):
+            ti = rng.randint(0, LL_NUM_TOKENS)
+            ki = rng.randint(0, LL_TOP_K)
+            topk_idx[ti, ki] = -1
+
+        # Count tokens sent from this rank to each target rank
+        send_counts = [0] * world_size
+        for i in range(LL_NUM_TOKENS):
+            target_ranks_seen = set()
+            for k in range(LL_TOP_K):
+                eid = topk_idx[i, k]
+                if eid >= 0:
+                    target_rank = int(eid) // num_local_experts
+                    target_ranks_seen.add(target_rank)
+            for tr in target_ranks_seen:
+                send_counts[tr] += 1
+
+        # Normalize send_counts so each rank sends exactly TARGET_SELECTIONS
+        # tokens total, matching ep_bench's reported selections=1015.
+        # This ensures total_send_bytes = 1015 × 7168 × 2 = 14,551,040 bytes.
+        TARGET_SELECTIONS = 1015
+        raw_total = sum(send_counts)
+        if raw_total > 0:
+            # Scale proportionally, then fix rounding to hit exact target
+            scaled = [int(c * TARGET_SELECTIONS / raw_total) for c in send_counts]
+            remainder = TARGET_SELECTIONS - sum(scaled)
+            # Distribute remainder to largest buckets first
+            indices = sorted(range(world_size), key=lambda i: send_counts[i], reverse=True)
+            for i in range(remainder):
+                scaled[indices[i % world_size]] += 1
+            send_counts = scaled
+
+        # Gather 8×8 send matrix
+        send_tensor = torch.tensor(send_counts, dtype=torch.int32, device='cuda')
+        all_sends = [torch.zeros(world_size, dtype=torch.int32, device='cuda')
+                     for _ in range(world_size)]
+        dist.all_gather(all_sends, send_tensor)
+        send_matrix = [t.cpu().tolist() for t in all_sends]
+
+        in_splits_tokens = send_matrix[rank]
+        out_splits_tokens = [send_matrix[j][rank] for j in range(world_size)]
+
+        in_splits = [t * LL_HIDDEN for t in in_splits_tokens]
+        out_splits = [t * LL_HIDDEN for t in out_splits_tokens]
+
+        total_send_tokens = sum(in_splits_tokens)
+        total_recv_tokens = sum(out_splits_tokens)
+        total_send_bytes = sum(in_splits) * 2
+        total_recv_bytes = sum(out_splits) * 2
+
+        if rank == 0:
+            print(f"\n[Test 5] NCCL EP LL-equivalent workload "
+                  f"(tokens={LL_NUM_TOKENS}, experts={LL_NUM_EXPERTS}, "
+                  f"top_k={LL_TOP_K}, hidden={LL_HIDDEN}, bf16, {world_size} ranks)")
+            print(f"  Rank 0 send tokens: {in_splits_tokens} (total {total_send_tokens})")
+            print(f"  Rank 0 recv tokens: {out_splits_tokens} (total {total_recv_tokens})")
+            print(f"  Send {total_send_bytes / 1e6:.2f}MB, "
+                  f"Recv {total_recv_bytes / 1e6:.2f}MB")
+            print(f"  Target: dispatch=14.55 MB, selections=1015")
+            max_out = max(out_splits_tokens)
+            min_out = min(out_splits_tokens)
+            print(f"  Recv imbalance: {max_out/min_out:.2f}x "
+                  f"(min={min_out}, max={max_out})")
+            print_header()
+
+        inp = torch.randn(sum(in_splits), dtype=torch.bfloat16, device='cuda')
+        out = torch.empty(sum(out_splits), dtype=torch.bfloat16, device='cuda')
+
+        n_warmup, n_iters = 10, 50
+
+        m_lat, m_bw = bench_alltoallv(mscclpp_fn, inp, out, in_splits, out_splits, n_warmup, n_iters)
+        if use_torch_baseline:
+            t_lat, t_bw = bench_alltoallv(torch_fn, inp, out, in_splits, out_splits, n_warmup, n_iters)
+            print_row(fmt_size_decimal(total_send_bytes), m_lat, m_bw, t_lat, t_bw)
+        else:
+            print_row(fmt_size_decimal(total_send_bytes), m_lat, m_bw)
+    else:
+        if rank == 0:
+            print("\n[Test 5] Skipped (NCCL EP LL-equivalent requires >= 2 ranks)")
+
+    # ── Test 6: NCCL EP High-Throughput equivalent workload ──────────────
+    # Matches the data volume of:
+    #   mpirun -np N ep_bench -a ht -t 4096 -d 7168
+    #
+    # Target byte counts (per rank avg, 8 GPUs):
+    #   RDMA_send = 58.72 MB (4096 tokens × 7168 × 2 bytes)
+    #   total_recv = 469.76 MB (32768 tokens = 8 peers × 4096 tokens each)
+    #
+    # ep_bench config: 4096 tokens/rank, 256 experts, top_k=8,
+    # hidden=7168, bf16.  Each token is dispatched to top_k=8 experts,
+    # so each rank receives ~4096 token-expert pairs from each peer.
+    #
+    # We replicate the ep_bench expert assignment logic:
+    #   srand(rank + 42), for each of 4096 tokens pick a random first_expert
+    #   in [0, num_experts), then assign top_k=8 consecutive experts.
+    #   target_rank = expert_id // num_local_experts.
+
+    EP_NUM_TOKENS = 4096    # tokens per rank (input)
+    EP_NUM_EXPERTS = 256
+    EP_TOP_K = 8
+    EP_HIDDEN = 7168        # bf16 elements per token
+
+    if world_size >= 2:
+        num_local_experts = EP_NUM_EXPERTS // world_size
+
+        # Use C's srand/rand to replicate ep_bench's exact token distribution
+        import ctypes
+        libc = ctypes.CDLL("libc.so.6")
+        libc.srand(rank + 42)
+
+        # Count tokens sent from this rank to each target rank.
+        # ep_bench dispatches each token to all ranks hosting its top_k experts.
+        # A token with experts spanning 2 ranks sends a copy to each.
+        send_counts = [0] * world_size
+        for i in range(EP_NUM_TOKENS):
+            first_expert = libc.rand() % EP_NUM_EXPERTS
+            target_ranks_seen = set()
+            for k in range(EP_TOP_K):
+                expert_id = (first_expert + k) % EP_NUM_EXPERTS
+                target_rank = expert_id // num_local_experts
+                target_ranks_seen.add(target_rank)
+            for tr in target_ranks_seen:
+                send_counts[tr] += 1
+
+        # Normalize send_counts so each rank sends exactly EP_NUM_TOKENS
+        # tokens total, ensuring total_send_bytes = 4096 × 7168 × 2 = 58,720,256 bytes.
+        TARGET_SEND_TOKENS = EP_NUM_TOKENS  # 4096
+        raw_total = sum(send_counts)
+        if raw_total > 0 and raw_total != TARGET_SEND_TOKENS:
+            scaled = [int(c * TARGET_SEND_TOKENS / raw_total) for c in send_counts]
+            remainder = TARGET_SEND_TOKENS - sum(scaled)
+            indices = sorted(range(world_size), key=lambda i: send_counts[i], reverse=True)
+            for i in range(abs(remainder)):
+                if remainder > 0:
+                    scaled[indices[i % world_size]] += 1
+                else:
+                    scaled[indices[i % world_size]] -= 1
+            send_counts = scaled
+
+        # Gather 8×8 send matrix via allgather
+        send_tensor = torch.tensor(send_counts, dtype=torch.int32, device='cuda')
+        all_sends = [torch.zeros(world_size, dtype=torch.int32, device='cuda')
+                     for _ in range(world_size)]
+        dist.all_gather(all_sends, send_tensor)
+        send_matrix = [t.cpu().tolist() for t in all_sends]
+
+        in_splits_tokens = send_matrix[rank]
+        out_splits_tokens = [send_matrix[j][rank] for j in range(world_size)]
+
+        # Convert tokens to bf16 elements
+        in_splits = [t * EP_HIDDEN for t in in_splits_tokens]
+        out_splits = [t * EP_HIDDEN for t in out_splits_tokens]
+
+        total_send_tokens = sum(in_splits_tokens)
+        total_recv_tokens = sum(out_splits_tokens)
+        total_send_bytes = sum(in_splits) * 2
+        total_recv_bytes = sum(out_splits) * 2
+
+        if rank == 0:
+            print(f"\n[Test 6] NCCL EP HT-equivalent workload "
+                  f"(tokens={EP_NUM_TOKENS}, experts={EP_NUM_EXPERTS}, "
+                  f"top_k={EP_TOP_K}, hidden={EP_HIDDEN}, bf16, {world_size} ranks)")
+            print(f"  Rank 0 send tokens: {in_splits_tokens} (total {total_send_tokens})")
+            print(f"  Rank 0 recv tokens: {out_splits_tokens} (total {total_recv_tokens})")
+            print(f"  Send {total_send_bytes / 1e6:.2f}MB, "
+                  f"Recv {total_recv_bytes / 1e6:.2f}MB")
+            print(f"  Target: RDMA_send=58.72 MB, total_recv=469.76 MB (8 GPUs)")
+            # Show imbalance
+            max_out = max(out_splits_tokens)
+            min_out = min(out_splits_tokens)
+            print(f"  Recv imbalance: {max_out/min_out:.2f}x "
+                  f"(min={min_out}, max={max_out})")
+            print_header()
+
+        inp = torch.randn(sum(in_splits), dtype=torch.bfloat16, device='cuda')
+        out = torch.empty(sum(out_splits), dtype=torch.bfloat16, device='cuda')
+
+        n_warmup, n_iters = 10, 50  # match ep_bench defaults
+
+        m_lat, m_bw = bench_alltoallv(mscclpp_fn, inp, out, in_splits, out_splits, n_warmup, n_iters)
+        if use_torch_baseline:
+            t_lat, t_bw = bench_alltoallv(torch_fn, inp, out, in_splits, out_splits, n_warmup, n_iters)
+            print_row(fmt_size_decimal(total_send_bytes), m_lat, m_bw, t_lat, t_bw)
+        else:
+            print_row(fmt_size_decimal(total_send_bytes), m_lat, m_bw)
+    else:
+        if rank == 0:
+            print("\n[Test 6] Skipped (NCCL EP HT-equivalent requires >= 2 ranks)")
+
     # Cleanup
     dist.barrier()
     if rank == 0:
diff --git a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
index ca945361..b2ebbd23 100644
--- a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
+++ b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
@@ -100,24 +100,38 @@ void AlltoallvFullmesh::initialize(std::shared_ptr<Communicator> comm) {
   bool nvlsSupported = isNvlsSupported();
   int ibDevCount = getIBDeviceCount();
 
+  // Detect compute capability to distinguish NVSwitch topologies:
+  //   SM 10.x (Blackwell/GB200): NVSwitch fabric can span across nodes (MNNVLS),
+  //     so CudaIpc works cross-node → prefer NVSwitch mode.
+  //   SM 9.x  (Hopper/H100):     NVSwitch is intra-node only,
+  //     CudaIpc cannot map cross-node memory → must use IB for cross-node.
+  int computeCapabilityMajor = 0;
+  MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&computeCapabilityMajor,
+                                            cudaDevAttrComputeCapabilityMajor, localGpuIdx));
+
   INFO(MSCCLPP_COLL, "[alltoallv][rank %d] initialize: worldSize=%d, nRanksPerNode=%d, "
-       "isMultiNode=%d, isNvlsSupported=%d, ibDevCount=%d, localGpuIdx=%d",
-       rank, worldSize_, nRanksPerNode, isMultiNode, nvlsSupported, ibDevCount, localGpuIdx);
+       "isMultiNode=%d, isNvlsSupported=%d, ibDevCount=%d, localGpuIdx=%d, computeCapabilityMajor=%d",
+       rank, worldSize_, nRanksPerNode, isMultiNode, nvlsSupported, ibDevCount, localGpuIdx,
+       computeCapabilityMajor);
 
   if (!isMultiNode) {
     multiNodeMode_ = MultiNodeMode::SingleNode;
     this->conns_ = setupConnections(comm);
-  } else if (nvlsSupported) {
+  } else if (nvlsSupported && computeCapabilityMajor >= 10) {
+    // Blackwell/GB200 (SM 10.x+): NVSwitch fabric spans across nodes (MNNVLS).
+    // CudaIpc works cross-node → use NVSwitch mode for all peers.
     multiNodeMode_ = MultiNodeMode::NVSwitch;
     this->conns_ = setupConnections(comm);
-  } else {
-    if (ibDevCount <= 0) {
-      throw Error("Multi-node alltoallv requires IB transport but no IB devices found. "
-                  "Ensure IB drivers are loaded and devices are available.",
-                  ErrorCode::InvalidUsage);
-    }
+  } else if (ibDevCount > 0) {
+    // Hopper/Ampere (SM 9.x/8.x) or no NVLS: NVSwitch is intra-node only.
+    // Use IB (PortChannel) for cross-node, CudaIpc for intra-node.
     multiNodeMode_ = MultiNodeMode::IB;
     this->conns_ = setupHybridConnections(comm, localGpuIdx);
+  } else {
+    throw Error("Multi-node alltoallv requires either IB transport or cross-node NVSwitch (GB200+). "
+                "On Hopper/Ampere, ensure IB drivers are loaded. On Blackwell, ensure NVSwitch is "
+                "properly configured.",
+                ErrorCode::InvalidUsage);
   }
 
   const char* modeStr = (multiNodeMode_ == MultiNodeMode::SingleNode) ? "SingleNode" :

From 36940dbacfa679a3dbd013b47289b207fea7d93e Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Mon, 30 Mar 2026 03:40:05 +0000
Subject: [PATCH 30/52] Match the message size for EP bench HT of 16 GPUs in
 test 6

---
 python/test/test_alltoallv_mscclpp.py | 31 ++++++++++++++++++---------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/python/test/test_alltoallv_mscclpp.py b/python/test/test_alltoallv_mscclpp.py
index 6b95dd85..96695f94 100644
--- a/python/test/test_alltoallv_mscclpp.py
+++ b/python/test/test_alltoallv_mscclpp.py
@@ -665,10 +665,6 @@ def main():
     # Matches the data volume of:
     #   mpirun -np N ep_bench -a ht -t 4096 -d 7168
     #
-    # Target byte counts (per rank avg, 8 GPUs):
-    #   RDMA_send = 58.72 MB (4096 tokens × 7168 × 2 bytes)
-    #   total_recv = 469.76 MB (32768 tokens = 8 peers × 4096 tokens each)
-    #
     # ep_bench config: 4096 tokens/rank, 256 experts, top_k=8,
     # hidden=7168, bf16.  Each token is dispatched to top_k=8 experts,
     # so each rank receives ~4096 token-expert pairs from each peer.
@@ -677,12 +673,21 @@ def main():
     #   srand(rank + 42), for each of 4096 tokens pick a random first_expert
     #   in [0, num_experts), then assign top_k=8 consecutive experts.
     #   target_rank = expert_id // num_local_experts.
+    #
+    # Target send bytes vary by GPU count (to match ep_bench reports):
+    #    8 GPUs: 4096 tokens/rank → 58.72 MB  (no cross-boundary inflation)
+    #   16 GPUs: 4317 tokens/rank → 61.88 MB  (matches ep_bench RDMA_send)
 
     EP_NUM_TOKENS = 4096    # tokens per rank (input)
     EP_NUM_EXPERTS = 256
     EP_TOP_K = 8
     EP_HIDDEN = 7168        # bf16 elements per token
 
+    # Target send tokens per rank, keyed by world_size.
+    # 8 GPUs:  top_k=8 = num_local_experts=32, so no boundary-crossing → 4096
+    # 16 GPUs: num_local_experts=16, boundary crossing inflates to ~4317
+    EP_TARGET_TOKENS = {8: 4096, 16: 4317}
+
     if world_size >= 2:
         num_local_experts = EP_NUM_EXPERTS // world_size
 
@@ -705,9 +710,9 @@ def main():
             for tr in target_ranks_seen:
                 send_counts[tr] += 1
 
-        # Normalize send_counts so each rank sends exactly EP_NUM_TOKENS
-        # tokens total, ensuring total_send_bytes = 4096 × 7168 × 2 = 58,720,256 bytes.
-        TARGET_SEND_TOKENS = EP_NUM_TOKENS  # 4096
+        # Normalize send_counts to the target for this world_size.
+        # For unknown world_size, keep raw counts.
+        TARGET_SEND_TOKENS = EP_TARGET_TOKENS.get(world_size, sum(send_counts))
         raw_total = sum(send_counts)
         if raw_total > 0 and raw_total != TARGET_SEND_TOKENS:
             scaled = [int(c * TARGET_SEND_TOKENS / raw_total) for c in send_counts]
@@ -720,7 +725,7 @@ def main():
                     scaled[indices[i % world_size]] -= 1
             send_counts = scaled
 
-        # Gather 8×8 send matrix via allgather
+        # Gather send matrix via allgather
         send_tensor = torch.tensor(send_counts, dtype=torch.int32, device='cuda')
         all_sends = [torch.zeros(world_size, dtype=torch.int32, device='cuda')
                      for _ in range(world_size)]
@@ -739,6 +744,10 @@ def main():
         total_send_bytes = sum(in_splits) * 2
         total_recv_bytes = sum(out_splits) * 2
 
+        target_send_mb = TARGET_SEND_TOKENS * EP_HIDDEN * 2 / 1e6
+        target_recv_tokens = world_size * EP_NUM_TOKENS
+        target_recv_mb = target_recv_tokens * EP_HIDDEN * 2 / 1e6
+
         if rank == 0:
             print(f"\n[Test 6] NCCL EP HT-equivalent workload "
                   f"(tokens={EP_NUM_TOKENS}, experts={EP_NUM_EXPERTS}, "
@@ -747,8 +756,10 @@ def main():
             print(f"  Rank 0 recv tokens: {out_splits_tokens} (total {total_recv_tokens})")
             print(f"  Send {total_send_bytes / 1e6:.2f}MB, "
                   f"Recv {total_recv_bytes / 1e6:.2f}MB")
-            print(f"  Target: RDMA_send=58.72 MB, total_recv=469.76 MB (8 GPUs)")
-            # Show imbalance
+            print(f"  Target: RDMA_send={target_send_mb:.2f} MB "
+                  f"({TARGET_SEND_TOKENS} tokens), "
+                  f"total_recv={target_recv_mb:.2f} MB "
+                  f"({target_recv_tokens} tokens)")
             max_out = max(out_splits_tokens)
             min_out = min(out_splits_tokens)
             print(f"  Recv imbalance: {max_out/min_out:.2f}x "

From fd76507e9a6b57a6de7ad832deb4a15dc4d60195 Mon Sep 17 00:00:00 2001
From: Ekow Wellington <34079588+ekwhoa@users.noreply.github.com>
Date: Tue, 31 Mar 2026 14:27:33 -0500
Subject: [PATCH 31/52] Install default plans under MSCCLPP_CACHE_DIR/default
 (#769)

### Summary
Update the installer to place bundled default execution plans under
`<MSCCLPP_CACHE_DIR>/default`, which is where the runtime already looks
for bundled plans.

### Background
The C++ runtime treats `MSCCLPP_CACHE_DIR` as the cache *root* and loads
bundled default plans from `<cache root>/default`.
When `MSCCLPP_CACHE_DIR` was set, the installer instead wrote bundled
plans
directly into the cache root, causing the runtime to miss them.

This surfaced while running benchmarking tests with a non-default
`MSCCLPP_CACHE_DIR`, where the bundled plans were not being discovered.

### Change
This PR updates the installer to always install bundled default plans
into
`<MSCCLPP_CACHE_DIR>/default`, preserving the existing runtime contract.

### Scope
- Installer-only change
- No runtime behavior changes

### Validation
Manual inspection of the updated install path.
Successful build

---------

Co-authored-by: Ekow Wellington <t-ekoww@microsoft.com>
---
 docs/dsl/quick_start.md    | 4 ++++
 docs/dsl/results.md        | 3 +++
 python/mscclpp/__main__.py | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/dsl/quick_start.md b/docs/dsl/quick_start.md
index 6c32ec32..afccd48e 100644
--- a/docs/dsl/quick_start.md
+++ b/docs/dsl/quick_start.md
@@ -12,6 +12,10 @@ After finishing the installation in the quick start section, you can add the fol
 python3 -m mscclpp --install
 ```
 
+This installs bundled default execution plans into `~/.cache/mscclpp/default` by default.
+If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed into `MSCCLPP_CACHE_DIR/default`.
+`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
+
 ## Your First Algorithm: AllGather
 
 Let's walk through a simple AllGather algorithm to understand the DSL basics. This example demonstrates the key concepts without diving into all the advanced features.
diff --git a/docs/dsl/results.md b/docs/dsl/results.md
index 99f19476..a1adad2a 100644
--- a/docs/dsl/results.md
+++ b/docs/dsl/results.md
@@ -59,6 +59,9 @@ After installation, the generated JSON execution plan can be found at:
 ~/.cache/mscclpp/default/
 ```
 
+If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed under `MSCCLPP_CACHE_DIR/default/`.
+`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
+
 **Performance Results:**
 
 The figure below shows the performance characteristics for small message sizes in a two-node configuration:
diff --git a/python/mscclpp/__main__.py b/python/mscclpp/__main__.py
index d57cb362..6a6f5f28 100644
--- a/python/mscclpp/__main__.py
+++ b/python/mscclpp/__main__.py
@@ -57,7 +57,7 @@ default_algo_configs = [
 
 
 def create_default_plans():
-    plan_dir = os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp/default")
+    plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default")
     plan_path = Path(plan_dir)
     if plan_path.exists():
         shutil.rmtree(plan_path)

From 4f3638b60db4640eb5f0cd4c1c92e05a72227474 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 31 Mar 2026 15:34:43 -0700
Subject: [PATCH 32/52] Use PTX red for D2D semaphore signal (#768)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary
- Replace the two-step `signal()` implementation (`incOutbound()` +
`atomicStore()`) with a single fire-and-forget PTX
`red.release.sys.global.add.u64` instruction
- This eliminates one local atomic fetch-add and replaces a remote store
with a remote atomic add that has no return value — more efficient on
both NVIDIA (PTX `red`) and AMD (compiler optimizes `(void)fetch_add` to
fire-and-forget `flat_atomic_add_x2`)
- Add a C++ perf test (`PERF_TEST`) in `mp_unit` for signal+wait
ping-pong latency

### Performance (H100, 2 ranks, signal+wait round-trip)

```
SemaphorePerfTest.SignalPingPong:
  Store-based (old): 2.595 us/iter
  Red-based   (new): 2.345 us/iter
  Speedup:           1.11x
```

## Test plan
- [x] Builds successfully (`make mp_unit_tests`)
- [x] `mpirun -np 2 ./build/bin/mp_unit_tests --filter
"SemaphorePerfTest"` — 1.11x speedup

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
---
 include/mscclpp/semaphore.hpp        |  1 -
 include/mscclpp/semaphore_device.hpp | 34 ++++---------
 python/csrc/semaphore_py.cpp         |  1 -
 src/core/semaphore.cc                |  5 +-
 test/mp_unit/CMakeLists.txt          |  1 +
 test/mp_unit/mp_unit_tests.hpp       |  6 +++
 test/mp_unit/semaphore_perf_tests.cu | 73 ++++++++++++++++++++++++++++
 7 files changed, 91 insertions(+), 30 deletions(-)
 create mode 100644 test/mp_unit/semaphore_perf_tests.cu

diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index 27f9aefa..85787c95 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -82,7 +82,6 @@ class MemoryDevice2DeviceSemaphore {
  private:
   Semaphore semaphore_;
   detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
-  detail::UniqueGpuPtr<uint64_t> outboundToken_;
 
  public:
   /// Constructor.
diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp
index f1b01e89..a790a6e1 100644
--- a/include/mscclpp/semaphore_device.hpp
+++ b/include/mscclpp/semaphore_device.hpp
@@ -82,19 +82,20 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
 
   /// Signal remote device, ensures prior memory ops complete.
   MSCCLPP_DEVICE_INLINE void signal() {
-    auto outbound = incOutbound();
-#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ == 800)
-    // Using memoryOrderSeqCst is faster for A100.
-    atomicStore(remoteInboundToken, outbound, memoryOrderSeqCst);
-#else
-    atomicStore(remoteInboundToken, outbound, memoryOrderRelease);
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("red.release.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
+#elif defined(MSCCLPP_DEVICE_HIP)
+    (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelease);
 #endif
   }
 
   /// Relaxed signal; no memory completion guarantee. Use it only for synchronizing execution, not data.
   MSCCLPP_DEVICE_INLINE void relaxedSignal() {
-    auto outbound = incOutbound();
-    atomicStore(remoteInboundToken, outbound, memoryOrderRelaxed);
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("red.relaxed.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
+#elif defined(MSCCLPP_DEVICE_HIP)
+    (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelaxed);
+#endif
   }
 
   /// Thread-safe read of expected inbound value.
@@ -121,27 +122,12 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
     return atomicLoad<uint64_t, scopeSystem>(inboundToken, memoryOrderRelaxed);
   }
 
-  /// Thread-safe read of outbound value.
-  /// @return The outbound value.
-  MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() {
-    return atomicLoad<uint64_t, scopeDevice>(outboundToken, memoryOrderRelaxed);
-  }
-
-  /// Thread-safe increment of outbound value.
-  /// @return The incremented outbound value.
-  MSCCLPP_DEVICE_INLINE uint64_t incOutbound() {
-    return atomicFetchAdd<uint64_t, scopeDevice>(outboundToken, 1, memoryOrderRelaxed) + 1;
-  }
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
   /// A local memory space where the remote device will write its semaphore value and the local device will read it.
   uint64_t* inboundToken;
 
-  /// A local memory space where the local device stores the semaphore value to be written to the remote device.
-  uint64_t* outboundToken;
-
-  /// A remote memory space where the local device writes its outboundToken on. This is inboundToken of the
-  /// remote device.
+  /// A remote memory space where the local device atomically increments. This is inboundToken of the remote device.
   uint64_t* remoteInboundToken;
 
   /// A local memory space where the local device stores the expected value of the inboundToken to wait for.
diff --git a/python/csrc/semaphore_py.cpp b/python/csrc/semaphore_py.cpp
index 36d559f2..17c06a7d 100644
--- a/python/csrc/semaphore_py.cpp
+++ b/python/csrc/semaphore_py.cpp
@@ -43,7 +43,6 @@ void register_semaphore(nb::module_& m) {
   nb::class_<MemoryDevice2DeviceSemaphore::DeviceHandle>(memoryDevice2DeviceSemaphore, "DeviceHandle")
       .def(nb::init<>())
       .def_rw("inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundToken)
-      .def_rw("outbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundToken)
       .def_rw("remote_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundToken)
       .def_rw("expected_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundToken)
       .def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index c6eb1e23..bea43327 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -183,9 +183,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
 }
 
 MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const Semaphore& semaphore)
-    : semaphore_(semaphore),
-      expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
-      outboundToken_(detail::gpuCallocUnique<uint64_t>()) {
+    : semaphore_(semaphore), expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
     throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
   }
@@ -202,7 +200,6 @@ MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::DeviceHandle MemoryDevice2DeviceSe
   device.remoteInboundToken = reinterpret_cast<uint64_t*>(semaphore_.remoteMemory().data());
   device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
   device.expectedInboundToken = expectedInboundToken_.get();
-  device.outboundToken = outboundToken_.get();
   return device;
 };
 
diff --git a/test/mp_unit/CMakeLists.txt b/test/mp_unit/CMakeLists.txt
index b99bb09d..d4004e8e 100644
--- a/test/mp_unit/CMakeLists.txt
+++ b/test/mp_unit/CMakeLists.txt
@@ -8,6 +8,7 @@ target_sources(mp_unit_tests PRIVATE
     communicator_tests.cu
     port_channel_tests.cu
     memory_channel_tests.cu
+    semaphore_perf_tests.cu
     switch_channel_tests.cu
     executor_tests.cc
 )
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index 03e4cbde..5f95d660 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -176,6 +176,12 @@ class MemoryChannelOneToOneTest : public CommunicatorTestBase {
   std::unordered_map<int, std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
 };
 
+class SemaphorePerfTest : public CommunicatorTestBase {
+ protected:
+  void SetUp() override;
+  void TearDown() override;
+};
+
 class SwitchChannelTest : public CommunicatorTestBase {
  protected:
   void SetUp() override;
diff --git a/test/mp_unit/semaphore_perf_tests.cu b/test/mp_unit/semaphore_perf_tests.cu
new file mode 100644
index 00000000..92560539
--- /dev/null
+++ b/test/mp_unit/semaphore_perf_tests.cu
@@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/semaphore.hpp>
+
+#include "mp_unit_tests.hpp"
+
+void SemaphorePerfTest::SetUp() {
+  // Need at least two ranks within a node
+  if (gEnv->nRanksPerNode < 2) {
+    SKIP_TEST();
+  }
+  setNumRanksToUse(2);
+  CommunicatorTestBase::SetUp();
+}
+
+void SemaphorePerfTest::TearDown() { CommunicatorTestBase::TearDown(); }
+
+__constant__ mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle gSemaphorePerfTestHandle;
+
+__global__ void kernelSemaphorePingPong(int rank, int nIters) {
+  mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle& sem = gSemaphorePerfTestHandle;
+
+  // Warmup
+  for (int i = 0; i < 10; i++) {
+    if ((rank ^ (i & 1)) == 0) {
+      sem.signal();
+    } else {
+      sem.wait();
+    }
+  }
+
+  // Timed iterations — alternating signal/wait like the memory channel ping-pong
+  for (int i = 0; i < nIters; i++) {
+    if ((rank ^ (i & 1)) == 0) {
+      sem.signal();
+    } else {
+      sem.wait();
+    }
+  }
+}
+
+PERF_TEST(SemaphorePerfTest, SignalPingPong) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  connectMesh(/*useIpc=*/true, /*useIb=*/false, /*useEthernet=*/false);
+
+  int peerRank = (gEnv->rank == 0) ? 1 : 0;
+  auto d2dSemaphore = std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*communicator, connections[peerRank]);
+
+  auto devHandle = d2dSemaphore->deviceHandle();
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gSemaphorePerfTestHandle, &devHandle, sizeof(devHandle)));
+
+  const int nIters = 1000;
+  const std::string testName = ::mscclpp::test::currentTestName();
+
+  // Warmup run
+  kernelSemaphorePingPong<<<1, 1>>>(gEnv->rank, nIters);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  communicator->bootstrap()->barrier();
+
+  // Timed run
+  mscclpp::Timer timer;
+  kernelSemaphorePingPong<<<1, 1>>>(gEnv->rank, nIters);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  communicator->bootstrap()->barrier();
+
+  if (gEnv->rank == 0) {
+    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nIters << " us/iter\n";
+  }
+}

From d2f7056cf4d1956cb452ee475b331f8e19e1d886 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 31 Mar 2026 22:30:35 -0700
Subject: [PATCH 33/52] Add unit testing framework readme (#766)

---
 test/README.md | 130 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 test/README.md

diff --git a/test/README.md b/test/README.md
new file mode 100644
index 00000000..a69b66ad
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,130 @@
+# MSCCL++ C++ Test Framework
+
+A lightweight, GTest-like test framework with MPI support for testing MSCCL++ C++ APIs. Defined in `framework.hpp` / `framework.cc`.
+
+## Adding a New Test (Step-by-Step)
+
+### Single-process test (unit/)
+
+1. **Create the test file** `test/unit/my_feature_tests.cc` (or `.cu` for CUDA):
+
+    ```cpp
+    #include "../framework.hpp"
+    #include <mscclpp/my_feature.hpp>
+
+    TEST(MyFeatureTest, BasicUsage) {
+      EXPECT_EQ(myFunction(), 42);
+    }
+    ```
+
+2. **Register it in CMake** — add the filename to `test/unit/CMakeLists.txt`:
+
+    ```cmake
+    target_sources(unit_tests PRIVATE
+        ...
+        my_feature_tests.cc   # <-- add here
+    )
+    ```
+
+3. **Build and run**:
+
+    ```bash
+    cmake --build build -j
+    ./build/test/unit_tests --filter=MyFeatureTest
+    ```
+
+### Multi-process test (mp_unit/)
+
+1. **Create the test file** `test/mp_unit/my_feature_tests.cc` (or `.cu`):
+
+    ```cpp
+    #include "mp_unit_tests.hpp"
+
+    TEST(MyFeatureTest, MultiRank) {
+      int rank = gEnv->rank;
+      EXPECT_GE(rank, 0);
+    }
+    ```
+
+    Use fixtures from `mp_unit_tests.hpp` (e.g., `CommunicatorTest`) if you need pre-established connections.
+
+2. **Register it in CMake** — add the filename to `test/mp_unit/CMakeLists.txt`:
+
+    ```cmake
+    target_sources(mp_unit_tests PRIVATE
+        ...
+        my_feature_tests.cc   # <-- add here
+    )
+    ```
+
+3. **Build and run**:
+
+    ```bash
+    cmake --build build -j
+    mpirun -np 2 ./build/test/mp_unit_tests --filter=MyFeatureTest
+    ```
+
+### Notes
+
+- No separate test registration step is needed — `TEST()` auto-registers via static initialization.
+- The `test_framework` static library is built from `framework.cc` in the top-level `test/CMakeLists.txt` and linked into both `unit_tests` and `mp_unit_tests`. You do not need to modify it.
+- Use `.cu` extension for files that contain CUDA kernel code; use `.cc` for host-only tests.
+- Each test binary needs a `main()` that calls `RUN_ALL_TESTS()`. See `unit/unit_tests_main.cc` (single-process) and `mp_unit/mp_unit_tests.cc` (multi-process with `Environment` setup).
+- Additional run options: `--filter=-Pattern` (exclude), `--exclude-perf-tests` (skip `PERF_TEST`s).
+
+## Macros
+
+| Macro | Behavior |
+|---|---|
+| `TEST(Suite, Name)` | Register a test. If `Suite` is a defined class, it's used as a fixture. |
+| `PERF_TEST(Suite, Name)` | Same as `TEST` but marked as perf (skippable via `--exclude-perf-tests`). |
+| `EXPECT_*` | Non-fatal assertions: `EXPECT_TRUE`, `EXPECT_FALSE`, `EXPECT_EQ`, `EXPECT_NE`, `EXPECT_LT`, `EXPECT_LE`, `EXPECT_GT`, `EXPECT_GE` |
+| `ASSERT_*` | Fatal assertions (abort test on failure): same variants as `EXPECT_*`, plus `ASSERT_NO_THROW` |
+| `FAIL()` | Fail immediately. Supports streaming: `FAIL() << "reason";` |
+| `SKIP_TEST()` | Skip the current test. Supports streaming: `SKIP_TEST() << "reason";` |
+| `CUDA_CHECK(call)` | Check a CUDA API return code, throw on error. |
+
+## Fixtures
+
+Define a class inheriting from `mscclpp::test::TestCase` with `SetUp()` / `TearDown()`, then use the class name as the suite name:
+
+```cpp
+class MyFixture : public mscclpp::test::TestCase {
+ public:
+  void SetUp() override { /* per-test setup */ }
+  void TearDown() override { /* per-test cleanup */ }
+ protected:
+  int sharedState_ = 0;
+};
+
+TEST(MyFixture, SomeTest) {
+  sharedState_ = 42;
+  EXPECT_EQ(sharedState_, 42);
+}
+```
+
+See `mp_unit/mp_unit_tests.hpp` (`BootstrapTest`, `CommunicatorTest`, etc.) for real fixture examples.
+
+## Global Environments
+
+Register an `Environment` subclass for one-time global setup/teardown (e.g., MPI bootstrap):
+
+```cpp
+class MyEnv : public mscclpp::test::Environment {
+ public:
+  void SetUp() override { /* global init */ }
+  void TearDown() override { /* global cleanup */ }
+};
+
+// In main(), before RUN_ALL_TESTS():
+mscclpp::test::TestRegistry::instance().addEnvironment(new MyEnv());
+```
+
+See `mp_unit/mp_unit_tests.cc` for the `MultiProcessTestEnv` example.
+
+## Utilities
+
+- `mscclpp::test::utils::isMainRank()` — true on MPI rank 0
+- `mscclpp::test::utils::getMPIRank()` / `getMPISize()`
+- `mscclpp::test::utils::Timer` — high-resolution timer with `start()`, `stop()`, `elapsedMilliseconds()`
+- `mscclpp::test::currentTestName()` — returns `"Suite.Name"` for the running test
\ No newline at end of file

From be9126ca1b36c4817de622a0aebd87e5382b9a6b Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 1 Apr 2026 16:25:19 -0700
Subject: [PATCH 34/52] Fix run-remote.sh to support multi-command scripts
 (#770)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary
- Fix `run-remote.sh` to correctly execute multi-command scripts (e.g.,
multiple `mpirun` calls)
- The old approach piped decoded script through `base64 -d | bash`,
which feeds the script via bash's **stdin**. When `mpirun` (or its child
processes) runs, it can consume the remaining stdin, causing bash to
never see subsequent commands — only the first command would execute.
- The fix decodes the script to a **temp file** and runs `bash -euxo
pipefail "$TMP"` instead, so bash reads commands from the file and
`mpirun` consuming stdin has no effect.
- Applied to both the docker path (pssh + docker exec) and the
non-docker path (pssh only).


🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 test/deploy/run-remote.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
index b646ea92..2468243e 100755
--- a/test/deploy/run-remote.sh
+++ b/test/deploy/run-remote.sh
@@ -97,11 +97,14 @@ if $USE_DOCKER; then
     INNER+=" cd /root/mscclpp;"
     INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
     INNER+=" CMD_B64='${CMD_B64}';"
-    INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
+    INNER+=" TMP=\\\$(mktemp);"
+    INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d > \\\"\\\$TMP\\\";"
+    INNER+=" bash -euxo pipefail \\\"\\\$TMP\\\";"
+    INNER+=" rm -f \\\"\\\$TMP\\\""
 
     parallel-ssh -i "${PSSH_COMMON[@]}" \
         "sudo docker exec mscclpp-test bash -c \"${INNER}\""
 else
     parallel-ssh -i "${PSSH_COMMON[@]}" \
-        "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"
+        "set -euxo pipefail; CMD_B64='${CMD_B64}'; TMP=\$(mktemp); printf '%s' \"\$CMD_B64\" | base64 -d > \"\$TMP\"; bash -euxo pipefail \"\$TMP\"; rm -f \"\$TMP\""
 fi

From 520c890df57203f3bcbbae766520c15a2f4e4076 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Thu, 2 Apr 2026 04:39:48 +0000
Subject: [PATCH 35/52] Add debug variable MSCCLPP_DEBUG_ALLTOALLV_to print

---
 python/mscclpp/ext/alltoallv_single.py | 28 +++++++++++++-------
 python/test/test_alltoallv_mscclpp.py  | 36 ++++++++++++++------------
 2 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/python/mscclpp/ext/alltoallv_single.py b/python/mscclpp/ext/alltoallv_single.py
index e84171ee..4387c3a3 100644
--- a/python/mscclpp/ext/alltoallv_single.py
+++ b/python/mscclpp/ext/alltoallv_single.py
@@ -12,6 +12,7 @@ via the NativeAlgorithm framework with size-adaptive algorithm selection.
 """
 
 from __future__ import annotations
+import os
 import torch
 import torch.distributed as dist
 from typing import Optional, List, Tuple
@@ -23,6 +24,8 @@ from mscclpp._mscclpp import (
 )
 from mscclpp.ext.algorithm_collection_builder import AlgorithmCollectionBuilder
 
+_DEBUG = os.environ.get("MSCCLPP_DEBUG_ALLTOALLV", "0") == "1"
+
 __all__ = ["MscclppAlltoAllV", "all_to_all_single"]
 
 
@@ -239,8 +242,8 @@ class MscclppAlltoAllV:
         # Fast path: skip GPU copies + bootstrap exchange if split sizes unchanged
         splits_key = (tuple(send_counts_bytes), tuple(recv_counts_bytes))
         if splits_key != self._cached_splits_key:
-            import sys as _sys
-            print(f"  [rank {self._rank}] alltoallv: splits changed, doing bootstrap exchange", flush=True)
+            if _DEBUG:
+                print(f"  [rank {self._rank}] alltoallv: splits changed, doing bootstrap exchange", flush=True)
             # Clear cached contexts to free RegisteredMemory for old (possibly freed) tensors.
             # Without this, stale CUDA IPC handles accumulate and eventually SIGSEGV.
             if hasattr(self._algo, 'reset'):
@@ -252,9 +255,11 @@ class MscclppAlltoAllV:
             self._d_recv_displs.copy_(torch.tensor(recv_displs_bytes, dtype=torch.int64))
 
             # Exchange recv displacements with peers via bootstrap
-            print(f"  [rank {self._rank}] alltoallv: starting _exchange_recv_displs", flush=True)
+            if _DEBUG:
+                print(f"  [rank {self._rank}] alltoallv: starting _exchange_recv_displs", flush=True)
             remote_recv_displs = self._exchange_recv_displs(recv_displs_bytes)
-            print(f"  [rank {self._rank}] alltoallv: _exchange_recv_displs done", flush=True)
+            if _DEBUG:
+                print(f"  [rank {self._rank}] alltoallv: _exchange_recv_displs done", flush=True)
             self._d_remote_recv_displs.copy_(torch.tensor(remote_recv_displs, dtype=torch.int64))
 
             # Cache for subsequent calls
@@ -267,9 +272,11 @@ class MscclppAlltoAllV:
             # bootstrap operations (comm->connect, setupRemoteMemories).
             # Without this barrier, fast ranks' bootstrap messages from
             # initialize() can collide with slow ranks still in _exchange_recv_displs.
-            print(f"  [rank {self._rank}] alltoallv: waiting on bootstrap barrier", flush=True)
+            if _DEBUG:
+                print(f"  [rank {self._rank}] alltoallv: waiting on bootstrap barrier", flush=True)
             self._comm.bootstrap().barrier()
-            print(f"  [rank {self._rank}] alltoallv: bootstrap barrier done", flush=True)
+            if _DEBUG:
+                print(f"  [rank {self._rank}] alltoallv: bootstrap barrier done", flush=True)
 
         # Get stream
         if stream is None:
@@ -288,7 +295,6 @@ class MscclppAlltoAllV:
             output_alloc_size = output.nelement() * output.element_size()
         
         # Execute the optimized kernel
-        import sys as _sys
         # Clear any stale CUDA errors before executing (the C++ code checks
         # cudaGetLastError() after the kernel and returns INTERNAL_ERROR if any
         # previous error was pending).
@@ -298,11 +304,12 @@ class MscclppAlltoAllV:
         try:
             _cudart = ctypes.CDLL("libcudart.so")
             _last_err = _cudart.cudaGetLastError()
-            if _last_err != 0:
+            if _last_err != 0 and _DEBUG:
                 print(f"  [rank {self._rank}] WARNING: cleared stale CUDA error code {_last_err} before execute", flush=True)
         except Exception:
             pass
-        print(f"  [rank {self._rank}] alltoallv: calling algo.execute(input_alloc={input_alloc_size}, output_alloc={output_alloc_size})", flush=True)
+        if _DEBUG:
+            print(f"  [rank {self._rank}] alltoallv: calling algo.execute(input_alloc={input_alloc_size}, output_alloc={output_alloc_size})", flush=True)
         result = self._algo.execute(
             self._comm,
             input.data_ptr(),
@@ -317,7 +324,8 @@ class MscclppAlltoAllV:
             0,     # nthreads_per_block (auto)
             self._extras,
         )
-        print(f"  [rank {self._rank}] alltoallv: algo.execute returned {result}", flush=True)
+        if _DEBUG:
+            print(f"  [rank {self._rank}] alltoallv: algo.execute returned {result}", flush=True)
         
         from mscclpp._mscclpp import CommResult
         if result != CommResult.COMM_SUCCESS:
diff --git a/python/test/test_alltoallv_mscclpp.py b/python/test/test_alltoallv_mscclpp.py
index 96695f94..a9ebb6fd 100644
--- a/python/test/test_alltoallv_mscclpp.py
+++ b/python/test/test_alltoallv_mscclpp.py
@@ -14,6 +14,8 @@ import torch.distributed as dist
 import os
 import sys
 import time
+
+_DEBUG = os.environ.get("MSCCLPP_DEBUG_ALLTOALLV", "0") == "1"
 import random
 import socket
 import struct
@@ -166,7 +168,7 @@ def main():
     except Exception:
         ib_devices = []
     
-    if rank == 0:
+    if rank == 0 and _DEBUG:
         print(f"  Hostname: {hostname}")
         print(f"  nRanksPerNode: {n_ranks_per_node}, isMultiNode: {is_multi_node}")
         print(f"  IB devices: {ib_devices if ib_devices else 'NONE FOUND'}")
@@ -174,7 +176,7 @@ def main():
         if is_multi_node and not ib_devices:
             print(f"  WARNING: Multi-node detected but no IB devices! Cross-node will fail.")
     # Also print from rank n_ranks_per_node (first rank on node 1) for comparison
-    if is_multi_node and rank == n_ranks_per_node:
+    if is_multi_node and rank == n_ranks_per_node and _DEBUG:
         print(f"  [Node 1] Hostname: {hostname}, rank={rank}")
         print(f"  [Node 1] IB devices: {ib_devices if ib_devices else 'NONE FOUND'}")
     # ── End diagnostics ────────────────────────────────────────────────
@@ -184,7 +186,7 @@ def main():
     # Create MscclppAlltoAllV with existing communicator
     alltoallv = MscclppAlltoAllV(communicator=comm)
     
-    if rank == 0:
+    if rank == 0 and _DEBUG:
         print(f"MscclppAlltoAllV initialized")
         print(f"Algorithm: {alltoallv._algo.name}")
     
@@ -201,14 +203,15 @@ def main():
     )
     
     # ── DEBUG: print tensor sizes before all_to_all_single ──
-    print(f"  [rank {rank}] input_data: numel={input_data.numel()}, shape={input_data.shape}, "
-          f"dtype={input_data.dtype}, device={input_data.device}, "
-          f"storage_size={input_data.untyped_storage().size()}, "
-          f"data_ptr=0x{input_data.data_ptr():x}")
-    print(f"  [rank {rank}] world_size={world_size}, chunk_size={chunk_size}, "
-          f"expected_total_elems={world_size * chunk_size}, "
-          f"scratch_buffer_size={alltoallv._scratch_size}")
-    sys.stdout.flush()
+    if _DEBUG:
+        print(f"  [rank {rank}] input_data: numel={input_data.numel()}, shape={input_data.shape}, "
+              f"dtype={input_data.dtype}, device={input_data.device}, "
+              f"storage_size={input_data.untyped_storage().size()}, "
+              f"data_ptr=0x{input_data.data_ptr():x}")
+        print(f"  [rank {rank}] world_size={world_size}, chunk_size={chunk_size}, "
+              f"expected_total_elems={world_size * chunk_size}, "
+              f"scratch_buffer_size={alltoallv._scratch_size}")
+        sys.stdout.flush()
     dist.barrier()
 
     try:
@@ -224,11 +227,12 @@ def main():
         raise
 
     # ── DEBUG: print output tensor sizes ──
-    print(f"  [rank {rank}] output: numel={output.numel()}, shape={output.shape}, "
-          f"dtype={output.dtype}, device={output.device}, "
-          f"storage_size={output.untyped_storage().size()}, "
-          f"data_ptr=0x{output.data_ptr():x}")
-    sys.stdout.flush()
+    if _DEBUG:
+        print(f"  [rank {rank}] output: numel={output.numel()}, shape={output.shape}, "
+              f"dtype={output.dtype}, device={output.device}, "
+              f"storage_size={output.untyped_storage().size()}, "
+              f"data_ptr=0x{output.data_ptr():x}")
+        sys.stdout.flush()
 
     # Verify: each chunk should come from different ranks
     try:

From 0a2eaeaac2c73e602fab586ed452bf1ce4f3fbc6 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Fri, 3 Apr 2026 16:28:21 +0000
Subject: [PATCH 36/52] Optimize alltoallv: hybrid kernel for multi-node IB
 mode, reduce Python hot-path overhead

---
 python/mscclpp/ext/alltoallv_single.py        | 45 +++++++--------
 .../alltoallv/alltoallv_fullmesh.cu           | 57 +++++++++++++++----
 2 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/python/mscclpp/ext/alltoallv_single.py b/python/mscclpp/ext/alltoallv_single.py
index 4387c3a3..86c57b43 100644
--- a/python/mscclpp/ext/alltoallv_single.py
+++ b/python/mscclpp/ext/alltoallv_single.py
@@ -21,9 +21,16 @@ from mscclpp._mscclpp import (
     TcpBootstrap,
     DataType,
     ReduceOp,
+    CommResult,
 )
 from mscclpp.ext.algorithm_collection_builder import AlgorithmCollectionBuilder
 
+import ctypes as _ctypes
+try:
+    _cudart = _ctypes.CDLL("libcudart.so")
+except Exception:
+    _cudart = None
+
 _DEBUG = os.environ.get("MSCCLPP_DEBUG_ALLTOALLV", "0") == "1"
 
 __all__ = ["MscclppAlltoAllV", "all_to_all_single"]
@@ -283,32 +290,19 @@ class MscclppAlltoAllV:
             stream = torch.cuda.current_stream()
         cuda_stream = stream.cuda_stream
 
-        # Use the full underlying storage size (not just the view's active data)
-        # for the context key, so that reusing views of the same tensor with
-        # different split sizes doesn't create new contexts (which leak
-        # RegisteredMemory for stale buffers).
-        try:
-            input_alloc_size = input.untyped_storage().size()
-            output_alloc_size = output.untyped_storage().size()
-        except Exception:
-            input_alloc_size = input.nelement() * input.element_size()
-            output_alloc_size = output.nelement() * output.element_size()
-        
-        # Execute the optimized kernel
-        # Clear any stale CUDA errors before executing (the C++ code checks
-        # cudaGetLastError() after the kernel and returns INTERNAL_ERROR if any
-        # previous error was pending).
-        torch.cuda.synchronize()
-        # Also clear the CUDA error state via cudaGetLastError (consumes the error)
-        import ctypes
-        try:
-            _cudart = ctypes.CDLL("libcudart.so")
-            _last_err = _cudart.cudaGetLastError()
-            if _last_err != 0 and _DEBUG:
-                print(f"  [rank {self._rank}] WARNING: cleared stale CUDA error code {_last_err} before execute", flush=True)
-        except Exception:
-            pass
+        # Use the full underlying storage size for context key stability.
+        # When the test reuses the same large tensor with different split sizes,
+        # storage size stays constant → same context key → reuses channels.
+        input_alloc_size = input.untyped_storage().size()
+        output_alloc_size = output.untyped_storage().size()
+
         if _DEBUG:
+            # Clear stale CUDA errors (the C++ code checks cudaGetLastError
+            # after the kernel and returns INTERNAL_ERROR if any was pending).
+            if _cudart is not None:
+                _last_err = _cudart.cudaGetLastError()
+                if _last_err != 0:
+                    print(f"  [rank {self._rank}] WARNING: cleared stale CUDA error code {_last_err} before execute", flush=True)
             print(f"  [rank {self._rank}] alltoallv: calling algo.execute(input_alloc={input_alloc_size}, output_alloc={output_alloc_size})", flush=True)
         result = self._algo.execute(
             self._comm,
@@ -327,7 +321,6 @@ class MscclppAlltoAllV:
         if _DEBUG:
             print(f"  [rank {self._rank}] alltoallv: algo.execute returned {result}", flush=True)
         
-        from mscclpp._mscclpp import CommResult
         if result != CommResult.COMM_SUCCESS:
             # Get detailed CUDA error before raising
             try:
diff --git a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
index b2ebbd23..2853ad16 100644
--- a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
+++ b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
@@ -193,12 +193,16 @@ CommResult AlltoallvFullmesh::alltoallvKernelFunc(
   }
 
   if (algoCtx->mode == MultiNodeMode::IB) {
-    // ── IB mode: PortChannel kernel for ALL peers ──────────────────────
-    // PortChannel handles both CudaIpc (intra) and IB (inter) connections
-    // via the ProxyService proxy thread.
+    // ── IB mode: Hybrid kernel ─────────────────────────────────────────
+    // MemoryChannel (direct NVLink) for intra-node peers,
+    // PortChannel (CPU proxy → RDMA) for inter-node peers.
     int numBlocks = nPeers;
-    alltoallvPortChannelKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
+    alltoallvHybridKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
+        algoCtx->memoryChannelDeviceHandles.get(),
         algoCtx->portChannelDeviceHandles.get(),
+        algoCtx->d_peerIsLocal.get(),
+        algoCtx->d_peerToPortChannelIdx.get(),
+        algoCtx->deviceSyncer.get(),
         rank, worldSize,
         sendBuff, recvBuff,
         d_sendCounts, d_sendDispls,
@@ -308,25 +312,54 @@ std::shared_ptr<void> AlltoallvFullmesh::initAlltoallvContext(
     ctx->registeredMemories.push_back(outputBufRegMem);
 
   } else if (ctx->mode == MultiNodeMode::IB) {
-    // ── IB: PortChannel for ALL peers (CudaIpc intra + IB inter connections)
+    // ── IB hybrid: MemoryChannel (intra-node) + PortChannel (inter-node) ──
     TransportFlags allTransports = Transport::CudaIpc | getIBTransportForGpu(localGpuIdx);
     RegisteredMemory inputBufRegMem = comm->registerMemory((void*)input, inputSize, allTransports);
     RegisteredMemory outputBufRegMem = comm->registerMemory(output, outputSize, allTransports);
 
     std::vector<RegisteredMemory> remoteOutputMemories = setupRemoteMemories(comm, rank, outputBufRegMem);
-    INFO(MSCCLPP_COLL, "[alltoallv][rank %d] IB: input=%p (%zu B), output=%p (%zu B), remotes=%zu",
+    INFO(MSCCLPP_COLL, "[alltoallv][rank %d] IB hybrid: input=%p (%zu B), output=%p (%zu B), remotes=%zu",
          rank, input, inputSize, output, outputSize, remoteOutputMemories.size());
-    for (size_t i = 0; i < remoteOutputMemories.size(); ++i) {
-      INFO(MSCCLPP_COLL, "[alltoallv][rank %d] IB: remoteOutput[%zu] data=%p, size=%zu",
-           rank, i, remoteOutputMemories[i].data(), remoteOutputMemories[i].size());
-    }
 
+    // Build peer locality map and per-type channel arrays
+    int nPeers = ctx->worldSize - 1;
+    int thisNode = rank / ctx->nRanksPerNode;
+    std::vector<int> peerIsLocal(nPeers, 0);
+    std::vector<int> peerToPortChIdx(nPeers, -1);
+    int portChCount = 0;
+    for (int peerIdx = 0; peerIdx < nPeers; peerIdx++) {
+      int peer = peerIdx < rank ? peerIdx : peerIdx + 1;
+      if (peer / ctx->nRanksPerNode == thisNode) {
+        peerIsLocal[peerIdx] = 1;
+      } else {
+        peerToPortChIdx[peerIdx] = portChCount++;
+      }
+    }
+    INFO(MSCCLPP_COLL, "[alltoallv][rank %d] IB hybrid: nPeers=%d, localPeers=%d, remotePeers=%d",
+         rank, nPeers, nPeers - portChCount, portChCount);
+
+    // Copy locality arrays to GPU
+    ctx->d_peerIsLocal = mscclpp::detail::gpuCallocShared<int>(nPeers);
+    ctx->d_peerToPortChannelIdx = mscclpp::detail::gpuCallocShared<int>(nPeers);
+    mscclpp::gpuMemcpy<int>(ctx->d_peerIsLocal.get(), peerIsLocal.data(), nPeers, cudaMemcpyHostToDevice);
+    mscclpp::gpuMemcpy<int>(ctx->d_peerToPortChannelIdx.get(), peerToPortChIdx.data(), nPeers, cudaMemcpyHostToDevice);
+
+    // MemoryChannel for intra-node CudaIpc connections (direct NVLink put)
+    constexpr int nChannelsPerConnection = 1;
+    ctx->memorySemaphores = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection);
+    ctx->memoryChannels = setupMemoryChannels(
+        this->conns_, ctx->memorySemaphores, remoteOutputMemories, inputBufRegMem, nChannelsPerConnection);
+    ctx->memoryChannelDeviceHandles = setupMemoryChannelDeviceHandles(ctx->memoryChannels);
+    INFO(MSCCLPP_COLL, "[alltoallv][rank %d] IB hybrid: %zu memoryChannels (intra-node)",
+         rank, ctx->memoryChannels.size());
+
+    // PortChannel for inter-node IB connections only (CPU proxy → RDMA)
     ctx->proxyService = std::make_shared<ProxyService>();
-    ctx->portChannels = setupAllPortChannels(
+    ctx->portChannels = setupPortChannels(
         ctx->proxyService, *comm, this->conns_, remoteOutputMemories, inputBufRegMem);
     ctx->portChannelDeviceHandles = setupPortChannelDeviceHandles(ctx->portChannels);
     ctx->proxyService->startProxy(true);
-    INFO(MSCCLPP_COLL, "[alltoallv][rank %d] IB: %zu portChannels created, proxy started",
+    INFO(MSCCLPP_COLL, "[alltoallv][rank %d] IB hybrid: %zu portChannels (inter-node), proxy started",
          rank, ctx->portChannels.size());
 
     ctx->registeredMemories = std::move(remoteOutputMemories);

From 3148819deaba87e19529118b421e670e7079f17a Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Fri, 3 Apr 2026 17:14:46 +0000
Subject: [PATCH 37/52] Add safe fallback for untyped_storage() on older
 PyTorch versions

---
 python/mscclpp/ext/alltoallv_single.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/mscclpp/ext/alltoallv_single.py b/python/mscclpp/ext/alltoallv_single.py
index 86c57b43..6f583ae7 100644
--- a/python/mscclpp/ext/alltoallv_single.py
+++ b/python/mscclpp/ext/alltoallv_single.py
@@ -166,6 +166,8 @@ class MscclppAlltoAllV:
         self._cached_output_size = 0
         self._cached_total_output_elems = 0
         self._cached_dtype = None
+        # One-time check for untyped_storage (available since PyTorch 1.13)
+        self._has_untyped_storage = hasattr(torch.Tensor, 'untyped_storage')
         # Pre-built extras dict (GPU pointers don't change)
         self._extras = {
             "sendCounts": self._d_send_counts.data_ptr(),
@@ -293,8 +295,12 @@ class MscclppAlltoAllV:
         # Use the full underlying storage size for context key stability.
         # When the test reuses the same large tensor with different split sizes,
         # storage size stays constant → same context key → reuses channels.
-        input_alloc_size = input.untyped_storage().size()
-        output_alloc_size = output.untyped_storage().size()
+        if self._has_untyped_storage:
+            input_alloc_size = input.untyped_storage().size()
+            output_alloc_size = output.untyped_storage().size()
+        else:
+            input_alloc_size = input.nelement() * input.element_size()
+            output_alloc_size = output.nelement() * output.element_size()
 
         if _DEBUG:
             # Clear stale CUDA errors (the C++ code checks cudaGetLastError
@@ -304,6 +310,7 @@ class MscclppAlltoAllV:
                 if _last_err != 0:
                     print(f"  [rank {self._rank}] WARNING: cleared stale CUDA error code {_last_err} before execute", flush=True)
             print(f"  [rank {self._rank}] alltoallv: calling algo.execute(input_alloc={input_alloc_size}, output_alloc={output_alloc_size})", flush=True)
+
         result = self._algo.execute(
             self._comm,
             input.data_ptr(),
@@ -318,6 +325,7 @@ class MscclppAlltoAllV:
             0,     # nthreads_per_block (auto)
             self._extras,
         )
+
         if _DEBUG:
             print(f"  [rank {self._rank}] alltoallv: algo.execute returned {result}", flush=True)
         

From 5d938d6f4742d2e92ea8bc6925492edb78d01e9c Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Fri, 3 Apr 2026 18:34:22 +0000
Subject: [PATCH 38/52] Update warning message for multi-node IB check

---
 python/test/test_alltoallv_mscclpp.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/test/test_alltoallv_mscclpp.py b/python/test/test_alltoallv_mscclpp.py
index a9ebb6fd..e8797e43 100644
--- a/python/test/test_alltoallv_mscclpp.py
+++ b/python/test/test_alltoallv_mscclpp.py
@@ -174,7 +174,9 @@ def main():
         print(f"  IB devices: {ib_devices if ib_devices else 'NONE FOUND'}")
         print(f"  MSCCLPP_SOCKET_IFNAME: {os.environ.get('MSCCLPP_SOCKET_IFNAME', '<not set>')}")
         if is_multi_node and not ib_devices:
-            print(f"  WARNING: Multi-node detected but no IB devices! Cross-node will fail.")
+            print(f"  NOTE: Multi-node detected but no IB devices. "
+                  f"GB200 NVSwitch can handle cross-node without IB; "
+                  f"on Hopper/Ampere IB is required.")
     # Also print from rank n_ranks_per_node (first rank on node 1) for comparison
     if is_multi_node and rank == n_ranks_per_node and _DEBUG:
         print(f"  [Node 1] Hostname: {hostname}, rank={rank}")

From 161d9c828dc9feadadcd46028555c654135a592a Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Fri, 3 Apr 2026 22:16:21 +0000
Subject: [PATCH 39/52] Remove legacy alltoallv kernels and dead code from
 collective utils.

---
 .../alltoallv/alltoallv_fullmesh.cu           |   9 -
 src/ext/collectives/collective_utils.cc       |  21 -
 .../include/alltoallv/alltoallv_kernel.hpp    | 360 ------------------
 .../collectives/include/collective_utils.hpp  |  20 -
 test/mscclpp-test/alltoallv_test.cu           |  48 +--
 5 files changed, 3 insertions(+), 455 deletions(-)

diff --git a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
index 2853ad16..4a57d30d 100644
--- a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
+++ b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
@@ -13,18 +13,11 @@
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/utils.hpp>
 
-#include <algorithm>
 #include "debug.h"
 
 namespace mscclpp {
 namespace collective {
 
-#if defined(__HIP_PLATFORM_AMD__)
-#define ALLTOALLV_WARP_SIZE 64
-#else
-#define ALLTOALLV_WARP_SIZE 32
-#endif
-
 using MultiNodeMode = AlltoallvFullmesh::MultiNodeMode;
 
 // Context to hold all necessary state for alltoallv execution
@@ -397,7 +390,5 @@ AlgorithmCtxKey AlltoallvFullmesh::generateAlltoallvContextKey(
   return {(void*)input, output, inputSize, outputSize, 0};
 }
 
-#undef ALLTOALLV_WARP_SIZE
-
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/collective_utils.cc b/src/ext/collectives/collective_utils.cc
index 270223b3..90a3530c 100644
--- a/src/ext/collectives/collective_utils.cc
+++ b/src/ext/collectives/collective_utils.cc
@@ -4,7 +4,6 @@
 #include "collective_utils.hpp"
 
 #include <algorithm>
-#include <mscclpp/algorithm.hpp>
 #include <mscclpp/core.hpp>
 #include <mscclpp/memory_channel.hpp>
 #include <mscclpp/port_channel.hpp>
@@ -124,26 +123,6 @@ std::vector<mscclpp::PortChannel> setupPortChannels(
   return channels;
 }
 
-std::vector<mscclpp::PortChannel> setupAllPortChannels(
-    std::shared_ptr<mscclpp::ProxyService> proxyService,
-    mscclpp::Communicator& comm,
-    const std::vector<mscclpp::Connection>& connections,
-    const std::vector<mscclpp::RegisteredMemory>& remoteMemories,
-    mscclpp::RegisteredMemory localMemory) {
-  std::vector<mscclpp::PortChannel> channels;
-  mscclpp::MemoryId srcMemId = proxyService->addMemory(localMemory);
-  for (size_t cid = 0; cid < connections.size(); ++cid) {
-    // Create PortChannel for EVERY connection (CudaIpc and IB alike).
-    // The ProxyService proxy thread handles both connection types:
-    //   - CudaIpc: cudaMemcpyD2D via IPC-mapped pointer
-    //   - IB: RDMA write via ibv_post_send
-    mscclpp::SemaphoreId semId = proxyService->buildAndAddSemaphore(comm, connections[cid]);
-    mscclpp::MemoryId dstMemId = proxyService->addMemory(remoteMemories[cid]);
-    channels.emplace_back(proxyService->portChannel(semId, dstMemId, srcMemId));
-  }
-  return channels;
-}
-
 std::shared_ptr<mscclpp::PortChannelDeviceHandle> setupPortChannelDeviceHandles(
     const std::vector<mscclpp::PortChannel>& portChannels) {
   if (portChannels.empty()) return nullptr;
diff --git a/src/ext/collectives/include/alltoallv/alltoallv_kernel.hpp b/src/ext/collectives/include/alltoallv/alltoallv_kernel.hpp
index e00773f0..8fffab74 100644
--- a/src/ext/collectives/include/alltoallv/alltoallv_kernel.hpp
+++ b/src/ext/collectives/include/alltoallv/alltoallv_kernel.hpp
@@ -11,21 +11,6 @@
 namespace mscclpp {
 namespace collective {
 
-#if defined(__HIP_PLATFORM_AMD__)
-#define ALLTOALLV_WARP_SIZE 64
-#else
-#define ALLTOALLV_WARP_SIZE 32
-#endif
-
-// Chunk size for pipelined transfers (1MB)
-// Large enough to amortize overhead, small enough for good memory patterns
-constexpr size_t ALLTOALLV_CHUNK_SIZE = 1 << 20;
-
-// Default number of blocks for multi-block kernels.
-// Tuned for H100 (132 SMs). Enough to saturate NVLink bandwidth without
-// excessive DeviceSyncer overhead.
-constexpr int ALLTOALLV_DEFAULT_NBLOCKS = 24;
-
 // Default blocks per peer for the peer-parallel kernel.
 // Controls how many thread blocks cooperate on each peer's data transfer.
 constexpr int ALLTOALLV_DEFAULT_BLOCKS_PER_PEER = 16;
@@ -239,352 +224,7 @@ __global__ void __launch_bounds__(1024)
   }
 }
 
-/**
- * Legacy multi-block AllToAllV kernel (sequential peers).
- *
- * All thread blocks cooperate on each peer's data transfer using global thread IDs.
- * Peers are processed sequentially. Kept for comparison; prefer alltoallvPeerParallelKernel.
- *
- * Launch config: <<<nBlocks, 1024>>>
- */
-__global__ void __launch_bounds__(1024)
-    alltoallvMultiBlockKernel(DeviceHandle<MemoryChannel>* memoryChannels,
-                              DeviceSyncer* syncer,
-                              int rank,
-                              int worldSize,
-                              const void* sendBuff,
-                              void* recvBuff,
-                              const size_t* sendCounts,
-                              const size_t* sendDispls,
-                              const size_t* recvCounts,
-                              const size_t* recvDispls,
-                              const size_t* remoteRecvDispls) {
-  const int gtid = threadIdx.x + blockIdx.x * blockDim.x;
-  const int nThreads = blockDim.x * gridDim.x;
-  const int nPeers = worldSize - 1;
 
-  // Phase 1: Local copy — all threads across all blocks cooperate
-  if (sendCounts[rank] > 0) {
-    mscclpp::copy((char*)recvBuff + recvDispls[rank],
-                  (void*)((const char*)sendBuff + sendDispls[rank]),
-                  sendCounts[rank], gtid, nThreads);
-  }
 
-  // Phase 2: Remote puts — all blocks cooperate on each peer's transfer
-  for (int peerIdx = 0; peerIdx < nPeers; peerIdx++) {
-    int peer = peerIdx < rank ? peerIdx : peerIdx + 1;
-    int chanIdx = peerIdx;
-
-    if (sendCounts[peer] > 0) {
-      memoryChannels[chanIdx].put(
-          remoteRecvDispls[peer],
-          sendDispls[peer],
-          sendCounts[peer],
-          gtid,
-          nThreads
-      );
-    }
-  }
-
-  // Phase 3: Grid-wide barrier
-  syncer->sync(gridDim.x);
-
-  // Phase 4: Signal all peers, then wait (single thread)
-  if (gtid == 0) {
-    for (int peerIdx = 0; peerIdx < nPeers; peerIdx++) {
-      memoryChannels[peerIdx].signal();
-    }
-    for (int peerIdx = 0; peerIdx < nPeers; peerIdx++) {
-      int peer = peerIdx < rank ? peerIdx : peerIdx + 1;
-      if (recvCounts[peer] > 0) {
-        memoryChannels[peerIdx].wait();
-      }
-    }
-  }
-}
-
-/**
- * High-performance AllToAllV kernel using maximum thread parallelism.
- *
- * Processes each peer sequentially but uses ALL block threads (1024) for each
- * data transfer to maximize copy bandwidth. This provides much better performance
- * than the warp-per-peer approach for large message sizes.
- *
- * Launch config: <<<1, 1024>>> for maximum bandwidth within a single block.
- *
- * @param memoryChannels Array of MemoryChannel handles for each peer (worldSize-1 channels)
- * @param rank Current rank
- * @param worldSize Total number of ranks
- * @param sendBuff Source buffer containing data to send
- * @param recvBuff Destination buffer for received data
- * @param sendCounts Array of send counts for each rank (in bytes)
- * @param sendDispls Array of send displacements for each rank (in bytes)
- * @param recvCounts Array of receive counts for each rank (in bytes)
- * @param recvDispls Array of receive displacements for each rank (in bytes)
- */
-__global__ void __launch_bounds__(1024)
-    alltoallvKernel(DeviceHandle<MemoryChannel>* memoryChannels,
-                    int rank,
-                    int worldSize,
-                    const void* sendBuff,
-                    void* recvBuff,
-                    const size_t* sendCounts,
-                    const size_t* sendDispls,
-                    const size_t* recvCounts,
-                    const size_t* recvDispls,
-                    const size_t* remoteRecvDispls) {
-  int tid = threadIdx.x;
-  int nThreads = blockDim.x;
-  int nPeers = worldSize - 1;
-
-  // Step 1: Copy local data using ALL threads for maximum bandwidth
-  if (sendCounts[rank] > 0) {
-    mscclpp::copy((char*)recvBuff + recvDispls[rank],
-                  (void*)((const char*)sendBuff + sendDispls[rank]),
-                  sendCounts[rank], tid, nThreads);
-  }
-  __syncthreads();
-
-  // Step 2: Process each peer sequentially, but use ALL threads for each transfer
-  // This maximizes bandwidth for each transfer compared to warp-per-peer approach
-  for (int peerIdx = 0; peerIdx < nPeers; peerIdx++) {
-    int peer = peerIdx < rank ? peerIdx : peerIdx + 1;
-    int chanIdx = peerIdx;
-
-    if (sendCounts[peer] > 0) {
-      // Use all threads for maximum copy throughput
-      memoryChannels[chanIdx].put(
-          remoteRecvDispls[peer], // dst offset in peer's buffer (peer's recvDispls[rank])
-          sendDispls[peer],       // src offset in our buffer
-          sendCounts[peer],       // size
-          tid,                    // thread id
-          nThreads                // total threads
-      );
-    }
-    __syncthreads();
-
-    // Only one thread signals per peer
-    if (tid == 0) {
-      memoryChannels[chanIdx].signal();
-    }
-    __syncthreads();
-
-    // Wait for incoming data from this peer
-    if (tid == 0 && recvCounts[peer] > 0) {
-      memoryChannels[chanIdx].wait();
-    }
-    __syncthreads();
-  }
-}
-
-/**
- * Pipelined AllToAllV kernel for imbalanced workloads.
- *
- * For large messages, breaks transfers into chunks to improve memory access
- * patterns, but avoids excessive signaling overhead by signaling only once
- * per peer after all chunks are sent.
- *
- * Optimized for MoE workloads where message sizes can vary by 100x+ between ranks.
- *
- * Launch config: <<<1, 1024>>>
- */
-__global__ void __launch_bounds__(1024)
-    alltoallvPipelinedKernel(DeviceHandle<MemoryChannel>* memoryChannels,
-                             int rank,
-                             int worldSize,
-                             const void* sendBuff,
-                             void* recvBuff,
-                             const size_t* sendCounts,
-                             const size_t* sendDispls,
-                             const size_t* recvCounts,
-                             const size_t* recvDispls,
-                             const size_t* remoteRecvDispls) {
-  int tid = threadIdx.x;
-  int nThreads = blockDim.x;
-  int nPeers = worldSize - 1;
-
-  // Step 1: Copy local data
-  if (sendCounts[rank] > 0) {
-    mscclpp::copy((char*)recvBuff + recvDispls[rank],
-                  (void*)((const char*)sendBuff + sendDispls[rank]),
-                  sendCounts[rank], tid, nThreads);
-  }
-  __syncthreads();
-
-  // Step 2: Process each peer - send all data in chunks, then signal once
-  for (int peerIdx = 0; peerIdx < nPeers; peerIdx++) {
-    int peer = peerIdx < rank ? peerIdx : peerIdx + 1;
-    int chanIdx = peerIdx;
-
-    size_t sendSize = sendCounts[peer];
-    size_t recvSize = recvCounts[peer];
-    size_t dstOffset = remoteRecvDispls[peer]; // peer's recvDispls[rank]
-    size_t srcOffset = sendDispls[peer];
-
-    // Send data in chunks for better memory access patterns
-    // But only signal ONCE after all chunks are sent (avoids signaling overhead)
-    if (sendSize > 0) {
-      for (size_t offset = 0; offset < sendSize; offset += ALLTOALLV_CHUNK_SIZE) {
-        size_t chunkSize = (sendSize - offset < ALLTOALLV_CHUNK_SIZE)
-                           ? (sendSize - offset) : ALLTOALLV_CHUNK_SIZE;
-        memoryChannels[chanIdx].put(
-            dstOffset + offset,
-            srcOffset + offset,
-            chunkSize,
-            tid,
-            nThreads
-        );
-        __syncthreads();
-      }
-    }
-
-    // Signal ONCE after all data is sent
-    if (tid == 0 && sendSize > 0) {
-      memoryChannels[chanIdx].signal();
-    }
-    __syncthreads();
-
-    // Wait ONCE for all peer's data
-    if (tid == 0 && recvSize > 0) {
-      memoryChannels[chanIdx].wait();
-    }
-    __syncthreads();
-  }
-}
-
-/**
- * Ring-based AllToAllV kernel with maximum thread parallelism.
- *
- * Uses step-by-step ring pattern with ALL threads for maximum bandwidth.
- * Each step processes one peer pair, with correct semaphore handling.
- */
-__global__ void __launch_bounds__(1024)
-    alltoallvRingKernel(DeviceHandle<MemoryChannel>* memoryChannels,
-                        int rank,
-                        int worldSize,
-                        const void* sendBuff,
-                        void* recvBuff,
-                        const size_t* sendCounts,
-                        const size_t* sendDispls,
-                        const size_t* recvCounts,
-                        const size_t* recvDispls,
-                        const size_t* remoteRecvDispls) {
-  int tid = threadIdx.x;
-  int nThreads = blockDim.x;
-
-  // Copy local data first using ALL threads
-  if (sendCounts[rank] > 0) {
-    mscclpp::copy((char*)recvBuff + recvDispls[rank],
-                  (void*)((const char*)sendBuff + sendDispls[rank]),
-                  sendCounts[rank], tid, nThreads);
-  }
-  __syncthreads();
-
-  // Ring-based exchange - process each peer sequentially
-  // Key fix: use the SAME channel for both signal and wait (peer-pair symmetry)
-  for (int step = 1; step < worldSize; step++) {
-    int sendPeer = (rank + step) % worldSize;
-    int chanIdx = sendPeer < rank ? sendPeer : sendPeer - 1;
-
-    // Send data to sendPeer using ALL threads
-    if (sendCounts[sendPeer] > 0) {
-      memoryChannels[chanIdx].put(
-          remoteRecvDispls[sendPeer], // dst offset in peer's buffer (peer's recvDispls[rank])
-          sendDispls[sendPeer],
-          sendCounts[sendPeer],
-          tid,
-          nThreads
-      );
-    }
-    __syncthreads();
-
-    // Signal completion on the SAME channel we'll wait on
-    if (tid == 0) {
-      memoryChannels[chanIdx].signal();
-    }
-    __syncthreads();
-
-    // Wait for peer's data on the SAME channel (correct semaphore pairing)
-    if (tid == 0 && recvCounts[sendPeer] > 0) {
-      memoryChannels[chanIdx].wait();
-    }
-    __syncthreads();
-  }
-}
-
-/**
- * PortChannel-only AllToAllV kernel for multi-node.
- *
- * Uses PortChannel (proxy-based) for ALL peers — both intra-node and inter-node.
- * This follows the proven pattern from allgather_test_cpp.cu which works reliably
- * on GB200 multi-node NVSwitch systems.
- *
- * For intra-node CudaIpc connections, the proxy performs cudaMemcpyD2D.
- * For inter-node IB connections, the proxy performs RDMA writes.
- *
- * Each block handles one peer. Thread 0 pushes a put descriptor to the FIFO
- * (single-threaded), which triggers the proxy to perform the data transfer.
- *
- * Launch config: <<<nPeers, 1024>>>
- */
-__global__ void __launch_bounds__(1024)
-    alltoallvPortChannelKernel(PortChannelDeviceHandle* portChannels,
-                               int rank,
-                               int worldSize,
-                               const void* sendBuff,
-                               void* recvBuff,
-                               const size_t* sendCounts,
-                               const size_t* sendDispls,
-                               const size_t* recvCounts,
-                               const size_t* recvDispls,
-                               const size_t* remoteRecvDispls) {
-  const int nPeers = worldSize - 1;
-
-  // Handle trivial case (single rank)
-  if (nPeers == 0) {
-    const int gtid = threadIdx.x + blockIdx.x * blockDim.x;
-    const int nThreads = blockDim.x * gridDim.x;
-    if (sendCounts[rank] > 0) {
-      mscclpp::copy((char*)recvBuff + recvDispls[rank],
-                    (void*)((const char*)sendBuff + sendDispls[rank]),
-                    sendCounts[rank], gtid, nThreads);
-    }
-    return;
-  }
-
-  // Phase 1: Local copy — all blocks cooperate using global thread IDs
-  const int gtid = threadIdx.x + blockIdx.x * blockDim.x;
-  const int nThreads = blockDim.x * gridDim.x;
-  if (sendCounts[rank] > 0) {
-    mscclpp::copy((char*)recvBuff + recvDispls[rank],
-                  (void*)((const char*)sendBuff + sendDispls[rank]),
-                  sendCounts[rank], gtid, nThreads);
-  }
-
-  // Phase 2: Per-peer data transfer via PortChannel (proxy-based).
-  // Each block handles one peer: blockIdx.x == peerIdx.
-  const int peerIdx = blockIdx.x;
-  if (peerIdx >= nPeers) return;
-
-  const int peer = peerIdx < rank ? peerIdx : peerIdx + 1;
-
-  // Thread 0 pushes a put+signal+flush descriptor to the proxy FIFO.
-  // The proxy thread performs the actual data transfer (cudaMemcpy or RDMA).
-  if (threadIdx.x == 0 && sendCounts[peer] > 0) {
-    portChannels[peerIdx].putWithSignalAndFlush(
-        remoteRecvDispls[peer],  // dst offset in peer's output buffer
-        sendDispls[peer],        // src offset in our input buffer
-        sendCounts[peer]         // bytes to transfer
-    );
-  }
-  __syncthreads();
-
-  // Wait for incoming data from this peer
-  if (threadIdx.x == 0 && recvCounts[peer] > 0) {
-    portChannels[peerIdx].wait();
-  }
-}
-
-#undef ALLTOALLV_WARP_SIZE
 }  // namespace collective
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 02c85096..97497eea 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -51,12 +51,6 @@ std::vector<Connection> setupConnections(std::shared_ptr<Communicator> comm);
 /// @return Vector of connections (one per peer)
 std::vector<Connection> setupHybridConnections(std::shared_ptr<Communicator> comm, int localGpuIdx);
 
-/// Check if a connection is intra-node (CudaIpc transport).
-/// @param conn The connection to check
-/// @return true if the connection uses CudaIpc transport
-inline bool isIntraNodeConnection(const Connection& conn) {
-  return conn.transport() == Transport::CudaIpc;
-}
 
 /// Get the IB transport for a given local GPU index.
 /// @param localGpuIdx Local GPU index (0-7)
@@ -82,20 +76,6 @@ std::vector<PortChannel> setupPortChannels(
 /// This follows the proven pattern from allgather_test_cpp.cu:
 /// - CudaIpc connections: proxy does cudaMemcpyD2D
 /// - IB connections: proxy does RDMA write
-/// Creates one PortChannel per peer (dense indexing by peerIdx).
-/// @param proxyService The ProxyService managing transfers
-/// @param comm The communicator
-/// @param connections All connections (mixed CudaIpc + IB)
-/// @param remoteMemories Remote registered memories (one per peer)
-/// @param localMemory Local registered memory
-/// @return Vector of PortChannels (one per peer, in connection order)
-std::vector<PortChannel> setupAllPortChannels(
-    std::shared_ptr<ProxyService> proxyService,
-    Communicator& comm,
-    const std::vector<Connection>& connections,
-    const std::vector<RegisteredMemory>& remoteMemories,
-    RegisteredMemory localMemory);
-
 /// Setup PortChannel device handles (GPU-allocated array).
 std::shared_ptr<PortChannelDeviceHandle> setupPortChannelDeviceHandles(
     const std::vector<PortChannel>& portChannels);
diff --git a/test/mscclpp-test/alltoallv_test.cu b/test/mscclpp-test/alltoallv_test.cu
index a813e703..bebfcf53 100644
--- a/test/mscclpp-test/alltoallv_test.cu
+++ b/test/mscclpp-test/alltoallv_test.cu
@@ -65,44 +65,6 @@ void AllToAllVTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
   const int nThreads = 1024;
 
   if (kernelNum == 0) {
-    // Use high-throughput kernel with all threads participating in each transfer
-    mscclpp::collective::alltoallvKernel<<<1, nThreads, 0, stream>>>(
-        d_memoryChannels,
-        rank, worldSize,
-        localSendBuffV, localRecvBuffV,
-        d_sendCounts, d_sendDispls,
-        d_recvCounts, d_recvDispls,
-        d_remoteRecvDispls);
-  } else if (kernelNum == 1) {
-    // Use ring-based kernel for larger world sizes
-    mscclpp::collective::alltoallvRingKernel<<<1, nThreads, 0, stream>>>(
-        d_memoryChannels,
-        rank, worldSize,
-        localSendBuffV, localRecvBuffV,
-        d_sendCounts, d_sendDispls,
-        d_recvCounts, d_recvDispls,
-        d_remoteRecvDispls);
-  } else if (kernelNum == 2) {
-    // Use pipelined kernel for imbalanced workloads (MoE)
-    mscclpp::collective::alltoallvPipelinedKernel<<<1, nThreads, 0, stream>>>(
-        d_memoryChannels,
-        rank, worldSize,
-        localSendBuffV, localRecvBuffV,
-        d_sendCounts, d_sendDispls,
-        d_recvCounts, d_recvDispls,
-        d_remoteRecvDispls);
-  } else if (kernelNum == 3) {
-    // Use legacy multi-block kernel (sequential peers)
-    const int nBlocks = mscclpp::collective::ALLTOALLV_DEFAULT_NBLOCKS;
-    mscclpp::collective::alltoallvMultiBlockKernel<<<nBlocks, nThreads, 0, stream>>>(
-        d_memoryChannels,
-        d_deviceSyncer,
-        rank, worldSize,
-        localSendBuffV, localRecvBuffV,
-        d_sendCounts, d_sendDispls,
-        d_recvCounts, d_recvDispls,
-        d_remoteRecvDispls);
-  } else if (kernelNum == 4) {
     // Peer-parallel kernel: small messages (1 block/peer, no barrier)
     const int nPeers = worldSize - 1;
     const int nBlocks = (nPeers > 0) ? nPeers : 1;
@@ -114,7 +76,7 @@ void AllToAllVTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
         d_sendCounts, d_sendDispls,
         d_recvCounts, d_recvDispls,
         d_remoteRecvDispls);
-  } else if (kernelNum == 5) {
+  } else if (kernelNum == 1) {
     // Peer-parallel kernel: large messages (multiple blocks/peer, barrier)
     const int nPeers = worldSize - 1;
     const int blocksPerPeer = mscclpp::collective::ALLTOALLV_DEFAULT_BLOCKS_PER_PEER;
@@ -220,12 +182,8 @@ void AllToAllVTestColl::setupCollTest(size_t size) {
 
 std::vector<KernelRestriction> AllToAllVTestColl::getKernelRestrictions() {
   return {
-      {0, "alltoallvKernel", true, 1, 4 * worldSize_},
-      {1, "alltoallvRingKernel", true, 1, 4 * worldSize_},
-      {2, "alltoallvPipelinedKernel", true, 1, 4 * worldSize_},
-      {3, "alltoallvMultiBlockKernel", true, 1, 4 * worldSize_},
-      {4, "alltoallvPeerParallel(small)", true, 1, 4 * worldSize_},
-      {5, "alltoallvPeerParallel(large)", true, 1, 4 * worldSize_}
+      {0, "alltoallvPeerParallel(small)", true, 1, 4 * worldSize_},
+      {1, "alltoallvPeerParallel(large)", true, 1, 4 * worldSize_}
   };
 }
 

From fa95e82e18c5f963b059aefe20939d5ca8a63df2 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 7 Apr 2026 08:41:51 -0700
Subject: [PATCH 40/52] Fix CI/CD pipeline issues (#773)

This pull request updates the deployment pipeline to allow custom CMake
arguments to be passed to the pip install process on remote VMs.

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .azure-pipelines/templates/deploy.yml   | 24 ++++++++++++++++++++++--
 .azure-pipelines/templates/ut-npkit.yml | 10 +++++-----
 test/deploy/setup.sh                    |  6 ++++++
 tools/npkit/npkit_trace_generator.py    | 16 ++++++++--------
 4 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml
index fc116acf..2f642f1d 100644
--- a/.azure-pipelines/templates/deploy.yml
+++ b/.azure-pipelines/templates/deploy.yml
@@ -94,7 +94,27 @@ steps:
       du -sh build/bin/* 2>/dev/null || true
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-# 2. Download SSH key + install packages + start VMSS
+# 2. Write CMake args for pip install on remote VMs
+- task: Bash@3
+  name: WritePipCmakeArgs
+  displayName: Write pip CMake args
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      PIP_CMAKE_ARGS=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        PIP_CMAKE_ARGS="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ -n "${CMAKE_EXTRA_ARGS}" ]; then
+        PIP_CMAKE_ARGS="${PIP_CMAKE_ARGS} ${CMAKE_EXTRA_ARGS}"
+      fi
+      echo "${PIP_CMAKE_ARGS}" > pip_cmake_args.txt
+      echo "pip CMake args: $(cat pip_cmake_args.txt)"
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 3. Download SSH key + install packages + start VMSS
 - task: DownloadSecureFile@1
   name: SshKeyFile
   displayName: Download key file
@@ -120,7 +140,7 @@ steps:
     inlineScript: |
       az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
 
-# 3. Deploy test environment
+# 4. Deploy test environment
 - task: Bash@3
   name: DeployTestEnv
   displayName: Deploy Test Env
diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml
index e53b5cf5..1bd89caf 100644
--- a/.azure-pipelines/templates/ut-npkit.yml
+++ b/.azure-pipelines/templates/ut-npkit.yml
@@ -28,7 +28,7 @@ steps:
       grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
 
 - template: run-remote-task.yml
   parameters:
@@ -42,14 +42,14 @@ steps:
       grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
       rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
       mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
       python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
       grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_UNPACK_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
 
 - template: stop.yml
   parameters:
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
index 80cd10b1..d4996cc2 100644
--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -30,6 +30,12 @@ fi
 if [ "${PLATFORM}" == "rocm" ]; then
     export CXX=/opt/rocm/bin/hipcc
 fi
+
+PIP_CMAKE_ARGS_FILE="/root/mscclpp/pip_cmake_args.txt"
+if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then
+    export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})"
+    echo "Using CMAKE_ARGS: ${CMAKE_ARGS}"
+fi
 cd /root/mscclpp && pip3 install .
 pip3 install setuptools_scm
 python3 -m setuptools_scm --force-write-version-files
diff --git a/tools/npkit/npkit_trace_generator.py b/tools/npkit/npkit_trace_generator.py
index c5ed6191..294516e6 100644
--- a/tools/npkit/npkit_trace_generator.py
+++ b/tools/npkit/npkit_trace_generator.py
@@ -14,25 +14,25 @@ def parse_npkit_event_header(npkit_event_header_path):
         "NOP",
         "BARRIER",
         "PUT",
-        "PUT_PACKET",
-        "READ_PUT_PACKET",
+        "PUT_PACKETS",
+        "READ_PUT_PACKETS",
         "PUT_WITH_SIGNAL",
         "PUT_WITH_SIGNAL_AND_FLUSH",
         "GET",
         "COPY",
-        "COPY_PACKET",
-        "TRANSFORM_TO_PACKET",
+        "COPY_PACKETS",
+        "UNPACK_PACKETS",
         "SIGNAL",
         "WAIT",
         "FLUSH",
         "REDUCE",
-        "REDUCE_PACKET",
+        "REDUCE_PACKETS",
         "REDUCE_COPY_PACKETS",
         "REDUCE_SEND",
-        "REDUCE_SEND_PACKET",
+        "REDUCE_SEND_PACKETS",
         "REDUCE_COPY_SEND_PACKETS",
-        "READ_REDUCE_COPY",
-        "READ_REDUCE_COPY_SEND",
+        "READ_REDUCE",
+        "READ_REDUCE_SEND",
         "MULTI_LOAD_REDUCE_STORE",
         "RELAXED_SIGNAL",
         "RELAXED_WAIT",

From 96a72bbd3e71df14f8afca6b4daaf907bbad8e8e Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 7 Apr 2026 13:37:02 -0700
Subject: [PATCH 41/52] Support E4M3B15 datatype (#765)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

- **Add `fp8_e4m3b15` datatype**: A software-defined FP8 type with 4
exponent bits, 3 mantissa bits, and bias=15 (max finite value: 0.9375).
Implemented entirely in software with no HW dependency, using
Triton-style bit manipulation through fp16 as intermediate for efficient
conversion.
- **Add mixed-precision accumulation for allreduce**: All allreduce
algorithm variants (packet, NVLS packet, fullmesh, RSAG zero-copy, and
others) now support a configurable `accumDtype` parameter, enabling FP8
inputs to be reduced in float16 or float32 for higher accuracy.
- **Propagate `accumDtype` through the full API**: The new parameter is
threaded from `Algorithm::execute()` → `NativeAlgorithm` → `KernelFunc`
→ dispatch → CUDA kernels, with `DataType::AUTO` as the default
(resolves to input dtype at runtime).
- **Add FP8 accumulation correctness tests**: New `test_fp8_accum.py`
validates that higher-precision accumulation produces results at least
as accurate as native FP8 accumulation across multiple algorithms and
sizes. Skipped on CUDA SM < 89 (pre-Hopper); runs on HIP/ROCm.
- **Add `test_fp8_accum.py` to CI**: Azure Pipeline `ut.yml` now runs
FP8 accumulation tests alongside existing pytests.
- **NCCL shim logging cleanup**: Migrated `printf`-style `WARN`/`INFO`
calls to streaming-style logging.

## Key files

| Area | Files |
|------|-------|
| New datatype + vector ops | `include/mscclpp/gpu_data_types.hpp` |
| Accumulation reduce helpers | `src/core/include/reduce_kernel.hpp` |
| Algorithm API (`accumDtype`) | `include/mscclpp/algorithm.hpp`,
`src/core/algorithm.cc` |
| Allreduce kernels | `src/ext/collectives/allreduce/*.cu` |
| Dispatch + common | `src/ext/collectives/include/allreduce/common.hpp`
|
| Python bindings | `python/csrc/algorithm.cpp`,
`python/mscclpp/_core/algorithm.py` |
| Tests | `python/test/test_fp8_accum.py` |
| CI | `.azure-pipelines/templates/ut.yml` |

## Test plan

- [x] CI passes on H100 (CUDA SM 90) — full FP8 E4M3 + E4M3B15
accumulation tests
- [x] CI passes on A100 (CUDA SM 80) — FP8 tests correctly skipped
- [x] CI passes on MI300X (ROCm) — FP8 tests run via HIP
- [x] Existing `test_mscclpp.py` tests continue to pass
- [x] NCCL shim builds and runs correctly with new `accumDtype` defaults

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .azure-pipelines/templates/ut.yml             |   1 +
 docs/guide/mscclpp-torch-integration.md       |   3 +-
 .../customized_allgather.cu                   |   3 +-
 .../torch-integration/customized_allgather.cu |   3 +-
 include/mscclpp/algorithm.hpp                 |  15 +-
 include/mscclpp/gpu_data_types.hpp            | 771 +++++++++++++++++-
 python/csrc/algorithm.cpp                     |   8 +-
 python/csrc/core_py.cpp                       |   3 +-
 python/csrc/gpu_utils_py.cpp                  |  13 +
 python/mscclpp/_core/algorithm.py             |   8 +-
 python/test/test_fp8_accum.py                 | 391 +++++++++
 src/core/algorithm.cc                         |  17 +-
 src/core/executor/execution_kernel.cu         |   6 +
 src/core/include/execution_kernel.hpp         |  27 +-
 src/core/include/reduce_kernel.hpp            | 174 +++-
 .../allgather/allgather_fullmesh.cu           |   3 +-
 .../allgather/allgather_fullmesh_2.cu         |   3 +-
 .../allreduce/allreduce_allpair_packet.cu     |  13 +-
 .../allreduce/allreduce_fullmesh.cu           |  37 +-
 .../allreduce_nvls_block_pipeline.cu          |  14 +-
 .../allreduce/allreduce_nvls_packet.cu        |  45 +-
 .../allreduce/allreduce_nvls_warp_pipeline.cu |  19 +-
 .../allreduce/allreduce_nvls_zero_copy.cu     |  15 +-
 .../collectives/allreduce/allreduce_packet.cu |  68 +-
 .../collectives/allreduce/allreduce_rsag.cu   |  13 +-
 .../allreduce/allreduce_rsag_pipeline.cu      |  19 +-
 .../allreduce/allreduce_rsag_zero_copy.cu     |  31 +-
 .../allreduce/allreduce_allpair_packet.hpp    |   2 +-
 .../include/allreduce/allreduce_fullmesh.hpp  |   2 +-
 .../allreduce_nvls_block_pipeline.hpp         |   2 +-
 .../allreduce/allreduce_nvls_packet.hpp       |   4 +-
 .../allreduce_nvls_warp_pipeline.hpp          |   2 +-
 .../allreduce/allreduce_nvls_zero_copy.hpp    |   2 +-
 .../include/allreduce/allreduce_packet.hpp    |   2 +-
 .../include/allreduce/allreduce_rsag.hpp      |   2 +-
 .../allreduce/allreduce_rsag_pipeline.hpp     |   2 +-
 .../allreduce/allreduce_rsag_zero_copy.hpp    |   2 +-
 .../collectives/include/allreduce/common.hpp  |  92 +--
 src/ext/nccl/algorithm_selector.cc            |   3 +-
 src/ext/nccl/datatype_conversion.hpp          |   5 +
 src/ext/nccl/nccl.cc                          |  39 +-
 41 files changed, 1623 insertions(+), 261 deletions(-)
 create mode 100644 python/test/test_fp8_accum.py

diff --git a/.azure-pipelines/templates/ut.yml b/.azure-pipelines/templates/ut.yml
index 9d17e923..743c66e6 100644
--- a/.azure-pipelines/templates/ut.yml
+++ b/.azure-pipelines/templates/ut.yml
@@ -41,6 +41,7 @@ steps:
     displayName: Run pytests
     remoteScript: |
       mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_fp8_accum.py -x
 
 - template: stop.yml
   parameters:
diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md
index 1c966155..b4e4fcdf 100644
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -332,7 +332,8 @@ public:
                    size_t inputSize, size_t outputSize,
                    mscclpp::DataType dtype, mscclpp::ReduceOp op,
                    cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                   const std::unordered_map<std::string, uintptr_t>& extras) {
+                   const std::unordered_map<std::string, uintptr_t>& extras,
+                   [[maybe_unused]] mscclpp::DataType accumDtype) {
                 return self->kernelFunc(ctx, input, output, inputSize, dtype, stream);
             },
             // Context initialization function
diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu
index e78c4777..02df3685 100644
--- a/examples/customized-collective-algorithm/customized_allgather.cu
+++ b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -101,7 +101,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
         "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
         [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
                mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
-               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+               [[maybe_unused]] mscclpp::DataType accumDtype) {
           return self->allgatherKernelFunc(ctx, input, output, inputSize, stream);
         },
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu
index d48c4410..907b3ada 100644
--- a/examples/torch-integration/customized_allgather.cu
+++ b/examples/torch-integration/customized_allgather.cu
@@ -69,7 +69,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
         "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
         [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
                mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
-               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+               [[maybe_unused]] mscclpp::DataType accumDtype) {
           return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream);
         },
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/include/mscclpp/algorithm.hpp b/include/mscclpp/algorithm.hpp
index 65b1ab3c..531cb857 100644
--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -103,12 +103,14 @@ class Algorithm {
   /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
   /// @param symmetricMemory Whether to use symmetric memory optimization.
   /// @param extras Additional parameters for algorithm-specific customization.
+  /// @param accumDtype Data type for accumulation during reduction. DataType::AUTO resolves to dtype.
   /// @return The result of the operation.
   virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                              size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                              std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
                              bool symmetricMemory = false,
-                             const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
+                             const std::unordered_map<std::string, uintptr_t>& extras = {},
+                             DataType accumDtype = DataType::AUTO) = 0;
 
   /// Reset the algorithm state, clearing any cached contexts.
   virtual void reset() = 0;
@@ -186,10 +188,11 @@ class NativeAlgorithm : public Algorithm {
   /// @param nBlocks Number of CUDA blocks.
   /// @param nThreadsPerBlock Number of threads per block.
   /// @param extras Additional algorithm-specific parameters.
+  /// @param accumDtype Data type for accumulation (resolved from input dtype if sentinel).
   /// @return The result of the operation.
   using KernelFunc =
       std::function<CommResult(const std::shared_ptr<void>, const void*, void*, size_t, size_t, DataType, ReduceOp,
-                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&)>;
+                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&, DataType)>;
 
   /// Function type for creating algorithm contexts.
   /// @param comm The communicator.
@@ -233,8 +236,8 @@ class NativeAlgorithm : public Algorithm {
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                      size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                      std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                     bool symmetricMemory = false,
-                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+                     bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
+                     DataType accumDtype = DataType::AUTO) override;
   const std::string& name() const override;
   const std::string& collective() const override;
   const std::pair<size_t, size_t>& messageRange() const override;
@@ -285,8 +288,8 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                      size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                      std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                     bool symmetricMemory = false,
-                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+                     bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
+                     DataType accumDtype = DataType::AUTO) override;
   AlgorithmType type() const override { return AlgorithmType::DSL; }
   Constraint constraint() const override;
   void reset() override;
diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
index 1cecbea6..fa31a28f 100644
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -64,18 +64,151 @@ using __bfloat162 = __nv_bfloat162;
 
 #endif
 
+/// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15.
+/// Format (MSB first): [sign:1][exponent:4][mantissa:3]
+/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention).
+/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6.
+struct alignas(1) __fp8_e4m3b15 {
+  uint8_t __x;
+
+  __fp8_e4m3b15() = default;
+
+  /// Construct from raw bits (use __fp8_e4m3b15::fromRaw() for clarity).
+  MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(uint8_t raw) : __x(raw) {}
+
+  /// Construct from float32 (explicit to avoid ambiguous conversion chains).
+  MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(float val) : __x(fromFloat(val)) {}
+
+  /// Convert to float32.
+  MSCCLPP_HOST_DEVICE_INLINE operator float() const { return toFloat(__x); }
+
+  /// Construct from a raw bit pattern without conversion.
+  static MSCCLPP_HOST_DEVICE_INLINE __fp8_e4m3b15 fromRaw(uint8_t bits) {
+    __fp8_e4m3b15 r;
+    r.__x = bits;
+    return r;
+  }
+
+ private:
+  /// Decode fp8_e4m3b15 bits → float32.
+  ///
+  /// Uses bit manipulation through fp16 as intermediate, adapted from the Triton compiler.
+  /// fp8_e4m3b15 is identical to fp8_e4m3fn (NVIDIA) except exponent bias is 15 vs 7.
+  /// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8,
+  /// then convert fp16 → float32.
+  static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) {
+    // Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN.
+    uint32_t exp = (bits >> 3) & 0xFu;
+    if (bits == 0x80 || exp == 15) {
+      union {
+        uint32_t u;
+        float f;
+      } nan_val = {0x7FC00000u};
+      return nan_val.f;
+    }
+    if (bits == 0) return 0.0f;
+
+    // Triton-style bit manipulation: fp8 → fp16 → fp32.
+    // fp8 layout: [S:1][E:4][M:3]  (bias=15)
+    // fp16 layout: [S:1][E:5][M:10] (bias=15)
+    //
+    // Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1
+    // to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15.
+    // Refer:
+    // https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34
+    uint16_t h = (uint16_t)bits << 8;             // place fp8 in upper byte of fp16
+    uint16_t sign16 = h & 0x8000u;                // extract sign at fp16 position
+    uint16_t nosign = h & 0x7F00u;                // exponent + mantissa (no sign)
+    uint16_t fp16_bits = sign16 | (nosign >> 1);  // shift exponent right by 1
+
+    // For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0
+    // and fp16 mantissa = (fp8_mantissa << 7), which correctly represents
+    // the subnormal fp16 value since both share bias=15.
+
+    // Convert fp16 bits to float via __half (works on host and device, CUDA and HIP).
+    union {
+      uint16_t u;
+      __half h;
+    } cvt = {fp16_bits};
+    return __half2float(cvt.h);
+  }
+
+  /// Encode float32 → fp8_e4m3b15 bits.
+  ///
+  /// Algorithm adapted from Triton: float32 → fp16 → bit-manipulate → fp8.
+  /// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15),
+  /// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1.
+  static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) {
+    union {
+      float f;
+      uint32_t u;
+    } in = {val};
+
+    // NaN → 0x80 (negative-zero bit pattern = NaN in fnuz).
+    if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u;
+
+    // Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP).
+    __half h_val = __float2half_rn(val);
+    union {
+      __half h;
+      uint16_t u;
+    } cvt = {h_val};
+    uint16_t fp16_bits = cvt.u;
+
+    // Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80.
+    uint16_t abs_fp16 = fp16_bits & 0x7FFFu;
+    if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u;
+
+    // Reconstruct with sign.
+    uint16_t sign16 = fp16_bits & 0x8000u;
+
+    // Triton-style: fp16 → fp8.
+    // fp16 layout: [S:1][E:5][M:10] (bias=15)
+    // fp8 layout:  [S:1][E:4][M:3]  (bias=15)
+    //
+    // mad.lo.u32 a0, a0, 2, 0x00800080  →  (abs_fp16 * 2 + 0x0080)
+    // This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias.
+    // Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea  →  (sign & 0x8000) | a0
+    // Finally: prmt for byte extraction.
+    //
+    // Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte.
+    uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u);
+    // The upper byte now contains [E:4][M:3][round_bit].
+    // Combine with sign and extract.
+    uint16_t with_sign = sign16 | adjusted;
+    uint8_t result = (uint8_t)(with_sign >> 8);
+
+    // Zero → 0x00 (ensure positive zero, not negative zero which is NaN).
+    if ((result & 0x7Fu) == 0) result = 0x00u;
+
+    return result;
+  }
+};
+
+/// Packed 2x fp8_e4m3b15 storage.
+struct alignas(2) __fp8x2_e4m3b15 {
+  uint16_t __x;
+};
+
+/// Packed 4x fp8_e4m3b15 storage.
+struct alignas(4) __fp8x4_e4m3b15 {
+  uint32_t __x;
+};
+
 namespace mscclpp {
 
 /// Data types supported by mscclpp operations.
 enum class DataType {
-  INT32,        // 32-bit signed integer.
-  UINT32,       // 32-bit unsigned integer.
-  FLOAT16,      // IEEE 754 half precision.
-  FLOAT32,      // IEEE 754 single precision.
-  BFLOAT16,     // bfloat16 precision.
-  FLOAT8_E4M3,  // float8 with E4M3 layout.
-  FLOAT8_E5M2,  // float8 with E5M2 layout.
-  UINT8,        // 8-bit unsigned integer.
+  INT32,           // 32-bit signed integer.
+  UINT32,          // 32-bit unsigned integer.
+  FLOAT16,         // IEEE 754 half precision.
+  FLOAT32,         // IEEE 754 single precision.
+  BFLOAT16,        // bfloat16 precision.
+  FLOAT8_E4M3,     // float8 with E4M3 layout.
+  FLOAT8_E5M2,     // float8 with E5M2 layout.
+  UINT8,           // 8-bit unsigned integer.
+  FLOAT8_E4M3B15,  // float8 with E4M3 layout, bias=15 (software, no HW accel).
+  AUTO = 255,      // Sentinel: resolve to the input dtype at runtime.
 };
 
 /// Word array.
@@ -97,6 +230,7 @@ struct alignas(Bytes) Words<Bytes, false> {};
 template <typename T, int N, typename StorageT>
 union alignas(sizeof(T) * N) VectorTypeImpl {
   static_assert(N > 0, "N must be greater than 0");
+  static_assert(sizeof(StorageT) >= sizeof(T) * N, "StorageT must cover the full vector size");
 
   T data[N];
   Words<sizeof(T) * N> words;
@@ -127,13 +261,14 @@ union alignas(sizeof(T) * N) VectorTypeImpl {
   MSCCLPP_HOST_DEVICE_INLINE const T& operator[](int i) const { return data[i]; }
 };
 
-// Helper template to get the appropriate vector type for a given element type and count
+// Helper template to get the appropriate vector type for a given element type and count.
 template <typename T, int N>
 struct VectorTypeHelper {
-  using type =
-      VectorTypeImpl<T, N,
-                     typename std::conditional_t<N * sizeof(T) == 4, uint32_t,
-                                                 typename std::conditional_t<N * sizeof(T) == 8, uint2, uint4>>>;
+  static constexpr int Bytes = N * sizeof(T);
+  using type = VectorTypeImpl<
+      T, N,
+      std::conditional_t<Bytes == 4, uint32_t,
+                         std::conditional_t<Bytes == 8, uint2, std::conditional_t<Bytes <= 16, uint4, Words<Bytes>>>>>;
 };
 
 /// Vector type - clean user interface (automatically selects appropriate storage type)
@@ -170,6 +305,11 @@ DEFINE_VEC(bf16x4, __bfloat16, 4, uint2);
 DEFINE_VEC(f16x8, __half, 8, uint4);
 DEFINE_VEC(bf16x8, __bfloat16, 8, uint4);
 
+// Aliases for large vector types (>16 bytes) where no native CUDA storage type exists.
+using f32x8 = VectorType<float, 8>;
+using f32x16 = VectorType<float, 16>;
+using f16x16 = VectorType<__half, 16>;
+
 #if defined(__FP8_TYPES_EXIST__)
 DEFINE_VEC(f8_e4m3x2, __fp8_e4m3, 2, __fp8x2_e4m3);
 DEFINE_VEC(f8_e4m3x4, __fp8_e4m3, 4, __fp8x4_e4m3);
@@ -181,6 +321,12 @@ DEFINE_VEC(f8_e5m2x4, __fp8_e5m2, 4, __fp8x4_e5m2);
 DEFINE_VEC(f8_e5m2x8, __fp8_e5m2, 8, uint2);
 DEFINE_VEC(f8_e5m2x16, __fp8_e5m2, 16, uint4);
 #endif
+
+// fp8_e4m3b15 vectors (always available — software type, no HW dependency)
+DEFINE_VEC(f8_e4m3b15x2, __fp8_e4m3b15, 2, __fp8x2_e4m3b15);
+DEFINE_VEC(f8_e4m3b15x4, __fp8_e4m3b15, 4, __fp8x4_e4m3b15);
+DEFINE_VEC(f8_e4m3b15x8, __fp8_e4m3b15, 8, uint2);
+DEFINE_VEC(f8_e4m3b15x16, __fp8_e4m3b15, 16, uint4);
 #undef DEFINE_VEC
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
@@ -254,6 +400,21 @@ MSCCLPP_DEVICE_INLINE __fp8_e5m2 clip(__fp8_e5m2 val) {
 }
 #endif
 
+// --- f32x2 arithmetic ---
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f32x2 operator+(const f32x2& a, const f32x2& b) {
+#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ >= 1000)
+  // Blackwell (SM 10.0+): packed float2 add in a single instruction.
+  return __fadd2_rn(a.storage, b.storage);
+#else
+  f32x2 result;
+  result.data[0] = a.data[0] + b.data[0];
+  result.data[1] = a.data[1] + b.data[1];
+  return result;
+#endif
+}
+
 template <bool UseClip = true>
 MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
   __half2 result;
@@ -265,6 +426,18 @@ MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
   return result;
 }
 
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f16x4 operator+(const f16x4& a, const f16x4& b) {
+  // Decompose into 2× packed __hadd2 (2 instructions instead of 4 scalar __hadd).
+  const f16x2* a2 = reinterpret_cast<const f16x2*>(&a);
+  const f16x2* b2 = reinterpret_cast<const f16x2*>(&b);
+  f16x4 result;
+  f16x2* r2 = reinterpret_cast<f16x2*>(&result);
+  r2[0] = a2[0] + b2[0];
+  r2[1] = a2[1] + b2[1];
+  return result;
+}
+
 template <bool UseClip = true>
 MSCCLPP_DEVICE_INLINE bf16x2 operator+(const bf16x2& a, const bf16x2& b) {
   __bfloat162 result;
@@ -449,6 +622,14 @@ MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) {
   return (a < b ? a : b);
 }
 
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 min(const f32x2& a, const f32x2& b) {
+  f32x2 result;
+  result.data[0] = fminf(a.data[0], b.data[0]);
+  result.data[1] = fminf(a.data[1], b.data[1]);
+  return result;
+}
+
 template <>
 MSCCLPP_DEVICE_INLINE f16x2 min(const f16x2& a, const f16x2& b) {
 #if defined(MSCCLPP_DEVICE_HIP)
@@ -489,6 +670,51 @@ MSCCLPP_DEVICE_INLINE u8x4 min(const u8x4& a, const u8x4& b) {
 #endif
 }
 
+/// Convert a vector type From to vector type To.
+/// Primary template with auto-decomposition: vectors with N > 4 elements decompose into x4 chunks,
+/// vectors with N == 4 decompose into x2 chunks, enabling optimized x2/x4 specializations to be reached.
+/// Specialized below for optimized FP8 conversion paths at x2/x4 level.
+template <typename To, typename From>
+MSCCLPP_DEVICE_INLINE To to(const From& v) {
+  static_assert(To::Size == From::Size, "to<To, From>: vector sizes must match");
+  constexpr int N = From::Size;
+
+  // Auto-decompose: N > 4 → split into x4 chunks
+  if constexpr (N > 4 && N % 4 == 0) {
+    constexpr int nChunks = N / 4;
+    using FromChunk = VectorType<typename From::ElementType, 4>;
+    using ToChunk = VectorType<typename To::ElementType, 4>;
+    const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
+    To result;
+    ToChunk* out = reinterpret_cast<ToChunk*>(&result);
+#pragma unroll
+    for (int c = 0; c < nChunks; ++c) {
+      out[c] = to<ToChunk>(in[c]);
+    }
+    return result;
+  }
+  // Auto-decompose: N == 4 → split into 2x x2 chunks
+  else if constexpr (N == 4) {
+    using FromChunk = VectorType<typename From::ElementType, 2>;
+    using ToChunk = VectorType<typename To::ElementType, 2>;
+    const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
+    To result;
+    ToChunk* out = reinterpret_cast<ToChunk*>(&result);
+    out[0] = to<ToChunk>(in[0]);
+    out[1] = to<ToChunk>(in[1]);
+    return result;
+  }
+  // Base case: element-wise conversion
+  else {
+    To result;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      result.data[i] = static_cast<typename To::ElementType>(v.data[i]);
+    }
+    return result;
+  }
+}
+
 #if defined(__FP8_TYPES_EXIST__)
 template <>
 MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) {
@@ -551,7 +777,526 @@ MSCCLPP_DEVICE_INLINE f8_e5m2x4 min(const f8_e5m2x4& a, const f8_e5m2x4& b) {
 
   return result;
 }
+
+// --- f8_e4m3 -> f32 specializations ---
+
+/// f8_e4m3x2 -> f32x2.
+/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
+/// HIP gfx942: fp8 -> float (via __builtin_amdgcn_cvt_pk_f32_fp8).
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3x2>(const f8_e4m3x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
+  f32x2 result;
+  result.data[0] = f[0];
+  result.data[1] = f[1];
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
+  f32x2 result;
+  result.data[0] = __half2float(bit_cast<__half>(h2.x));
+  result.data[1] = __half2float(bit_cast<__half>(h2.y));
+  return result;
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e4m3x4 -> f32x4.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3x4>(const f8_e4m3x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto lo = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, false);
+  auto hi = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, true);
+  f32x4 result;
+  result.data[0] = lo[0];
+  result.data[1] = lo[1];
+  result.data[2] = hi[0];
+  result.data[3] = hi[1];
+  return result;
+#else
+  const f8_e4m3x2* pair = reinterpret_cast<const f8_e4m3x2*>(&v);
+  f32x2 lo = to<f32x2>(pair[0]);
+  f32x2 hi = to<f32x2>(pair[1]);
+  f32x4 result;
+  result.data[0] = lo.data[0];
+  result.data[1] = lo.data[1];
+  result.data[2] = hi.data[0];
+  result.data[3] = hi.data[1];
+  return result;
+#endif
+}
+
+// --- f8_e5m2 -> f32 specializations ---
+
+/// f8_e5m2x2 -> f32x2.
+/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
+/// HIP gfx942: bf8 -> float (via __builtin_amdgcn_cvt_pk_f32_bf8).
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e5m2x2>(const f8_e5m2x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, 0);
+  f32x2 result;
+  result.data[0] = f[0];
+  result.data[1] = f[1];
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E5M2);
+  f32x2 result;
+  result.data[0] = __half2float(bit_cast<__half>(h2.x));
+  result.data[1] = __half2float(bit_cast<__half>(h2.y));
+  return result;
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e5m2x4 -> f32x4.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e5m2x4>(const f8_e5m2x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto lo = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, false);
+  auto hi = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, true);
+  f32x4 result;
+  result.data[0] = lo[0];
+  result.data[1] = lo[1];
+  result.data[2] = hi[0];
+  result.data[3] = hi[1];
+  return result;
+#else
+  const f8_e5m2x2* pair = reinterpret_cast<const f8_e5m2x2*>(&v);
+  f32x2 lo = to<f32x2>(pair[0]);
+  f32x2 hi = to<f32x2>(pair[1]);
+  f32x4 result;
+  result.data[0] = lo.data[0];
+  result.data[1] = lo.data[1];
+  result.data[2] = hi.data[0];
+  result.data[3] = hi.data[1];
+  return result;
+#endif
+}
+
+// --- f32 -> f8_e4m3 specializations (downcast) ---
+
+/// f32x2 -> f8_e4m3x2.
+/// HIP gfx942: float -> fp8 (via __builtin_amdgcn_cvt_pk_fp8_f32).
+/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2).
+/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
+  return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2;
+  h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
+  return bit_cast<f8_e4m3x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  f8_e4m3x2 result;
+  result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
+  result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
+  return result;
+#else
+  f8_e4m3x2 result;
+  result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e4m3x4.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x4 to<f8_e4m3x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
+  packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[2], v.data[3], packed, true);
+  return bit_cast<f8_e4m3x4>(packed);
+#else
+  f32x2 lo, hi;
+  lo.data[0] = v.data[0];
+  lo.data[1] = v.data[1];
+  hi.data[0] = v.data[2];
+  hi.data[1] = v.data[3];
+  f8_e4m3x2 lo_fp8 = to<f8_e4m3x2>(lo);
+  f8_e4m3x2 hi_fp8 = to<f8_e4m3x2>(hi);
+  f8_e4m3x4 result;
+  result.data[0] = lo_fp8.data[0];
+  result.data[1] = lo_fp8.data[1];
+  result.data[2] = hi_fp8.data[0];
+  result.data[3] = hi_fp8.data[1];
+  return result;
+#endif
+}
+
+// --- f32 -> f8_e5m2 specializations (downcast) ---
+
+/// f32x2 -> f8_e5m2x2.
+/// HIP gfx942: float -> bf8 (via __builtin_amdgcn_cvt_pk_bf8_f32).
+/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2 with __NV_E5M2).
+/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e5m2x2 to<f8_e5m2x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
+  return bit_cast<f8_e5m2x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2;
+  h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E5M2);
+  return bit_cast<f8_e5m2x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  f8_e5m2x2 result;
+  result.data[0] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E5M2));
+  result.data[1] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E5M2));
+  return result;
+#else
+  f8_e5m2x2 result;
+  result.data[0] = static_cast<__fp8_e5m2>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e5m2>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e5m2x4.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e5m2x4 to<f8_e5m2x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
+  packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[2], v.data[3], packed, true);
+  return bit_cast<f8_e5m2x4>(packed);
+#else
+  f32x2 lo, hi;
+  lo.data[0] = v.data[0];
+  lo.data[1] = v.data[1];
+  hi.data[0] = v.data[2];
+  hi.data[1] = v.data[3];
+  f8_e5m2x2 lo_fp8 = to<f8_e5m2x2>(lo);
+  f8_e5m2x2 hi_fp8 = to<f8_e5m2x2>(hi);
+  f8_e5m2x4 result;
+  result.data[0] = lo_fp8.data[0];
+  result.data[1] = lo_fp8.data[1];
+  result.data[2] = hi_fp8.data[0];
+  result.data[3] = hi_fp8.data[1];
+  return result;
+#endif
+}
+
+// --- f8_e4m3 <-> f16 conversion specializations ---
+
+/// f8_e4m3x2 -> f16x2.
+/// NVIDIA SM90+: packed intrinsic (1 instruction).
+/// HIP gfx942: fp8 -> float -> half (via AMD builtin).
+/// Pre-SM90 / fallback: element-wise scalar conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3x2>(const f8_e4m3x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
+  f16x2 result;
+  result.data[0] = __float2half(f[0]);
+  result.data[1] = __float2half(f[1]);
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
+  return bit_cast<f16x2>(h2);
+#else
+  f16x2 result;
+  result.data[0] = static_cast<__half>(v.data[0]);
+  result.data[1] = static_cast<__half>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f16x2 -> f8_e4m3x2.
+/// NVIDIA SM90+: packed intrinsic (1 instruction).
+/// HIP gfx942: half -> float -> fp8 (via AMD builtin).
+/// Pre-SM90: element-wise scalar conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f16x2>(const f16x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  float f0 = __half2float(v.data[0]);
+  float f1 = __half2float(v.data[1]);
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(f0, f1, 0, false);
+  return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = bit_cast<__half2_raw>(v);
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
+  return bit_cast<f8_e4m3x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(v.data[0]);
+  h1.x = bit_cast<unsigned short>(v.data[1]);
+  f8_e4m3x2 result;
+  result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
+  result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
+  return result;
+#else
+  f8_e4m3x2 result;
+  result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
+  return result;
+#endif
+}
+
 #endif  // defined(__FP8_TYPES_EXIST__)
+
+// --- fp8_e4m3b15 <-> fp16 direct conversion specializations ---
+// These are the PRIMARY conversions: fp8_b15 <-> fp16 is just a 1-bit exponent shift
+// (E4 bias=15 <-> E5 bias=15), no precision loss since fp16 has 10 mantissa bits
+// vs fp8's 3. fp32 conversions are derived by routing through fp16.
+
+/// f8_e4m3b15x2 -> f16x2.
+/// Direct fp8 -> fp16 via branch-free bit manipulation.
+template <>
+MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint16_t in = v.storage.__x;
+  // Spread 2 fp8 bytes into packed fp16 pair, adjust exponent E4->E5.
+  uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
+  uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
+  uint32_t out0 = b0 | (a0 & 0x80008000u);
+  __half2 h;
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h)) : "r"(out0));
+  return h;
+#else
+  f16x2 result;
+  result.data[0] = __float2half(float(v.data[0]));
+  result.data[1] = __float2half(float(v.data[1]));
+  return result;
+#endif
+}
+
+/// f8_e4m3b15x4 -> f16x4.
+/// Uses __byte_perm + lop3 for branch-free vectorized conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f16x4 to<f16x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in = v.storage.__x;
+  uint32_t a0 = __byte_perm(0u, in, 0x5746u);
+  uint32_t a0_shr = a0 >> 1;
+  uint32_t a0_sign = a0 & 0x80008000u;
+  uint32_t out0;
+  asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out0) : "r"(a0_shr), "r"(0x3f803f80u), "r"(a0_sign));
+  uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
+  uint32_t a1_shr = a1 >> 1;
+  uint32_t a1_sign = a1 & 0x80008000u;
+  uint32_t out1;
+  asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out1) : "r"(a1_shr), "r"(0x3f803f80u), "r"(a1_sign));
+  f16x4 result;
+  asm("mov.b32 %0, %1;" : "=r"(result.words[0]) : "r"(out0));
+  asm("mov.b32 %0, %1;" : "=r"(result.words[1]) : "r"(out1));
+  return result;
+#else
+  f16x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __float2half(float(v.data[i]));
+  }
+  return result;
+#endif
+}
+
+/// f16x2 -> f8_e4m3b15x2.
+/// Direct fp16 -> fp8 via clamp + exponent shift E5->E4 + pack.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in0;
+  asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast<const uint32_t*>(&v)));
+  // Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16).
+  uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16;
+  uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu;
+  alo = alo < 0x3B80u ? alo : 0x3B80u;
+  ahi = ahi < 0x3B80u ? ahi : 0x3B80u;
+  uint32_t a0 = alo | (ahi << 16);
+  a0 = a0 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
+  return bit_cast<f8_e4m3b15x2>(packed);
+#else
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(__half2float(v.data[0]));
+  result.data[1] = __fp8_e4m3b15(__half2float(v.data[1]));
+  return result;
+#endif
+}
+
+/// f16x4 -> f8_e4m3b15x4.
+/// Uses __vminu2 + lop3 + __byte_perm for branch-free vectorized conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in0, in1;
+  asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(v.words[0]));
+  asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1]));
+  uint32_t abs0 = in0 & 0x7fff7fffu;
+  uint32_t abs1 = in1 & 0x7fff7fffu;
+  uint32_t a0 = __vminu2(abs0, 0x3B803B80u);
+  uint32_t a1 = __vminu2(abs1, 0x3B803B80u);
+  a0 = a0 * 2u + 0x00800080u;
+  a1 = a1 * 2u + 0x00800080u;
+  uint32_t b0, b1;
+  asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b0) : "r"(a0), "r"(in0), "r"(0x80008000u));
+  asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b1) : "r"(a1), "r"(in1), "r"(0x80008000u));
+  uint32_t packed = __byte_perm(b0, b1, 0x7531u);
+  return bit_cast<f8_e4m3b15x4>(packed);
+#else
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(__half2float(v.data[i]));
+  }
+  return result;
+#endif
+}
+
+// --- fp8_e4m3b15 <-> f32 conversion specializations ---
+// Derived from fp16 conversions: fp8→f32 = fp8→fp16→f32, f32→fp8 = f32→fp16→fp8.
+
+/// f8_e4m3b15x2 -> f32x2.
+/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
+  float2 f2 = __half22float2(h);
+  return bit_cast<f32x2>(f2);
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e4m3b15x4 -> f32x4.
+/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
+  __half2 h0, h1;
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h0)) : "r"(h.words[0]));
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h1)) : "r"(h.words[1]));
+  float2 f0 = __half22float2(h0);
+  float2 f1 = __half22float2(h1);
+  f32x4 result;
+  result.data[0] = f0.x;
+  result.data[1] = f0.y;
+  result.data[2] = f1.x;
+  result.data[3] = f1.y;
+  return result;
+#else
+  f32x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = float(v.data[i]);
+  }
+  return result;
+#endif
+}
+
+/// f32x2 -> f8_e4m3b15x2.
+/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  float2 f2 = {v.data[0], v.data[1]};
+  __half2 h = __float22half2_rn(f2);
+  return to<f8_e4m3b15x2, f16x2>(h);
+#else
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(v.data[0]);
+  result.data[1] = __fp8_e4m3b15(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e4m3b15x4.
+/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  float2 f01 = {v.data[0], v.data[1]};
+  float2 f23 = {v.data[2], v.data[3]};
+  __half2 h01 = __float22half2_rn(f01);
+  __half2 h23 = __float22half2_rn(f23);
+  f16x4 h;
+  asm("mov.b32 %0, %1;" : "=r"(h.words[0]) : "r"(*reinterpret_cast<uint32_t*>(&h01)));
+  asm("mov.b32 %0, %1;" : "=r"(h.words[1]) : "r"(*reinterpret_cast<uint32_t*>(&h23)));
+  return to<f8_e4m3b15x4, f16x4>(h);
+#else
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(v.data[i]);
+  }
+  return result;
+#endif
+}
+
+// --- fp8_e4m3b15 arithmetic (software, always available) ---
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 operator+(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
+  return __fp8_e4m3b15(float(a) + float(b));
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 operator+(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(float(a.data[0]) + float(b.data[0]));
+  result.data[1] = __fp8_e4m3b15(float(a.data[1]) + float(b.data[1]));
+  return result;
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 operator+(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(float(a.data[i]) + float(b.data[i]));
+  }
+  return result;
+}
+
+// --- fp8_e4m3b15 min (software) ---
+
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 min(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
+  return __fp8_e4m3b15(fminf(float(a), float(b)));
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 min(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
+  f8_e4m3b15x2 result;
+  result.data[0] = mscclpp::min(a.data[0], b.data[0]);
+  result.data[1] = mscclpp::min(a.data[1], b.data[1]);
+  return result;
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 min(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = mscclpp::min(a.data[i], b.data[i]);
+  }
+  return result;
+}
+
 #endif  // MSCCLPP_DEVICE_COMPILE
 }  // namespace mscclpp
 
diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp
index 1a93cbc0..1cb3f253 100644
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -75,15 +75,17 @@ void register_algorithm(nb::module_& m) {
               [](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
                  size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
                  std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
-                 std::unordered_map<std::string, uintptr_t> extras) {
+                 std::unordered_map<std::string, uintptr_t> extras, int32_t accumDtype) {
                 return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
                                     inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
-                                    nBlocks, nThreadsPerBlock, symmetricMemory, extras);
+                                    nBlocks, nThreadsPerBlock, symmetricMemory, extras,
+                                    static_cast<DataType>(accumDtype));
               },
               nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
               nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
               nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
-              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>())
+              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>(),
+              nb::arg("accum_dtype") = static_cast<int32_t>(DataType::AUTO))
           .def("reset", &Algorithm::reset);
 
   nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index 47d76ac4..b8649564 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -47,7 +47,8 @@ void register_core(nb::module_& m) {
       .value("bfloat16", DataType::BFLOAT16)
       .value("float8_e4m3", DataType::FLOAT8_E4M3)
       .value("float8_e5m2", DataType::FLOAT8_E5M2)
-      .value("uint8", DataType::UINT8);
+      .value("uint8", DataType::UINT8)
+      .value("float8_e4m3b15", DataType::FLOAT8_E4M3B15);
 
   nb::class_<Bootstrap>(m, "CppBootstrap")
       .def("get_rank", &Bootstrap::getRank)
diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp
index 6995756b..60880456 100644
--- a/python/csrc/gpu_utils_py.cpp
+++ b/python/csrc/gpu_utils_py.cpp
@@ -34,6 +34,19 @@ static DLDataType getDlType(std::string type) {
     return DLDataType{kDLBfloat, 16, 1};
   } else if (type == "torch.float16") {
     return DLDataType{kDLFloat, 16, 1};
+  } else if (type == "torch.float8_e4m3fn") {
+    return DLDataType{kDLFloat8_e4m3fn, 8, 1};
+  } else if (type == "torch.float8_e4m3fnuz") {
+    return DLDataType{kDLFloat8_e4m3fnuz, 8, 1};
+  } else if (type == "torch.float8_e5m2") {
+    return DLDataType{kDLFloat8_e5m2, 8, 1};
+  } else if (type == "torch.float8_e5m2fnuz") {
+    return DLDataType{kDLFloat8_e5m2fnuz, 8, 1};
+  } else if (type == "torch.uint8") {
+    return DLDataType{kDLUInt, 8, 1};
+  } else if (type == "fp8_e4m3b15") {
+    // No standard DLPack code for fp8_e4m3b15; store as raw uint8 bytes.
+    return DLDataType{kDLUInt, 8, 1};
   } else {
     throw Error("Unsupported type: " + type, ErrorCode::InvalidUsage);
   }
diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py
index 744cf39e..f12a3027 100644
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -177,6 +177,7 @@ class Algorithm:
         nthreads_per_block=0,
         symmetric_memory: bool = False,
         extras: Optional[Dict[str, int]] = None,
+        accum_dtype: Optional[CppDataType] = None,
     ) -> int:
         """Execute the collective algorithm.
 
@@ -194,10 +195,14 @@ class Algorithm:
             nthreads_per_block: Number of threads per block (0 for auto-selection).
             symmetric_memory: Whether to use symmetric memory optimization (default: False).
             extras: Additional algorithm-specific parameters.
+            accum_dtype: Data type for accumulation during reduction. If None, defaults to
+                         the same as dtype. Use DataType.float32 for high-precision FP8 accumulation.
 
         Returns:
             The result code (0 for success).
         """
+        merged_extras = dict(extras) if extras is not None else {}
+        accum_dtype = accum_dtype if accum_dtype is not None else dtype
         return self._algorithm.execute(
             comm,
             int(input_buffer),
@@ -211,7 +216,8 @@ class Algorithm:
             nblocks,
             nthreads_per_block,
             symmetric_memory,
-            extras if extras is not None else {},
+            merged_extras,
+            int(accum_dtype),
         )
 
     def reset(self):
diff --git a/python/test/test_fp8_accum.py b/python/test/test_fp8_accum.py
new file mode 100644
index 00000000..3a6c67f1
--- /dev/null
+++ b/python/test/test_fp8_accum.py
@@ -0,0 +1,391 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Correctness test for FP8 allreduce with different accumulation types.
+#
+# Verifies that FP8 allreduce with higher-precision accumulation produces
+# results at least as accurate as native FP8 accumulation, by comparing
+# against a float32 reference.
+#
+# Usage:
+#   mpirun -np 8 pytest python/test/test_fp8_accum.py -v
+
+import cupy as cp
+import numpy as np
+import pytest
+
+from mscclpp import CommGroup, GpuBuffer, DataType, ReduceOp, is_nvls_supported
+from mscclpp.ext import AlgorithmCollectionBuilder
+from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
+
+# FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs.
+# On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed.
+_is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip
+# TODO(binyli): Skip hip for now, will fix it in the next PR
+_skip_fp8 = _is_hip or int(cp.cuda.Device().compute_capability) < 89
+pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA (HIP not yet supported)")
+
+# ---------------------------------------------------------------------------
+# FP8 E4M3FN helpers (bias=7, no infinity, NaN = exp=15 & mant=7)
+# ---------------------------------------------------------------------------
+
+
+def e4m3fn_to_float(uint8_array):
+    """Decode a cupy uint8 array of E4M3FN bit patterns to float32."""
+    bits = uint8_array.astype(cp.int32)
+    sign = (bits >> 7) & 1
+    exp = (bits >> 3) & 0xF
+    mant = bits & 0x7
+
+    # Normal: (-1)^s * 2^(exp-7) * (1 + mant/8)
+    normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 7).astype(cp.int32))
+    # Subnormal (exp==0): (-1)^s * 2^(-6) * (mant/8)
+    subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-6))
+
+    result = cp.where(exp == 0, subnormal_val, normal_val)
+    result = cp.where(sign == 1, -result, result)
+    # Zero
+    result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
+    # NaN: exp==15 & mant==7
+    nan_mask = (exp == 15) & (mant == 7)
+    result = cp.where(nan_mask, cp.float32(float("nan")), result)
+    return result
+
+
+def float_to_e4m3fn(f32_array, chunk_size=65536):
+    """Encode a cupy float32 array to uint8 E4M3FN bit patterns.
+
+    Uses a lookup-table approach: precompute all 128 positive E4M3FN values,
+    then find nearest match per element via chunked broadcast comparison.
+    """
+    # Build lookup table of all 128 positive E4M3FN values (0x00..0x7F)
+    all_bytes = cp.arange(128, dtype=cp.uint8)
+    all_floats = e4m3fn_to_float(all_bytes)  # (128,) float32
+    # Mark NaN entries as inf so they're never selected as nearest
+    all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
+
+    # Clamp input and extract sign
+    clamped = f32_array.astype(cp.float32)
+    clamped = cp.clip(clamped, -448.0, 448.0)
+    signs = (clamped < 0).astype(cp.uint8)
+    absval = cp.abs(clamped)
+
+    result = cp.zeros(absval.shape, dtype=cp.uint8)
+    n = absval.size
+    absval_flat = absval.ravel()
+    result_flat = result.ravel()
+
+    for start in range(0, n, chunk_size):
+        end = min(start + chunk_size, n)
+        chunk = absval_flat[start:end]
+        # (chunk_size, 128) difference matrix
+        diffs = cp.abs(chunk[:, None] - all_floats[None, :])
+        result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
+
+    # Combine with sign bit
+    result = result_flat.reshape(absval.shape)
+    result = result | (signs << 7)
+    # Handle exact zero
+    result = cp.where(absval == 0, cp.uint8(0), result)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80)
+# ---------------------------------------------------------------------------
+
+
+def e4m3b15_to_float(uint8_array):
+    """Decode a cupy uint8 array of E4M3B15 bit patterns to float32."""
+    bits = uint8_array.astype(cp.int32)
+    sign = (bits >> 7) & 1
+    exp = (bits >> 3) & 0xF
+    mant = bits & 0x7
+
+    # Normal: (-1)^s * 2^(exp-15) * (1 + mant/8)
+    normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 15).astype(cp.int32))
+    # Subnormal (exp==0): (-1)^s * 2^(-14) * (mant/8)
+    subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-14))
+
+    result = cp.where(exp == 0, subnormal_val, normal_val)
+    result = cp.where(sign == 1, -result, result)
+    # Zero
+    result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
+    # NaN: exp==15 or negative zero (0x80)
+    nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80)
+    result = cp.where(nan_mask, cp.float32(float("nan")), result)
+    return result
+
+
+def float_to_e4m3b15(f32_array, chunk_size=65536):
+    """Encode a cupy float32 array to uint8 E4M3B15 bit patterns.
+
+    Same lookup-table approach as float_to_e4m3fn.
+    """
+    # Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F)
+    all_bytes = cp.arange(128, dtype=cp.uint8)
+    all_floats = e4m3b15_to_float(all_bytes)  # (128,) float32
+    # Mark NaN entries as inf so they're never selected as nearest
+    all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
+
+    # Clamp input and extract sign
+    clamped = f32_array.astype(cp.float32)
+    clamped = cp.clip(clamped, -0.9375, 0.9375)
+    signs = (clamped < 0).astype(cp.uint8)
+    absval = cp.abs(clamped)
+
+    result = cp.zeros(absval.shape, dtype=cp.uint8)
+    n = absval.size
+    absval_flat = absval.ravel()
+    result_flat = result.ravel()
+
+    for start in range(0, n, chunk_size):
+        end = min(start + chunk_size, n)
+        chunk = absval_flat[start:end]
+        # (chunk_size, 128) difference matrix
+        diffs = cp.abs(chunk[:, None] - all_floats[None, :])
+        result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
+
+    # Combine with sign bit
+    result = result_flat.reshape(absval.shape)
+    result = result | (signs << 7)
+    # Handle exact zero
+    result = cp.where(absval == 0, cp.uint8(0), result)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Shared test helpers
+# ---------------------------------------------------------------------------
+
+
+def setup_algorithms(mpi_group):
+    """Build default algorithms and return (comm_group, algo_map, scratch_buf)."""
+    comm_group = CommGroup(mpi_group.comm)
+    scratch = GpuBuffer(1 << 27, dtype=cp.uint8)  # 128 MB
+    AlgorithmCollectionBuilder.reset()
+    builder = AlgorithmCollectionBuilder()
+    algorithms = builder.build_default_algorithms(
+        scratch_buffer=scratch.data.ptr,
+        scratch_buffer_size=scratch.nbytes,
+        rank=comm_group.my_rank,
+    )
+    algo_map = {a.name: a for a in algorithms}
+    return comm_group, algo_map, scratch
+
+
+def run_allreduce(algo, comm_group, buffer, dtype, accum_dtype=None, nblocks=0, nthreads_per_block=0):
+    """Run allreduce in-place on buffer and return a copy of the result."""
+    ret = algo.execute(
+        comm=comm_group.communicator,
+        input_buffer=buffer.data.ptr,
+        output_buffer=buffer.data.ptr,
+        input_size=buffer.nbytes,
+        output_size=buffer.nbytes,
+        dtype=dtype,
+        op=ReduceOp.SUM,
+        stream=cp.cuda.get_current_stream().ptr,
+        nblocks=nblocks,
+        nthreads_per_block=nthreads_per_block,
+        symmetric_memory=True,
+        accum_dtype=accum_dtype,
+    )
+    cp.cuda.Device().synchronize()
+    assert ret == 0, f"Allreduce failed with error code {ret}"
+    return buffer.copy()
+
+
+# ---------------------------------------------------------------------------
+# Test: FP8 E4M3 accumulation correctness
+# ---------------------------------------------------------------------------
+
+
+@parametrize_mpi_groups(8)
+@pytest.mark.parametrize(
+    "algo_name",
+    [
+        "default_allreduce_packet",
+        "default_allreduce_nvls_packet",
+        "default_allreduce_fullmesh",
+        "default_allreduce_rsag_zero_copy",
+    ],
+)
+@pytest.mark.parametrize("size", [1024, 4096, 16384, 65536, 262144, 1048576])
+def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
+    """Verify that FP8 E4M3 allreduce with higher-precision accumulation is at
+    least as accurate as native FP8 accumulation, across all algorithm variants."""
+    rank = mpi_group.comm.rank
+    world_size = mpi_group.comm.size
+
+    comm_group, algo_map, scratch = setup_algorithms(mpi_group)
+    if algo_name not in algo_map:
+        pytest.skip(f"{algo_name} not available")
+    algo = algo_map[algo_name]
+
+    buf = GpuBuffer(size, dtype=cp.uint8)
+
+    accum_configs = [
+        ("fp8_native", DataType.float8_e4m3),
+        ("float16", DataType.float16),
+        ("float32", DataType.float32),
+    ]
+
+    # rsag_zero_copy and fullmesh need explicit block/thread counts
+    if "rsag" in algo_name:
+        nb = max(1, min(32, size // (world_size * 32)))
+        nt = 1024
+    elif "fullmesh" in algo_name:
+        nb = 35
+        nt = 512
+    else:
+        nb = 0
+        nt = 0
+
+    errors = {}
+    for accum_label, accum_dtype in accum_configs:
+        # Generate deterministic per-rank data
+        cp.random.seed(42 + rank)
+        src_f32 = cp.random.randn(size).astype(cp.float32)
+        src_f32 = cp.clip(src_f32, -240.0, 240.0)
+        src_fp8 = float_to_e4m3fn(src_f32)
+
+        # Copy into symmetric buffer
+        buf[:] = src_fp8
+        cp.cuda.Device().synchronize()
+
+        # Run allreduce
+        result = run_allreduce(
+            algo,
+            comm_group,
+            buf,
+            dtype=DataType.float8_e4m3,
+            accum_dtype=accum_dtype,
+            nblocks=nb,
+            nthreads_per_block=nt,
+        )
+        result_f32 = e4m3fn_to_float(result)
+
+        # Compute float32 reference: sum all ranks' quantized FP8 inputs in float32
+        ref_f32 = cp.zeros(size, dtype=cp.float32)
+        for r in range(world_size):
+            cp.random.seed(42 + r)
+            rank_data = cp.random.randn(size).astype(cp.float32)
+            rank_data = cp.clip(rank_data, -240.0, 240.0)
+            rank_data_fp8 = float_to_e4m3fn(rank_data)
+            ref_f32 += e4m3fn_to_float(rank_data_fp8)
+
+        # Compute errors
+        abs_err = cp.abs(result_f32 - ref_f32)
+        mean_abs_err = float(cp.mean(abs_err))
+        errors[accum_label] = mean_abs_err
+
+        # Reset between runs
+        algo.reset()
+
+    # Higher-precision accumulation should be at least as accurate as native fp8
+    assert (
+        errors["float16"] <= errors["fp8_native"] + 1e-6
+    ), f"float16 accum ({errors['float16']:.6f}) worse than native ({errors['fp8_native']:.6f})"
+    assert (
+        errors["float32"] <= errors["fp8_native"] + 1e-6
+    ), f"float32 accum ({errors['float32']:.6f}) worse than native ({errors['fp8_native']:.6f})"
+
+
+# ---------------------------------------------------------------------------
+# Test: FP8 E4M3B15 accumulation correctness
+# ---------------------------------------------------------------------------
+
+
+@parametrize_mpi_groups(8)
+@pytest.mark.parametrize(
+    "algo_name",
+    [
+        "default_allreduce_packet",
+        "default_allreduce_nvls_packet",
+        "default_allreduce_rsag_zero_copy",
+    ],
+)
+@pytest.mark.parametrize("size", [1024, 4096, 65536])
+def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
+    """Verify that FP8 E4M3B15 allreduce with higher-precision accumulation is at
+    least as accurate as native E4M3B15 accumulation."""
+    rank = mpi_group.comm.rank
+    world_size = mpi_group.comm.size
+
+    comm_group, algo_map, scratch = setup_algorithms(mpi_group)
+    if algo_name not in algo_map:
+        pytest.skip(f"{algo_name} not available")
+
+    algo = algo_map[algo_name]
+    buf = GpuBuffer(size, dtype=cp.uint8)
+
+    accum_configs = [
+        ("e4m3b15_native", DataType.float8_e4m3b15),
+        ("float16", DataType.float16),
+        ("float32", DataType.float32),
+    ]
+
+    # rsag_zero_copy needs explicit block/thread counts, scaled to data size
+    if "rsag" in algo_name:
+        nb = max(1, min(32, size // (world_size * 32)))
+        nt = 1024
+    else:
+        nb = 0
+        nt = 0
+
+    errors = {}
+    for accum_label, accum_dtype in accum_configs:
+        # Generate deterministic per-rank random uint8 values in valid e4m3b15 range
+        cp.random.seed(42 + rank)
+        raw = cp.random.randint(0, 0x78, (size,), dtype=cp.uint8)
+        signs = cp.random.randint(0, 2, (size,), dtype=cp.uint8).astype(cp.uint8) << 7
+        src_uint8 = raw | signs
+        # Fix negative zero -> positive zero
+        src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8)
+
+        # Copy into symmetric buffer
+        buf[:] = src_uint8
+        cp.cuda.Device().synchronize()
+
+        # Run allreduce
+        result = run_allreduce(
+            algo,
+            comm_group,
+            buf,
+            dtype=DataType.float8_e4m3b15,
+            accum_dtype=accum_dtype,
+            nblocks=nb,
+            nthreads_per_block=nt,
+        )
+
+        # Decode result
+        result_f32 = e4m3b15_to_float(result)
+
+        # Compute float32 reference
+        ref_f32 = cp.zeros(size, dtype=cp.float32)
+        for r in range(world_size):
+            cp.random.seed(42 + r)
+            raw_r = cp.random.randint(0, 0x78, (size,), dtype=cp.uint8)
+            signs_r = cp.random.randint(0, 2, (size,), dtype=cp.uint8).astype(cp.uint8) << 7
+            bits_r = raw_r | signs_r
+            bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r)
+            ref_f32 += e4m3b15_to_float(bits_r)
+
+        # Clamp reference to e4m3b15 representable range
+        ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375)
+
+        # Compute errors (only on valid entries)
+        valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32)
+        abs_err = cp.abs(result_f32[valid] - ref_f32[valid])
+        mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0
+        errors[accum_label] = mean_abs_err
+
+        algo.reset()
+
+    # Higher-precision accumulation should be at least as accurate as native
+    assert (
+        errors["float16"] <= errors["e4m3b15_native"] + 1e-8
+    ), f"float16 accum ({errors['float16']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
+    assert (
+        errors["float32"] <= errors["e4m3b15_native"] + 1e-8
+    ), f"float32 accum ({errors['float32']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc
index 99e7b031..ffa53aa8 100644
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -41,7 +41,9 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
 CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
                                     size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
                                     cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
-                                    bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras) {
+                                    bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras,
+                                    DataType accumDtype) {
+  if (accumDtype == DataType::AUTO) accumDtype = dtype;
   if (!initialized_) {
     initFunc_(comm);
     initialized_ = true;
@@ -53,7 +55,7 @@ CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const vo
     contexts_[ctxKey] = ctx;
   }
   return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks,
-                           nThreadsPerBlock, extras);
+                           nThreadsPerBlock, extras, accumDtype);
 }
 
 const std::string& NativeAlgorithm::name() const { return name_; }
@@ -77,10 +79,7 @@ const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferM
 
 Algorithm::Constraint NativeAlgorithm::constraint() const { return constraint_; }
 
-void NativeAlgorithm::reset() {
-  contexts_.clear();
-  initialized_ = false;
-}
+void NativeAlgorithm::reset() { contexts_.clear(); }
 
 void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName,
                                             std::shared_ptr<Algorithm> algorithm) {
@@ -166,7 +165,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
 CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                                  size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
                                  std::shared_ptr<Executor> executor, int, int, bool,
-                                 const std::unordered_map<std::string, uintptr_t>&) {
+                                 const std::unordered_map<std::string, uintptr_t>&, DataType) {
   if (!executor) {
     THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
   }
@@ -192,6 +191,10 @@ CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void*
                         plan_, stream);
       break;
 #endif
+    case DataType::FLOAT8_E4M3B15:
+      executor->execute(rank, (__fp8_e4m3b15*)input, (__fp8_e4m3b15*)output, inputSize, outputSize,
+                        DataType::FLOAT8_E4M3B15, plan_, stream);
+      break;
     case DataType::INT32:
     case DataType::UINT32:
       executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream);
diff --git a/src/core/executor/execution_kernel.cu b/src/core/executor/execution_kernel.cu
index 2d36bcf5..28ced77f 100644
--- a/src/core/executor/execution_kernel.cu
+++ b/src/core/executor/execution_kernel.cu
@@ -82,6 +82,12 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
     case DataType::FLOAT8_E5M2:
       // FP8 is not supported in CUDA execution kernel.
       break;
+    case DataType::FLOAT8_E4M3B15:
+      // fp8_e4m3b15 is a software type not supported in the CUDA execution kernel.
+      break;
+    case DataType::AUTO:
+      // AUTO is a sentinel resolved before reaching this point; nothing to do.
+      break;
   }
 }
 
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index 20147c30..87b88888 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -210,7 +210,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input
           sizeof(int4);
       void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]);
       val = mscclpp::read<int4>(remoteMemory, srcOffset + idx);
-      tmp = cal_vector<T, OpType>(tmp, val);
+      tmp = calVector<T, OpType>(tmp, val);
     }
     output4[outputOffset4 + idx] = tmp;
     if constexpr (SendToRemote) {
@@ -353,9 +353,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in
     for (uint32_t index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
       PacketPayload<PacketType> val = pkt[idx].read(flag_);
-      data = cal_vector<T, OpType>(data, val);
+      data = calVector<T, OpType>(data, val);
     }
-    data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
+    data = calVector<T, OpType>(data, srcPacketPayload[idx]);
     dstPacketPayload[idx] = data;
 
     if constexpr (SendToRemote) {
@@ -394,9 +394,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void
     for (uint32_t index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
       PacketPayload<PacketType> val = pkt[idx].read(flag_);
-      data = cal_vector<T, OpType>(data, val);
+      data = calVector<T, OpType>(data, val);
     }
-    data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
+    data = calVector<T, OpType>(data, srcPacketPayload[idx]);
     dstPacketPayload[idx] = data;
     PacketType* dst_val = &dstPkt[idx];
     dst_val->write(data, flag_);
@@ -464,7 +464,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo
       size_t buffOffset =
           (inputOffsets[index] + getOffset<ReuseScratch>(outputBufferRefs[index].type, offset)) / sizeof(int4);
       int4 val = buff4[buffOffset + idx];
-      tmp = cal_vector<T, OpType>(tmp, val);
+      tmp = calVector<T, OpType>(tmp, val);
     }
     dst4[dstOffset4 + idx] = tmp;
     if constexpr (SendToRemote) {
@@ -899,6 +899,17 @@ class ExecutionKernel {
 #endif
         break;
 #endif  // __FP8_TYPES_EXIST__
+      case DataType::FLOAT8_E4M3B15:
+        executionKernel<__fp8_e4m3b15, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (__fp8_e4m3b15*)src, (__fp8_e4m3b15*)dst, (__fp8_e4m3b15*)scratch, scratchOffset, scratchChunkSize,
+            plan, semaphores, localMemoryIdBegin, flag
+#if defined(ENABLE_NPKIT)
+            ,
+            NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
+#else
+        );
+#endif
+        break;
       case DataType::UINT8:
         executionKernel<uint8_t, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
             rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores,
@@ -910,6 +921,10 @@ class ExecutionKernel {
         );
 #endif
         break;
+      case DataType::AUTO:
+        // AUTO is a sentinel that must be resolved before reaching this point.
+        assert(false && "DataType::AUTO must be resolved before kernel launch");
+        break;
     }
   }
 #else   // !defined(MSCCLPP_DEVICE_HIP)
diff --git a/src/core/include/reduce_kernel.hpp b/src/core/include/reduce_kernel.hpp
index fd9bd1e9..463f827d 100644
--- a/src/core/include/reduce_kernel.hpp
+++ b/src/core/include/reduce_kernel.hpp
@@ -14,7 +14,7 @@ namespace mscclpp {
 
 // Generic element-wise calculation helper
 template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
+MSCCLPP_DEVICE_INLINE T calElements(const T& a, const T& b) {
   if constexpr (OpType == SUM) {
     return a + b;
   } else if constexpr (OpType == MIN) {
@@ -24,56 +24,168 @@ MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
 }
 
 // Generic vector reduction helpers
-template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE int4 cal_vector_helper(const int4& a, const int4& b) {
-  int4 ret;
-  ret.w = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
-  ret.x = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  ret.z = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
-  return ret;
-}
 
 template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE uint2 cal_vector_helper(const uint2& a, const uint2& b) {
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) {
   uint2 ret;
-  ret.x = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
-  ret.y = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
+  ret.x = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
+  ret.y = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
   return ret;
 }
 
-template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE int cal_vector_helper(const int& a, const int& b) {
-  return bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+/// f32x2 specialization for uint2: uses packed f32x2 operator+ (Blackwell __fadd2_rn when available).
+template <>
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, SUM>(const uint2& a, const uint2& b) {
+  f32x2 fa = bit_cast<f32x2, uint2>(a);
+  f32x2 fb = bit_cast<f32x2, uint2>(b);
+  f32x2 fr = fa + fb;
+  return bit_cast<uint2, f32x2>(fr);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, MIN>(const uint2& a, const uint2& b) {
+  f32x2 fa = bit_cast<f32x2, uint2>(a);
+  f32x2 fb = bit_cast<f32x2, uint2>(b);
+  f32x2 fr = mscclpp::min(fa, fb);
+  return bit_cast<uint2, f32x2>(fr);
 }
 
 template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE uint32_t cal_vector_helper(const uint32_t& a, const uint32_t& b) {
-  return bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) {
+  int4 ret;
+  ret.w = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
+  ret.x = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  ret.z = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
+  return ret;
 }
 
-// cal_vector wrapper - converts scalar types to vector types and calls cal_vector_helper
+/// f32x2 specialization for int4: process as two uint2 pairs using packed f32x2 arithmetic.
+template <>
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, SUM>(const int4& a, const int4& b) {
+  uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
+  uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
+  uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
+  uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
+  uint2 lo_r = calVectorHelper<f32x2, SUM>(lo_a, lo_b);
+  uint2 hi_r = calVectorHelper<f32x2, SUM>(hi_a, hi_b);
+  return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, MIN>(const int4& a, const int4& b) {
+  uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
+  uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
+  uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
+  uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
+  uint2 lo_r = calVectorHelper<f32x2, MIN>(lo_a, lo_b);
+  uint2 hi_r = calVectorHelper<f32x2, MIN>(hi_a, hi_b);
+  return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE int calVectorHelper(const int& a, const int& b) {
+  return bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) {
+  return bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
+}
+
+/// f32x2 specialization for uint32_t: a single float packed in 32 bits (scalar fallback).
+template <>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, SUM>(const uint32_t& a, const uint32_t& b) {
+  float fa = bit_cast<float, uint32_t>(a);
+  float fb = bit_cast<float, uint32_t>(b);
+  return bit_cast<uint32_t, float>(fa + fb);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, MIN>(const uint32_t& a, const uint32_t& b) {
+  float fa = bit_cast<float, uint32_t>(a);
+  float fb = bit_cast<float, uint32_t>(b);
+  return bit_cast<uint32_t, float>(fminf(fa, fb));
+}
+
+// calVector wrapper – converts scalar types to vector types and calls calVectorHelper
 template <typename T, ReduceOp OpType, typename DataType>
-MSCCLPP_DEVICE_INLINE DataType cal_vector(const DataType& a, const DataType& b) {
+MSCCLPP_DEVICE_INLINE DataType calVector(const DataType& a, const DataType& b) {
   // Define the vectorized computation type based on the element type
   static_assert(sizeof(DataType) % sizeof(T) == 0, "DataType size must be multiple of T size");
   static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes");
   using CompType = typename std::conditional_t<
-      std::is_same_v<T, __half>, f16x2,
+      std::is_same_v<T, float>, f32x2,
       std::conditional_t<
-          std::is_same_v<T, __bfloat16>, bf16x2,
-          std::conditional_t<std::is_same_v<T, uint8_t>, u8x4,
+          std::is_same_v<T, __half>, f16x2,
+          std::conditional_t<
+              std::is_same_v<T, __bfloat16>, bf16x2,
+              std::conditional_t<
+                  std::is_same_v<T, uint8_t>, u8x4,
+                  std::conditional_t<std::is_same_v<T, __fp8_e4m3b15>, f8_e4m3b15x4,
 #if defined(__FP8_TYPES_EXIST__)
-                             std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
-                                                std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4,
-#endif
-                                                                   T
-#if defined(__FP8_TYPES_EXIST__)
-                                                                   >>>>>;
+                                     std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
+                                                        std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4, T>>
 #else
-                             >>>;
+                                     T
 #endif
-  return cal_vector_helper<CompType, OpType>(a, b);
+                                     >>>>>;
+  return calVectorHelper<CompType, OpType>(a, b);
+}
+
+/// Upcast a packed DataType (containing T elements) to a packed AccDataType (containing AccumT elements).
+/// Uses the optimized to<>() specializations when available (e.g. FP8 -> float hardware intrinsics).
+/// When AccumT == T, this is a no-op identity.
+template <typename T, typename AccumT, typename AccDataType, typename DataType>
+MSCCLPP_DEVICE_INLINE AccDataType upcastVector(const DataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return val;
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+    ToVec result = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
+    return reinterpret_cast<const AccDataType&>(result);
+  }
+}
+
+/// Downcast a packed AccDataType (containing AccumT elements) back to DataType (containing T elements).
+/// Uses the optimized to<>() specializations when available.
+/// When AccumT == T, this is a no-op identity.
+template <typename T, typename AccumT, typename DataType, typename AccDataType>
+MSCCLPP_DEVICE_INLINE DataType downcastVector(const AccDataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return val;
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+    FromVec result = mscclpp::to<FromVec>(reinterpret_cast<const ToVec&>(val));
+    return reinterpret_cast<const DataType&>(result);
+  }
+}
+
+/// Accumulate `val` (packed T elements in DataType) into `acc` (packed AccumT elements in AccDataType).
+/// When AccumT == T, falls back to the standard calVector.
+/// Otherwise, upcasts val to AccumT, reduces element-wise, and returns the AccumT accumulator.
+template <typename T, typename AccumT, ReduceOp OpType, typename AccDataType, typename DataType>
+MSCCLPP_DEVICE_INLINE AccDataType calVectorAccum(const AccDataType& acc, const DataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return calVector<T, OpType>(acc, val);
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+
+    ToVec fv = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
+    const ToVec& fa = reinterpret_cast<const ToVec&>(acc);
+    ToVec fr;
+#pragma unroll
+    for (int i = 0; i < nElems; ++i) {
+      fr.data[i] = calElements<AccumT, OpType>(fa.data[i], fv.data[i]);
+    }
+    return reinterpret_cast<const AccDataType&>(fr);
+  }
 }
 
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)
diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index 0b288b38..fb51a342 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -183,7 +183,8 @@ std::shared_ptr<Algorithm> AllgatherFullmesh::build() {
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, [[maybe_unused]] DataType dtype, [[maybe_unused]] ReduceOp op,
              cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-             const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             const std::unordered_map<std::string, uintptr_t>& extras,
+             [[maybe_unused]] DataType accumDtype) -> CommResult {
         return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
index cf6027d9..9d169d68 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -212,7 +212,8 @@ std::shared_ptr<Algorithm> AllgatherFullmesh2::build() {
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, [[maybe_unused]] mscclpp::DataType dtype, [[maybe_unused]] ReduceOp op,
              cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-             const std::unordered_map<std::string, uintptr_t>& extras) -> mscclpp::CommResult {
+             const std::unordered_map<std::string, uintptr_t>& extras,
+             [[maybe_unused]] mscclpp::DataType accumDtype) -> mscclpp::CommResult {
         return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 83950d7c..6cbc8977 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -47,7 +47,7 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
       const int remoteRank = index < rank ? index : index + 1;
       LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems;
       uint32_t val = dstPkt[idx].read(flag, -1);
-      data = cal_vector<T, OpType>(val, data);
+      data = calVector<T, OpType>(val, data);
     }
     dst[idx] = data;
   }
@@ -67,7 +67,7 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
   return {(worldSize - 1) * 4, 512};
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllpairAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
@@ -94,7 +94,8 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
                                                        size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op,
                                                        cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                       const std::unordered_map<std::string, uintptr_t>&) {
+                                                       const std::unordered_map<std::string, uintptr_t>&,
+                                                       DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
   std::pair<int, int> blockAndThreadNum{nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
@@ -105,7 +106,7 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   size_t channelInOffset = (char*)input - (char*)sendBasePtr;
 
-  AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -161,9 +162,9 @@ std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index 13c63ba1..ee46fd77 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -9,7 +9,7 @@
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(512, 1)
     allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                       DeviceHandle<MemoryChannel>* memoryOutChannels, size_t channelOutDataOffset, int rank,
@@ -26,6 +26,10 @@ __global__ void __launch_bounds__(512, 1)
   int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
   int4* resultBuff4 = reinterpret_cast<int4*>(resultBuff);
 
+  // AccumVec: wider vector for mixed-precision accumulation. When AccumT==T, this is just int4 (no-op).
+  constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T);
+  using AccumVec = std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nElemsPerInt4>>;
+
   // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4`
   constexpr size_t unitNInt4 = 512;
   const size_t maxNInt4PerBlock =
@@ -81,12 +85,14 @@ __global__ void __launch_bounds__(512, 1)
     __syncthreads();
 
     for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) {
-      int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(rawData);
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
-        data = cal_vector<T, OpType>(val, data);
+        acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, val);
       }
+      int4 data = mscclpp::downcastVector<T, AccumT, int4>(acc);
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
@@ -121,12 +127,14 @@ __global__ void __launch_bounds__(512, 1)
     __syncthreads();
 
     for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) {
-      int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(rawData);
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
-        data = cal_vector<T, OpType>(val, data);
+        acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, val);
       }
+      int4 data = mscclpp::downcastVector<T, AccumT, int4>(acc);
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
@@ -144,7 +152,7 @@ __global__ void __launch_bounds__(512, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceAllconnectAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t,
@@ -155,7 +163,7 @@ struct AllreduceAllconnectAdapter {
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0) nBlocks = 35;
     if (nThreadsPerBlock == 0) nThreadsPerBlock = 512;
-    allreduceFullmesh<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreduceFullmesh<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels,
         channelOutDataOffset, rank, nRanksPerNode, worldSize, nelems);
     return cudaGetLastError();
@@ -174,10 +182,10 @@ void AllreduceFullmesh::initialize(std::shared_ptr<Communicator> comm) {
   localScratchMemory_ = std::move(localMemory);
 }
 
-CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
-                                                  size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
-                                                  int nBlocks, int nThreadsPerBlock,
-                                                  const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceFullmesh::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx_void, const void* input, void* output, size_t inputSize, DataType dtype,
+    ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   size_t recvBytes;
   CUdeviceptr recvBasePtr;
@@ -198,7 +206,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
   }
   inputChannelHandles = this->memoryChannelsMap_[input].second;
 
-  AllreduceFunc allreduce = dispatch<AllreduceAllconnectAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceAllconnectAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", static_cast<int>(op),
          static_cast<int>(dtype));
@@ -261,9 +269,10 @@ std::shared_ptr<Algorithm> AllreduceFullmesh::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index b542a6a6..2d71cd63 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -146,7 +146,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsBlockPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -155,6 +155,9 @@ struct NvlsBlockPipelineAdapter {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
+      return cudaErrorNotSupported;
     } else
 #if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
       if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
@@ -187,9 +190,10 @@ void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm)
 CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
                                                            void* output, size_t inputSize, DataType dtype, ReduceOp op,
                                                            cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                           const std::unordered_map<std::string, uintptr_t>&) {
+                                                           const std::unordered_map<std::string, uintptr_t>& extras,
+                                                           DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -235,9 +239,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsBlockPipeline::build() {
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index 9824fbcd..a616485e 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -1,15 +1,17 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <type_traits>
+
 #include "allreduce/allreduce_nvls_packet.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
-#include "debug.h"
+#include "logger.hpp"
 
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output,
                         [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
@@ -31,15 +33,16 @@ __global__ void __launch_bounds__(1024, 1)
     mscclpp::SwitchChannelDeviceHandle::multimemStore(*(mscclpp::f32x2*)(&pkt), multiPkt + i);
   }
   for (uint32_t i = tid; i < nPktPerRank * worldSize; i += blockDim.x * gridDim.x) {
-    uint data = src[i];
+    // When T == AccumT, stay with raw uint to avoid type mismatch in identity path.
+    using AccRaw =
+        std::conditional_t<std::is_same_v<T, AccumT>, uint, mscclpp::VectorType<AccumT, sizeof(uint) / sizeof(T)>>;
+    AccRaw acc = mscclpp::upcastVector<T, AccumT, AccRaw>(src[i]);
     for (int peer = 0; peer < worldSize; peer++) {
-      if (peer == rank) {
-        continue;
-      }
+      if (peer == rank) continue;
       uint val = scratchPkt[peer * worldSize * nPktPerRank + i].read(flag);
-      data = cal_vector<T, OpType>(data, val);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(acc, val);
     }
-    dst[i] = data;
+    dst[i] = mscclpp::downcastVector<T, AccumT, uint>(acc);
   }
   __syncthreads();
   if (threadIdx.x == 0) {
@@ -62,13 +65,13 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize) {
   return {blockNum, threadNum};
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceNvlsPacketAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int, int worldSize, size_t inputSize, cudaStream_t stream,
                           void* flags, uint32_t flagBufferSize, uint32_t, int nBlocks, int nThreadsPerBlock) {
-    allreduceNvlsPacket<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreduceNvlsPacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank,
         worldSize, flags, flagBufferSize);
     return cudaGetLastError();
@@ -78,6 +81,8 @@ struct AllreduceNvlsPacketAdapter {
 void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator> comm) {
   int nSwitchChannels = 1;
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
+  this->switchChannels_ =
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
 }
 
 AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
@@ -92,9 +97,7 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
   ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
-  int nSwitchChannels = 1;
-  ctx->switchChannels =
-      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
+  ctx->switchChannels = this->switchChannels_;
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
@@ -102,19 +105,20 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
 CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
                                                     void* output, size_t inputSize, mscclpp::DataType dtype,
                                                     ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                    const std::unordered_map<std::string, uintptr_t>&) {
+                                                    const std::unordered_map<std::string, uintptr_t>&,
+                                                    mscclpp::DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize);
   }
   if (blockAndThreadNum.first > maxBlockNum_) {
-    WARN("Block number %d exceeds the maximum limit %d", blockAndThreadNum.first, maxBlockNum_);
+    WARN(ALGO, "Block number ", blockAndThreadNum.first, " exceeds the maximum limit ", maxBlockNum_);
     return CommResult::CommInvalidArgument;
   }
-  AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
+    WARN(ALGO, "Unsupported operation or data type for allreduce, dtype=", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   cudaError_t error =
@@ -122,7 +126,7 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
                 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
                 (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
+    WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
@@ -136,9 +140,10 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             mscclpp::DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index bc03ab26..3bb054da 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -109,7 +109,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsWarpPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -118,6 +118,9 @@ struct NvlsWarpPipelineAdapter {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
+      return cudaErrorNotSupported;
     } else
 #if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
       if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
@@ -147,12 +150,12 @@ void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
-CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
-                                                          void* output, size_t inputSize, DataType dtype, ReduceOp op,
-                                                          cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                          const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx_void, const void* input, void* output, size_t inputSize, DataType dtype,
+    ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsWarpPipelineAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsWarpPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -198,9 +201,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsWarpPipeline::build() {
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index f251bcda..e7f2028f 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -67,7 +67,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsAdapter {
   static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
@@ -77,6 +77,9 @@ struct NvlsAdapter {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
+      return cudaErrorNotSupported;
     } else
 #if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)
         if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
@@ -114,13 +117,14 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
 CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
                                               size_t inputSize, mscclpp::DataType dtype, ReduceOp op,
                                               cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                              const std::unordered_map<std::string, uintptr_t>&) {
+                                              [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras,
+                                              mscclpp::DataType accumDtype) {
   if (!symmetricMemory_) {
     WARN("AllreduceNvls requires symmetric memory for now.");
     return CommResult::CommInvalidArgument;
   }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -203,9 +207,10 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvls::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             mscclpp::DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index ceb545ee..e2d8ef73 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -2,16 +2,17 @@
 // Licensed under the MIT License.
 
 #include <mscclpp/algorithm.hpp>
+#include <type_traits>
 
 #include "allreduce/allreduce_packet.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
-#include "debug.h"
+#include "logger.hpp"
 
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
                     size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize,
@@ -92,12 +93,21 @@ __global__ void __launch_bounds__(1024, 1)
   // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
     uint2 data = src[idx];
-    for (int index = 0; index < nPeers; index++) {
-      const int remoteRank = index < rank ? index : index + 1;
-      mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
-      uint2 val = dstPkt[idx].read(flag);
-      data.x = cal_vector<T, OpType>(val.x, data.x);
-      data.y = cal_vector<T, OpType>(val.y, data.y);
+    {
+      // When T == AccumT, stay with raw uint32_t to avoid type mismatch in identity path.
+      using AccRaw = std::conditional_t<std::is_same_v<T, AccumT>, uint32_t,
+                                        mscclpp::VectorType<AccumT, sizeof(uint32_t) / sizeof(T)>>;
+      AccRaw accX = mscclpp::upcastVector<T, AccumT, AccRaw>(data.x);
+      AccRaw accY = mscclpp::upcastVector<T, AccumT, AccRaw>(data.y);
+      for (int index = 0; index < nPeers; index++) {
+        const int remoteRank = index < rank ? index : index + 1;
+        mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
+        uint2 val = dstPkt[idx].read(flag);
+        accX = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(accX, val.x);
+        accY = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(accY, val.y);
+      }
+      data.x = mscclpp::downcastVector<T, AccumT, uint32_t>(accX);
+      data.y = mscclpp::downcastVector<T, AccumT, uint32_t>(accY);
     }
 
     dst[idx].x = data.x;
@@ -142,7 +152,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct PacketAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
@@ -155,12 +165,12 @@ struct PacketAdapter {
     nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1);
 #if defined(ENABLE_NPKIT)
     size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS;
-    allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
+    allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
         nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
         NpKit::GetCpuTimestamp());
 #else
-    allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
         nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
 #endif
@@ -186,18 +196,22 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
     }
   }
 
-#if defined(__FP8_TYPES_EXIST__)
   // FP8-specific tuning for 32KB-256KB range
-  if (dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2) {
-    if (inputSize < (64 << 10)) {
-      nThreadsPerBlock = 64;
-    } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
-      nThreadsPerBlock = 128;
-    } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) {
-      nThreadsPerBlock = 256;
+  {
+    bool isFp8 = dtype == DataType::FLOAT8_E4M3B15;
+#if defined(__FP8_TYPES_EXIST__)
+    isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2;
+#endif
+    if (isFp8) {
+      if (inputSize < (64 << 10)) {
+        nThreadsPerBlock = 64;
+      } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
+        nThreadsPerBlock = 128;
+      } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) {
+        nThreadsPerBlock = 256;
+      }
     }
   }
-#endif
 #endif
   return {nBlocks, nThreadsPerBlock};
 }
@@ -213,7 +227,8 @@ void AllreducePacket::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
                                                 size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op,
                                                 cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                const std::unordered_map<std::string, uintptr_t>&) {
+                                                const std::unordered_map<std::string, uintptr_t>&,
+                                                DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
@@ -225,9 +240,10 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   size_t channelInOffset = (char*)input - (char*)sendBasePtr;
 
-  AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
+    WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
+         ", dtype=", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   cudaError_t error =
@@ -236,7 +252,7 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
                 stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
+    WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
@@ -280,9 +296,9 @@ std::shared_ptr<Algorithm> AllreducePacket::build() {
       "default_allreduce_packet", "allreduce", [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index d5be2257..db471b93 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -87,7 +87,7 @@ __global__ void __launch_bounds__(1024, 1)
       int rankIdx = (rank + i + 1) % nRanksPerNode;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       int4 data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
-      tmp = cal_vector<T, OpType>(data, tmp);
+      tmp = calVector<T, OpType>(data, tmp);
     }
     for (uint32_t i = 0; i < nPeers; i++) {
       int rankIdx = (rank + i + 1) % nRanksPerNode;
@@ -123,7 +123,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -166,9 +166,9 @@ void AllreduceRsAg::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
                                               size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                                               int nBlocks, int nThreadsPerBlock,
-                                              const std::unordered_map<std::string, uintptr_t>&) {
+                                              const std::unordered_map<std::string, uintptr_t>&, DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
-  AllreduceFunc allreduce = dispatch<AllreduceRsAgAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
          ", dtype=", static_cast<int>(dtype));
@@ -213,9 +213,10 @@ std::shared_ptr<Algorithm> AllreduceRsAg::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
index a230d8cd..eabe3dc5 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
@@ -168,7 +168,7 @@ __global__ void __launch_bounds__(1024, 1)
           uint32_t peerSlotOffset =
               baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
           int4 data = scratch4[peerSlotOffset];
-          tmp = cal_vector<T, OpType>(data, tmp);
+          tmp = calVector<T, OpType>(data, tmp);
         }
         storeVec(resultBuff, myChunkOffset, tmp, nelems);
         // Broadcast reduced result to all peers' scratch at SCATTER_AG_OFFSET + rank * nInt4PerIter
@@ -220,7 +220,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -274,12 +274,12 @@ void AllreduceRsAgPipeline::initialize(std::shared_ptr<Communicator> comm) {
             cudaMemcpyHostToDevice);
 }
 
-CommResult AllreduceRsAgPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
-                                                      size_t inputSize, DataType dtype, ReduceOp op,
-                                                      cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                      const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceRsAgPipeline::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op,
+    cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
-  AllreduceFunc allreduce = dispatch<AllreduceRsAgPipelineAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
          ", dtype=", static_cast<int>(dtype));
@@ -320,9 +320,10 @@ std::shared_ptr<Algorithm> AllreduceRsAgPipeline::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index caac07ae..f95ba7e3 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <type_traits>
+
 #include "allreduce/allreduce_rsag_zero_copy.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
@@ -36,7 +38,7 @@ __device__ mscclpp::DeviceSyncer globalSyncer;
 // the extra copy steps of the standard RSAG. The NRanksPerNode template
 // parameter enables compile-time unrolling of peer loops (supports 4 or 8).
 
-template <int NRanksPerNode, ReduceOp OpType, typename T>
+template <int NRanksPerNode, ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
                           DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int worldSize,
@@ -73,19 +75,26 @@ __global__ void __launch_bounds__(1024, 1)
   }
   __syncthreads();
   int4 data[NPeers];
+  // AccumInt4: when AccumT != T, use a wider accumulator type.
+  // For AccumT == T, this is just int4 (no-op conversion).
+  constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T);
+  // When T == AccumT, stay with raw int4 to avoid type mismatch in identity path.
+  using AccumVec = std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nElemsPerInt4>>;
   for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) {
     uint32_t offset = idx + offset4 + rank * nInt4PerRank;
     if (offset >= nInt4Total) continue;
-    int4 tmp = buff4[offset];
+    int4 tmp_raw = buff4[offset];
 #pragma unroll
     for (int i = 0; i < NPeers; i++) {
       int rankIdx = (rank + i + 1) % NRanksPerNode;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       data[i] = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
     }
+    AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(tmp_raw);
     for (int i = 0; i < NPeers; i++) {
-      tmp = cal_vector<T, OpType>(data[i], tmp);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data[i]);
     }
+    int4 tmp = mscclpp::downcastVector<T, AccumT, int4>(acc);
 #pragma unroll
     for (int i = 0; i < NPeers; i++) {
       int rankIdx = (rank + i + 1) % NRanksPerNode;
@@ -102,7 +111,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgZeroCopyAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -118,11 +127,11 @@ struct AllreduceRsAgZeroCopyAdapter {
       }
     }
     if (nRanksPerNode == 4) {
-      allreduceRsAgZeroCopy<4, OpType, T>
+      allreduceRsAgZeroCopy<4, OpType, T, AccumT>
           <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
                                                      switchChannel, remoteMemories, rank, worldSize, nelems);
     } else if (nRanksPerNode == 8) {
-      allreduceRsAgZeroCopy<8, OpType, T>
+      allreduceRsAgZeroCopy<8, OpType, T, AccumT>
           <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
                                                      switchChannel, remoteMemories, rank, worldSize, nelems);
     } else {
@@ -145,9 +154,10 @@ void AllreduceRsAgZeroCopy::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
                                                       size_t inputSize, DataType dtype, ReduceOp op,
                                                       cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                      const std::unordered_map<std::string, uintptr_t>&) {
+                                                      const std::unordered_map<std::string, uintptr_t>&,
+                                                      DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
-  AllreduceFunc allreduce = dispatch<AllreduceRsAgZeroCopyAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgZeroCopyAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
          ", dtype=", static_cast<int>(dtype));
@@ -220,9 +230,10 @@ std::shared_ptr<Algorithm> AllreduceRsAgZeroCopy::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
index bd402cfa..362308b2 100644
--- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
@@ -20,7 +20,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
index fa811b15..a54352b3 100644
--- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
@@ -16,7 +16,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
index 8b9b04ae..81b74add 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
@@ -19,7 +19,7 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
index 65a48923..fb0c63b8 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
@@ -21,7 +21,8 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<mscclpp::Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-                                 int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras);
+                                 int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+                                 mscclpp::DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
                                              size_t, mscclpp::DataType);
@@ -34,6 +35,7 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
   uintptr_t flagBuffer_;
   size_t flagBufferSize_;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
+  std::vector<SwitchChannel> switchChannels_;
 };
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
index e392b54e..8f02a873 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
@@ -19,7 +19,7 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
index d0593500..d53ea180 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -19,7 +19,7 @@ class AllreduceNvls : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
index f0438dea..de7ca471 100644
--- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
@@ -20,7 +20,7 @@ class AllreducePacket : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
index 6e033f67..1fd663da 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
@@ -19,7 +19,7 @@ class AllreduceRsAg : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
index 2a740ac0..7629f2fe 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
@@ -19,7 +19,7 @@ class AllreduceRsAgPipeline : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
index 6153a0e4..05bf2ef3 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
@@ -18,7 +18,7 @@ class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 9bfac69a..1e0e7e69 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#ifndef MSCCLPP_ALLREDUCE_COMMOM_HPP_
-#define MSCCLPP_ALLREDUCE_COMMOM_HPP_
+#ifndef MSCCLPP_ALLREDUCE_COMMON_HPP_
+#define MSCCLPP_ALLREDUCE_COMMON_HPP_
 
 #include <cmath>
 #include <mscclpp/algorithm.hpp>
@@ -77,55 +77,51 @@ using AllreduceFunc =
                               mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t, int, int, int,
                               size_t, cudaStream_t, void*, uint32_t, uint32_t, int, int)>;
 
-template <template <ReduceOp, typename> class Adapter>
-AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {
-  if (op == SUM) {
-    if (dtype == mscclpp::DataType::FLOAT16) {
-      return Adapter<SUM, half>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT32) {
-      return Adapter<SUM, float>::call;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::BFLOAT16) {
-      return Adapter<SUM, __bfloat16>::call;
-#endif
-#if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
-      return Adapter<SUM, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
-      return Adapter<SUM, __fp8_e5m2>::call;
-#endif
-    } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
-      return Adapter<SUM, int>::call;
-    } else if (dtype == mscclpp::DataType::UINT8) {
-      return Adapter<SUM, uint8_t>::call;
-    } else {
-      return nullptr;
-    }
-  } else if (op == MIN) {
-    if (dtype == mscclpp::DataType::FLOAT16) {
-      return Adapter<MIN, half>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT32) {
-      return Adapter<MIN, float>::call;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::BFLOAT16) {
-      return Adapter<MIN, __bfloat16>::call;
-#endif
-#if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
-      return Adapter<MIN, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
-      return Adapter<MIN, __fp8_e5m2>::call;
-#endif
-    } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
-      return Adapter<MIN, int>::call;
-    } else if (dtype == mscclpp::DataType::UINT8) {
-      return Adapter<MIN, uint8_t>::call;
-    } else {
-      return nullptr;
-    }
+/// Dispatch helper for FP8 types with a configurable accumulation type.
+template <ReduceOp Op, typename FP8T, template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatchFp8Accum(mscclpp::DataType accumDtype, mscclpp::DataType dtype) {
+  if (accumDtype == mscclpp::DataType::FLOAT32) {
+    return Adapter<Op, FP8T, float>::call;
+  } else if (accumDtype == mscclpp::DataType::FLOAT16) {
+    return Adapter<Op, FP8T, half>::call;
+  } else if (accumDtype == dtype) {
+    return Adapter<Op, FP8T, FP8T>::call;
   }
   return nullptr;
 }
+
+template <ReduceOp Op, template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatchByDtype(mscclpp::DataType dtype, mscclpp::DataType accumDtype) {
+  if (dtype == mscclpp::DataType::FLOAT16) {
+    return Adapter<Op, half, half>::call;
+  } else if (dtype == mscclpp::DataType::FLOAT32) {
+    return Adapter<Op, float, float>::call;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  } else if (dtype == mscclpp::DataType::BFLOAT16) {
+    return Adapter<Op, __bfloat16, __bfloat16>::call;
+#endif
+#if defined(__FP8_TYPES_EXIST__)
+  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
+    return dispatchFp8Accum<Op, __fp8_e4m3, Adapter>(accumDtype, dtype);
+  } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
+    return dispatchFp8Accum<Op, __fp8_e5m2, Adapter>(accumDtype, dtype);
+#endif
+  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3B15) {
+    return dispatchFp8Accum<Op, __fp8_e4m3b15, Adapter>(accumDtype, dtype);
+  } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
+    return Adapter<Op, int, int>::call;
+  } else if (dtype == mscclpp::DataType::UINT8) {
+    return Adapter<Op, uint8_t, uint8_t>::call;
+  }
+  return nullptr;
+}
+
+template <template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype, mscclpp::DataType accumDtype) {
+  if (op == SUM) return dispatchByDtype<SUM, Adapter>(dtype, accumDtype);
+  if (op == MIN) return dispatchByDtype<MIN, Adapter>(dtype, accumDtype);
+  return nullptr;
+}
 }  // namespace collective
 }  // namespace mscclpp
 
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
index 82dd2d9e..0b9592d7 100644
--- a/src/ext/nccl/algorithm_selector.cc
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -15,7 +15,8 @@ static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, Da
   bool nvlsSupported = config.nvlsSupported;
 
   // NVLS does not support uint8_t (no hardware support for byte-level reduction)
-  if (dtype == DataType::UINT8) {
+  // NVLS also does not support float8_e4m3b15 (software-defined type with no hardware NVLS reduction support)
+  if (dtype == DataType::UINT8 || dtype == DataType::FLOAT8_E4M3B15) {
     return false;
   }
 
diff --git a/src/ext/nccl/datatype_conversion.hpp b/src/ext/nccl/datatype_conversion.hpp
index 0270a753..dcfb645a 100644
--- a/src/ext/nccl/datatype_conversion.hpp
+++ b/src/ext/nccl/datatype_conversion.hpp
@@ -43,6 +43,7 @@ inline size_t getDataTypeSize(mscclpp::DataType dtype) {
     case mscclpp::DataType::UINT8:
     case mscclpp::DataType::FLOAT8_E4M3:
     case mscclpp::DataType::FLOAT8_E5M2:
+    case mscclpp::DataType::FLOAT8_E4M3B15:
       return 1;
     case mscclpp::DataType::FLOAT16:
     case mscclpp::DataType::BFLOAT16:
@@ -76,6 +77,10 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
     case mscclpp::DataType::FLOAT8_E5M2:
       return ncclFloat8e5m2;
 #endif
+    case mscclpp::DataType::FLOAT8_E4M3B15:
+      // float8_e4m3b15 has no NCCL equivalent; NCCL cannot reduce this type correctly.
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "FLOAT8_E4M3B15 (float8_e4m3b15) has no NCCL equivalent and cannot be used with NCCL collectives");
     default:
       THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
             "Unsupported mscclpp::DataType: " + std::to_string(static_cast<int>(dtype)));
diff --git a/src/ext/nccl/nccl.cc b/src/ext/nccl/nccl.cc
index e12f12b6..2d6c5f9d 100644
--- a/src/ext/nccl/nccl.cc
+++ b/src/ext/nccl/nccl.cc
@@ -83,17 +83,17 @@ static inline int mscclppNcclDlopenInit() {
   const char* ncclLibPath = mscclpp::env()->ncclSharedLibPath.c_str();
   if (ncclLibPath != nullptr && ncclLibPath[0] != '\0') {
     if (std::filesystem::is_directory(ncclLibPath)) {
-      WARN(MSCCLPP_NCCL, "The value of the environment variable %s is a directory", ncclLibPath);
+      WARN(MSCCLPP_NCCL, "MSCCLPP_NCCL_LIB_PATH points to a directory: ", ncclLibPath);
       return dlopenError;
     }
 
     mscclppNcclDlHandle = dlopen(ncclLibPath, RTLD_LAZY | RTLD_NODELETE | RTLD_DEEPBIND);
     if (!mscclppNcclDlHandle) {
-      WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
+      WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: ", dlerror());
       return dlopenError;
     }
   } else {
-    WARN(MSCCLPP_NCCL, "The value of MSCCLPP_NCCL_LIB_PATH is empty!\n");
+    WARN(MSCCLPP_NCCL, "The value of MSCCLPP_NCCL_LIB_PATH is empty!");
     return dlopenError;
   }
 
@@ -270,19 +270,18 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
     return mscclpp::nccl::selectSingleNodeAllreduce(algoMap, request, config);
   }
 
-  INFO(MSCCLPP_NCCL, "No suitable algorithm found for collective '%s', fallback to nccl/rccl",
-       request.collective.c_str());
+  INFO(MSCCLPP_NCCL, "No suitable algorithm found for collective '", request.collective, "', fallback to nccl/rccl");
   return nullptr;
 }
 
 NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
-  INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank %d, world_size=%d", rank, nranks);
+  INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank ", rank, ", world_size=", nranks);
   if (comm == nullptr) {
     WARN(MSCCLPP_NCCL, "comm is nullptr");
     return ncclInvalidArgument;
   }
   if (nranks < 0 || rank < 0 || rank >= nranks) {
-    WARN(MSCCLPP_NCCL, "nranks is %d, rank is %d", nranks, rank);
+    WARN(MSCCLPP_NCCL, "nranks is ", nranks, ", rank is ", rank);
     return ncclInvalidArgument;
   }
   std::shared_ptr<mscclpp::TcpBootstrap> bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
@@ -560,8 +559,8 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
     return ncclInvalidArgument;
   }
 
-  INFO(MSCCLPP_NCCL, "rank %d broadcast sendbuff %p recvbuff %p count %ld, dtype %d, comm: %p", rank, sendbuff,
-       recvbuff, count, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " broadcast sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", count,
+       ", dtype ", datatype, ", comm: ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("broadcast", fallbackList)) {
@@ -619,8 +618,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
   }
   // Declarating variables
   int rank = comm->comm->bootstrap()->getRank();
-  INFO(MSCCLPP_NCCL, "rank %d allreduce sendbuff %p recvbuff %p count %ld, dtype %d comm is %p", rank, sendbuff,
-       recvbuff, count, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " allreduce sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", count,
+       ", dtype ", datatype, " comm is ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib && mscclppNcclInFallbackList("allreduce", fallbackList)) {
@@ -673,8 +672,8 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
     return ncclInvalidArgument;
   }
 
-  INFO(MSCCLPP_NCCL, "ReduceScatter recvcount: %ld, datatype: %d, op: %d, messageSize: %ld", recvcount, datatype, op,
-       bytes * comm->comm->bootstrap()->getNranks());
+  INFO(MSCCLPP_NCCL, "ReduceScatter recvcount: ", recvcount, ", datatype: ", datatype, ", op: ", op,
+       ", messageSize: ", bytes * comm->comm->bootstrap()->getNranks());
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("reducescatter", fallbackList)) {
@@ -730,8 +729,8 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
 
   int rank = comm->comm->bootstrap()->getRank();
   int nRank = comm->comm->bootstrap()->getNranks();
-  INFO(MSCCLPP_NCCL, "rank %d allgather sendbuff %p recvbuff %p count %ld, dtype %d, comm %p", rank, sendbuff, recvbuff,
-       sendcount, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " allgather sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", sendcount,
+       ", dtype ", datatype, ", comm ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("allgather", fallbackList)) {
@@ -866,20 +865,20 @@ ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
     }
   } catch (const mscclpp::Error& e) {
     if (e.getErrorCode() == mscclpp::ErrorCode::InvalidUsage) {
-      WARN(MSCCLPP_NCCL, "Invalid usage: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Invalid usage: ", e.what());
       return ncclInvalidUsage;
     } else {
-      WARN(MSCCLPP_NCCL, "Internal error: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Internal error: ", e.what());
       return ncclInternalError;
     }
   } catch (const mscclpp::CudaError& e) {
-    WARN(MSCCLPP_NCCL, "Cuda error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cuda error: ", e.what());
     return ncclUnhandledCudaError;
   } catch (const mscclpp::CuError& e) {
-    WARN(MSCCLPP_NCCL, "Cu error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cu error: ", e.what());
     return ncclUnhandledCudaError;
   } catch (const mscclpp::BaseError& e) {
-    WARN(MSCCLPP_NCCL, "Base error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Base error: ", e.what());
     return ncclInternalError;
   }
   ptrMap[sharedPtr.get()] = sharedPtr;

From e66ce39647a3db04b6996d7d3c53bc18c0201488 Mon Sep 17 00:00:00 2001
From: Mahdieh Ghazi <mahdiehghazi@microsoft.com>
Date: Wed, 8 Apr 2026 12:38:56 -0400
Subject: [PATCH 42/52] Mahdieh/update version number (#775)

Update the version number for v0.9.0
---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index a3df0a69..ac39a106 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.8.0
+0.9.0

From 8896cd909a651d9c9b3556a3440adfc2c46e0d4a Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 8 Apr 2026 09:53:45 -0700
Subject: [PATCH 43/52] Add ROCm FP8 E4M3B15 support (#774)

## Summary

Add ROCm (gfx942) support for the FP8 E4M3B15 data type, including
optimized conversion routines between FP8 E4M3B15 and FP16/FP32 using
inline assembly.

Extends the allpair packet and fullmesh allreduce kernels to support
higher-precision accumulation (e.g., FP16/FP32) when reducing FP8 data,
improving numerical accuracy.

Adds Python tests to verify that higher-precision accumulation is at
least as accurate as native FP8 accumulation across all algorithm
variants.

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            | 666 +++++++++++-------
 include/mscclpp/gpu_data_types.hpp            |  70 +-
 python/csrc/CMakeLists.txt                    |   3 +
 python/requirements_rocm6.txt                 |   4 +-
 python/test/test_fp8_accum.py                 |  34 +-
 .../allreduce/allreduce_allpair_packet.cu     |  22 +-
 .../allreduce/allreduce_fullmesh.cu           |   7 +
 7 files changed, 538 insertions(+), 268 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 41be5825..060a0097 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -1,193 +1,117 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8  customized_comm_with_tuning.py
+# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
 
 import os
-import torch
-import mscclpp.utils as mscclpp_utils
-import mscclpp
-import mscclpp.ext
-import netifaces as ni
 import ipaddress
 
+import netifaces as ni
+import torch
+import mscclpp
+import mscclpp.ext
+import mscclpp.utils as mscclpp_utils
 
-def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
-    collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
-    return collection_builder.build_default_algorithms(
-        scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
+# -- Helpers ------------------------------------------------------------------
+
+
+def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor:
+    """Allocate a tensor backed by RawGpuBuffer (symmetric memory)."""
+    # PyTorch's from_dlpack does not support certain float8 DLPack type codes.
+    # Work around by importing as uint8 and reinterpreting via .view().
+    _DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)
+    if dtype in _DLPACK_UNSUPPORTED:
+        dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8))
+        return torch.utils.dlpack.from_dlpack(dlpack).view(dtype)
+    dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype))
+    return torch.utils.dlpack.from_dlpack(dlpack)
+
+
+def _load_algorithms(scratch: torch.Tensor, rank: int):
+    return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms(
+        scratch_buffer=scratch.data_ptr(),
+        scratch_buffer_size=scratch.nbytes,
+        rank=rank,
     )
 
 
-def interfaces_for_ip_netifaces(ip: str):
+def _interfaces_for_ip(ip: str):
     target = ipaddress.ip_address(ip)
-    for interface in ni.interfaces():
-        addresses = ni.ifaddresses(interface)
-        if ni.AF_INET in addresses:
-            for link in addresses[ni.AF_INET]:
-                if "addr" in link:
-                    addr = ipaddress.ip_address(link["addr"])
-                    if addr == target:
-                        return interface
+    for iface in ni.interfaces():
+        addrs = ni.ifaddresses(iface)
+        if ni.AF_INET in addrs:
+            for link in addrs[ni.AF_INET]:
+                if "addr" in link and ipaddress.ip_address(link["addr"]) == target:
+                    return iface
     return None
 
 
-def to_mscclpp_reduce_op(op: torch.distributed.ReduceOp) -> mscclpp.ReduceOp:
+def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
     if op == torch.distributed.ReduceOp.SUM:
         return mscclpp.ReduceOp.SUM
-    elif op == torch.distributed.ReduceOp.MIN:
+    if op == torch.distributed.ReduceOp.MIN:
         return mscclpp.ReduceOp.MIN
-    else:
-        raise ValueError(f"unsupported op: {op}")
+    raise ValueError(f"unsupported op: {op}")
+
+
+def _round_pow2(size: int) -> int:
+    """Round up to next power-of-2, clamped to [1024, 256 MB]."""
+    size = max(size, 1024)
+    size = min(size, 256 << 20)
+    return 1 << (size - 1).bit_length()
+
+
+# -- CustomizedComm -----------------------------------------------------------
 
 
 class CustomizedComm:
-    def __init__(self, comm: mscclpp.CommGroup):
+    """Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
+
+    _TUNE_N_WARMUP = 5
+    _TUNE_N_GRAPH_LAUNCHES = 10
+    _TUNE_N_OPS_PER_GRAPH = 100
+    _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128]
+    _CANDIDATE_NTHREADS = [512, 768, 1024]
+    _NBLOCKS_LIMIT = {
+        "default_allreduce_nvls_packet": 16,
+        "default_allreduce_packet": 56,
+        "default_allreduce_allpair_packet": 56,
+        "default_allreduce_fullmesh": 64,
+        "default_allgather_fullmesh2": 32,
+    }
+
+    def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self.comm = comm
         self.rank = comm.my_rank
         self.world_size = comm.nranks
-        self.local_rank = comm.my_rank % comm.nranks_per_node
-        self.n_ranks_per_node = comm.nranks_per_node
-        dlpack = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
-        self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
-        algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
-        self._algorithm_nvls_packet = [
-            algo
-            for algo in algorithms
-            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
-        ][0]
-        self._algorithm_rsag_zero_copy = [
-            algo
-            for algo in algorithms
-            if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy"
-        ][0]
-        self._algorithm_packet = [
-            algo for algo in algorithms if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
-        ][0]
-        if mscclpp.is_nvls_supported():
-            self._algorithm_nvls_zero_copy = [
-                algo
-                for algo in algorithms
-                if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy"
-            ][0]
-        self._tune(n_warmup=5, n_graph_launches=10, n_ops_per_graph=100)
+        self.symmetric_memory = symmetric_memory
+        self._nvls = mscclpp.is_nvls_supported()
 
-    def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
-        sizes = [1 << i for i in range(10, 28)]
-        # Pre-fill with defaults for barrier
-        self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
+        self._scratch = _make_tensor(1 << 27, torch.float16)
+        self._barrier_tensor = _make_tensor(4096, torch.float32)
 
-        tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
-        tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
-        tune_tensor.normal_()
-        candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
-        candidates_nthreads = [512, 768, 1024]
+        algos = _load_algorithms(self._scratch, self.rank)
+        self._algos = {(a.collective, a.name): a for a in algos}
 
-        for size in sizes:
-            algos = []
-            if mscclpp.is_nvls_supported():
-                algos.append(self._algorithm_nvls_zero_copy)
-            if size <= 4 * 1024 * 1024:
-                algos.append(self._algorithm_nvls_packet)
-                algos.append(self._algorithm_packet)
-            if size >= 512 * 1024:
-                algos.append(self._algorithm_rsag_zero_copy)
+        # {collective: {rounded_size: (algo, nblocks, nthreads)}}
+        self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}}
+        self._tune_buf = None
+        self._time_buf = None
 
-            best_time = float("inf")
-            best_config = None
+    def _algo(self, collective: str, name: str):
+        return self._algos.get((collective, name))
 
-            for algo in algos:
-                for nb in candidates_nblocks:
-                    if algo.name == "default_allreduce_nvls_packet" and nb > 16:
-                        continue
-                    if algo.name == "default_allreduce_packet" and nb > 56:
-                        continue
-                    for nt in candidates_nthreads:
-                        if self._run_algo(algo, tune_tensor, size, nb, nt) != 0:
-                            continue
+    def _default_ar_config(self):
+        """Fallback allreduce config for barrier / timing sync."""
+        pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
+        if self._nvls and pkt:
+            return (pkt, 0, 0)
+        return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
 
-                        for _ in range(n_warmup):
-                            self._run_algo(algo, tune_tensor, size, nb, nt)
-                        self.barrier()
+    # -- low-level execute --
 
-                        capture_stream = torch.cuda.Stream()
-                        capture_stream.wait_stream(torch.cuda.current_stream())
-
-                        g = torch.cuda.CUDAGraph()
-                        # Warmup on capture stream
-                        with torch.cuda.stream(capture_stream):
-                            self._run_algo(algo, tune_tensor, size, nb, nt)
-                        capture_stream.synchronize()
-
-                        with torch.cuda.graph(g, stream=capture_stream):
-                            for _ in range(n_ops_per_graph):
-                                self._run_algo(algo, tune_tensor, size, nb, nt)
-
-                        start_event = torch.cuda.Event(enable_timing=True)
-                        end_event = torch.cuda.Event(enable_timing=True)
-                        start_event.record(capture_stream)
-                        with torch.cuda.stream(capture_stream):
-                            for _ in range(n_graph_launches):
-                                g.replay()
-                        end_event.record(capture_stream)
-                        end_event.synchronize()
-
-                        elapsed = start_event.elapsed_time(end_event)
-
-                        # Synchronize timing results across all ranks to ensure consistent algorithm selection
-                        # replicate n times such due to algo limitations
-                        time_tensor = torch.full((self.world_size,), elapsed, dtype=torch.float64, device="cuda").to(
-                            dtype=torch.float32
-                        )
-                        torch.cuda.current_stream().wait_stream(capture_stream)
-                        # TODO: use all_reduce may cause problem if the time elapsed between different algos are too close.
-                        # May change to broadcast in the future if that becomes an issue.
-                        self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM)
-                        avg_time = time_tensor[self.rank].item() / self.world_size
-
-                        if avg_time < best_time:
-                            best_time = avg_time
-                            best_config = (algo, nb, nt)
-
-            if best_config:
-                self.best_configs[size] = best_config
-                if self.rank == 0:
-                    print(
-                        f"Size {size}: Best Algo {best_config[0].name} nblocks {best_config[1]} nthreads {best_config[2]} Time {(best_time/(n_graph_launches * n_ops_per_graph))*1000:.2f} us"
-                    )
-        # reset the algorithms after tuning
-        torch.cuda.synchronize()
-        for algo in algos:
-            algo.reset()
-
-    def _run_algo(self, algo: mscclpp.Algorithm, tensor, size, nblocks, nthreads):
-        return algo.execute(
-            comm=self.comm.communicator,
-            input_buffer=tensor.data_ptr(),
-            output_buffer=tensor.data_ptr(),
-            input_size=size,
-            output_size=size,
-            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
-            op=mscclpp.ReduceOp.SUM,
-            stream=torch.cuda.current_stream().cuda_stream,
-            nblocks=nblocks,
-            nthreads_per_block=nthreads,
-            symmetric_memory=True,
-        )
-
-    def get_tuned_config(self, size):
-        if size < 1024:
-            target_size = 1024
-        elif size > 256 * 1024 * 1024:
-            target_size = 256 * 1024 * 1024
-        else:
-            target_size = 1 << (size - 1).bit_length()
-        return self.best_configs.get(target_size)
-
-    def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
-        assert op == torch.distributed.ReduceOp.SUM
-        config = self.get_tuned_config(tensor.nbytes)
-        algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0)
+    def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True):
+        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
         ret = algo.execute(
             comm=self.comm.communicator,
             input_buffer=tensor.data_ptr(),
@@ -195,107 +119,357 @@ class CustomizedComm:
             input_size=tensor.nbytes,
             output_size=tensor.nbytes,
             dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
-            op=to_mscclpp_reduce_op(op),
-            stream=stream.cuda_stream if stream is not None else torch.cuda.current_stream().cuda_stream,
-            nblocks=nblocks,
-            nthreads_per_block=nthreads,
-            symmetric_memory=True,
+            op=op,
+            stream=s,
+            nblocks=nb,
+            nthreads_per_block=nt,
+            symmetric_memory=sym,
+            accum_dtype=accum_dtype,
         )
         if ret != 0:
-            print(f"Rank {self.rank}: Algo {algo.name} failed with error {ret}")
+            print(f"Rank {self.rank}: {algo.name} failed ({ret})")
+        return ret
+
+    def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
+        if sym is None:
+            sym = self.symmetric_memory
+        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
+        ret = algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=inp.data_ptr(),
+            output_buffer=out.data_ptr(),
+            input_size=inp.nbytes,
+            output_size=out.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype),
+            op=mscclpp.ReduceOp.NOP,
+            stream=s,
+            nblocks=nb,
+            nthreads_per_block=nt,
+            symmetric_memory=sym,
+        )
+        if ret != 0:
+            print(f"Rank {self.rank}: AG {algo.name} failed ({ret})")
+        return ret
+
+    def _barrier_internal(self):
+        a, nb, nt = self._default_ar_config()
+        self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
+
+    # -- lazy tuning --
+
+    def _ensure_tune_bufs(self):
+        if self._tune_buf is None:
+            self._tune_buf = _make_tensor(1 << 27, torch.float16)
+            self._tune_buf.normal_()
+            self._time_buf = _make_tensor(4096, torch.float32)
+        return self._tune_buf
+
+    def _ar_candidates(self, size: int):
+        out = []
+        if size <= 4 << 20:
+            a = self._algo("allreduce", "default_allreduce_nvls_packet")
+            if self._nvls and a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_packet")
+            if a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_allpair_packet")
+            if a:
+                out.append(a)
+        if size >= 512 << 10:
+            a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
+            if self._nvls and self.symmetric_memory and a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
+            if a:
+                out.append(a)
+        if torch.version.hip is not None:
+            a = self._algo("allreduce", "default_allreduce_fullmesh")
+            if a:
+                out.append(a)
+        return out
+
+    def _ag_candidates(self):
+        a = self._algo("allgather", "default_allgather_fullmesh2")
+        return [a] if a else []
+
+    def _run_tune(self, collective, algo, buf, size, nb, nt):
+        """Single tune invocation for either collective."""
+        if collective == "allreduce":
+            return algo.execute(
+                comm=self.comm.communicator,
+                input_buffer=buf.data_ptr(),
+                output_buffer=buf.data_ptr(),
+                input_size=size,
+                output_size=size,
+                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
+                op=mscclpp.ReduceOp.SUM,
+                stream=torch.cuda.current_stream().cuda_stream,
+                nblocks=nb,
+                nthreads_per_block=nt,
+                symmetric_memory=True,
+            )
+        else:
+            total = size * self.world_size
+            out_ptr = buf.data_ptr()
+            return algo.execute(
+                comm=self.comm.communicator,
+                input_buffer=out_ptr + self.rank * size,
+                output_buffer=out_ptr,
+                input_size=size,
+                output_size=total,
+                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
+                op=mscclpp.ReduceOp.NOP,
+                stream=torch.cuda.current_stream().cuda_stream,
+                nblocks=nb,
+                nthreads_per_block=nt,
+                symmetric_memory=False,
+            )
+
+    def _tune_size(self, collective: str, target_size: int):
+        """Auto-tune one (collective, target_size) pair and cache result."""
+        buf = self._ensure_tune_bufs()
+        cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates()
+
+        best_time, best_cfg = float("inf"), None
+        used = set()
+        run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
+
+        for algo in cands:
+            nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
+            for nb in self._CANDIDATE_NBLOCKS:
+                if nb > nb_limit:
+                    continue
+                for nt in self._CANDIDATE_NTHREADS:
+                    # Feasibility — sync result across ranks so all agree
+                    ret = run(algo, nb, nt)
+                    torch.cuda.synchronize()
+                    self._time_buf[0] = float(ret)
+                    self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True)
+                    if self._time_buf[0].item() != 0:
+                        continue
+                    used.add(algo)
+
+                    # Warmup
+                    for _ in range(self._TUNE_N_WARMUP):
+                        run(algo, nb, nt)
+
+                    # CUDA-graph timed benchmark
+                    cs = torch.cuda.Stream()
+                    cs.wait_stream(torch.cuda.current_stream())
+                    g = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(g, stream=cs):
+                        for _ in range(self._TUNE_N_OPS_PER_GRAPH):
+                            run(algo, nb, nt)
+
+                    start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+                    start.record(cs)
+                    with torch.cuda.stream(cs):
+                        for _ in range(self._TUNE_N_GRAPH_LAUNCHES):
+                            g.replay()
+                    end.record(cs)
+                    end.synchronize()
+                    elapsed = start.elapsed_time(end)
+
+                    # Cross-rank timing sync
+                    self._time_buf.fill_(elapsed)
+                    torch.cuda.current_stream().wait_stream(cs)
+                    self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True)
+                    avg = self._time_buf[self.rank].item() / self.world_size
+
+                    if avg < best_time:
+                        best_time, best_cfg = avg, (algo, nb, nt)
+
+        if best_cfg:
+            self._tune_cache[collective][target_size] = best_cfg
+            if self.rank == 0:
+                n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH
+                print(
+                    f"[tune] {collective} size={target_size}: {best_cfg[0].name} "
+                    f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us",
+                    flush=True,
+                )
+        else:
+            fb = (
+                self._default_ar_config()
+                if collective == "allreduce"
+                else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None)
+            )
+            self._tune_cache[collective][target_size] = fb
+
+        torch.cuda.synchronize()
+        self._barrier_internal()
+        for a in used:
+            a.reset()
+
+    # -- public API --
+
+    def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None):
+        sz = _round_pow2(tensor.nbytes)
+        if sz not in self._tune_cache["allreduce"]:
+            self._tune_size("allreduce", sz)
+        a, nb, nt = self._tune_cache["allreduce"][sz]
+        self._exec_ar(
+            tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory
+        )
+
+    def all_gather(self, output_tensor, input_tensor, stream=None):
+        sz = _round_pow2(input_tensor.nbytes)
+        if sz not in self._tune_cache["allgather"]:
+            self._tune_size("allgather", sz)
+        a, nb, nt = self._tune_cache["allgather"][sz]
+        self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory)
 
     def barrier(self):
-        tensor = torch.empty(self.world_size, dtype=torch.float, device=torch.device("cuda"))
-        self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
-
-    def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
-        low = 5 * 1024
-        high = 80 * 1024 * 1024
-        sizes = []
-        curr = low
-        while curr <= high:
-            sizes.append(curr)
-            curr *= 2
-
-        if self.rank == 0:
-            print(f"{'Size (Bytes)':<20} {'Time (us)':<20} {'AlgoBW (GB/s)':<20}")
-
-        dtype = torch.float16
-        capture_stream = torch.cuda.Stream()
-
-        # Allocate a single large RawGpuBuffer (symmetric memory) and reuse it for all sizes.
-        # Cannot allocate per-size tensors with symmetric memory.
-        bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype))
-        bench_buf = torch.utils.dlpack.from_dlpack(bench_buf)
-        bench_buf.normal_()
-
-        for size in sizes:
-            n_elements = size // bench_buf.element_size()
-            tensor = bench_buf[:n_elements]
-
-            capture_stream.wait_stream(torch.cuda.current_stream())
-            # Capture Graph
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g, stream=capture_stream):
-                for _ in range(n_iter_per_graph):
-                    self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
-
-            # warmup: Execute the graph once to prime the driver
-            with torch.cuda.stream(capture_stream):
-                for _ in range(n_warmup):
-                    g.replay()
-                self.barrier()
-            capture_stream.synchronize()
-
-            # Benchmark
-            start_event = torch.cuda.Event(enable_timing=True)
-            end_event = torch.cuda.Event(enable_timing=True)
-
-            start_event.record(capture_stream)
-            with torch.cuda.stream(capture_stream):
-                for _ in range(n_graph_launches):
-                    g.replay()
-            end_event.record(capture_stream)
-            end_event.synchronize()
-
-            # Get elapsed time in milliseconds
-            elapsed_ms = start_event.elapsed_time(end_event)
-            avg_time_ms = elapsed_ms / (n_graph_launches * n_iter_per_graph)
-            time_us = avg_time_ms * 1000
-
-            alg_bw = size / (avg_time_ms * 1e-3) if avg_time_ms > 0 else 0
-            if self.rank == 0:
-                print(f"{size:<20} {time_us:<20.2f} {alg_bw / 1e9:<20.2f}")
+        self._barrier_internal()
 
     def destroy(self):
-        self._algorithm_nvls_nonzero_copy = None
-        self._algorithm_nvls_packet = None
-        self.scratch_buffer = None
-        self.comm = None
+        self._algos.clear()
+        self._tune_cache = {"allreduce": {}, "allgather": {}}
+        self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None
 
 
-def init_dist() -> CustomizedComm:
-    rank = int(os.environ["RANK"])
-    world = int(os.environ["WORLD_SIZE"])
-    master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
-    master_port = os.environ["MSCCLPP_MASTER_PORT"]
-    interface = interfaces_for_ip_netifaces(master_addr)
-    if interface is None:
-        raise ValueError(f"Cannot find network interface for IP address {master_addr}")
-    interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
-    mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
-    return CustomizedComm(mscclpp_group)
+# -- Benchmarks (standalone) --------------------------------------------------
+
+
+def _bench_sizes(low=5 * 1024, high=80 << 20):
+    sizes, c = [], low
+    while c <= high:
+        sizes.append(c)
+        c *= 2
+    return sizes
+
+
+def benchmark_allreduce(
+    comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100
+):
+    sizes = _bench_sizes()
+    if comm.rank == 0:
+        print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}")
+        print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
+
+    cs = torch.cuda.Stream()
+    buf = _make_tensor(1 << 27, dtype)
+    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
+
+    for size in sizes:
+        nelems = size // buf.element_size()
+        t = buf[: size // buf.element_size()]
+        comm.all_reduce(t, accum_dtype=accum_dtype)
+        torch.cuda.synchronize()
+
+        cs.wait_stream(torch.cuda.current_stream())
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, stream=cs):
+            for _ in range(n_iter):
+                comm.all_reduce(t, accum_dtype=accum_dtype)
+        with torch.cuda.stream(cs):
+            for _ in range(n_warmup):
+                g.replay()
+            comm.barrier()
+        cs.synchronize()
+
+        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        s.record(cs)
+        with torch.cuda.stream(cs):
+            for _ in range(n_graph_launches):
+                g.replay()
+        e.record(cs)
+        e.synchronize()
+
+        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
+        if comm.rank == 0:
+            print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}")
+
+
+def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100):
+    sizes = _bench_sizes()
+    if comm.rank == 0:
+        print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}")
+        print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
+
+    cs = torch.cuda.Stream()
+    buf = _make_tensor(1 << 27, dtype)
+    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
+
+    for prs in sizes:
+        total = prs * comm.world_size
+        if total > buf.nbytes:
+            break
+        nt = total // buf.element_size()
+        npr = prs // buf.element_size()
+        out = buf[:nt]
+        inp = out[comm.rank * npr : (comm.rank + 1) * npr]
+
+        comm.all_gather(out, inp)
+        torch.cuda.synchronize()
+
+        cs.wait_stream(torch.cuda.current_stream())
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, stream=cs):
+            for _ in range(n_iter):
+                comm.all_gather(out, inp)
+        with torch.cuda.stream(cs):
+            for _ in range(n_warmup):
+                g.replay()
+            comm.barrier()
+        cs.synchronize()
+
+        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        s.record(cs)
+        with torch.cuda.stream(cs):
+            for _ in range(n_graph_launches):
+                g.replay()
+        e.record(cs)
+        e.synchronize()
+
+        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
+        if comm.rank == 0:
+            print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}")
+
+
+# -- Bootstrap & main ---------------------------------------------------------
+
+
+def init_dist() -> mscclpp.CommGroup:
+    addr = os.environ.get("MSCCLPP_MASTER_ADDR")
+    if addr:
+        rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"])
+        port = os.environ["MSCCLPP_MASTER_PORT"]
+        iface = _interfaces_for_ip(addr)
+        if not iface:
+            raise ValueError(f"No interface for {addr}")
+        return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
+    import torch.distributed as dist
+
+    dist.init_process_group(backend="gloo")
+    return mscclpp.CommGroup(torch_group=dist.group.WORLD)
 
 
 def main():
     local = int(os.environ["LOCAL_RANK"])
     torch.cuda.set_device(local)
-    comm = init_dist()
-    comm.benchmark(n_warmup=5, n_graph_launches=10, n_iter_per_graph=100)
-    comm.barrier()
+
+    dtype_str = os.environ.get("DTYPE", "float16")
+    dtype = getattr(torch, dtype_str, torch.float16)
+    accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
+    accum_str = os.environ.get("ACCUM_DTYPE")
+    accum_dtype = accum_map.get(accum_str) if accum_str else None
+
+    comm_group = init_dist()
+    cc = CustomizedComm(comm_group)
+
+    print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
+    benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
+    cc.barrier()
     torch.cuda.synchronize()
-    comm.destroy()
-    print(f"rank {local} All-reduce operation completed successfully.")
+
+    benchmark_allgather(cc, dtype=dtype)
+    cc.barrier()
+    torch.cuda.synchronize()
+
+    cc.destroy()
+    print(f"rank {local} completed successfully.")
 
 
 if __name__ == "__main__":
diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
index fa31a28f..41bd5928 100644
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -1072,6 +1072,15 @@ MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
   __half2 h;
   asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h)) : "r"(out0));
   return h;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: same bit manipulation as CUDA, store packed fp16 bits via words[].
+  uint16_t in = v.storage.__x;
+  uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
+  uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
+  uint32_t out0 = b0 | (a0 & 0x80008000u);
+  f16x2 result;
+  result.words[0] = out0;
+  return result;
 #else
   f16x2 result;
   result.data[0] = __float2half(float(v.data[0]));
@@ -1100,6 +1109,17 @@ MSCCLPP_DEVICE_INLINE f16x4 to<f16x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
   asm("mov.b32 %0, %1;" : "=r"(result.words[0]) : "r"(out0));
   asm("mov.b32 %0, %1;" : "=r"(result.words[1]) : "r"(out1));
   return result;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: __byte_perm + bitwise E4→E5 shift (no lop3), store via words[].
+  uint32_t in = v.storage.__x;
+  uint32_t a0 = __byte_perm(0u, in, 0x5746u);
+  uint32_t out0 = ((a0 >> 1) & 0x3f803f80u) | (a0 & 0x80008000u);
+  uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
+  uint32_t out1 = ((a1 >> 1) & 0x3f803f80u) | (a1 & 0x80008000u);
+  f16x4 result;
+  result.words[0] = out0;
+  result.words[1] = out1;
+  return result;
 #else
   f16x4 result;
 #pragma unroll
@@ -1127,6 +1147,16 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
   uint32_t b0 = a0 | (in0 & 0x80008000u);
   uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
   return bit_cast<f8_e4m3b15x2>(packed);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, pack.
+  uint32_t in0 = v.words[0];
+  uint32_t abs0 = in0 & 0x7fff7fffu;
+  uint32_t a0;
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
+  a0 = a0 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
+  return bit_cast<f8_e4m3b15x2>(packed);
 #else
   f8_e4m3b15x2 result;
   result.data[0] = __fp8_e4m3b15(__half2float(v.data[0]));
@@ -1154,6 +1184,19 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
   asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b1) : "r"(a1), "r"(in1), "r"(0x80008000u));
   uint32_t packed = __byte_perm(b0, b1, 0x7531u);
   return bit_cast<f8_e4m3b15x4>(packed);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, __byte_perm pack.
+  uint32_t in0 = v.words[0], in1 = v.words[1];
+  uint32_t abs0 = in0 & 0x7fff7fffu, abs1 = in1 & 0x7fff7fffu;
+  uint32_t a0, a1;
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3B803B80u));
+  a0 = a0 * 2u + 0x00800080u;
+  a1 = a1 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint32_t b1 = a1 | (in1 & 0x80008000u);
+  uint32_t packed = __byte_perm(b0, b1, 0x7531u);
+  return bit_cast<f8_e4m3b15x4>(packed);
 #else
   f8_e4m3b15x4 result;
 #pragma unroll
@@ -1164,8 +1207,7 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
 #endif
 }
 
-// --- fp8_e4m3b15 <-> f32 conversion specializations ---
-// Derived from fp16 conversions: fp8→f32 = fp8→fp16→f32, f32→fp8 = f32→fp16→fp8.
+// --- fp8_e4m3b15 <-> f32 conversion specializations (software, always available) ---
 
 /// f8_e4m3b15x2 -> f32x2.
 /// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
@@ -1175,6 +1217,12 @@ MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
   f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
   float2 f2 = __half22float2(h);
   return bit_cast<f32x2>(f2);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
+  f32x2 result;
+  result.data[0] = __half2float(h.data[0]);
+  result.data[1] = __half2float(h.data[1]);
+  return result;
 #else
   f32x2 result;
   result.data[0] = float(v.data[0]);
@@ -1200,6 +1248,14 @@ MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
   result.data[2] = f1.x;
   result.data[3] = f1.y;
   return result;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
+  f32x4 result;
+  result.data[0] = __half2float(h.data[0]);
+  result.data[1] = __half2float(h.data[1]);
+  result.data[2] = __half2float(h.data[2]);
+  result.data[3] = __half2float(h.data[3]);
+  return result;
 #else
   f32x4 result;
 #pragma unroll
@@ -1218,6 +1274,11 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f32x2>(const f32x2& v) {
   float2 f2 = {v.data[0], v.data[1]};
   __half2 h = __float22half2_rn(f2);
   return to<f8_e4m3b15x2, f16x2>(h);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x2 h;
+  h.data[0] = __float2half_rn(v.data[0]);
+  h.data[1] = __float2half_rn(v.data[1]);
+  return to<f8_e4m3b15x2, f16x2>(h);
 #else
   f8_e4m3b15x2 result;
   result.data[0] = __fp8_e4m3b15(v.data[0]);
@@ -1239,6 +1300,11 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f32x4>(const f32x4& v) {
   asm("mov.b32 %0, %1;" : "=r"(h.words[0]) : "r"(*reinterpret_cast<uint32_t*>(&h01)));
   asm("mov.b32 %0, %1;" : "=r"(h.words[1]) : "r"(*reinterpret_cast<uint32_t*>(&h23)));
   return to<f8_e4m3b15x4, f16x4>(h);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x4 h;
+  h.words[0] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[0], v.data[1]));
+  h.words[1] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[2], v.data[3]));
+  return to<f8_e4m3b15x4, f16x4>(h);
 #else
   f8_e4m3b15x4 result;
 #pragma unroll
diff --git a/python/csrc/CMakeLists.txt b/python/csrc/CMakeLists.txt
index 8759201f..44fb150f 100644
--- a/python/csrc/CMakeLists.txt
+++ b/python/csrc/CMakeLists.txt
@@ -24,4 +24,7 @@ set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
 set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
 target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES})
 target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
+if(MSCCLPP_USE_ROCM)
+    target_compile_definitions(mscclpp_py PRIVATE MSCCLPP_USE_ROCM)
+endif()
 install(TARGETS mscclpp_py LIBRARY DESTINATION .)
diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt
index d2a3389b..7ed4fef3 100644
--- a/python/requirements_rocm6.txt
+++ b/python/requirements_rocm6.txt
@@ -1,5 +1,5 @@
-mpi4py==4.1.1
-cupy==13.6.0
+mpi4py
+cupy
 prettytable
 netifaces
 pytest
diff --git a/python/test/test_fp8_accum.py b/python/test/test_fp8_accum.py
index 3a6c67f1..82981ce1 100644
--- a/python/test/test_fp8_accum.py
+++ b/python/test/test_fp8_accum.py
@@ -21,9 +21,8 @@ from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
 # FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs.
 # On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed.
 _is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip
-# TODO(binyli): Skip hip for now, will fix it in the next PR
-_skip_fp8 = _is_hip or int(cp.cuda.Device().compute_capability) < 89
-pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA (HIP not yet supported)")
+_skip_fp8 = not _is_hip and int(cp.cuda.Device().compute_capability) < 89
+pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA")
 
 # ---------------------------------------------------------------------------
 # FP8 E4M3FN helpers (bias=7, no infinity, NaN = exp=15 & mant=7)
@@ -208,6 +207,7 @@ def run_allreduce(algo, comm_group, buffer, dtype, accum_dtype=None, nblocks=0,
         "default_allreduce_nvls_packet",
         "default_allreduce_fullmesh",
         "default_allreduce_rsag_zero_copy",
+        "default_allreduce_allpair_packet",
     ],
 )
 @pytest.mark.parametrize("size", [1024, 4096, 16384, 65536, 262144, 1048576])
@@ -220,6 +220,8 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
     comm_group, algo_map, scratch = setup_algorithms(mpi_group)
     if algo_name not in algo_map:
         pytest.skip(f"{algo_name} not available")
+    if "nvls" in algo_name and not is_nvls_supported():
+        pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
     algo = algo_map[algo_name]
 
     buf = GpuBuffer(size, dtype=cp.uint8)
@@ -243,9 +245,9 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
 
     errors = {}
     for accum_label, accum_dtype in accum_configs:
-        # Generate deterministic per-rank data
-        cp.random.seed(42 + rank)
-        src_f32 = cp.random.randn(size).astype(cp.float32)
+        # Generate deterministic per-rank data (use numpy to avoid hipRAND issues on ROCm)
+        rng = np.random.RandomState(42 + rank)
+        src_f32 = cp.asarray(rng.randn(size).astype(np.float32))
         src_f32 = cp.clip(src_f32, -240.0, 240.0)
         src_fp8 = float_to_e4m3fn(src_f32)
 
@@ -268,8 +270,8 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
         # Compute float32 reference: sum all ranks' quantized FP8 inputs in float32
         ref_f32 = cp.zeros(size, dtype=cp.float32)
         for r in range(world_size):
-            cp.random.seed(42 + r)
-            rank_data = cp.random.randn(size).astype(cp.float32)
+            rng_r = np.random.RandomState(42 + r)
+            rank_data = cp.asarray(rng_r.randn(size).astype(np.float32))
             rank_data = cp.clip(rank_data, -240.0, 240.0)
             rank_data_fp8 = float_to_e4m3fn(rank_data)
             ref_f32 += e4m3fn_to_float(rank_data_fp8)
@@ -303,6 +305,8 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
         "default_allreduce_packet",
         "default_allreduce_nvls_packet",
         "default_allreduce_rsag_zero_copy",
+        "default_allreduce_fullmesh",
+        "default_allreduce_allpair_packet",
     ],
 )
 @pytest.mark.parametrize("size", [1024, 4096, 65536])
@@ -315,6 +319,8 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
     comm_group, algo_map, scratch = setup_algorithms(mpi_group)
     if algo_name not in algo_map:
         pytest.skip(f"{algo_name} not available")
+    if "nvls" in algo_name and not is_nvls_supported():
+        pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
 
     algo = algo_map[algo_name]
     buf = GpuBuffer(size, dtype=cp.uint8)
@@ -336,9 +342,9 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
     errors = {}
     for accum_label, accum_dtype in accum_configs:
         # Generate deterministic per-rank random uint8 values in valid e4m3b15 range
-        cp.random.seed(42 + rank)
-        raw = cp.random.randint(0, 0x78, (size,), dtype=cp.uint8)
-        signs = cp.random.randint(0, 2, (size,), dtype=cp.uint8).astype(cp.uint8) << 7
+        rng = np.random.RandomState(42 + rank)
+        raw = cp.asarray(rng.randint(0, 0x78, (size,)).astype(np.uint8))
+        signs = cp.asarray(rng.randint(0, 2, (size,)).astype(np.uint8)) << 7
         src_uint8 = raw | signs
         # Fix negative zero -> positive zero
         src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8)
@@ -364,9 +370,9 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
         # Compute float32 reference
         ref_f32 = cp.zeros(size, dtype=cp.float32)
         for r in range(world_size):
-            cp.random.seed(42 + r)
-            raw_r = cp.random.randint(0, 0x78, (size,), dtype=cp.uint8)
-            signs_r = cp.random.randint(0, 2, (size,), dtype=cp.uint8).astype(cp.uint8) << 7
+            rng_r = np.random.RandomState(42 + r)
+            raw_r = cp.asarray(rng_r.randint(0, 0x78, (size,)).astype(np.uint8))
+            signs_r = cp.asarray(rng_r.randint(0, 2, (size,)).astype(np.uint8)) << 7
             bits_r = raw_r | signs_r
             bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r)
             ref_f32 += e4m3b15_to_float(bits_r)
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 6cbc8977..17bcfc33 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <collective_utils.hpp>
+#include <type_traits>
 
 #include "allreduce/allreduce_allpair_packet.hpp"
 #include "allreduce/common.hpp"
@@ -11,7 +12,7 @@
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                                   size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
                                   int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
@@ -43,13 +44,16 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
   // step 2: Reduce Data
   for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nelems; idx += blockDim.x * gridDim.x) {
     uint32_t data = src[idx];
+    using AccRaw = std::conditional_t<std::is_same_v<T, AccumT>, uint32_t,
+                                      mscclpp::VectorType<AccumT, sizeof(uint32_t) / sizeof(T)>>;
+    AccRaw acc = mscclpp::upcastVector<T, AccumT, AccRaw>(data);
     for (int index = 0; index < nPeers; index++) {
       const int remoteRank = index < rank ? index : index + 1;
       LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems;
       uint32_t val = dstPkt[idx].read(flag, -1);
-      data = calVector<T, OpType>(val, data);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(acc, val);
     }
-    dst[idx] = data;
+    dst[idx] = mscclpp::downcastVector<T, AccumT, uint32_t>(acc);
   }
   __syncthreads();
   if (threadIdx.x == 0) {
@@ -76,7 +80,12 @@ struct AllpairAdapter {
                           int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
-    allreduceAllPairs<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    // Round nBlocks to multiple of nPeers so every block maps to a valid peer.
+    const int nPeers = worldSize - 1;
+    if (nPeers > 0) {
+      nBlocks = (nBlocks / nPeers) * nPeers;
+    }
+    allreduceAllPairs<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
         nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
     return cudaGetLastError();
@@ -101,6 +110,11 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
   }
+  // nBlocks must be at least nPeers for allpair — each block maps to one peer.
+  const int nPeers = algoCtx->nRanksPerNode - 1;
+  if (nPeers > 0 && blockAndThreadNum.first < nPeers) {
+    return CommResult::CommInvalidArgument;
+  }
   size_t sendBytes;
   CUdeviceptr sendBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index ee46fd77..24d2a31c 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -213,6 +213,13 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(
     return CommResult::CommInvalidArgument;
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  if (numBlocksAndThreads.first > 64) {
+    WARN("AllreduceFullmesh: number of blocks exceeds maximum supported blocks, which is 64");
+    return mscclpp::CommResult::CommInvalidArgument;
+  }
+  if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
+    numBlocksAndThreads = {35, 512};
+  }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(),
                 nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,

From ed565ceb333c128b48176cf0f8ff4c22164e9be6 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Wed, 8 Apr 2026 14:59:05 -0700
Subject: [PATCH 44/52] Fix missing directory of document for new tag v0.9.0 
 (#776)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The v0.9.0 conf.py (introduced in #775) dynamically loads the version
from python/mscclpp/_version.py.

This file is generated at build time by setuptools_scm and is listed in
.gitignore — it is never committed to the repo. Earlier tags (v0.8.0 and
below) used a hardcoded release (e.g., "v0.8.0") in conf.py, so they had
no dependency on generated files.
sphinx-multiversion checks out each tag using git archive, which only
extracts committed files.
Since _version.py is not committed, the v0.9.0 checkout is missing it,
and conf.py crashes on import. All future tags will have this same
problem.

**Three changes:**
1. docs/build_multiversion.py (new): A wrapper around
sphinx-multiversion that monkey-patches copy_tree to generate
_version.py in each tag checkout after extraction. The version string is
parsed from the tag name (e.g., v0.9.0 → __version__ = "0.9.0").
2. Makefile: The multiversion target now calls build_multiversion.py
instead of sphinx-multiversion directly.
3. conf.py: Added a fallback so that if _version.py doesn't exist, it
reads the version from the VERSION file instead. This makes conf.py
resilient for any future scenario where _version.py is missing.

**Testing**
Verified locally:
• make multiversion now successfully builds all 11 versions (v0.4.0
through v0.9.0)
• v0.9.0 docs are correctly generated under _build/html/v0.9.0/
Version selector shows v0.9.0 as latest
---
 docs/Makefile              |  2 +-
 docs/build_multiversion.py | 49 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 docs/build_multiversion.py

diff --git a/docs/Makefile b/docs/Makefile
index 5bc7422e..bf82c03a 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -5,7 +5,7 @@
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
-SPHINXMULTIVERSION ?= sphinx-multiversion
+SPHINXMULTIVERSION ?= python3 build_multiversion.py
 SOURCEDIR     = .
 BUILDDIR      = _build
 
diff --git a/docs/build_multiversion.py b/docs/build_multiversion.py
new file mode 100644
index 00000000..ace20fc0
--- /dev/null
+++ b/docs/build_multiversion.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Wrapper around sphinx-multiversion that patches copy_tree to generate
+_version.py in each tag checkout. This is needed because setuptools_scm
+generates _version.py at build time, but sphinx-multiversion uses
+`git archive` which only contains committed files.
+
+Usage (called by Makefile):
+    python3 build_multiversion.py <sourcedir> <outputdir> [sphinx-opts...]
+"""
+
+import os
+import re
+import subprocess
+import sys
+
+import sphinx_multiversion.git as smv_git
+from sphinx_multiversion import main as smv_main
+
+# Save the original copy_tree
+_original_copy_tree = smv_git.copy_tree
+
+
+def _patched_copy_tree(gitroot, src, dst, reference, sourcepath="."):
+    """Call original copy_tree, then generate _version.py from the VERSION file."""
+    _original_copy_tree(gitroot, src, dst, reference, sourcepath)
+
+    # Extract version from the tag name (e.g., "v0.9.0" -> "0.9.0")
+    refname = getattr(reference, "refname", "") or ""
+    match = re.search(r"v(\d+\.\d+\.\d+)", refname)
+    if not match:
+        return
+
+    version = match.group(1)
+    version_py_dir = os.path.join(dst, "python", "mscclpp")
+    if os.path.isdir(version_py_dir):
+        version_py = os.path.join(version_py_dir, "_version.py")
+        if not os.path.exists(version_py):
+            with open(version_py, "w") as f:
+                f.write(f'__version__ = "{version}"\n')
+
+
+# Monkey-patch
+smv_git.copy_tree = _patched_copy_tree
+
+if __name__ == "__main__":
+    sys.exit(smv_main(sys.argv[1:]))

From 3e5c41c98a2633555a1404a69332cb476e9c5e88 Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Wed, 8 Apr 2026 16:59:08 -0700
Subject: [PATCH 45/52] Adding Channel Type in ReduceSend Operation on DSL
 (#777)

The reduce send operation in DSL essentially combines the reduce and put
operations. The put operation carry the information about the channel
type, whereas previously, we were using the channel type from the reduce
operation.
---
 python/mscclpp/language/internal/operations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mscclpp/language/internal/operations.py b/python/mscclpp/language/internal/operations.py
index 5f719c21..5fb392e3 100644
--- a/python/mscclpp/language/internal/operations.py
+++ b/python/mscclpp/language/internal/operations.py
@@ -745,7 +745,7 @@ class ReduceOperation(BaseOperation):
                 remote_dst_buff=self.remote_dst_buff + other.dst_buff,
                 channel_ids=self.channel_ids,
                 put_channel_ids=self.put_channel_ids + other.channel_ids,
-                channel_type=self.channel_type,
+                channel_type=other.channel_type,
                 reduce_operation=self.reduce_operation,
                 tbg_info=self.tbg_info,
                 packet=self.packet,

From a7273047e9623d87cc031568f820b53c7637a1e7 Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Wed, 8 Apr 2026 17:02:07 -0700
Subject: [PATCH 46/52] Fix TBG on DSL Get Operation (#778)

---
 python/mscclpp/language/channel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mscclpp/language/channel.py b/python/mscclpp/language/channel.py
index 1b22e4e2..23d76eda 100644
--- a/python/mscclpp/language/channel.py
+++ b/python/mscclpp/language/channel.py
@@ -140,7 +140,7 @@ class MemoryChannel:
 
         for tb_id in tb_list:
             tb_chunk_id = get_program().setup_remote_chunk(self.src_rank, tb_id, remote_chunk, self.channel_type)
-            tb_channel_ids = get_program().setup_channel(tb, self)
+            tb_channel_ids = get_program().setup_channel(tb_id, self)
             op = GetOperation(
                 src_buff=[RemoteChunk(src_chunk.buffer, src_chunk.index, src_chunk.size, tb_chunk_id)],
                 dst_buff=[LocalChunk(dst_chunk.buffer, dst_chunk.index, dst_chunk.size)],

From d63f9403c0c03fd87b5c1bffcc2e4091ec65e9a4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 9 Apr 2026 02:24:30 -0700
Subject: [PATCH 47/52] IB `host-no-atomic`: GDRCopy + mlx5dv Data Direct for
 memory-consistent low-latency signaling (#753)

Major enhancements to the IB signal forwarding mechanisms
(`host-no-atomic` mode), primarily adding support for GDRCopy and MLX5
Direct Verbs, and refactoring the signal forwarding path for IB
HostNoAtomic mode. The changes fix memory consistency issues and reduce
signaling latency.
- GDRCopy and MLX5 Direct Verbs MR integration
- Signal forwarding path redesign
- Semaphore and connection API updates
- Environment (`MSCCLPP_FORCE_DISABLE_GDR`) and documentation updates
---
 .github/copilot-instructions.md      |   2 +-
 CMakeLists.txt                       |  21 +++
 cmake/FindGDRCopy.cmake              |  50 ++++++
 cmake/FindMLX5.cmake                 |  38 ++++
 docker/base-dev-x.dockerfile         |  19 +-
 docs/quickstart.md                   |   3 +
 include/mscclpp/atomic_device.hpp    |   5 +-
 include/mscclpp/env.hpp              |   5 +
 include/mscclpp/semaphore.hpp        |  10 ++
 src/core/CMakeLists.txt              |  10 ++
 src/core/connection.cc               | 192 +++++++++++++-------
 src/core/context.cc                  |   2 -
 src/core/endpoint.cc                 |   4 +-
 src/core/env.cpp                     |   4 +-
 src/core/gdr.cc                      | 204 ++++++++++++++++++++++
 src/core/ib.cc                       | 120 +++++++++++--
 src/core/include/connection.hpp      |  57 ++++--
 src/core/include/context.hpp         |   2 -
 src/core/include/gdr.hpp             |  62 +++++++
 src/core/include/ib.hpp              |  18 +-
 src/core/include/mlx5dv_wrapper.hpp  |  38 ++++
 src/core/mlx5dv_wrapper.cc           | 126 ++++++++++++++
 src/core/semaphore.cc                |  58 +++++--
 test/framework.cc                    |  46 +++++
 test/framework.hpp                   |   7 +
 test/mp_unit/ib_tests.cu             | 127 +++++++++++---
 test/mp_unit/memory_channel_tests.cu |   6 +-
 test/mp_unit/mp_unit_tests.hpp       |   1 +
 test/mp_unit/port_channel_tests.cu   | 140 ++++++++++++++-
 test/mp_unit/semaphore_perf_tests.cu |   2 +-
 test/unit/CMakeLists.txt             |   1 +
 test/unit/gdr_tests.cu               | 251 +++++++++++++++++++++++++++
 32 files changed, 1472 insertions(+), 159 deletions(-)
 create mode 100644 cmake/FindGDRCopy.cmake
 create mode 100644 cmake/FindMLX5.cmake
 create mode 100644 src/core/gdr.cc
 create mode 100644 src/core/include/gdr.hpp
 create mode 100644 src/core/include/mlx5dv_wrapper.hpp
 create mode 100644 src/core/mlx5dv_wrapper.cc
 create mode 100644 test/unit/gdr_tests.cu

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 4f13c557..9d7e7798 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -43,7 +43,7 @@ For testing after successful build:
 # To run tests with two GPUs - two is enough for most tests
 mpirun -np 2 ./build/bin/mp_unit_tests
 # To run tests excluding IB-related ones (when IB is not available)
-mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
+mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
 ```
 
 For building a Python package:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9db54d15..ef8b785a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -224,9 +224,30 @@ if(MSCCLPP_USE_IB)
     if(NOT IBVERBS_FOUND)
         message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.")
     endif()
+    find_package(MLX5)
+    if(MLX5_FOUND)
+        message(STATUS "MLX5 Direct Verbs found: ${MLX5_LIBRARIES}")
+    else()
+        message(STATUS "MLX5 Direct Verbs not found, mlx5dv optimizations disabled")
+    endif()
 endif()
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)
+
+option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON)
+if(MSCCLPP_USE_ROCM)
+    set(MSCCLPP_USE_GDRCOPY OFF)
+endif()
+if(MSCCLPP_USE_GDRCOPY)
+    find_package(GDRCopy)
+    if(NOT GDRCOPY_FOUND)
+        message(STATUS "GDRCopy not found, disabling GDRCopy support")
+        set(MSCCLPP_USE_GDRCOPY OFF)
+    else()
+        message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}")
+    endif()
+endif()
+
 include(FetchContent)
 FetchContent_Declare(json
     GIT_REPOSITORY https://github.com/nlohmann/json.git
diff --git a/cmake/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake
new file mode 100644
index 00000000..54e0ba1c
--- /dev/null
+++ b/cmake/FindGDRCopy.cmake
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE)
+#
+# The following variables are optionally searched for defaults
+#  GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
+#  GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
+#  GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found
+
+# The following are set after configuration is done:
+#  GDRCOPY_FOUND
+#  GDRCOPY_INCLUDE_DIRS
+#  GDRCOPY_LIBRARIES
+
+find_path(GDRCOPY_INCLUDE_DIRS
+  NAMES gdrapi.h
+  HINTS
+  ${GDRCOPY_INCLUDE_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(GDRCOPY_LIBRARIES
+  NAMES gdrapi
+  HINTS
+  ${GDRCOPY_LIB_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+if(GDRCOPY_INCLUDE_DIRS)
+    include(CheckSymbolExists)
+    set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
+    set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
+    check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    unset(CMAKE_REQUIRED_INCLUDES)
+    if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
+        message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
+        set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
+    endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
+mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
diff --git a/cmake/FindMLX5.cmake b/cmake/FindMLX5.cmake
new file mode 100644
index 00000000..9fd59127
--- /dev/null
+++ b/cmake/FindMLX5.cmake
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the MLX5 Direct Verbs (mlx5dv) library
+#
+# The following variables are optionally searched for defaults
+#  MLX5_ROOT_DIR: Base directory where all MLX5 components are found
+#  MLX5_INCLUDE_DIR: Directory where MLX5 headers are found
+#  MLX5_LIB_DIR: Directory where MLX5 libraries are found
+
+# The following are set after configuration is done:
+#  MLX5_FOUND
+#  MLX5_INCLUDE_DIRS
+#  MLX5_LIBRARIES
+
+find_path(MLX5_INCLUDE_DIRS
+  NAMES infiniband/mlx5dv.h
+  HINTS
+  ${MLX5_INCLUDE_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(MLX5_LIBRARIES
+  NAMES mlx5
+  HINTS
+  ${MLX5_LIB_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
+mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
index 7c6c927e..47436202 100644
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -49,8 +49,25 @@ RUN OS_ARCH=$(uname -m) && \
     rm -rf ${CMAKE_HOME}.tar.gz && \
     ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
 
-# Install ROCm-specific packages if building for ROCm
+# Install GDRCopy userspace library for CUDA targets
 ARG TARGET="cuda13.0"
+RUN if echo "$TARGET" | grep -q "^cuda"; then \
+        GDRCOPY_VERSION="2.5.2" && \
+        apt-get update -y && \
+        apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \
+        cd /tmp && \
+        curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
+        tar xzf gdrcopy.tar.gz && \
+        cd gdrcopy-${GDRCOPY_VERSION}/packages && \
+        ./build-deb-packages.sh -k -t && \
+        dpkg -i libgdrapi_*.deb && \
+        cd / && rm -rf /tmp/gdrcopy* && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* /tmp/*; \
+    fi
+
+# Install ROCm-specific packages if building for ROCm
 RUN if echo "$TARGET" | grep -q "^rocm"; then \
         apt-get update -y && \
         apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
diff --git a/docs/quickstart.md b/docs/quickstart.md
index b7a68050..c9c98128 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -31,6 +31,9 @@
         ```
         If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
     * (Optional, for benchmarks) MPI
+    * (Optional, for NVIDIA platforms) [GDRCopy](https://github.com/NVIDIA/gdrcopy) >= 2.5.1
+        * GDRCopy is required for IB `HostNoAtomic` mode, which uses CPU-side signal forwarding to GPU memory via BAR1 mappings. This mode is used on platforms where RDMA atomics are not available (e.g., when using Data Direct Virtual Functions).
+        * Install GDRCopy from source or via packages. See the [GDRCopy installation guide](https://github.com/NVIDIA/gdrcopy#installation).
 * Others
     * For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites).
     * For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.
diff --git a/include/mscclpp/atomic_device.hpp b/include/mscclpp/atomic_device.hpp
index 74f6122f..d00bb50c 100644
--- a/include/mscclpp/atomic_device.hpp
+++ b/include/mscclpp/atomic_device.hpp
@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
   return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
 }
 
-#elif defined(MSCCLPP_DEVICE_HIP)
+#else  // !defined(MSCCLPP_DEVICE_CUDA)
 
 constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
 constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
 constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
 constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
 
-// HIP does not have thread scope enums like CUDA
 constexpr auto scopeSystem = 0;
 constexpr auto scopeDevice = 0;
 
@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
   return __atomic_fetch_add(ptr, val, memoryOrder);
 }
 
-#endif  // defined(MSCCLPP_DEVICE_HIP)
+#endif  // !defined(MSCCLPP_DEVICE_CUDA)
 
 }  // namespace mscclpp
 
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index 39f73e8d..fb1da22c 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -110,6 +110,11 @@ class Env {
   /// Default is false.
   const bool forceDisableNvls;
 
+  /// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++.
+  /// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded.
+  /// Default is false.
+  const bool forceDisableGdr;
+
  private:
   Env();
 
diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index 85787c95..4d1f2e32 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -16,6 +16,7 @@ namespace mscclpp {
 class Host2DeviceSemaphore {
  private:
   Semaphore semaphore_;
+  std::shared_ptr<uint64_t> inboundToken_;
   detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
   std::unique_ptr<uint64_t> outboundToken_;
 
@@ -29,6 +30,15 @@ class Host2DeviceSemaphore {
   /// @param connection The connection associated with this semaphore.
   Host2DeviceSemaphore(Communicator& communicator, const Connection& connection);
 
+  /// Destructor.
+  ~Host2DeviceSemaphore();
+
+  /// Move constructor.
+  Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default;
+
+  /// Move assignment operator.
+  Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default;
+
   /// Returns the connection.
   /// @return The connection associated with this semaphore.
   Connection& connection();
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index c1aa25bb..9ca5fed3 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -28,6 +28,16 @@ if(MSCCLPP_USE_IB)
     target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
     target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
     target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
+    if(MLX5_FOUND)
+        target_include_directories(mscclpp_obj SYSTEM PRIVATE ${MLX5_INCLUDE_DIRS})
+        target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MLX5DV)
+    endif()
+endif()
+
+if(MSCCLPP_USE_GDRCOPY)
+    target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GDRCOPY_INCLUDE_DIRS})
+    target_link_libraries(mscclpp_obj PRIVATE ${GDRCOPY_LIBRARIES})
+    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
 endif()
 
 set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 6466ca2a..8b6c0afb 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -7,6 +7,7 @@
 #include <mscclpp/npkit/npkit.hpp>
 #endif
 
+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
@@ -197,45 +198,54 @@ void IBConnection::recvThreadFunc() {
     }
   }
 
-  // Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy)
+  uint32_t lastImmData = 0;
+  uint64_t immHighBits = 0;
   uint64_t newValueHost = 0;
 
-  while (!stopRecvThread_.load(std::memory_order_relaxed)) {
-    auto qp = qp_.lock();
-    if (!qp) break;
+  auto qp = qp_.lock();
+  if (!qp) return;
 
+  while (!stopRecvThread_.load(std::memory_order_relaxed)) {
     int wcNum = qp->pollRecvCq();
     if (wcNum < 0) {
-      WARN(NET, "IBConnection recvThreadFunc: pollRecvCq failed");
+      recvThreadErrorMsg_ = "pollRecvCq failed";
+      recvThreadError_.store(true, std::memory_order_release);
+      WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
       break;
     }
 
     for (int i = 0; i < wcNum; ++i) {
       int status = qp->getRecvWcStatus(i);
       if (status != static_cast<int>(WsStatus::Success)) {
-        WARN(NET, "IBConnection recvThreadFunc: recv work completion failed: ", qp->getRecvWcStatusString(i));
-        // Post another recv to replace the failed one
-        qp->stageRecv(/*wrId=*/0);
-        qp->postRecv();
-        continue;
+        // A failed recv WC typically means the QP entered error state (e.g., WR Flushed Error).
+        // All remaining WRs will also fail — no recovery without QP recreation. Exit the thread
+        // and set the error flag so the main thread can detect it.
+        recvThreadErrorMsg_ = std::string("recv work completion failed: ") + qp->getRecvWcStatusString(i);
+        recvThreadError_.store(true, std::memory_order_release);
+        WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
+        return;
       }
 
-      // The imm_data contains newValue (32-bit, extended to 64-bit)
-      // Note: getRecvWcImmData already converts from network byte order via ntohl
-      unsigned int immData = qp->getRecvWcImmData(i);
-      newValueHost = static_cast<uint64_t>(immData);
+      // Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
+      // using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits
+      // are less than the previous value, the upper 32 bits must have incremented by 1.
+      uint32_t immData = qp->getRecvWcImmData(i);
+      if (immData < lastImmData) {
+        immHighBits += (1ULL << 32);
+      }
+      lastImmData = immData;
+      newValueHost = immHighBits | static_cast<uint64_t>(immData);
 
-      // Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
-      uint64_t dstGpuAddr = remoteUpdateDstAddr_;
-      if (dstGpuAddr != 0) {
-        uint64_t* dstPtr = reinterpret_cast<uint64_t*>(dstGpuAddr);
-
-        // Use cudaMemcpyAsync with our dedicated stream to avoid blocking on the default stream
-        MSCCLPP_CUDATHROW(
-            cudaMemcpyAsync(dstPtr, &newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_));
-
-        INFO(CONN, "IBConnection recvThreadFunc: updated GPU ptr ", dstPtr, " to ", newValueHost, " (immData=", immData,
-             ")");
+      // Forward the token to the semaphore's inbound token address via atomicStore
+      // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
+      if (signalAddr_ != 0) {
+        if (signalGdrMap_ && signalGdrMap_->valid()) {
+          atomicStore(signalGdrMap_->hostPtr(), newValueHost, memoryOrderRelaxed);
+        } else {
+          // For HIP/ROCm.
+          // NOTE: may need a fix in the future to ensure BAR1 mapping.
+          *reinterpret_cast<volatile uint64_t*>(signalAddr_) = newValueHost;
+        }
       }
 
       // Post another recv for future messages
@@ -250,60 +260,105 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
     : BaseConnection(context, localEndpoint),
       transport_(localEndpoint.transport()),
       remoteTransport_(remoteEndpoint.transport()),
-      dummyAtomicSource_(std::make_unique<uint64_t>(0)),
+      atomicSrc_(std::make_unique<uint64_t>(0)),
       ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
+      gdrSignalForwarding_(false),
       stopRecvThread_(false),
+      recvThreadError_(false),
       localGpuDeviceId_(localEndpoint.device().id),
-      signalStream_(nullptr),
-      remoteUpdateDstAddr_(0) {
+      signalAddr_(0) {
   qp_ = getImpl(localEndpoint).ibQp_;
   qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
   qp_.lock()->rts();
-  dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
-  validateTransport(dummyAtomicSourceMem_, transport_);
-  dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_);
+  atomicSrcMem_ = context->registerMemory(atomicSrc_.get(), sizeof(uint64_t), transport_);
+  validateTransport(atomicSrcMem_, transport_);
+  atomicSrcTransportInfo_ = getImpl(atomicSrcMem_).getTransportInfo(transport_);
 
   if (ibNoAtomic_) {
-    // Create a CUDA stream for async memory copies
-    MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking));
+#if defined(MSCCLPP_USE_CUDA)
+    // On CUDA, HostNoAtomic requires GDRCopy for CPU→GPU signal forwarding through BAR1.
+    if (!gdrEnabled()) {
+      THROW(CONN, Error, ErrorCode::InvalidUsage,
+            "IB host-no-atomic mode on CUDA requires GDRCopy: ", gdrStatusMessage());
+    }
+    gdrSignalForwarding_ = true;
+#endif  // defined(MSCCLPP_USE_CUDA)
 
-    // Pre-post receive requests for incoming write-with-imm
+    // On platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200
+    // NVLink-C2C), HostNoAtomic requires Data Direct for correct memory ordering. Data Direct
+    // routes NIC DMA through the PCIe Data Direct engine, bypassing the bridge. It is available
+    // on Virtual Function (VF) devices. On platforms without such a bridge (x86, non-Grace
+    // aarch64), HostNoAtomic works without Data Direct.
+    //
+    // We cannot reliably detect the bridge at compile time or runtime, so we emit a warning
+    // when the device is not a VF. If data corruption occurs, switching to VF devices with
+    // Data Direct or using IbMode::Host with RDMA atomics will resolve it.
+    {
+      IbCtx* ibCtx = getImpl(*context).getIbContext(transport_);
+      if (!ibCtx->isVirtualFunction()) {
+        WARN(CONN,
+             "IB HostNoAtomic mode without a Virtual Function (VF) device may cause data corruption "
+             "on platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200). "
+             "Device ",
+             ibCtx->getDevName(),
+             " is not a VF. "
+             "If you experience data corruption, use VF devices with Data Direct or IbMode::Host.");
+      }
+    }
+
+    // Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
+    // The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
     auto qp = qp_.lock();
     int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
     for (int i = 0; i < maxRecvWr; ++i) {
       qp->stageRecv(/*wrId=*/0);
     }
     qp->postRecv();
-    // Start the background thread to poll recv CQ
-    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
-    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with no-atomic mode");
+    // The recv thread is started later in startSignalForwarding() when the semaphore
+    // provides the signal forwarding destination. This ensures the thread lifetime is
+    // bounded by the GdrMap lifetime (created before start, destroyed after stop).
+    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with signal forwarding (HostNoAtomic) mode");
   } else {
     INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode");
   }
 }
 
-IBConnection::~IBConnection() {
-  if (ibNoAtomic_) {
-    stopRecvThread_.store(true, std::memory_order_relaxed);
-    if (recvThread_.joinable()) {
-      recvThread_.join();
-    }
-    if (signalStream_ != nullptr) {
-      // Synchronize stream to ensure all async copies are complete before destruction
-      // Ignore errors during teardown (CUDA context may already be destroyed)
-      MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamSynchronize(signalStream_));
-      MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamDestroy(signalStream_));
-    }
-  }
-}
+IBConnection::~IBConnection() { stopSignalForwarding(); }
 
 Transport IBConnection::transport() const { return transport_; }
 
 Transport IBConnection::remoteTransport() const { return remoteTransport_; }
 
-void IBConnection::setRemoteUpdateDstAddr(uint64_t addr) {
-  remoteUpdateDstAddr_ = addr;
-  INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)addr);
+bool IBConnection::isSignalForwarding() const { return ibNoAtomic_; }
+
+void IBConnection::startSignalForwarding(std::shared_ptr<uint64_t> mem) {
+  // Set up the forwarding destination and GdrMap, then start the recv thread.
+  // Order: set address → create GdrMap → start thread.
+  signalAddr_ = reinterpret_cast<uint64_t>(mem.get());
+  if (gdrSignalForwarding_) {
+    signalGdrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
+  }
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(false, std::memory_order_relaxed);
+    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
+  }
+  INFO(CONN, "IBConnection startSignalForwarding: ", (void*)signalAddr_);
+}
+
+void IBConnection::stopSignalForwarding() {
+  // Stop the recv thread, then tear down GdrMap and address.
+  // Order: stop thread → destroy GdrMap → clear address.
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(true, std::memory_order_relaxed);
+    if (recvThread_.joinable()) {
+      recvThread_.join();
+    }
+  }
+  if (gdrSignalForwarding_) {
+    signalGdrMap_.reset();
+  }
+  signalAddr_ = 0;
+  INFO(CONN, "IBConnection stopSignalForwarding");
 }
 
 void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
@@ -356,25 +411,29 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
   *src = newValue;
 
   if (ibNoAtomic_) {
-    // Use RDMA write-with-imm instead of atomic operation
-    // Send only newValue in imm_data (0-byte write)
-    // The remote's recvThreadFunc will use its stored remoteUpdateDstAddr_ to write
-
-    // Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
+    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the
+    // token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around
+    // detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
+    // indicates the upper 32 bits incremented by 1).
+    if (newValue <= oldValue) {
+      WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ", newValue);
+    } else if (newValue - oldValue >= (1ULL << 32)) {
+      WARN(CONN,
+           "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ", oldValue,
+           " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
+    }
     unsigned int immData = static_cast<unsigned int>(newValue);
-
-    // Send 0-byte write-with-imm; use dstMrInfo as target (we don't actually write anything)
     qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
                                       /*size=*/0, /*wrId=*/0,
                                       /*srcOffset=*/0, /*dstOffset=*/0,
                                       /*signaled=*/true, /*immData=*/immData);
     qp_.lock()->postSend();
-    INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
+    INFO(CONN, "IBConnection signal forwarding: value ", oldValue, " -> ", newValue);
   } else {
-    qp_.lock()->stageSendAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
+    qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
                                    /*signaled=*/true);
     qp_.lock()->postSend();
-    INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
+    INFO(CONN, "IBConnection atomic write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
          " -> ", newValue);
   }
 
@@ -388,6 +447,11 @@ void IBConnection::flush(int64_t timeoutUsec) {
   NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_FLUSH_ENTRY, 0, 0, *NpKit::GetCpuTimestamp(), 0);
 #endif
 
+  // Check if the recv thread has already reported an error (e.g., QP entered error state).
+  if (recvThreadError_.load(std::memory_order_acquire)) {
+    THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_);
+  }
+
   Timer timer;
   while (qp_.lock()->getNumSendCqItems()) {
     int wcNum = qp_.lock()->pollSendCq();
diff --git a/src/core/context.cc b/src/core/context.cc
index a5cdffb2..aabe71df 100644
--- a/src/core/context.cc
+++ b/src/core/context.cc
@@ -46,8 +46,6 @@ void CudaIpcStream::sync() {
   }
 }
 
-Context::Impl::Impl() {}
-
 IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
   // Find IB context or create it
   auto it = ibContexts_.find(ibTransport);
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 4795aa62..5ab4bad0 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -51,7 +51,7 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
 
     ibQp_ = contextImpl.getIbContext(config_.transport)
                 ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
-                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend);
+                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
     ibQpInfo_ = ibQp_->getInfo();
   } else if (config_.transport == Transport::Ethernet) {
     // Configuring Ethernet Interfaces
@@ -74,6 +74,7 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
   if (AllIBTransports.has(config_.transport)) {
     ibLocal_ = false;
     it = detail::deserialize(it, ibQpInfo_);
+    it = detail::deserialize(it, ibNoAtomic_);
   } else if (config_.transport == Transport::Ethernet) {
     it = detail::deserialize(it, socketAddress_);
   }
@@ -103,6 +104,7 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
   detail::serialize(data, pimpl_->pidHash_);
   if (AllIBTransports.has(pimpl_->config_.transport)) {
     detail::serialize(data, pimpl_->ibQpInfo_);
+    detail::serialize(data, pimpl_->ibNoAtomic_);
   } else if (pimpl_->config_.transport == Transport::Ethernet) {
     detail::serialize(data, pimpl_->socketAddress_);
   }
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 484b40af..96f53492 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -65,7 +65,8 @@ Env::Env()
       ncclSharedLibPath(readEnv<std::string>("MSCCLPP_NCCL_LIB_PATH", "")),
       forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
       ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
-      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)) {}
+      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
+      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -93,6 +94,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation);
     logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
+    logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
   }
   return globalEnv;
 }
diff --git a/src/core/gdr.cc b/src/core/gdr.cc
new file mode 100644
index 00000000..22ac15c9
--- /dev/null
+++ b/src/core/gdr.cc
@@ -0,0 +1,204 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "gdr.hpp"
+
+#if defined(MSCCLPP_USE_GDRCOPY)
+
+#include <gdrapi.h>
+#include <unistd.h>
+
+#include <mscclpp/env.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "logger.hpp"
+
+#ifndef GPU_PAGE_SHIFT
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
+#endif
+
+namespace mscclpp {
+
+// GdrContext
+
+class GdrContext {
+ public:
+  GdrContext();
+  ~GdrContext();
+
+  GdrContext(const GdrContext&) = delete;
+  GdrContext& operator=(const GdrContext&) = delete;
+
+  GdrStatus status() const { return status_; }
+  gdr_t handle() const { return handle_; }
+
+ private:
+  GdrStatus status_;
+  gdr_t handle_;
+};
+
+static std::shared_ptr<GdrContext> gdrContext() {
+  static auto instance = std::make_shared<GdrContext>();
+  return instance;
+}
+
+GdrStatus gdrStatus() { return gdrContext()->status(); }
+
+bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
+
+const char* gdrStatusMessage() {
+  switch (gdrStatus()) {
+    case GdrStatus::Ok:
+      return "GDRCopy initialized successfully";
+    case GdrStatus::NotBuilt:
+      return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
+    case GdrStatus::Disabled:
+      return "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
+    case GdrStatus::DriverMissing:
+      return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
+    case GdrStatus::OpenFailed:
+      return "gdr_open() failed; GDRCopy driver may be misconfigured";
+    default:
+      return "unknown GDRCopy status";
+  }
+}
+
+GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
+  if (env()->forceDisableGdr) {
+    INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
+    status_ = GdrStatus::Disabled;
+    return;
+  }
+
+  // Auto-detect: check if driver is available
+  if (access("/dev/gdrdrv", F_OK) != 0) {
+    INFO(GPU, "GDRCopy driver not detected, disabling GDRCopy");
+    status_ = GdrStatus::DriverMissing;
+    return;
+  }
+
+  handle_ = gdr_open();
+  if (handle_ == nullptr) {
+    INFO(GPU, "gdr_open() failed, disabling GDRCopy");
+    status_ = GdrStatus::OpenFailed;
+    return;
+  }
+
+  status_ = GdrStatus::Ok;
+  INFO(GPU, "GDRCopy initialized successfully");
+}
+
+GdrContext::~GdrContext() {
+  if (handle_ != nullptr) {
+    gdr_close(handle_);
+    handle_ = nullptr;
+  }
+}
+
+// GdrMap::Impl — real implementation with GDRCopy
+
+struct GdrMap::Impl {
+  std::shared_ptr<GdrContext> ctx;
+  std::shared_ptr<void> gpuMem;
+  gdr_mh_t mh;
+  void* barPtr;
+  uint64_t* hostDstPtr;
+  size_t mappedSize;
+};
+
+GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : pimpl_(std::make_unique<Impl>()) {
+  pimpl_->ctx = gdrContext();
+  pimpl_->gpuMem = std::move(gpuMem);
+  pimpl_->mh = {};
+  pimpl_->barPtr = nullptr;
+  pimpl_->hostDstPtr = nullptr;
+  pimpl_->mappedSize = 0;
+
+  // Ensure CUDA device context is active for gdr_pin_buffer
+  CudaDeviceGuard deviceGuard(deviceId);
+
+  uint64_t gpuAddr = reinterpret_cast<uint64_t>(pimpl_->gpuMem.get());
+  // Align to GPU page boundary and pin one page around the target address
+  unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK;
+  unsigned long pageOffset = gpuAddr - alignedAddr;
+  pimpl_->mappedSize = GPU_PAGE_SIZE;
+
+  // Pin the GPU memory for GDRCopy BAR1 mapping. Try GDR_PIN_FLAG_FORCE_PCIE first for optimal
+  // ordering on platforms that support it (e.g., GB200). Fall back to flags=0 if FORCE_PCIE is
+  // not supported. Both paths work correctly: CPU writes via atomicStore, GPU reads via
+  // system-scope acquire.
+  int ret =
+      gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, GDR_PIN_FLAG_FORCE_PCIE, &pimpl_->mh);
+  if (ret != 0) {
+    ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh);
+    if (ret != 0) {
+      THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
+            ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
+    }
+  }
+
+  ret = gdr_map(pimpl_->ctx->handle(), pimpl_->mh, &pimpl_->barPtr, pimpl_->mappedSize);
+  if (ret != 0) {
+    (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
+    THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
+  }
+
+  pimpl_->hostDstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(pimpl_->barPtr) + pageOffset);
+
+  INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)pimpl_->hostDstPtr);
+}
+
+GdrMap::~GdrMap() {
+  if (pimpl_) {
+    if (pimpl_->barPtr != nullptr) {
+      (void)gdr_unmap(pimpl_->ctx->handle(), pimpl_->mh, pimpl_->barPtr, pimpl_->mappedSize);
+    }
+    if (pimpl_->hostDstPtr != nullptr) {
+      (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
+    }
+  }
+}
+
+bool GdrMap::valid() const { return pimpl_ && pimpl_->hostDstPtr != nullptr; }
+
+uint64_t* GdrMap::hostPtr() const { return pimpl_ ? pimpl_->hostDstPtr : nullptr; }
+
+void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(pimpl_->mh, pimpl_->hostDstPtr, src, size); }
+
+void GdrMap::copyFrom(void* dst, size_t size) const {
+  gdr_copy_from_mapping(pimpl_->mh, dst, pimpl_->hostDstPtr, size);
+}
+
+}  // namespace mscclpp
+
+#else  // !defined(MSCCLPP_USE_GDRCOPY)
+
+namespace mscclpp {
+
+GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
+
+bool gdrEnabled() { return false; }
+
+const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
+
+// GdrMap::Impl — stub (no GDRCopy)
+
+struct GdrMap::Impl {};
+
+GdrMap::GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
+
+GdrMap::~GdrMap() = default;
+
+bool GdrMap::valid() const { return false; }
+
+uint64_t* GdrMap::hostPtr() const { return nullptr; }
+
+void GdrMap::copyTo(const void* /*src*/, size_t /*size*/) {}
+
+void GdrMap::copyFrom(void* /*dst*/, size_t /*size*/) const {}
+
+}  // namespace mscclpp
+
+#endif  // !defined(MSCCLPP_USE_GDRCOPY)
diff --git a/src/core/ib.cc b/src/core/ib.cc
index b8854a6e..557f0426 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -21,6 +21,9 @@
 #include "context.hpp"
 #if defined(USE_IBVERBS)
 #include "ibverbs_wrapper.hpp"
+#if defined(MSCCLPP_USE_MLX5DV)
+#include "mlx5dv_wrapper.hpp"
+#endif  // defined(MSCCLPP_USE_MLX5DV)
 #endif  // defined(USE_IBVERBS)
 #include "logger.hpp"
 
@@ -64,7 +67,7 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
   return ret;
 }
 
-IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff), size_(0) {
+IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nullptr), buff_(buff), size_(0) {
   if (size == 0) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0");
   }
@@ -80,13 +83,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
   bool isGpuBuff = (gpuId != -1);
   if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
 #if !defined(MSCCLPP_USE_ROCM)
-    int fd;
-    MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    int fd = -1;
+    size_t rangeSize = pages * pageSize;
 
+    // Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
+    // bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
+    // routes DMA through the Data Direct engine for correct ordering and higher throughput.
+    // Fall back to the default (non-PCIe) mapping if the flag is unsupported.
+#if (CUDA_VERSION >= 12030)
+    CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                                                   CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    if (cuRes != CUDA_SUCCESS || fd < 0) {
+      if (fd >= 0) ::close(fd);
+      fd = -1;
+    }
+    bool usedPcieFlag = (fd >= 0);
+#endif  // CUDA_VERSION >= 12030
+    if (fd < 0) {
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    }
+
+    // Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
+    // which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
     size_t offsetInDmaBuf = buffIntPtr % pageSize;
-    mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd,
-                                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
-                                         IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC);
+    int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
+                      IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
+
+#if defined(MSCCLPP_USE_MLX5DV)
+    if (isDataDirect) {
+      mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+#endif
+    if (mr_ == nullptr) {
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+
+    // If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
+#if (CUDA_VERSION >= 12030)
+    if (mr_ == nullptr && usedPcieFlag) {
+      ::close(fd);
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+#endif  // CUDA_VERSION >= 12030
+
     ::close(fd);
     if (mr_ == nullptr) {
       THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
@@ -131,7 +171,7 @@ const void* IbMr::getBuff() const { return buff_; }
 uint32_t IbMr::getLkey() const { return mr_->lkey; }
 
 IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
-           int maxSendWr, int maxRecvWr, int maxWrPerSend)
+           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
     : portNum_(portNum),
       gidIndex_(gidIndex),
       info_(),
@@ -151,7 +191,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
       maxSendCqPollNum_(maxSendCqPollNum),
       maxSendWr_(maxSendWr),
       maxWrPerSend_(maxWrPerSend),
-      maxRecvWr_(maxRecvWr) {
+      maxRecvWr_(maxRecvWr),
+      noAtomic_(noAtomic) {
   sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
   if (sendCq_ == nullptr) {
     THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
@@ -211,7 +252,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
   qpAttr.qp_state = IBV_QPS_INIT;
   qpAttr.pkey_index = 0;
   qpAttr.port_num = portNum_;
-  qpAttr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
+  qpAttr.qp_access_flags = noAtomic_ ? IBV_ACCESS_REMOTE_WRITE
+                                     : (IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC);
   if (IBVerbs::ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
     THROW(NET, IbError, errno, "ibv_modify_qp failed (errno ", errno, ")");
   }
@@ -240,7 +282,7 @@ void IbQp::rtr(const IbQpInfo& info) {
   qp_attr.path_mtu = static_cast<ibv_mtu>(info.mtu);
   qp_attr.dest_qp_num = info.qpn;
   qp_attr.rq_psn = 0;
-  qp_attr.max_dest_rd_atomic = 1;
+  qp_attr.max_dest_rd_atomic = noAtomic_ ? 0 : 1;
   qp_attr.min_rnr_timer = 0x12;
   if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.isGrh) {
     qp_attr.ah_attr.is_global = 1;
@@ -272,7 +314,7 @@ void IbQp::rts() {
   qp_attr.retry_cnt = 7;
   qp_attr.rnr_retry = 7;
   qp_attr.sq_psn = 0;
-  qp_attr.max_rd_atomic = 1;
+  qp_attr.max_rd_atomic = noAtomic_ ? 0 : 1;
   int ret = IBVerbs::ibv_modify_qp(
       qp_, &qp_attr,
       IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC);
@@ -434,12 +476,38 @@ std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_
 
 unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }
 
-IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false) {
+IbCtx::IbCtx(const std::string& devName)
+    : devName_(devName),
+      ctx_(nullptr),
+      pd_(nullptr),
+      supportsRdmaAtomics_(false),
+      isMlx5_(false),
+      isDataDirect_(false),
+      isVF_(false) {
   int num;
   struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
   for (int i = 0; i < num; ++i) {
     if (std::string(devices[i]->name) == devName_) {
       ctx_ = IBVerbs::ibv_open_device(devices[i]);
+
+      // Detect if this IB device is a Virtual Function (VF).
+      // VFs have a 'physfn' sysfs symlink pointing to their parent PF; PFs do not.
+      {
+        std::string physfnPath = "/sys/class/infiniband/" + devName_ + "/device/physfn";
+        isVF_ = (access(physfnPath.c_str(), F_OK) == 0);
+        if (isVF_) {
+          INFO(NET, "IB device ", devName_, " is a Virtual Function (Data Direct ordering available)");
+        }
+      }
+
+#if defined(MSCCLPP_USE_MLX5DV)
+      if (MLX5DV::isAvailable()) {
+        isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]);
+        if (isMlx5_) {
+          INFO(NET, "IB device ", devName_, " supports mlx5 Direct Verbs");
+        }
+      }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
       break;
     }
   }
@@ -452,6 +520,20 @@ IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_
     THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")");
   }
 
+  // Detect Data Direct support via mlx5dv_get_data_direct_sysfs_path
+#if defined(MSCCLPP_USE_MLX5DV)
+  if (isMlx5_ && MLX5DV::isAvailable()) {
+    char sysfsPath[256];
+    int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath));
+    if (ret == 0) {
+      isDataDirect_ = true;
+      INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")");
+    } else {
+      INFO(NET, "IB device ", devName_, " does not support Data Direct");
+    }
+  }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+
   // Query and cache RDMA atomics capability
   struct ibv_device_attr attr = {};
   if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) {
@@ -512,7 +594,7 @@ int IbCtx::getAnyUsablePort(int gidIndex) const {
 }
 
 std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-                                      int maxRecvWr, int maxWrPerSend) {
+                                      int maxRecvWr, int maxWrPerSend, bool noAtomic) {
   if (port == -1) {
     port = this->getAnyUsablePort(gidIndex);
     if (port == -1) {
@@ -521,16 +603,22 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
   } else if (!this->isPortUsable(port, gidIndex)) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
   }
-  return std::shared_ptr<IbQp>(
-      new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend));
+  return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
+                                        maxRecvWr, maxWrPerSend, noAtomic));
 }
 
 std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
-  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size));
+  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isDataDirect_));
 }
 
 bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
 
+bool IbCtx::isMlx5() const { return isMlx5_; }
+
+bool IbCtx::isDataDirect() const { return isDataDirect_; }
+
+bool IbCtx::isVirtualFunction() const { return isVF_; }
+
 MSCCLPP_API_CPP int getIBDeviceCount() {
   int num;
   IBVerbs::ibv_get_device_list(&num);
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 06e733c7..22a9930f 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -5,6 +5,7 @@
 #define MSCCLPP_CONNECTION_HPP_
 
 #include <atomic>
+#include <memory>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mutex>
@@ -15,6 +16,7 @@
 #include "communicator.hpp"
 #include "context.hpp"
 #include "endpoint.hpp"
+#include "gdr.hpp"
 #include "ib.hpp"
 #include "registered_memory.hpp"
 #include "socket.h"
@@ -35,11 +37,18 @@ class BaseConnection {
 
   virtual void flush(int64_t timeoutUsec = -1) = 0;
 
-  /// Set the local address where remote updateAndSync operations should write.
-  /// This is called by the receiver to specify where incoming signals should be written.
-  /// Default implementation is a no-op for connections that don't need it.
-  /// @param addr The local address for incoming writes.
-  virtual void setRemoteUpdateDstAddr(uint64_t /*addr*/) {}
+  /// Start signal forwarding to the given memory address.
+  /// Called by the semaphore to specify where incoming signals should be written.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  virtual void startSignalForwarding(std::shared_ptr<uint64_t> /*mem*/) {}
+
+  /// Stop signal forwarding and release associated resources.
+  virtual void stopSignalForwarding() {}
+
+  /// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode).
+  /// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
+  /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
+  virtual bool isSignalForwarding() const { return false; }
 
   virtual Transport transport() const = 0;
 
@@ -91,22 +100,29 @@ class IBConnection : public BaseConnection {
   Transport transport_;
   Transport remoteTransport_;
   std::weak_ptr<IbQp> qp_;
-  std::unique_ptr<uint64_t> dummyAtomicSource_;  // not used anywhere but IB needs a source
-  RegisteredMemory dummyAtomicSourceMem_;
-  mscclpp::TransportInfo dstTransportInfo_;
+  std::unique_ptr<uint64_t> atomicSrc_;
+  RegisteredMemory atomicSrcMem_;
+  mscclpp::TransportInfo atomicSrcTransportInfo_;
 
   // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
   // instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
   bool ibNoAtomic_;
+  bool gdrSignalForwarding_;  // ibNoAtomic_ && gdrEnabled() — decided once at construction
   std::thread recvThread_;
   std::atomic<bool> stopRecvThread_;
-  int localGpuDeviceId_;  // Local GPU device ID for setting CUDA context in recv thread
-  cudaStream_t signalStream_;
+  std::atomic<bool> recvThreadError_;  // Set by recv thread on fatal error
+  std::string recvThreadErrorMsg_;     // Error message from recv thread (written before recvThreadError_ is set)
+  int localGpuDeviceId_;               // Local GPU device ID for CUDA context and GDR mapping
 
-  // Write-with-imm design:
-  // - Sender: 0-byte RDMA write-with-imm to dst MR, newValue in imm_data (32-bit)
-  // - Receiver: uses remoteUpdateDstAddr_ (set via setRemoteUpdateDstAddr) to know where to write
-  uint64_t remoteUpdateDstAddr_;
+  // Signal forwarding design (HostNoAtomic mode):
+  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
+  // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
+  //   the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around
+  //   detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half
+  //   incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
+  uint64_t signalAddr_;
+
+  std::unique_ptr<GdrMap> signalGdrMap_;
 
   void recvThreadFunc();
 
@@ -114,10 +130,15 @@ class IBConnection : public BaseConnection {
   IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
   ~IBConnection();
 
-  /// Set the local address where remote updateAndSync operations will write.
-  /// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
-  /// @param addr The local address for incoming writes.
-  void setRemoteUpdateDstAddr(uint64_t addr) override;
+  /// Start signal forwarding to the given memory address.
+  /// Must be called before the remote sends any updateAndSync in HostNoAtomic mode.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  void startSignalForwarding(std::shared_ptr<uint64_t> mem) override;
+
+  /// Stop signal forwarding and release associated resources.
+  void stopSignalForwarding() override;
+
+  bool isSignalForwarding() const override;
 
   Transport transport() const override;
 
diff --git a/src/core/include/context.hpp b/src/core/include/context.hpp
index ee84d0f7..42d03db1 100644
--- a/src/core/include/context.hpp
+++ b/src/core/include/context.hpp
@@ -42,8 +42,6 @@ struct Context::Impl {
   std::shared_ptr<TokenPool> tokenPool_;
   const size_t maxNumTokens_ = 1 << 15;  // 32K tokens
 
-  Impl();
-
   IbCtx* getIbContext(Transport ibTransport);
   std::shared_ptr<uint64_t> getToken();
 };
diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp
new file mode 100644
index 00000000..e0c7f006
--- /dev/null
+++ b/src/core/include/gdr.hpp
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_GDR_HPP_
+#define MSCCLPP_GDR_HPP_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace mscclpp {
+
+enum class GdrStatus {
+  Ok,             // GDRCopy initialized successfully
+  NotBuilt,       // Built without MSCCLPP_USE_GDRCOPY
+  Disabled,       // Disabled via MSCCLPP_FORCE_DISABLE_GDR
+  DriverMissing,  // /dev/gdrdrv not found
+  OpenFailed,     // gdr_open() failed
+};
+
+/// Return the detailed status of the global GDRCopy context.
+GdrStatus gdrStatus();
+
+/// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok).
+bool gdrEnabled();
+
+/// Return a human-readable error message for the current GDRCopy status.
+const char* gdrStatusMessage();
+
+/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
+/// When GDRCopy is not available, all operations are no-ops and valid() returns false.
+class GdrMap {
+ public:
+  /// Pin and map a GPU address for direct host-side access.
+  /// @param gpuMem   Shared pointer to the GPU memory (e.g. from gpuCallocShared).
+  /// @param deviceId The CUDA device ID for setting context.
+  GdrMap(std::shared_ptr<void> gpuMem, int deviceId);
+  ~GdrMap();
+
+  GdrMap(const GdrMap&) = delete;
+  GdrMap& operator=(const GdrMap&) = delete;
+
+  /// Whether the mapping was established successfully.
+  bool valid() const;
+
+  /// Return the BAR1-mapped host pointer to the GPU location.
+  uint64_t* hostPtr() const;
+
+  /// Copy data from host memory to the mapped GPU location.
+  void copyTo(const void* src, size_t size);
+
+  /// Copy data from the mapped GPU location to host memory.
+  void copyFrom(void* dst, size_t size) const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> pimpl_;
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_GDR_HPP_
diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp
index e9363e9c..36c5a237 100644
--- a/src/core/include/ib.hpp
+++ b/src/core/include/ib.hpp
@@ -36,7 +36,7 @@ class IbMr {
   uint32_t getLkey() const;
 
  private:
-  IbMr(ibv_pd* pd, void* buff, std::size_t size);
+  IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect);
 
   ibv_mr* mr_;
   void* buff_;
@@ -101,7 +101,7 @@ class IbQp {
   };
 
   IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-       int maxRecvWr, int maxWrPerSend);
+       int maxRecvWr, int maxWrPerSend, bool noAtomic);
   SendWrInfo getNewSendWrInfo();
   RecvWrInfo getNewRecvWrInfo();
 
@@ -128,6 +128,7 @@ class IbQp {
   const int maxSendWr_;
   const int maxWrPerSend_;
   const int maxRecvWr_;
+  const bool noAtomic_;
 
   friend class IbCtx;
 };
@@ -139,18 +140,24 @@ class IbCtx {
   ~IbCtx();
 
   std::shared_ptr<IbQp> createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-                                 int maxRecvWr, int maxWrPerSend);
+                                 int maxRecvWr, int maxWrPerSend, bool noAtomic);
   std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
   bool supportsRdmaAtomics() const;
+  bool isMlx5() const;
+  bool isDataDirect() const;
+  bool isVirtualFunction() const;
 #else
   IbCtx([[maybe_unused]] const std::string& devName) {}
   ~IbCtx() {}
 
-  std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int) { return nullptr; }
+  std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int, bool) { return nullptr; }
   std::unique_ptr<const IbMr> registerMr([[maybe_unused]] void* buff, [[maybe_unused]] std::size_t size) {
     return nullptr;
   }
   bool supportsRdmaAtomics() const { return false; }
+  bool isMlx5() const { return false; }
+  bool isDataDirect() const { return false; }
+  bool isVirtualFunction() const { return false; }
 #endif
 
   const std::string& getDevName() const { return devName_; };
@@ -163,6 +170,9 @@ class IbCtx {
   ibv_context* ctx_;
   ibv_pd* pd_;
   bool supportsRdmaAtomics_;
+  bool isMlx5_;
+  bool isDataDirect_;
+  bool isVF_;
 };
 
 }  // namespace mscclpp
diff --git a/src/core/include/mlx5dv_wrapper.hpp b/src/core/include/mlx5dv_wrapper.hpp
new file mode 100644
index 00000000..79403a36
--- /dev/null
+++ b/src/core/include/mlx5dv_wrapper.hpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_MLX5DV_WRAPPER_HPP_
+#define MSCCLPP_MLX5DV_WRAPPER_HPP_
+
+#if defined(MSCCLPP_USE_MLX5DV)
+
+#include <infiniband/verbs.h>
+
+#include <string>
+
+namespace mscclpp {
+
+struct MLX5DV {
+  /// Whether libmlx5.so was successfully loaded at runtime.
+  static bool isAvailable();
+
+  /// Check if the given IB device supports mlx5 Direct Verbs.
+  static bool mlx5dv_is_supported(struct ibv_device* device);
+
+  /// Register a DMABUF memory region using mlx5dv extensions.
+  /// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version.
+  static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
+                                             int access);
+
+  /// Query the Data Direct sysfs path for the given IB context.
+  /// Returns 0 on success (device supports Data Direct), non-zero otherwise.
+  static int mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len);
+
+ private:
+  static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
+};
+
+}  // namespace mscclpp
+
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+#endif  // MSCCLPP_MLX5DV_WRAPPER_HPP_
diff --git a/src/core/mlx5dv_wrapper.cc b/src/core/mlx5dv_wrapper.cc
new file mode 100644
index 00000000..a56fad96
--- /dev/null
+++ b/src/core/mlx5dv_wrapper.cc
@@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#if defined(MSCCLPP_USE_MLX5DV)
+
+// _GNU_SOURCE is required for dlvsym()
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "mlx5dv_wrapper.hpp"
+
+#include <dlfcn.h>
+#include <infiniband/mlx5dv.h>
+
+#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
+#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
+#endif
+
+#include <memory>
+
+#include "logger.hpp"
+
+namespace mscclpp {
+
+static std::unique_ptr<void, int (*)(void*)> globalMLX5Handle(nullptr, &::dlclose);
+
+void* MLX5DV::dlsym(const std::string& symbol, bool allowReturnNull) {
+  if (!globalMLX5Handle) {
+    const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
+    for (int i = 0; possibleLibNames[i] != nullptr; i++) {
+      void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
+      if (handle) {
+        globalMLX5Handle.reset(handle);
+        break;
+      }
+    }
+    if (!globalMLX5Handle) {
+      if (allowReturnNull) return nullptr;
+      THROW(NET, SysError, errno, "Failed to open libmlx5: ", std::string(::dlerror()));
+    }
+  }
+  void* ptr = ::dlsym(globalMLX5Handle.get(), symbol.c_str());
+  if (!ptr && !allowReturnNull) {
+    THROW(NET, SysError, errno, "Failed to load libmlx5 symbol: ", symbol);
+  }
+  return ptr;
+}
+
+bool MLX5DV::isAvailable() {
+  static int available = -1;
+  if (available == -1) {
+    // Try to load the library; if it fails, mlx5dv is not available
+    const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
+    for (int i = 0; possibleLibNames[i] != nullptr; i++) {
+      void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
+      if (handle) {
+        if (!globalMLX5Handle) {
+          globalMLX5Handle.reset(handle);
+        } else {
+          ::dlclose(handle);
+        }
+        available = 1;
+        INFO(NET, "libmlx5 loaded successfully");
+        return true;
+      }
+    }
+    available = 0;
+    DEBUG(NET, "libmlx5 not available");
+  }
+  return available == 1;
+}
+
+bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
+  using FuncType = bool (*)(struct ibv_device*);
+  static FuncType impl = nullptr;
+  if (!impl) {
+    void* ptr = MLX5DV::dlsym("mlx5dv_is_supported", /*allowReturnNull=*/true);
+    if (!ptr) return false;
+    impl = reinterpret_cast<FuncType>(ptr);
+  }
+  return impl(device);
+}
+
+struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
+                                            int access) {
+  // mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
+  // Must use dlvsym with "MLX5_1.25" version to get the Data Direct-capable symbol.
+  using FuncType = struct ibv_mr* (*)(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int, int);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_reg_dmabuf_mr", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_reg_dmabuf_mr", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
+    resolved = true;
+  }
+  if (!impl) return nullptr;
+  return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
+}
+
+int MLX5DV::mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) {
+  using FuncType = int (*)(struct ibv_context*, char*, size_t);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_get_data_direct_sysfs_path", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_get_data_direct_sysfs_path", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
+    resolved = true;
+  }
+  if (!impl) return -1;
+  return impl(context, buf, buf_len);
+}
+
+}  // namespace mscclpp
+
+#endif  // defined(MSCCLPP_USE_MLX5DV)
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index bea43327..49a3791b 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -8,6 +8,7 @@
 #include "atomic.hpp"
 #include "connection.hpp"
 #include "context.hpp"
+#include "logger.hpp"
 #include "registered_memory.hpp"
 #include "serialization.hpp"
 
@@ -48,12 +49,12 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection
     token_ = std::make_shared<uint64_t>(0);
   } else if (localDevice.type == DeviceType::GPU) {
     if (localDevice.id < 0) {
-      throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
+      THROW(CONN, Error, ErrorCode::InvalidUsage, "Local GPU ID is not provided");
     }
     CudaDeviceGuard deviceGuard(localDevice.id);
     token_ = gpuCallocToken(connection_.context());
   } else {
-    throw Error("Unsupported local device type", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Unsupported local device type");
   }
   idMemory_ = std::move(connection_.context()->registerMemory(token_.get(), sizeof(uint64_t), connection_.transport()));
 }
@@ -78,7 +79,7 @@ MSCCLPP_API_CPP SemaphoreStub SemaphoreStub::deserialize(const std::vector<char>
   RegisteredMemory idMemory(std::make_shared<RegisteredMemory::Impl>(data.begin(), memEnd));
   auto it = detail::deserialize(memEnd, device);
   if (it != data.end()) {
-    throw Error("SemaphoreStub deserialize failed", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "SemaphoreStub deserialize failed");
   }
   return SemaphoreStub(std::make_shared<Impl>(std::move(idMemory), device));
 }
@@ -119,15 +120,35 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
       expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
       outboundToken_(std::make_unique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
-    throw Error("Local endpoint device type of Host2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
   }
-  BaseConnection::getImpl(connection())
-      ->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
+  auto connImpl = BaseConnection::getImpl(connection());
+  if (connImpl->isSignalForwarding()) {
+    // Signal forwarding (HostNoAtomic): the receiver's recv thread polls the recv CQ for
+    // WRITE_WITH_IMM completions, then forwards the token to inboundToken_ via GDRCopy.
+    CudaDeviceGuard deviceGuard(connection().localDevice().id);
+#if defined(MSCCLPP_USE_ROCM)
+    inboundToken_ = detail::gpuCallocUncachedShared<uint64_t>();
+#else
+    inboundToken_ = detail::gpuCallocShared<uint64_t>();
+#endif
+    connImpl->startSignalForwarding(inboundToken_);
+  }
+  // When isSignalForwarding() is false (atomic mode), inboundToken_ stays null
+  // and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
 }
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator, const Connection& connection)
     : Host2DeviceSemaphore(buildSemaphoreFromConnection(communicator, connection)) {}
 
+MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
+  if (inboundToken_) {
+    // Clear the connection's signal forwarding destination (and GdrMap)
+    // before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
+    BaseConnection::getImpl(connection())->stopSignalForwarding();
+  }
+}
+
 MSCCLPP_API_CPP Connection& Host2DeviceSemaphore::connection() { return semaphore_.connection(); }
 
 MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
@@ -136,7 +157,11 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const {
   Host2DeviceSemaphore::DeviceHandle device;
-  device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
+  // If inboundToken_ is allocated (signal forwarding mode), the GPU polls it.
+  // Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
+  // which is the same address targeted by the NIC's atomic operation.
+  device.inboundToken =
+      inboundToken_ ? inboundToken_.get() : reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
   device.expectedInboundToken = expectedInboundToken_.get();
   return device;
 }
@@ -146,13 +171,19 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
       expectedInboundToken_(std::make_unique<uint64_t>()),
       outboundToken_(std::make_unique<uint64_t>()) {
   if (connection().transport() == Transport::CudaIpc) {
-    throw Error("Host2HostSemaphore cannot be used with CudaIpc transport", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Host2HostSemaphore cannot be used with CudaIpc transport");
   }
   if (connection().localDevice().type != DeviceType::CPU) {
-    throw Error("Local endpoint device type of Host2HostSemaphore should be CPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
+  }
+  auto connImpl = BaseConnection::getImpl(connection());
+  if (connImpl->isSignalForwarding()) {
+    // Signal forwarding mode: tell the recv thread where to write the incoming token.
+    // Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
+    auto token =
+        std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
+    connImpl->startSignalForwarding(std::move(token));
   }
-  BaseConnection::getImpl(connection())
-      ->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
 }
 
 MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(Communicator& communicator, const Connection& connection)
@@ -177,7 +208,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
   while (atomicLoad(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), memoryOrderAcquire) <
          (*expectedInboundToken_)) {
     if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) {
-      throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout);
+      THROW(CONN, Error, ErrorCode::Timeout, "Host2HostSemaphore::wait timed out");
     }
   }
 }
@@ -185,7 +216,8 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
 MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const Semaphore& semaphore)
     : semaphore_(semaphore), expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
-    throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage,
+          "Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU");
   }
 }
 
diff --git a/test/framework.cc b/test/framework.cc
index 73cf1272..941fdcba 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -20,8 +20,30 @@ static bool gCurrentTestPassed = true;
 static std::string gCurrentTestFailureMessage;
 static std::string gCurrentTestName;
 
+// Performance result collection
+struct PerfResult {
+  std::string label;
+  double value;
+  std::string unit;
+};
+struct PerfTestResults {
+  std::string testName;
+  std::vector<PerfResult> results;
+};
+static std::vector<PerfTestResults> gPerfResults;
+
 std::string currentTestName() { return gCurrentTestName; }
 
+void reportPerfResult(const std::string& label, double value, const std::string& unit) {
+  if (gMpiRank != 0) return;
+  if (gCurrentTestName.empty()) return;
+  // Find or create entry for the current test
+  if (gPerfResults.empty() || gPerfResults.back().testName != gCurrentTestName) {
+    gPerfResults.push_back({gCurrentTestName, {}});
+  }
+  gPerfResults.back().results.push_back({label, value, unit});
+}
+
 namespace utils {
 
 void initializeMPI(int argc, char* argv[]) {
@@ -151,6 +173,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   // Parse command line arguments
   std::string filter;
   bool excludePerfTests = false;
+  bool onlyPerfTests = false;
 
   for (int i = 1; i < argc; ++i) {
     std::string arg = argv[i];
@@ -161,6 +184,8 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       ++i;
     } else if (arg == "--exclude-perf-tests") {
       excludePerfTests = true;
+    } else if (arg == "--only-perf-tests") {
+      onlyPerfTests = true;
     }
   }
 
@@ -189,6 +214,10 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       skippedByFilter++;
       continue;
     }
+    if (onlyPerfTests && !entry.isPerfTest) {
+      skippedByFilter++;
+      continue;
+    }
     if (!matchesFilter(fullName, filter)) {
       skippedByFilter++;
       continue;
@@ -208,6 +237,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     std::string fullName = entry.suiteName + "." + entry.testName;
 
     if (excludePerfTests && entry.isPerfTest) continue;
+    if (onlyPerfTests && !entry.isPerfTest) continue;
     if (!matchesFilter(fullName, filter)) continue;
 
     gCurrentTestPassed = true;
@@ -285,6 +315,9 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
         passed++;
       } else {
         std::cout << "[  FAILED  ] " << fullName << std::endl;
+        if (!gCurrentTestFailureMessage.empty()) {
+          std::cout << "            Reason: " << gCurrentTestFailureMessage << std::endl;
+        }
         failed++;
       }
     }
@@ -301,6 +334,19 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     if (failed > 0) {
       std::cout << "[  FAILED  ] " << failed << " tests.\n";
     }
+
+    // Print collected performance results
+    if (!gPerfResults.empty()) {
+      std::cout << "\n[   PERF   ] Performance results:\n";
+      for (const auto& testResult : gPerfResults) {
+        std::cout << "[   PERF   ] " << testResult.testName << "\n";
+        for (const auto& r : testResult.results) {
+          std::cout << "[   PERF   ]   " << std::setw(12) << r.label << ": " << std::setprecision(4) << r.value << " "
+                    << r.unit << "\n";
+        }
+      }
+      gPerfResults.clear();
+    }
   }
 
   // Tear down global test environments (in reverse order)
diff --git a/test/framework.hpp b/test/framework.hpp
index 26a32d5b..b2431ed9 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -63,6 +63,13 @@ class TestRegistry {
 // Returns "Suite.Name" for the currently running test, or "" if none.
 std::string currentTestName();
 
+/// Collect a performance result for the current test. Results are printed together
+/// after all tests complete. Only rank 0 should call this (results are ignored on other ranks).
+/// @param label A label for this measurement (e.g., "128 MB" or "latency").
+/// @param value The numeric result.
+/// @param unit The unit string (e.g., "GB/s", "us/iter").
+void reportPerfResult(const std::string& label, double value, const std::string& unit);
+
 // Utility functions
 namespace utils {
 
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 04ab402d..e5945563 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -3,8 +3,12 @@
 
 #include <mpi.h>
 
+#include <atomic>
+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <thread>
 
+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"
 
@@ -40,7 +44,11 @@ void IbPeerToPeerTest::SetUp() {
   int ib_gid_index = std::stoi(gEnv->args["ib_gid_index"]);
 
   ibCtx = std::make_shared<mscclpp::IbCtx>(ibDevName);
-  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, 0, 64);
+  bool noAtomic = !ibCtx->supportsRdmaAtomics();
+  // When atomics are not supported, the MemoryConsistency test uses
+  // write-with-imm which requires recv WRs on the receiver side.
+  int maxRecvWr = noAtomic ? 64 : 0;
+  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, maxRecvWr, 64, noAtomic);
 
   qpInfo[gEnv->rank] = qp->getInfo();
   bootstrap->allGather(qpInfo.data(), sizeof(mscclpp::IbQpInfo));
@@ -78,7 +86,7 @@ void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint6
   qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
 }
 
-TEST(IbPeerToPeerTest, SimpleSendRecv) {
+PERF_TEST(IbPeerToPeerTest, SimpleSendRecv) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -114,7 +122,7 @@ TEST(IbPeerToPeerTest, SimpleSendRecv) {
       }
     }
     float us = (float)timer.elapsed();
-    std::cout << "IbPeerToPeerTest.SimpleSendRecv: " << us / maxIter << " us/iter" << std::endl;
+    ::mscclpp::test::reportPerfResult("latency", us / maxIter, "us/iter");
   }
   bootstrap->barrier();
 }
@@ -199,11 +207,33 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     return;
   }
 
+  // Use atomic path if supported by the IB device.
+  bool useAtomic = ibCtx->supportsRdmaAtomics();
+
   const uint64_t signalPeriod = 1024;
   const uint64_t maxIter = 10000;
   const uint64_t nelem = 65536 + 1;
   auto data = mscclpp::detail::gpuCallocUnique<uint64_t>(nelem);
 
+  // For no-atomic mode: allocate a separate signal buffer for write-with-imm destination.
+  // The sender writes-with-imm to this buffer; the receiver's CPU thread reads the imm_data
+  // from the recv CQ and writes the iteration value to data[0] via GDRCopy atomicStore.
+  std::shared_ptr<uint64_t> signalBuf;
+  std::unique_ptr<const mscclpp::IbMr> signalMr;
+  std::array<mscclpp::IbMrInfo, 2> signalMrInfo{};
+  if (!useAtomic) {
+    signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+    signalMr = ibCtx->registerMr(signalBuf.get(), sizeof(uint64_t));
+    signalMrInfo[gEnv->rank] = signalMr->getInfo();
+    bootstrap->allGather(signalMrInfo.data(), sizeof(mscclpp::IbMrInfo));
+
+    // Pre-post recv WRs for write-with-imm on both ranks
+    for (int i = 0; i < 64; ++i) {
+      qp->stageRecv(0);
+    }
+    qp->postRecv();
+  }
+
   registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);
 
   uint64_t res = 0;
@@ -222,6 +252,40 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     ASSERT_EQ(*ptrCurIter, 0);
     ASSERT_EQ(*ptrResult, 0);
 
+    // For no-atomic mode: create a GDRCopy mapping for data[0] and start a CPU thread that
+    // polls recv CQ and forwards the signal via GDRCopy BAR1 write — the same mechanism
+    // used by IBConnection::recvThreadFunc for port channels.
+    std::atomic<bool> stopRecvThread(false);
+    std::thread recvThread;
+    std::unique_ptr<mscclpp::GdrMap> dataGdrMap;
+    if (!useAtomic) {
+      if (!mscclpp::gdrEnabled()) {
+        SKIP_TEST() << "No-atomic mode requires GDRCopy but it is not available.";
+      }
+      // Create GDRCopy BAR1 mapping for data[0] — same as how connection.cc maps inboundToken_
+      dataGdrMap =
+          std::make_unique<mscclpp::GdrMap>(std::shared_ptr<void>(data.get(), [](void*) {}),  // non-owning shared_ptr
+                                            cudaDevId);
+
+      recvThread = std::thread([&]() {
+        while (!stopRecvThread.load(std::memory_order_relaxed)) {
+          int wcNum = qp->pollRecvCq();
+          if (wcNum <= 0) continue;
+          for (int i = 0; i < wcNum; ++i) {
+            int status = qp->getRecvWcStatus(i);
+            if (status != static_cast<int>(mscclpp::WsStatus::Success)) continue;
+            uint64_t val = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+            // Write the iteration value to data[0] via GDRCopy BAR1 atomicStore —
+            // same pattern as IBConnection::recvThreadFunc.
+            mscclpp::atomicStore(dataGdrMap->hostPtr(), val, mscclpp::memoryOrderRelaxed);
+            // Re-post recv
+            qp->stageRecv(0);
+            qp->postRecv();
+          }
+        }
+      });
+    }
+
     kernelMemoryConsistency<<<1, 1024>>>(data.get(), ptrCurIter, ptrResult, nelem, maxIter);
     MSCCLPP_CUDATHROW(cudaGetLastError());
 
@@ -243,6 +307,11 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     }
 
     MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+    if (!useAtomic) {
+      stopRecvThread.store(true, std::memory_order_relaxed);
+      if (recvThread.joinable()) recvThread.join();
+    }
   } else if (gEnv->rank == 1) {
     // Sender
     std::vector<uint64_t> hostBuffer(nelem, 0);
@@ -263,15 +332,20 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
       stageSendWrite(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
       qp->postSend();
 
-#if 0
-      // For reference: send the first element using a normal send. This should occasionally see a wrong result.
-      stageSendWrite(sizeof(uint64_t), 0, 0, 0, false);
-      qp->postSend();
-#else
-      // Send the first element using AtomicAdd. This should see the correct result.
-      stageSendAtomicAdd(0, 0, 1, false);
-      qp->postSend();
-#endif
+      if (useAtomic) {
+        // Send the first element using AtomicAdd. The non-posted PCIe atomic operation
+        // provides end-to-end ordering: data[1..N] are guaranteed visible when data[0] updates.
+        stageSendAtomicAdd(0, 0, 1, false);
+        qp->postSend();
+      } else {
+        // No-atomic mode: send a 0-byte WRITE_WITH_IMM carrying the iteration in imm_data.
+        // The receiver's CPU thread polls the recv CQ and writes the value to data[0]
+        // via GDRCopy atomicStore.
+        // QP ordering guarantees data[1..N] WRITE completes before this write-with-imm.
+        const mscclpp::IbMrInfo& remoteSignalMrInfo = signalMrInfo[(gEnv->rank == 1) ? 0 : 1];
+        qp->stageSendWriteWithImm(nullptr, remoteSignalMrInfo, 0, 0, 0, 0, false, static_cast<unsigned int>(iter));
+        qp->postSend();
+      }
 
       if (signaled) {
         int wcNum = qp->pollSendCq();
@@ -292,20 +366,33 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     }
   }
 
-  if (res & 2) {
-    FAIL() << "The receiver is stuck at iteration " << iter << ".";
-  } else if (res != 0 && res != 1) {
-    FAIL() << "Unknown error is detected at iteration " << iter << ". res =" << res;
+  if (useAtomic) {
+    // With RDMA atomics, memory consistency must be guaranteed.
+    if (res & 2) {
+      FAIL() << "The receiver is stuck at iteration " << iter << ".";
+    }
+    EXPECT_EQ(res, 0);
+  } else {
+    if (res == 0) {
+      // No-atomic path works correctly here.
+    } else if (res & 2) {
+      SKIP_TEST() << "No-atomic signal forwarding: receiver stuck at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    } else {
+      SKIP_TEST() << "No-atomic signal forwarding: memory inconsistency detected at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    }
   }
-
-  EXPECT_EQ(res, 0);
 }
 
-TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
+PERF_TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
   }
+  if (!ibCtx->supportsRdmaAtomics()) {
+    SKIP_TEST() << "This test requires RDMA atomics support.";
+  }
 
   mscclpp::Timer timeout(3);
 
@@ -339,7 +426,7 @@ TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
       }
     }
     float us = (float)timer.elapsed();
-    std::cout << "IbPeerToPeerTest.SimpleAtomicAdd: " << us / maxIter << " us/iter" << std::endl;
+    ::mscclpp::test::reportPerfResult("latency", us / maxIter, "us/iter");
   }
   bootstrap->barrier();
 }
diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
index 318d301a..1ce9eb0b 100644
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -103,7 +103,7 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)(nTries) << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)(nTries), "us/iter");
   }
 }
 
@@ -324,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int*
   }
 }
 
-TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
+PERF_TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
   auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
   packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
 }
 
-TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
+PERF_TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
   auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index 5f95d660..f4a26cf9 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -159,6 +159,7 @@ class PortChannelOneToOneTest : public CommunicatorTestBase {
   void testPingPongPerf(PingPongTestParams params);
   void testPacketPingPong(bool useIbOnly, IbMode ibMode = IbMode::Default);
   void testPacketPingPongPerf(bool useIbOnly, IbMode ibMode = IbMode::Default);
+  void testBandwidth(PingPongTestParams params);
 
   std::shared_ptr<mscclpp::ProxyService> proxyService;
 };
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 764c3299..3b14ed31 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -4,9 +4,38 @@
 #include <cstdint>
 #include <mscclpp/concurrency_device.hpp>
 
+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"
 
+// Skip the current test if the given IB mode will require GDRCopy on CUDA but it is unavailable.
+// On CUDA, HostNoAtomic requires GDRCopy for BAR1 signal forwarding. When IbMode::Host or
+// IbMode::Default is used and the IB device does not support RDMA atomics, the endpoint falls
+// back to no-atomic mode, which also requires GDRCopy.
+// On ROCm, no-atomic mode uses direct volatile writes and does not need GDRCopy.
+#if defined(MSCCLPP_USE_CUDA)
+inline void requireGdrForIbMode(IbMode mode, mscclpp::Transport ibTransport) {
+  if (mscclpp::gdrEnabled()) return;  // GDRCopy available — nothing to skip.
+  if (mode == IbMode::HostNoAtomic) {
+    SKIP_TEST() << "HostNoAtomic requires GDRCopy on CUDA: " << mscclpp::gdrStatusMessage();
+  }
+  // For Host/Default modes: check whether the IB device lacks RDMA atomics,
+  // which would cause an automatic fallback to no-atomic mode.
+  if (mode == IbMode::Host || mode == IbMode::Default) {
+    std::string devName = mscclpp::getIBDeviceName(ibTransport);
+    mscclpp::IbCtx ibCtx(devName);
+    if (!ibCtx.supportsRdmaAtomics()) {
+      SKIP_TEST() << "IB device " << devName
+                  << " lacks RDMA atomics; Host mode falls back to HostNoAtomic which requires GDRCopy: "
+                  << mscclpp::gdrStatusMessage();
+    }
+  }
+}
+#define REQUIRE_GDR_FOR_IB_MODE(mode) requireGdrForIbMode((mode), ibTransport)
+#else
+#define REQUIRE_GDR_FOR_IB_MODE(mode)  // No extra requirements on non-CUDA platforms.
+#endif
+
 void PortChannelOneToOneTest::SetUp() {
   // Use only two ranks
   setNumRanksToUse(2);
@@ -226,7 +255,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nTries, "us/iter");
   }
 
   proxyService->stopProxy();
@@ -239,6 +268,7 @@ TEST(PortChannelOneToOneTest, PingPong) {
 
 TEST(PortChannelOneToOneTest, PingPongIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 }
@@ -255,28 +285,31 @@ TEST(PortChannelOneToOneTest, PingPongWithPoll) {
 
 TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerf) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerf) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
@@ -443,7 +476,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nTries, "us/iter");
   }
 
   proxyService->stopProxy();
@@ -453,28 +486,117 @@ TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode
 
 TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPacketPingPong(true, IbMode::Host);
 }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPacketPingPongPerf(true, IbMode::Host);
 }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPacketPingPongPerf(true, IbMode::HostNoAtomic);
 }
 
 TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
 
 TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPacketPingPong(true, IbMode::HostNoAtomic);
 }
+
+// Bandwidth test: bidirectional bulk transfer matching the tutorial pattern.
+// Both ranks do signal+wait+putWithSignal+wait per iteration.
+__global__ void kernelBandwidthBidir(int* buff, int nElem, int nIters, int rank) {
+  DeviceHandle<mscclpp::PortChannel>& portChan = gChannelOneToOneTestConstPortChans;
+  if (threadIdx.x != 0) return;
+  const uint64_t srcOffset = rank * nElem * sizeof(int);
+  const uint64_t dstOffset = srcOffset;
+  for (int i = 0; i < nIters; i++) {
+    portChan.signal();
+    portChan.wait();
+    portChan.putWithSignal(dstOffset, srcOffset, nElem * sizeof(int));
+    portChan.wait();
+  }
+}
+
+void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  const int maxElem = 32 * 1024 * 1024;  // 128 MB per direction
+  const int bufElem = maxElem * 2;       // 2x for bidirectional
+
+  std::vector<mscclpp::PortChannel> portChannels;
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(bufElem).memory();
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), bufElem * sizeof(int),
+                       nullptr, 0, params.ibMode);
+
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
+  for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
+
+  ASSERT_EQ(portChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::PortChannel>)));
+
+  proxyService->startProxy();
+
+  const std::string testName = ::mscclpp::test::currentTestName();
+  const int nIters = 1000;
+
+  for (int nElem : {256, 16 * 1024, 256 * 1024, 1024 * 1024, 4 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024}) {
+    // Warm-up
+    kernelBandwidthBidir<<<1, 1024>>>(buff.get(), nElem, 10, gEnv->rank);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    communicator->bootstrap()->barrier();
+
+    // Measure
+    mscclpp::Timer timer;
+    kernelBandwidthBidir<<<1, 1024>>>(buff.get(), nElem, nIters, gEnv->rank);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    double elapsedUs = timer.elapsed();
+    communicator->bootstrap()->barrier();
+
+    if (gEnv->rank == 0) {
+      double copyBytes = (double)nElem * sizeof(int);
+      double elapsedMsPerIter = elapsedUs / 1e3 / nIters;
+      double gbps = copyBytes / elapsedMsPerIter * 1e-6;
+      double sizeKB = copyBytes / 1024.0;
+      std::string label =
+          (sizeKB >= 1024.0) ? (std::to_string((int)(sizeKB / 1024.0)) + " MB") : (std::to_string((int)sizeKB) + " KB");
+      ::mscclpp::test::reportPerfResult(label, gbps, "GB/s");
+    }
+  }
+
+  proxyService->stopProxy();
+}
+
+PERF_TEST(PortChannelOneToOneTest, Bandwidth) {
+  testBandwidth(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
+}
+
+PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  testBandwidth(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
+}
+
+PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
+  testBandwidth(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
+}
diff --git a/test/mp_unit/semaphore_perf_tests.cu b/test/mp_unit/semaphore_perf_tests.cu
index 92560539..a4c0e29f 100644
--- a/test/mp_unit/semaphore_perf_tests.cu
+++ b/test/mp_unit/semaphore_perf_tests.cu
@@ -68,6 +68,6 @@ PERF_TEST(SemaphorePerfTest, SignalPingPong) {
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nIters << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nIters, "us/iter");
   }
 }
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 7836e063..a345effc 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -4,6 +4,7 @@
 target_sources(unit_tests PRIVATE
     unit_tests_main.cc
     core_tests.cc
+    gdr_tests.cu
     gpu_utils_tests.cc
     errors_tests.cc
     fifo_tests.cu
diff --git a/test/unit/gdr_tests.cu b/test/unit/gdr_tests.cu
new file mode 100644
index 00000000..78bb2e1a
--- /dev/null
+++ b/test/unit/gdr_tests.cu
@@ -0,0 +1,251 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <mscclpp/atomic_device.hpp>
+#include <mscclpp/errors.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "../framework.hpp"
+#include "gdr.hpp"
+
+// GdrStatus and gdrEnabled
+
+class GdrStatusTest : public ::mscclpp::test::TestCase {};
+
+TEST(GdrStatusTest, StatusIsValid) {
+  // gdrStatus() should return one of the defined enum values
+  auto status = mscclpp::gdrStatus();
+  ASSERT_TRUE(status == mscclpp::GdrStatus::Ok || status == mscclpp::GdrStatus::NotBuilt ||
+              status == mscclpp::GdrStatus::Disabled || status == mscclpp::GdrStatus::DriverMissing ||
+              status == mscclpp::GdrStatus::OpenFailed);
+}
+
+TEST(GdrStatusTest, EnabledConsistentWithStatus) {
+  // gdrEnabled() should be true iff gdrStatus() == Ok
+  EXPECT_EQ(mscclpp::gdrEnabled(), mscclpp::gdrStatus() == mscclpp::GdrStatus::Ok);
+}
+
+// GdrMap tests — only run when GDRCopy is available
+
+class GdrMapTest : public ::mscclpp::test::TestCase {
+ protected:
+  void SetUp() override {
+    if (!mscclpp::gdrEnabled()) {
+      SKIP_TEST() << "GDRCopy not enabled on this platform.";
+    }
+    MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
+    // Try creating a GDRCopy mapping to check if pin+map works on this platform.
+    try {
+      auto testMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+      mscclpp::GdrMap testMap(std::static_pointer_cast<void>(testMem), deviceId_);
+    } catch (const std::exception&) {
+      SKIP_TEST() << "GDRCopy mapping not supported on this platform.";
+    }
+  }
+
+  int deviceId_ = 0;
+};
+
+TEST(GdrMapTest, BasicMapping) {
+  // Allocate GPU memory via cudaMalloc (not VMM) and create a GDRCopy mapping
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+
+  ASSERT_TRUE(map.valid());
+  EXPECT_NE(map.hostPtr(), nullptr);
+}
+
+TEST(GdrMapTest, CopyToAndFrom) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write a value to GPU via GDRCopy
+  uint64_t writeVal = 0xDEADBEEFCAFE0123ULL;
+  map.copyTo(&writeVal, sizeof(uint64_t));
+
+  // Read it back via GDRCopy
+  uint64_t readVal = 0;
+  map.copyFrom(&readVal, sizeof(uint64_t));
+  EXPECT_EQ(readVal, writeVal);
+
+  // Also verify via cudaMemcpy
+  uint64_t cudaVal = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&cudaVal, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(cudaVal, writeVal);
+}
+
+TEST(GdrMapTest, CopyToVisibleFromGpu) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy, verify GPU sees it via cudaMemcpy
+  uint64_t val = 42;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 42);
+}
+
+TEST(GdrMapTest, MultipleWritesReadBack) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write multiple values sequentially and verify each
+  for (uint64_t i = 1; i <= 100; ++i) {
+    map.copyTo(&i, sizeof(uint64_t));
+    uint64_t readback = 0;
+    map.copyFrom(&readback, sizeof(uint64_t));
+    EXPECT_EQ(readback, i);
+    if (readback != i) break;
+  }
+}
+
+TEST(GdrMapTest, HostPtrIsWritable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write directly through the hostPtr (volatile store)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  *ptr = 12345;
+
+  // Read back via GDRCopy
+  uint64_t readback = 0;
+  map.copyFrom(&readback, sizeof(uint64_t));
+  EXPECT_EQ(readback, 12345);
+}
+
+TEST(GdrMapTest, HostPtrIsReadable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy copyTo (same BAR1 path as the read)
+  uint64_t val = 99999;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  // Read through the hostPtr (volatile load via BAR1)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  EXPECT_EQ(*ptr, 99999);
+}
+
+TEST(GdrMapTest, DestroyDoesNotCrash) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  {
+    mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+    ASSERT_TRUE(map.valid());
+    uint64_t val = 1;
+    map.copyTo(&val, sizeof(uint64_t));
+  }
+  // After GdrMap is destroyed, gpuMem should still be valid
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 1);
+}
+
+// GPU kernel: polls signalFromCpu until it reaches expectedIter, then writes expectedIter to ackToHost.
+// Repeats for maxIter iterations. The GPU uses system-scope acquire loads on signalFromCpu
+// and plain stores to ackToHost (which is host-pinned memory visible to CPU).
+__global__ void kernelGdrVisibilityPingPong(volatile uint64_t* signalFromCpu, volatile uint64_t* ackToHost,
+                                            uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // Poll until CPU writes the expected iteration value via GDRCopy BAR1
+    while (*signalFromCpu < iter) {
+    }
+    // Ack back to CPU via host-pinned memory
+    *ackToHost = iter;
+  }
+}
+
+TEST(GdrMapTest, CpuGpuVisibilityPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // signalBuf: GPU memory mapped via GDRCopy BAR1. CPU writes here, GPU polls.
+  auto signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap signalMap(std::static_pointer_cast<void>(signalBuf), deviceId_);
+  ASSERT_TRUE(signalMap.valid());
+
+  // ackBuf: host-pinned memory (gpuCallocHostShared). GPU writes here, CPU polls.
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — it will poll signalBuf and write ackBuf for each iteration
+  kernelGdrVisibilityPingPong<<<1, 1>>>(signalBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes iteration value to GPU via GDRCopy BAR1
+    uint64_t val = iter;
+    signalMap.copyTo(&val, sizeof(uint64_t));
+
+    // CPU polls host-pinned ack until GPU confirms it saw the value
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        FAIL() << "GPU did not ack iteration " << iter << " (ack=" << *ackPtr << ")";
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}
+
+// GPU kernel that polls a counter using system-scope acquire load.
+// When counter >= expectedIter, writes ack.
+__global__ void kernelCounterWait(uint64_t* counter, volatile uint64_t* ackToHost, uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // System-scope acquire load — matches the atomicStore(relaxed) on the CPU side
+    uint64_t got;
+    do {
+      got = mscclpp::atomicLoad(counter, mscclpp::memoryOrderAcquire);
+    } while (got < iter);
+    // Ack back
+    *ackToHost = iter;
+  }
+}
+
+// Test the GDRCopy counter pattern used by HostNoAtomic mode:
+// - GPU memory allocated via gpuCallocShared (cudaMalloc)
+// - GdrMap for BAR1 mapping
+// - CPU writes via atomicStore(relaxed) through GDRCopy BAR1 mapping
+// - GPU reads via atomicLoad with memory_order_acquire
+TEST(GdrMapTest, AtomicStoreCounterPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // Allocate GPU memory via gpuCallocShared
+  auto counterBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap counterMap(std::static_pointer_cast<void>(counterBuf), deviceId_);
+  ASSERT_TRUE(counterMap.valid());
+
+  // Ack buffer: host-pinned memory
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — polls counterBuf with system-scope acquire load
+  kernelCounterWait<<<1, 1>>>(counterBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes counter via atomicStore (relaxed — GPU uses acquire on read)
+    mscclpp::atomicStore(counterMap.hostPtr(), iter, mscclpp::memoryOrderRelaxed);
+
+    // Wait for GPU ack
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+        FAIL() << "GPU did not ack iteration " << iter;
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}

From feda3385954a1558cbfb1afd6356bcd744bbce8c Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Fri, 10 Apr 2026 13:57:14 -0700
Subject: [PATCH 48/52] Adjusting Torch Integration Example (#779)

Co-authored-by: Binyang Li <binyli@microsoft.com>
---
 examples/torch-integration/dsl_with_nccl_api.py   | 15 ++++++++-------
 .../csrc/ext/algorithm_collection_builder_py.cpp  |  1 +
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/torch-integration/dsl_with_nccl_api.py b/examples/torch-integration/dsl_with_nccl_api.py
index 975d3749..5a4dd1c4 100644
--- a/examples/torch-integration/dsl_with_nccl_api.py
+++ b/examples/torch-integration/dsl_with_nccl_api.py
@@ -1,19 +1,20 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/nccl/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
 
 import os
 from typing import Any, Dict
 import torch, torch.distributed as dist
-import mscclpp
+import mscclpp.ext
 from mscclpp.language.collectives import AllReduce
 from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
 from mscclpp.language.program import CollectiveProgram
 from mscclpp.language.rank import Rank
+from mscclpp.language.utils import AlgoSpec
 
 
-def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
+def allreduce_nvls(spec: AlgoSpec) -> CollectiveProgram:
     gpu_size = spec.world_size
     with CollectiveProgram.from_spec(spec) as program:
         # Creating Channels
@@ -63,8 +64,8 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
     return program
 
 
-def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int):
-    spec = mscclpp.AlgoSpec(
+def setup_plan(algo_collection_builder: mscclpp.ext.AlgorithmCollectionBuilder, rank: int, world_size: int):
+    spec = AlgoSpec(
         name="allreduce_nvls",
         collective=AllReduce(8, 1, True),
         nranks_per_node=8,
@@ -94,10 +95,10 @@ def init_dist():
     rank = int(os.environ["RANK"])
     world = int(os.environ["WORLD_SIZE"])
     local = int(os.environ["LOCAL_RANK"])
-    algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder()
+    algorithm_collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
     setup_plan(algorithm_collection_builder, rank, world)
     algorithm_collection_builder.set_algorithm_selector(selector)
-    dist.init_process_group(backend="nccl", device_id=local)
+    dist.init_process_group(backend="nccl", device_id=torch.device("cuda", local))
     return rank, world, local
 
 
diff --git a/python/csrc/ext/algorithm_collection_builder_py.cpp b/python/csrc/ext/algorithm_collection_builder_py.cpp
index be7f944e..4a3563d9 100644
--- a/python/csrc/ext/algorithm_collection_builder_py.cpp
+++ b/python/csrc/ext/algorithm_collection_builder_py.cpp
@@ -4,6 +4,7 @@
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/function.h>
 #include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
 #include <nanobind/stl/unordered_map.h>
 #include <nanobind/stl/vector.h>
 

From 5380a4ac6ef705f9f6e25141234dacaa6a95ffa0 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 13 Apr 2026 09:59:42 -0700
Subject: [PATCH 49/52] Add MSCCLPP_IB_GID_INDEX env (#780)

Use MSCCLPP_IB_GID_INDEX to control ib gid index

---------

Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 include/mscclpp/core.hpp | 4 ++--
 include/mscclpp/env.hpp  | 4 ++++
 python/csrc/env_py.cpp   | 3 ++-
 src/core/endpoint.cc     | 5 +++++
 src/core/env.cpp         | 4 +++-
 5 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 37bdbd51..ca2fc34f 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -389,7 +389,7 @@ struct EndpointConfig {
     };
 
     static constexpr int DefaultPort = -1;
-    static constexpr int DefaultGidIndex = 0;
+    static constexpr int DefaultGidIndex = -1;
     static constexpr int DefaultMaxCqSize = 1024;
     static constexpr int DefaultMaxCqPollNum = 1;
     static constexpr int DefaultMaxSendWr = 8192;
@@ -418,7 +418,7 @@ struct EndpointConfig {
     /// Constructor.
     /// @param deviceIndex Device index.
     /// @param port Port number.
-    /// @param gidIndex GID index.
+    /// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
     /// @param maxCqSize Maximum send completion queue size.
     /// @param maxCqPollNum Maximum send completion queue poll count.
     /// @param maxSendWr Maximum outstanding send work requests.
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index fb1da22c..a6dd306b 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -115,6 +115,10 @@ class Env {
   /// Default is false.
   const bool forceDisableGdr;
 
+  /// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
+  /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
+  const int ibGidIndex;
+
  private:
   Env();
 
diff --git a/python/csrc/env_py.cpp b/python/csrc/env_py.cpp
index ce89fd3d..d4b2f5da 100644
--- a/python/csrc/env_py.cpp
+++ b/python/csrc/env_py.cpp
@@ -23,7 +23,8 @@ void register_env(nb::module_& m) {
       .def_ro("ibv_mode", &Env::ibvMode)
       .def_ro("cache_dir", &Env::cacheDir)
       .def_ro("npkit_dump_dir", &Env::npkitDumpDir)
-      .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
+      .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream)
+      .def_ro("ib_gid_index", &Env::ibGidIndex);
 
   m.def("env", &env);
 }
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 5ab4bad0..fe51e348 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -47,6 +47,11 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
       }
     }
 
+    // Resolve GID index: explicit value (>= 0) takes priority, otherwise use env
+    if (config_.ib.gidIndex < 0) {
+      config_.ib.gidIndex = env()->ibGidIndex;
+    }
+
     int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
 
     ibQp_ = contextImpl.getIbContext(config_.transport)
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 96f53492..7a42471b 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -66,7 +66,8 @@ Env::Env()
       forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
       ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
       forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
-      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
+      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
+      ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -95,6 +96,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
     logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
+    logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
   }
   return globalEnv;
 }

From b59e6d7f0018ec39dc160553b3bd7d34308d204a Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Mon, 13 Apr 2026 13:36:42 -0700
Subject: [PATCH 50/52] Updating NpKit (#785)

---
 src/core/npkit/npkit.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/core/npkit/npkit.cc b/src/core/npkit/npkit.cc
index 30fc35c7..84457abf 100644
--- a/src/core/npkit/npkit.cc
+++ b/src/core/npkit/npkit.cc
@@ -103,10 +103,10 @@ static int GetGpuClockRateInKhz() {
   else
     return 25000;
 #else
-  cudaDeviceProp dev_prop;
+  int clockRate;
   MSCCLPP_CUDATHROW(cudaGetDevice(&dev_id));
-  MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev_id));
-  return dev_prop.clockRate;
+  MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev_id));
+  return clockRate;
 #endif
 }
 #endif

From b6d0ca13ca866d45a6385bd5ad0e1c4b568fd37c Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Mon, 13 Apr 2026 13:55:45 -0700
Subject: [PATCH 51/52] Adding CI Test to DSL Executor (#782)

---
 .azure-pipelines/templates/ut-executor.yml    |  42 +
 .azure-pipelines/ut.yml                       |  21 +
 test/executor-tests/algos/reduce.py           |  87 ++
 test/executor-tests/algos/reduce_nvls.py      |  91 +++
 .../algos/reduce_nvls_pipeline.py             |  94 +++
 test/executor-tests/algos/reduce_pack.py      |  73 ++
 test/executor-tests/algos/reduce_pack_tbg.py  |  77 ++
 test/executor-tests/algos/reduce_tbg.py       |  99 +++
 test/executor-tests/algos/transfer_pack.py    |  67 ++
 .../executor-tests/algos/transfer_pack_tbg.py |  71 ++
 .../execution-plans/reduce.json               | 389 +++++++++
 .../execution-plans/reduce_nvls.json          | 246 ++++++
 .../execution-plans/reduce_nvls_pipeline.json | 264 ++++++
 .../execution-plans/reduce_pack.json          | 297 +++++++
 .../execution-plans/reduce_pack_tbg.json      | 576 +++++++++++++
 .../execution-plans/reduce_tbg.json           | 773 ++++++++++++++++++
 .../execution-plans/transfer_pack.json        | 216 +++++
 .../execution-plans/transfer_pack_tbg.json    | 406 +++++++++
 18 files changed, 3889 insertions(+)
 create mode 100644 .azure-pipelines/templates/ut-executor.yml
 create mode 100644 test/executor-tests/algos/reduce.py
 create mode 100644 test/executor-tests/algos/reduce_nvls.py
 create mode 100644 test/executor-tests/algos/reduce_nvls_pipeline.py
 create mode 100644 test/executor-tests/algos/reduce_pack.py
 create mode 100644 test/executor-tests/algos/reduce_pack_tbg.py
 create mode 100644 test/executor-tests/algos/reduce_tbg.py
 create mode 100644 test/executor-tests/algos/transfer_pack.py
 create mode 100644 test/executor-tests/algos/transfer_pack_tbg.py
 create mode 100644 test/executor-tests/execution-plans/reduce.json
 create mode 100644 test/executor-tests/execution-plans/reduce_nvls.json
 create mode 100644 test/executor-tests/execution-plans/reduce_nvls_pipeline.json
 create mode 100644 test/executor-tests/execution-plans/reduce_pack.json
 create mode 100644 test/executor-tests/execution-plans/reduce_pack_tbg.json
 create mode 100644 test/executor-tests/execution-plans/reduce_tbg.json
 create mode 100644 test/executor-tests/execution-plans/transfer_pack.json
 create mode 100644 test/executor-tests/execution-plans/transfer_pack_tbg.json

diff --git a/.azure-pipelines/templates/ut-executor.yml b/.azure-pipelines/templates/ut-executor.yml
new file mode 100644
index 00000000..426daf17
--- /dev/null
+++ b/.azure-pipelines/templates/ut-executor.yml
@@ -0,0 +1,42 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: ExecutorTest
+    displayName: Run executor tests
+    remoteScript: |
+      python3 -m pip install .
+      PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
+      TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 4e6f96b1..6b8c9eda 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -148,3 +148,24 @@ jobs:
       vmssName:         mscclpp-mi300x-ci
       platform:         rocm
       gpuArch:          gfx942
+
+- job: UnitTestExecutor
+  timeoutInMinutes: 60
+  displayName: Test DSL Executor
+  pool:
+    name: msccl-ci-h100
+
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut-executor.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
\ No newline at end of file
diff --git a/test/executor-tests/algos/reduce.py b/test/executor-tests/algos/reduce.py
new file mode 100644
index 00000000..db630a43
--- /dev/null
+++ b/test/executor-tests/algos/reduce.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Test
+
+This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
+operations. It implements a 2-GPU allreduce using the Simple protocol
+with instruction fusion enabled.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce(name, num_threads_per_block, min_message_size, max_message_size):
+    collective = AllReduce(2, 2, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        2,
+        protocol="Simple",
+        instr_fusion=True,
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 4)
+        second_scratch_buffer = Buffer(1, 4)
+
+        # Each rank copies its input chunks to scratch to prepare for remote access
+        first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb=0)
+        second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb=0)
+
+        # Signal and wait to ensure scratch data is visible to the remote rank
+        first_ch.signal(tb=0)
+        second_ch.signal(tb=0)
+
+        first_ch.wait(tb=0)
+        second_ch.wait(tb=0)
+
+        # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
+        first_ch.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb=0)
+        first_ch.put(second_input_buffer[0:1], first_input_buffer[0:1], tb=0)
+
+        # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
+        first_ch.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=0)
+        first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb=0)
+        first_ch.put(second_input_buffer[1:2], first_input_buffer[1:2], tb=0)
+
+        # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
+        second_ch.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb=0)
+        second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb=0)
+        second_ch.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb=0)
+
+        # Final signal/wait to ensure all reduced data is consistent across both ranks
+        first_ch.signal(tb=0)
+        second_ch.signal(tb=0)
+
+        first_ch.wait(tb=0)
+        second_ch.wait(tb=0)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_nvls.py b/test/executor-tests/algos/reduce_nvls.py
new file mode 100644
index 00000000..e59b8247
--- /dev/null
+++ b/test/executor-tests/algos/reduce_nvls.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce NVLS Test
+
+This file tests the executor MULTI_LOAD_REDUCE_STORE operation using
+NVLS SwitchChannels. Each GPU reduces its chunk via the
+NVSwitch and broadcasts the result to all other GPUs.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_nvls(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        instances=1,
+        protocol="Simple",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the GPUs are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Reducing and Storing the data
+        for gpu in range(gpu_size):
+            buffer_offset = gpu
+            rank = Rank(gpu)
+            input_buffer = rank.get_input_buffer()
+            nvls_chan.at_rank(gpu).reduce(
+                buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+            )
+            nvls_chan.at_rank(gpu).broadcast(
+                src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+            )
+
+        # Synchronization to Ensure the GPUs finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_nvls(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_nvls_pipeline.py b/test/executor-tests/algos/reduce_nvls_pipeline.py
new file mode 100644
index 00000000..d7a4925e
--- /dev/null
+++ b/test/executor-tests/algos/reduce_nvls_pipeline.py
@@ -0,0 +1,94 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce NVLS Pipeline Test
+
+This file tests the executor MULTI_LOAD_REDUCE_STORE operation in a
+pipeline context using SwitchChannel. Each GPU reduces
+its chunk via the NVSwitch and broadcasts the result, processing data
+in a pipelined loop over fixed-size iterations.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+from mscclpp.language.loop import LoopIterationContext
+
+
+def reduce_nvls_pipeline(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        instances=1,
+        protocol="Simple",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the GPUs are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Pipeline Reducing and Storing the data
+        with LoopIterationContext(unit=2**20, num_chunks=1):
+            for gpu in range(gpu_size):
+                buffer_offset = gpu
+                rank = Rank(gpu)
+                input_buffer = rank.get_input_buffer()
+                nvls_chan.at_rank(gpu).reduce(
+                    buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+                )
+                nvls_chan.at_rank(gpu).broadcast(
+                    src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+                )
+
+        # Synchronization to Ensure the GPUs finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_nvls_pipeline(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_pack.py b/test/executor-tests/algos/reduce_pack.py
new file mode 100644
index 00000000..9aa48caf
--- /dev/null
+++ b/test/executor-tests/algos/reduce_pack.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Pack Test
+
+This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
+operations. It implements a 2-GPU allreduce with the LL (low-latency)
+packet protocol.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_pack(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 3)
+        second_scratch_buffer = Buffer(1, 3)
+
+        # Each rank sends its input chunk as packets to the other rank's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb=0)
+        second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb=0)
+
+        # Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
+        first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb=1, packet=True)
+        first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb=1)
+
+        # Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
+        second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb=1, packet=True)
+        second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb=1)
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb=1)
+
+        # Both ranks unpack the final reduced packets from scratch into their output buffers
+        first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb=2)
+        second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb=2)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_pack_tbg.py b/test/executor-tests/algos/reduce_pack_tbg.py
new file mode 100644
index 00000000..eaca4c4c
--- /dev/null
+++ b/test/executor-tests/algos/reduce_pack_tbg.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Pack Thread Block Group Test
+
+This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
+operations using thread block groups. It implements a 2-GPU allreduce
+with the LL (low-latency) packet protocol, where multiple thread
+blocks cooperate on each phase.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 3)
+        second_scratch_buffer = Buffer(1, 3)
+        tbg = []
+        for i in range(3):
+            tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
+
+        # Each rank sends its input chunk as packets to the other rank's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb_group=tbg[0])
+        second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb_group=tbg[0])
+
+        # Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
+        first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb_group=tbg[1], packet=True)
+        first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb_group=tbg[1])
+
+        # Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
+        second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb_group=tbg[1], packet=True)
+        second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb_group=tbg[1])
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb_group=tbg[1])
+
+        # Both ranks unpack the final reduced packets from scratch into their output buffers
+        first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[2])
+        second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_tbg.py b/test/executor-tests/algos/reduce_tbg.py
new file mode 100644
index 00000000..103c6d20
--- /dev/null
+++ b/test/executor-tests/algos/reduce_tbg.py
@@ -0,0 +1,99 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Thread Block Group Test
+
+This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
+operations using thread block groups. It implements a 2-GPU allreduce
+with the Simple protocol and instruction fusion, where multiple thread
+blocks cooperate on each operation.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    collective = AllReduce(2, 2, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        2,
+        protocol="Simple",
+        instr_fusion=True,
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, memory channels, input buffers, and scratch buffers for 2-GPU AllReduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch_tb0 = MemoryChannel(1, 0)
+        first_ch_tb1 = MemoryChannel(1, 0)
+        second_ch_tb0 = MemoryChannel(0, 1)
+        second_ch_tb1 = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 4)
+        second_scratch_buffer = Buffer(1, 4)
+        tbg = ThreadBlockGroup(tb_list=[0, 1])
+
+        # Each rank copies its input chunks to scratch to prepare for remote access
+        first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb_group=tbg)
+        second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb_group=tbg)
+
+        # Signal and wait on both TBs to ensure scratch data is visible to the remote rank
+        first_ch_tb0.signal(tb=0)
+        first_ch_tb1.signal(tb=1)
+        second_ch_tb0.signal(tb=0)
+        second_ch_tb1.signal(tb=1)
+
+        first_ch_tb0.wait(tb=0)
+        first_ch_tb1.wait(tb=1)
+        second_ch_tb0.wait(tb=0)
+        second_ch_tb1.wait(tb=1)
+
+        # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
+        first_ch_tb0.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb_group=tbg)
+        first_ch_tb0.put(second_input_buffer[0:1], first_input_buffer[0:1], tb_group=tbg)
+
+        # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
+        first_ch_tb0.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg)
+        first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb_group=tbg)
+        first_ch_tb0.put(second_input_buffer[1:2], first_input_buffer[1:2], tb_group=tbg)
+
+        # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
+        second_ch_tb0.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb_group=tbg)
+        second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb_group=tbg)
+        second_ch_tb0.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb_group=tbg)
+
+        # Final signal/wait on both TBs to ensure all reduced data is consistent across both ranks
+        first_ch_tb0.signal(tb=0)
+        first_ch_tb1.signal(tb=1)
+        second_ch_tb0.signal(tb=0)
+        second_ch_tb1.signal(tb=1)
+
+        first_ch_tb0.wait(tb=0)
+        first_ch_tb1.wait(tb=1)
+        second_ch_tb0.wait(tb=0)
+        second_ch_tb1.wait(tb=1)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/transfer_pack.py b/test/executor-tests/algos/transfer_pack.py
new file mode 100644
index 00000000..e382f012
--- /dev/null
+++ b/test/executor-tests/algos/transfer_pack.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Transfer Pack Test
+
+This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
+PUT_PACKETS operations. It implements a 2-GPU allgather with the LL
+(low-latency) packet protocol.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def transfer_pack(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, output and scratch buffers for 2-GPU allgather
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_output_buffer = first_rank.get_output_buffer()
+        second_output_buffer = second_rank.get_output_buffer()
+        first_scratch_buffer = Buffer(0, 2)
+        second_scratch_buffer = Buffer(1, 2)
+
+        # Rank 0 sends its output chunk as packets to rank 1's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb=0)
+
+        # Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
+        second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb=0)
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=1)
+
+        # Both ranks unpack received packets from scratch into their output buffers
+        first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb=1)
+        second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb=2)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+transfer_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/transfer_pack_tbg.py b/test/executor-tests/algos/transfer_pack_tbg.py
new file mode 100644
index 00000000..5a2dc11b
--- /dev/null
+++ b/test/executor-tests/algos/transfer_pack_tbg.py
@@ -0,0 +1,71 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Transfer Pack Thread Block Group Test
+
+This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
+PUT_PACKETS operations using thread block groups. It implements a 2-GPU
+allgather with the LL (low-latency) packet protocol, where multiple
+thread blocks cooperate on each phase.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def transfer_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, output and scratch buffers for 2-GPU allgather
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_output_buffer = first_rank.get_output_buffer()
+        second_output_buffer = second_rank.get_output_buffer()
+        first_scratch_buffer = Buffer(0, 2)
+        second_scratch_buffer = Buffer(1, 2)
+        tbg = []
+        for i in range(3):
+            tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
+
+        # Rank 0 sends its output chunk as packets to rank 1's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb_group=tbg[0])
+
+        # Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
+        second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb_group=tbg[0])
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg[1])
+
+        # Both ranks unpack received packets from scratch into their output buffers
+        first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[1])
+        second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+transfer_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/execution-plans/reduce.json b/test/executor-tests/execution-plans/reduce.json
new file mode 100644
index 00000000..49a1048a
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce.json
@@ -0,0 +1,389 @@
+{
+  "name": "reduce",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        },
+        {
+          "rank": 1,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_nvls.json b/test/executor-tests/execution-plans/reduce_nvls.json
new file mode 100644
index 00000000..ac1261d6
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_nvls.json
@@ -0,0 +1,246 @@
+{
+  "name": "allreduce_nvls",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "glres",
+              "src_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "switch",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "glres",
+              "src_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "switch",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_nvls_pipeline.json b/test/executor-tests/execution-plans/reduce_nvls_pipeline.json
new file mode 100644
index 00000000..c9fb0760
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_nvls_pipeline.json
@@ -0,0 +1,264 @@
+{
+  "name": "allreduce_nvls_pipeline",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pipeline",
+              "iter_context": {
+                "unit_size": 1048576,
+                "num_chunks": 1
+              },
+              "ops": [
+                {
+                  "name": "glres",
+                  "src_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 0,
+                      "size": 1
+                    }
+                  ],
+                  "dst_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 0,
+                      "size": 1
+                    }
+                  ],
+                  "channel_type": "switch",
+                  "reduce_op": "sum"
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pipeline",
+              "iter_context": {
+                "unit_size": 1048576,
+                "num_chunks": 1
+              },
+              "ops": [
+                {
+                  "name": "glres",
+                  "src_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 1,
+                      "size": 1
+                    }
+                  ],
+                  "dst_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 1,
+                      "size": 1
+                    }
+                  ],
+                  "channel_type": "switch",
+                  "reduce_op": "sum"
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_pack.json b/test/executor-tests/execution-plans/reduce_pack.json
new file mode 100644
index 00000000..b74d5772
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_pack.json
@@ -0,0 +1,297 @@
+{
+  "name": "reduce_pack",
+  "collective": "allreduce",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_pack_tbg.json b/test/executor-tests/execution-plans/reduce_pack_tbg.json
new file mode 100644
index 00000000..4380de6e
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_pack_tbg.json
@@ -0,0 +1,576 @@
+{
+  "name": "reduce_pack_tbg",
+  "collective": "allreduce",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_tbg.json b/test/executor-tests/execution-plans/reduce_tbg.json
new file mode 100644
index 00000000..a4683236
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_tbg.json
@@ -0,0 +1,773 @@
+{
+  "name": "reduce_tbg",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                1,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1,
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        },
+        {
+          "rank": 1,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                1,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0,
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/transfer_pack.json b/test/executor-tests/execution-plans/transfer_pack.json
new file mode 100644
index 00000000..270d6c13
--- /dev/null
+++ b/test/executor-tests/execution-plans/transfer_pack.json
@@ -0,0 +1,216 @@
+{
+  "name": "transfer_pack",
+  "collective": "allgather",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/transfer_pack_tbg.json b/test/executor-tests/execution-plans/transfer_pack_tbg.json
new file mode 100644
index 00000000..bec8459d
--- /dev/null
+++ b/test/executor-tests/execution-plans/transfer_pack_tbg.json
@@ -0,0 +1,406 @@
+{
+  "name": "transfer_pack_tbg",
+  "collective": "allgather",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}

From eb1e0f33eea903bc7037e0caf91abb7bbc47751c Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Mon, 13 Apr 2026 22:43:42 +0000
Subject: [PATCH 52/52] Fix alltoallv build and Python import errors

- Add missing accumDtype and symmetricMemory params to NativeAlgorithm
  lambda signatures in alltoallv_fullmesh.cu to match KernelFunc and
  ContextKeyGenFunc typedefs
- Use Cpp-prefixed binding names in alltoallv_single.py imports
  (CppCommunicator, CppDataType, etc.)
- Add missing symmetric_memory=False arg to algo.execute() call
- Fix test imports to use mscclpp public API instead of raw _mscclpp
---
 python/mscclpp/ext/alltoallv_single.py              | 11 ++++++-----
 python/test/test_alltoallv_mscclpp.py               |  4 ++--
 src/ext/collectives/alltoallv/alltoallv_fullmesh.cu |  6 ++++--
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/python/mscclpp/ext/alltoallv_single.py b/python/mscclpp/ext/alltoallv_single.py
index 6f583ae7..3923b91b 100644
--- a/python/mscclpp/ext/alltoallv_single.py
+++ b/python/mscclpp/ext/alltoallv_single.py
@@ -17,11 +17,11 @@ import torch
 import torch.distributed as dist
 from typing import Optional, List, Tuple
 from mscclpp._mscclpp import (
-    Communicator,
-    TcpBootstrap,
-    DataType,
-    ReduceOp,
-    CommResult,
+    CppCommunicator as Communicator,
+    CppTcpBootstrap as TcpBootstrap,
+    CppDataType as DataType,
+    CppReduceOp as ReduceOp,
+    CppCommResult as CommResult,
 )
 from mscclpp.ext.algorithm_collection_builder import AlgorithmCollectionBuilder
 
@@ -323,6 +323,7 @@ class MscclppAlltoAllV:
             None,  # executor (not needed for native algos)
             0,     # nblocks (auto)
             0,     # nthreads_per_block (auto)
+            False, # symmetric_memory
             self._extras,
         )
 
diff --git a/python/test/test_alltoallv_mscclpp.py b/python/test/test_alltoallv_mscclpp.py
index e8797e43..d45fb6f4 100644
--- a/python/test/test_alltoallv_mscclpp.py
+++ b/python/test/test_alltoallv_mscclpp.py
@@ -130,11 +130,11 @@ def main():
         print("=" * 60)
 
     # Import after torch.distributed init
-    from mscclpp._mscclpp import (
+    from mscclpp import (
         Communicator,
         TcpBootstrap,
-        UniqueId,
     )
+    from mscclpp._mscclpp import CppUniqueId as UniqueId
     from mscclpp.ext.alltoallv_single import MscclppAlltoAllV
     
     # Create mscclpp communicator with TcpBootstrap
diff --git a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
index 4a57d30d..8d1fae83 100644
--- a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
+++ b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
@@ -67,7 +67,8 @@ std::shared_ptr<Algorithm> AlltoallvFullmesh::build() {
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              size_t outputSize, DataType dtype, [[maybe_unused]] ReduceOp op, cudaStream_t stream,
              int nBlocks, int nThreadsPerBlock,
-             const std::unordered_map<std::string, uintptr_t>& extras) {
+             const std::unordered_map<std::string, uintptr_t>& extras,
+             [[maybe_unused]] DataType accumDtype) -> CommResult {
         return self->alltoallvKernelFunc(ctx, input, output, inputSize, outputSize, dtype, stream,
                                          nBlocks, nThreadsPerBlock, extras);
       },
@@ -77,7 +78,8 @@ std::shared_ptr<Algorithm> AlltoallvFullmesh::build() {
         return self->initAlltoallvContext(comm, input, output, inputSize, outputSize, dtype);
       },
       // Context key generation function
-      [self](const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype) {
+      [self](const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype,
+             [[maybe_unused]] bool symmetricMemory) {
         return self->generateAlltoallvContextKey(input, output, inputSize, outputSize, dtype);
       });