From fd76507e9a6b57a6de7ad832deb4a15dc4d60195 Mon Sep 17 00:00:00 2001
From: Ekow Wellington <34079588+ekwhoa@users.noreply.github.com>
Date: Tue, 31 Mar 2026 14:27:33 -0500
Subject: [PATCH 01/21] Install default plans under MSCCLPP_CACHE_DIR/default
 (#769)

### Summary
Update the installer to place bundled default execution plans under
`<MSCCLPP_CACHE_DIR>/default`, which is where the runtime already looks
for bundled plans.

### Background
The C++ runtime treats `MSCCLPP_CACHE_DIR` as the cache *root* and loads
bundled default plans from `<cache root>/default`.
When `MSCCLPP_CACHE_DIR` was set, the installer instead wrote bundled
plans
directly into the cache root, causing the runtime to miss them.

This surfaced while running benchmarking tests with a non-default
`MSCCLPP_CACHE_DIR`, where the bundled plans were not being discovered.

### Change
This PR updates the installer to always install bundled default plans
into
`<MSCCLPP_CACHE_DIR>/default`, preserving the existing runtime contract.

### Scope
- Installer-only change
- No runtime behavior changes

### Validation
Manual inspection of the updated install path.
Successful build

---------

Co-authored-by: Ekow Wellington <t-ekoww@microsoft.com>
---
 docs/dsl/quick_start.md    | 4 ++++
 docs/dsl/results.md        | 3 +++
 python/mscclpp/__main__.py | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/docs/dsl/quick_start.md b/docs/dsl/quick_start.md
index 6c32ec32..afccd48e 100644
--- a/docs/dsl/quick_start.md
+++ b/docs/dsl/quick_start.md
@@ -12,6 +12,10 @@ After finishing the installation in the quick start section, you can add the fol
 python3 -m mscclpp --install
 ```
 
+This installs bundled default execution plans into `~/.cache/mscclpp/default` by default.
+If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed into `MSCCLPP_CACHE_DIR/default`.
+`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
+
 ## Your First Algorithm: AllGather
 
 Let's walk through a simple AllGather algorithm to understand the DSL basics. This example demonstrates the key concepts without diving into all the advanced features.
diff --git a/docs/dsl/results.md b/docs/dsl/results.md
index 99f19476..a1adad2a 100644
--- a/docs/dsl/results.md
+++ b/docs/dsl/results.md
@@ -59,6 +59,9 @@ After installation, the generated JSON execution plan can be found at:
 ~/.cache/mscclpp/default/
 ```
 
+If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed under `MSCCLPP_CACHE_DIR/default/`.
+`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
+
 **Performance Results:**
 
 The figure below shows the performance characteristics for small message sizes in a two-node configuration:
diff --git a/python/mscclpp/__main__.py b/python/mscclpp/__main__.py
index d57cb362..6a6f5f28 100644
--- a/python/mscclpp/__main__.py
+++ b/python/mscclpp/__main__.py
@@ -57,7 +57,7 @@ default_algo_configs = [
 
 
 def create_default_plans():
-    plan_dir = os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp/default")
+    plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default")
     plan_path = Path(plan_dir)
     if plan_path.exists():
         shutil.rmtree(plan_path)

From 4f3638b60db4640eb5f0cd4c1c92e05a72227474 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 31 Mar 2026 15:34:43 -0700
Subject: [PATCH 02/21] Use PTX red for D2D semaphore signal (#768)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary
- Replace the two-step `signal()` implementation (`incOutbound()` +
`atomicStore()`) with a single fire-and-forget PTX
`red.release.sys.global.add.u64` instruction
- This eliminates one local atomic fetch-add and replaces a remote store
with a remote atomic add that has no return value — more efficient on
both NVIDIA (PTX `red`) and AMD (compiler optimizes `(void)fetch_add` to
fire-and-forget `flat_atomic_add_x2`)
- Add a C++ perf test (`PERF_TEST`) in `mp_unit` for signal+wait
ping-pong latency

### Performance (H100, 2 ranks, signal+wait round-trip)

```
SemaphorePerfTest.SignalPingPong:
  Store-based (old): 2.595 us/iter
  Red-based   (new): 2.345 us/iter
  Speedup:           1.11x
```

## Test plan
- [x] Builds successfully (`make mp_unit_tests`)
- [x] `mpirun -np 2 ./build/bin/mp_unit_tests --filter
"SemaphorePerfTest"` — 1.11x speedup

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
---
 include/mscclpp/semaphore.hpp        |  1 -
 include/mscclpp/semaphore_device.hpp | 34 ++++---------
 python/csrc/semaphore_py.cpp         |  1 -
 src/core/semaphore.cc                |  5 +-
 test/mp_unit/CMakeLists.txt          |  1 +
 test/mp_unit/mp_unit_tests.hpp       |  6 +++
 test/mp_unit/semaphore_perf_tests.cu | 73 ++++++++++++++++++++++++++++
 7 files changed, 91 insertions(+), 30 deletions(-)
 create mode 100644 test/mp_unit/semaphore_perf_tests.cu

diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index 27f9aefa..85787c95 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -82,7 +82,6 @@ class MemoryDevice2DeviceSemaphore {
  private:
   Semaphore semaphore_;
   detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
-  detail::UniqueGpuPtr<uint64_t> outboundToken_;
 
  public:
   /// Constructor.
diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp
index f1b01e89..a790a6e1 100644
--- a/include/mscclpp/semaphore_device.hpp
+++ b/include/mscclpp/semaphore_device.hpp
@@ -82,19 +82,20 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
 
   /// Signal remote device, ensures prior memory ops complete.
   MSCCLPP_DEVICE_INLINE void signal() {
-    auto outbound = incOutbound();
-#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ == 800)
-    // Using memoryOrderSeqCst is faster for A100.
-    atomicStore(remoteInboundToken, outbound, memoryOrderSeqCst);
-#else
-    atomicStore(remoteInboundToken, outbound, memoryOrderRelease);
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("red.release.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
+#elif defined(MSCCLPP_DEVICE_HIP)
+    (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelease);
 #endif
   }
 
   /// Relaxed signal; no memory completion guarantee. Use it only for synchronizing execution, not data.
   MSCCLPP_DEVICE_INLINE void relaxedSignal() {
-    auto outbound = incOutbound();
-    atomicStore(remoteInboundToken, outbound, memoryOrderRelaxed);
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("red.relaxed.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
+#elif defined(MSCCLPP_DEVICE_HIP)
+    (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelaxed);
+#endif
   }
 
   /// Thread-safe read of expected inbound value.
@@ -121,27 +122,12 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
     return atomicLoad<uint64_t, scopeSystem>(inboundToken, memoryOrderRelaxed);
   }
 
-  /// Thread-safe read of outbound value.
-  /// @return The outbound value.
-  MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() {
-    return atomicLoad<uint64_t, scopeDevice>(outboundToken, memoryOrderRelaxed);
-  }
-
-  /// Thread-safe increment of outbound value.
-  /// @return The incremented outbound value.
-  MSCCLPP_DEVICE_INLINE uint64_t incOutbound() {
-    return atomicFetchAdd<uint64_t, scopeDevice>(outboundToken, 1, memoryOrderRelaxed) + 1;
-  }
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
   /// A local memory space where the remote device will write its semaphore value and the local device will read it.
   uint64_t* inboundToken;
 
-  /// A local memory space where the local device stores the semaphore value to be written to the remote device.
-  uint64_t* outboundToken;
-
-  /// A remote memory space where the local device writes its outboundToken on. This is inboundToken of the
-  /// remote device.
+  /// A remote memory space where the local device atomically increments. This is inboundToken of the remote device.
   uint64_t* remoteInboundToken;
 
   /// A local memory space where the local device stores the expected value of the inboundToken to wait for.
diff --git a/python/csrc/semaphore_py.cpp b/python/csrc/semaphore_py.cpp
index 36d559f2..17c06a7d 100644
--- a/python/csrc/semaphore_py.cpp
+++ b/python/csrc/semaphore_py.cpp
@@ -43,7 +43,6 @@ void register_semaphore(nb::module_& m) {
   nb::class_<MemoryDevice2DeviceSemaphore::DeviceHandle>(memoryDevice2DeviceSemaphore, "DeviceHandle")
       .def(nb::init<>())
       .def_rw("inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundToken)
-      .def_rw("outbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundToken)
       .def_rw("remote_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundToken)
       .def_rw("expected_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundToken)
       .def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index c6eb1e23..bea43327 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -183,9 +183,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
 }
 
 MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const Semaphore& semaphore)
-    : semaphore_(semaphore),
-      expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
-      outboundToken_(detail::gpuCallocUnique<uint64_t>()) {
+    : semaphore_(semaphore), expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
     throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
   }
@@ -202,7 +200,6 @@ MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::DeviceHandle MemoryDevice2DeviceSe
   device.remoteInboundToken = reinterpret_cast<uint64_t*>(semaphore_.remoteMemory().data());
   device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
   device.expectedInboundToken = expectedInboundToken_.get();
-  device.outboundToken = outboundToken_.get();
   return device;
 };
 
diff --git a/test/mp_unit/CMakeLists.txt b/test/mp_unit/CMakeLists.txt
index b99bb09d..d4004e8e 100644
--- a/test/mp_unit/CMakeLists.txt
+++ b/test/mp_unit/CMakeLists.txt
@@ -8,6 +8,7 @@ target_sources(mp_unit_tests PRIVATE
     communicator_tests.cu
     port_channel_tests.cu
     memory_channel_tests.cu
+    semaphore_perf_tests.cu
     switch_channel_tests.cu
     executor_tests.cc
 )
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index 03e4cbde..5f95d660 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -176,6 +176,12 @@ class MemoryChannelOneToOneTest : public CommunicatorTestBase {
   std::unordered_map<int, std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
 };
 
+class SemaphorePerfTest : public CommunicatorTestBase {
+ protected:
+  void SetUp() override;
+  void TearDown() override;
+};
+
 class SwitchChannelTest : public CommunicatorTestBase {
  protected:
   void SetUp() override;
diff --git a/test/mp_unit/semaphore_perf_tests.cu b/test/mp_unit/semaphore_perf_tests.cu
new file mode 100644
index 00000000..92560539
--- /dev/null
+++ b/test/mp_unit/semaphore_perf_tests.cu
@@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/semaphore.hpp>
+
+#include "mp_unit_tests.hpp"
+
+void SemaphorePerfTest::SetUp() {
+  // Need at least two ranks within a node
+  if (gEnv->nRanksPerNode < 2) {
+    SKIP_TEST();
+  }
+  setNumRanksToUse(2);
+  CommunicatorTestBase::SetUp();
+}
+
+void SemaphorePerfTest::TearDown() { CommunicatorTestBase::TearDown(); }
+
+__constant__ mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle gSemaphorePerfTestHandle;
+
+__global__ void kernelSemaphorePingPong(int rank, int nIters) {
+  mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle& sem = gSemaphorePerfTestHandle;
+
+  // Warmup
+  for (int i = 0; i < 10; i++) {
+    if ((rank ^ (i & 1)) == 0) {
+      sem.signal();
+    } else {
+      sem.wait();
+    }
+  }
+
+  // Timed iterations — alternating signal/wait like the memory channel ping-pong
+  for (int i = 0; i < nIters; i++) {
+    if ((rank ^ (i & 1)) == 0) {
+      sem.signal();
+    } else {
+      sem.wait();
+    }
+  }
+}
+
+PERF_TEST(SemaphorePerfTest, SignalPingPong) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  connectMesh(/*useIpc=*/true, /*useIb=*/false, /*useEthernet=*/false);
+
+  int peerRank = (gEnv->rank == 0) ? 1 : 0;
+  auto d2dSemaphore = std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*communicator, connections[peerRank]);
+
+  auto devHandle = d2dSemaphore->deviceHandle();
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gSemaphorePerfTestHandle, &devHandle, sizeof(devHandle)));
+
+  const int nIters = 1000;
+  const std::string testName = ::mscclpp::test::currentTestName();
+
+  // Warmup run
+  kernelSemaphorePingPong<<<1, 1>>>(gEnv->rank, nIters);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  communicator->bootstrap()->barrier();
+
+  // Timed run
+  mscclpp::Timer timer;
+  kernelSemaphorePingPong<<<1, 1>>>(gEnv->rank, nIters);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  communicator->bootstrap()->barrier();
+
+  if (gEnv->rank == 0) {
+    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nIters << " us/iter\n";
+  }
+}

From d2f7056cf4d1956cb452ee475b331f8e19e1d886 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 31 Mar 2026 22:30:35 -0700
Subject: [PATCH 03/21] Add unit testing framework readme (#766)

---
 test/README.md | 130 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 test/README.md

diff --git a/test/README.md b/test/README.md
new file mode 100644
index 00000000..a69b66ad
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,130 @@
+# MSCCL++ C++ Test Framework
+
+A lightweight, GTest-like test framework with MPI support for testing MSCCL++ C++ APIs. Defined in `framework.hpp` / `framework.cc`.
+
+## Adding a New Test (Step-by-Step)
+
+### Single-process test (unit/)
+
+1. **Create the test file** `test/unit/my_feature_tests.cc` (or `.cu` for CUDA):
+
+    ```cpp
+    #include "../framework.hpp"
+    #include <mscclpp/my_feature.hpp>
+
+    TEST(MyFeatureTest, BasicUsage) {
+      EXPECT_EQ(myFunction(), 42);
+    }
+    ```
+
+2. **Register it in CMake** — add the filename to `test/unit/CMakeLists.txt`:
+
+    ```cmake
+    target_sources(unit_tests PRIVATE
+        ...
+        my_feature_tests.cc   # <-- add here
+    )
+    ```
+
+3. **Build and run**:
+
+    ```bash
+    cmake --build build -j
+    ./build/test/unit_tests --filter=MyFeatureTest
+    ```
+
+### Multi-process test (mp_unit/)
+
+1. **Create the test file** `test/mp_unit/my_feature_tests.cc` (or `.cu`):
+
+    ```cpp
+    #include "mp_unit_tests.hpp"
+
+    TEST(MyFeatureTest, MultiRank) {
+      int rank = gEnv->rank;
+      EXPECT_GE(rank, 0);
+    }
+    ```
+
+    Use fixtures from `mp_unit_tests.hpp` (e.g., `CommunicatorTest`) if you need pre-established connections.
+
+2. **Register it in CMake** — add the filename to `test/mp_unit/CMakeLists.txt`:
+
+    ```cmake
+    target_sources(mp_unit_tests PRIVATE
+        ...
+        my_feature_tests.cc   # <-- add here
+    )
+    ```
+
+3. **Build and run**:
+
+    ```bash
+    cmake --build build -j
+    mpirun -np 2 ./build/test/mp_unit_tests --filter=MyFeatureTest
+    ```
+
+### Notes
+
+- No separate test registration step is needed — `TEST()` auto-registers via static initialization.
+- The `test_framework` static library is built from `framework.cc` in the top-level `test/CMakeLists.txt` and linked into both `unit_tests` and `mp_unit_tests`. You do not need to modify it.
+- Use `.cu` extension for files that contain CUDA kernel code; use `.cc` for host-only tests.
+- Each test binary needs a `main()` that calls `RUN_ALL_TESTS()`. See `unit/unit_tests_main.cc` (single-process) and `mp_unit/mp_unit_tests.cc` (multi-process with `Environment` setup).
+- Additional run options: `--filter=-Pattern` (exclude), `--exclude-perf-tests` (skip `PERF_TEST`s).
+
+## Macros
+
+| Macro | Behavior |
+|---|---|
+| `TEST(Suite, Name)` | Register a test. If `Suite` is a defined class, it's used as a fixture. |
+| `PERF_TEST(Suite, Name)` | Same as `TEST` but marked as perf (skippable via `--exclude-perf-tests`). |
+| `EXPECT_*` | Non-fatal assertions: `EXPECT_TRUE`, `EXPECT_FALSE`, `EXPECT_EQ`, `EXPECT_NE`, `EXPECT_LT`, `EXPECT_LE`, `EXPECT_GT`, `EXPECT_GE` |
+| `ASSERT_*` | Fatal assertions (abort test on failure): same variants as `EXPECT_*`, plus `ASSERT_NO_THROW` |
+| `FAIL()` | Fail immediately. Supports streaming: `FAIL() << "reason";` |
+| `SKIP_TEST()` | Skip the current test. Supports streaming: `SKIP_TEST() << "reason";` |
+| `CUDA_CHECK(call)` | Check a CUDA API return code, throw on error. |
+
+## Fixtures
+
+Define a class inheriting from `mscclpp::test::TestCase` with `SetUp()` / `TearDown()`, then use the class name as the suite name:
+
+```cpp
+class MyFixture : public mscclpp::test::TestCase {
+ public:
+  void SetUp() override { /* per-test setup */ }
+  void TearDown() override { /* per-test cleanup */ }
+ protected:
+  int sharedState_ = 0;
+};
+
+TEST(MyFixture, SomeTest) {
+  sharedState_ = 42;
+  EXPECT_EQ(sharedState_, 42);
+}
+```
+
+See `mp_unit/mp_unit_tests.hpp` (`BootstrapTest`, `CommunicatorTest`, etc.) for real fixture examples.
+
+## Global Environments
+
+Register an `Environment` subclass for one-time global setup/teardown (e.g., MPI bootstrap):
+
+```cpp
+class MyEnv : public mscclpp::test::Environment {
+ public:
+  void SetUp() override { /* global init */ }
+  void TearDown() override { /* global cleanup */ }
+};
+
+// In main(), before RUN_ALL_TESTS():
+mscclpp::test::TestRegistry::instance().addEnvironment(new MyEnv());
+```
+
+See `mp_unit/mp_unit_tests.cc` for the `MultiProcessTestEnv` example.
+
+## Utilities
+
+- `mscclpp::test::utils::isMainRank()` — true on MPI rank 0
+- `mscclpp::test::utils::getMPIRank()` / `getMPISize()`
+- `mscclpp::test::utils::Timer` — high-resolution timer with `start()`, `stop()`, `elapsedMilliseconds()`
+- `mscclpp::test::currentTestName()` — returns `"Suite.Name"` for the running test
\ No newline at end of file

From be9126ca1b36c4817de622a0aebd87e5382b9a6b Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 1 Apr 2026 16:25:19 -0700
Subject: [PATCH 04/21] Fix run-remote.sh to support multi-command scripts
 (#770)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary
- Fix `run-remote.sh` to correctly execute multi-command scripts (e.g.,
multiple `mpirun` calls)
- The old approach piped decoded script through `base64 -d | bash`,
which feeds the script via bash's **stdin**. When `mpirun` (or its child
processes) runs, it can consume the remaining stdin, causing bash to
never see subsequent commands — only the first command would execute.
- The fix decodes the script to a **temp file** and runs `bash -euxo
pipefail "$TMP"` instead, so bash reads commands from the file and
`mpirun` consuming stdin has no effect.
- Applied to both the docker path (pssh + docker exec) and the
non-docker path (pssh only).


🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 test/deploy/run-remote.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
index b646ea92..2468243e 100755
--- a/test/deploy/run-remote.sh
+++ b/test/deploy/run-remote.sh
@@ -97,11 +97,14 @@ if $USE_DOCKER; then
     INNER+=" cd /root/mscclpp;"
     INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
     INNER+=" CMD_B64='${CMD_B64}';"
-    INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
+    INNER+=" TMP=\\\$(mktemp);"
+    INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d > \\\"\\\$TMP\\\";"
+    INNER+=" bash -euxo pipefail \\\"\\\$TMP\\\";"
+    INNER+=" rm -f \\\"\\\$TMP\\\""
 
     parallel-ssh -i "${PSSH_COMMON[@]}" \
         "sudo docker exec mscclpp-test bash -c \"${INNER}\""
 else
     parallel-ssh -i "${PSSH_COMMON[@]}" \
-        "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"
+        "set -euxo pipefail; CMD_B64='${CMD_B64}'; TMP=\$(mktemp); printf '%s' \"\$CMD_B64\" | base64 -d > \"\$TMP\"; bash -euxo pipefail \"\$TMP\"; rm -f \"\$TMP\""
 fi

From fa95e82e18c5f963b059aefe20939d5ca8a63df2 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 7 Apr 2026 08:41:51 -0700
Subject: [PATCH 05/21] Fix CI/CD pipeline issues (#773)

This pull request updates the deployment pipeline to allow custom CMake
arguments to be passed to the pip install process on remote VMs.

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .azure-pipelines/templates/deploy.yml   | 24 ++++++++++++++++++++++--
 .azure-pipelines/templates/ut-npkit.yml | 10 +++++-----
 test/deploy/setup.sh                    |  6 ++++++
 tools/npkit/npkit_trace_generator.py    | 16 ++++++++--------
 4 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml
index fc116acf..2f642f1d 100644
--- a/.azure-pipelines/templates/deploy.yml
+++ b/.azure-pipelines/templates/deploy.yml
@@ -94,7 +94,27 @@ steps:
       du -sh build/bin/* 2>/dev/null || true
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-# 2. Download SSH key + install packages + start VMSS
+# 2. Write CMake args for pip install on remote VMs
+- task: Bash@3
+  name: WritePipCmakeArgs
+  displayName: Write pip CMake args
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      PIP_CMAKE_ARGS=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        PIP_CMAKE_ARGS="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ -n "${CMAKE_EXTRA_ARGS}" ]; then
+        PIP_CMAKE_ARGS="${PIP_CMAKE_ARGS} ${CMAKE_EXTRA_ARGS}"
+      fi
+      echo "${PIP_CMAKE_ARGS}" > pip_cmake_args.txt
+      echo "pip CMake args: $(cat pip_cmake_args.txt)"
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 3. Download SSH key + install packages + start VMSS
 - task: DownloadSecureFile@1
   name: SshKeyFile
   displayName: Download key file
@@ -120,7 +140,7 @@ steps:
     inlineScript: |
       az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
 
-# 3. Deploy test environment
+# 4. Deploy test environment
 - task: Bash@3
   name: DeployTestEnv
   displayName: Deploy Test Env
diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml
index e53b5cf5..1bd89caf 100644
--- a/.azure-pipelines/templates/ut-npkit.yml
+++ b/.azure-pipelines/templates/ut-npkit.yml
@@ -28,7 +28,7 @@ steps:
       grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
 
 - template: run-remote-task.yml
   parameters:
@@ -42,14 +42,14 @@ steps:
       grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
       rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
       mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
       python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
       grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_UNPACK_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
 
 - template: stop.yml
   parameters:
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
index 80cd10b1..d4996cc2 100644
--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -30,6 +30,12 @@ fi
 if [ "${PLATFORM}" == "rocm" ]; then
     export CXX=/opt/rocm/bin/hipcc
 fi
+
+PIP_CMAKE_ARGS_FILE="/root/mscclpp/pip_cmake_args.txt"
+if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then
+    export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})"
+    echo "Using CMAKE_ARGS: ${CMAKE_ARGS}"
+fi
 cd /root/mscclpp && pip3 install .
 pip3 install setuptools_scm
 python3 -m setuptools_scm --force-write-version-files
diff --git a/tools/npkit/npkit_trace_generator.py b/tools/npkit/npkit_trace_generator.py
index c5ed6191..294516e6 100644
--- a/tools/npkit/npkit_trace_generator.py
+++ b/tools/npkit/npkit_trace_generator.py
@@ -14,25 +14,25 @@ def parse_npkit_event_header(npkit_event_header_path):
         "NOP",
         "BARRIER",
         "PUT",
-        "PUT_PACKET",
-        "READ_PUT_PACKET",
+        "PUT_PACKETS",
+        "READ_PUT_PACKETS",
         "PUT_WITH_SIGNAL",
         "PUT_WITH_SIGNAL_AND_FLUSH",
         "GET",
         "COPY",
-        "COPY_PACKET",
-        "TRANSFORM_TO_PACKET",
+        "COPY_PACKETS",
+        "UNPACK_PACKETS",
         "SIGNAL",
         "WAIT",
         "FLUSH",
         "REDUCE",
-        "REDUCE_PACKET",
+        "REDUCE_PACKETS",
         "REDUCE_COPY_PACKETS",
         "REDUCE_SEND",
-        "REDUCE_SEND_PACKET",
+        "REDUCE_SEND_PACKETS",
         "REDUCE_COPY_SEND_PACKETS",
-        "READ_REDUCE_COPY",
-        "READ_REDUCE_COPY_SEND",
+        "READ_REDUCE",
+        "READ_REDUCE_SEND",
         "MULTI_LOAD_REDUCE_STORE",
         "RELAXED_SIGNAL",
         "RELAXED_WAIT",

From 96a72bbd3e71df14f8afca6b4daaf907bbad8e8e Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 7 Apr 2026 13:37:02 -0700
Subject: [PATCH 06/21] Support E4M3B15 datatype (#765)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

- **Add `fp8_e4m3b15` datatype**: A software-defined FP8 type with 4
exponent bits, 3 mantissa bits, and bias=15 (max finite value: 0.9375).
Implemented entirely in software with no HW dependency, using
Triton-style bit manipulation through fp16 as intermediate for efficient
conversion.
- **Add mixed-precision accumulation for allreduce**: All allreduce
algorithm variants (packet, NVLS packet, fullmesh, RSAG zero-copy, and
others) now support a configurable `accumDtype` parameter, enabling FP8
inputs to be reduced in float16 or float32 for higher accuracy.
- **Propagate `accumDtype` through the full API**: The new parameter is
threaded from `Algorithm::execute()` → `NativeAlgorithm` → `KernelFunc`
→ dispatch → CUDA kernels, with `DataType::AUTO` as the default
(resolves to input dtype at runtime).
- **Add FP8 accumulation correctness tests**: New `test_fp8_accum.py`
validates that higher-precision accumulation produces results at least
as accurate as native FP8 accumulation across multiple algorithms and
sizes. Skipped on CUDA SM < 89 (pre-Hopper); runs on HIP/ROCm.
- **Add `test_fp8_accum.py` to CI**: Azure Pipeline `ut.yml` now runs
FP8 accumulation tests alongside existing pytests.
- **NCCL shim logging cleanup**: Migrated `printf`-style `WARN`/`INFO`
calls to streaming-style logging.

## Key files

| Area | Files |
|------|-------|
| New datatype + vector ops | `include/mscclpp/gpu_data_types.hpp` |
| Accumulation reduce helpers | `src/core/include/reduce_kernel.hpp` |
| Algorithm API (`accumDtype`) | `include/mscclpp/algorithm.hpp`,
`src/core/algorithm.cc` |
| Allreduce kernels | `src/ext/collectives/allreduce/*.cu` |
| Dispatch + common | `src/ext/collectives/include/allreduce/common.hpp`
|
| Python bindings | `python/csrc/algorithm.cpp`,
`python/mscclpp/_core/algorithm.py` |
| Tests | `python/test/test_fp8_accum.py` |
| CI | `.azure-pipelines/templates/ut.yml` |

## Test plan

- [x] CI passes on H100 (CUDA SM 90) — full FP8 E4M3 + E4M3B15
accumulation tests
- [x] CI passes on A100 (CUDA SM 80) — FP8 tests correctly skipped
- [x] CI passes on MI300X (ROCm) — FP8 tests run via HIP
- [x] Existing `test_mscclpp.py` tests continue to pass
- [x] NCCL shim builds and runs correctly with new `accumDtype` defaults

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .azure-pipelines/templates/ut.yml             |   1 +
 docs/guide/mscclpp-torch-integration.md       |   3 +-
 .../customized_allgather.cu                   |   3 +-
 .../torch-integration/customized_allgather.cu |   3 +-
 include/mscclpp/algorithm.hpp                 |  15 +-
 include/mscclpp/gpu_data_types.hpp            | 771 +++++++++++++++++-
 python/csrc/algorithm.cpp                     |   8 +-
 python/csrc/core_py.cpp                       |   3 +-
 python/csrc/gpu_utils_py.cpp                  |  13 +
 python/mscclpp/_core/algorithm.py             |   8 +-
 python/test/test_fp8_accum.py                 | 391 +++++++++
 src/core/algorithm.cc                         |  17 +-
 src/core/executor/execution_kernel.cu         |   6 +
 src/core/include/execution_kernel.hpp         |  27 +-
 src/core/include/reduce_kernel.hpp            | 174 +++-
 .../allgather/allgather_fullmesh.cu           |   3 +-
 .../allgather/allgather_fullmesh_2.cu         |   3 +-
 .../allreduce/allreduce_allpair_packet.cu     |  13 +-
 .../allreduce/allreduce_fullmesh.cu           |  37 +-
 .../allreduce_nvls_block_pipeline.cu          |  14 +-
 .../allreduce/allreduce_nvls_packet.cu        |  45 +-
 .../allreduce/allreduce_nvls_warp_pipeline.cu |  19 +-
 .../allreduce/allreduce_nvls_zero_copy.cu     |  15 +-
 .../collectives/allreduce/allreduce_packet.cu |  68 +-
 .../collectives/allreduce/allreduce_rsag.cu   |  13 +-
 .../allreduce/allreduce_rsag_pipeline.cu      |  19 +-
 .../allreduce/allreduce_rsag_zero_copy.cu     |  31 +-
 .../allreduce/allreduce_allpair_packet.hpp    |   2 +-
 .../include/allreduce/allreduce_fullmesh.hpp  |   2 +-
 .../allreduce_nvls_block_pipeline.hpp         |   2 +-
 .../allreduce/allreduce_nvls_packet.hpp       |   4 +-
 .../allreduce_nvls_warp_pipeline.hpp          |   2 +-
 .../allreduce/allreduce_nvls_zero_copy.hpp    |   2 +-
 .../include/allreduce/allreduce_packet.hpp    |   2 +-
 .../include/allreduce/allreduce_rsag.hpp      |   2 +-
 .../allreduce/allreduce_rsag_pipeline.hpp     |   2 +-
 .../allreduce/allreduce_rsag_zero_copy.hpp    |   2 +-
 .../collectives/include/allreduce/common.hpp  |  92 +--
 src/ext/nccl/algorithm_selector.cc            |   3 +-
 src/ext/nccl/datatype_conversion.hpp          |   5 +
 src/ext/nccl/nccl.cc                          |  39 +-
 41 files changed, 1623 insertions(+), 261 deletions(-)
 create mode 100644 python/test/test_fp8_accum.py

diff --git a/.azure-pipelines/templates/ut.yml b/.azure-pipelines/templates/ut.yml
index 9d17e923..743c66e6 100644
--- a/.azure-pipelines/templates/ut.yml
+++ b/.azure-pipelines/templates/ut.yml
@@ -41,6 +41,7 @@ steps:
     displayName: Run pytests
     remoteScript: |
       mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_fp8_accum.py -x
 
 - template: stop.yml
   parameters:
diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md
index 1c966155..b4e4fcdf 100644
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -332,7 +332,8 @@ public:
                    size_t inputSize, size_t outputSize,
                    mscclpp::DataType dtype, mscclpp::ReduceOp op,
                    cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                   const std::unordered_map<std::string, uintptr_t>& extras) {
+                   const std::unordered_map<std::string, uintptr_t>& extras,
+                   [[maybe_unused]] mscclpp::DataType accumDtype) {
                 return self->kernelFunc(ctx, input, output, inputSize, dtype, stream);
             },
             // Context initialization function
diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu
index e78c4777..02df3685 100644
--- a/examples/customized-collective-algorithm/customized_allgather.cu
+++ b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -101,7 +101,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
         "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
         [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
                mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
-               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+               [[maybe_unused]] mscclpp::DataType accumDtype) {
           return self->allgatherKernelFunc(ctx, input, output, inputSize, stream);
         },
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu
index d48c4410..907b3ada 100644
--- a/examples/torch-integration/customized_allgather.cu
+++ b/examples/torch-integration/customized_allgather.cu
@@ -69,7 +69,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
         "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
         [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
                mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
-               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+               [[maybe_unused]] mscclpp::DataType accumDtype) {
           return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream);
         },
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/include/mscclpp/algorithm.hpp b/include/mscclpp/algorithm.hpp
index 65b1ab3c..531cb857 100644
--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -103,12 +103,14 @@ class Algorithm {
   /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
   /// @param symmetricMemory Whether to use symmetric memory optimization.
   /// @param extras Additional parameters for algorithm-specific customization.
+  /// @param accumDtype Data type for accumulation during reduction. DataType::AUTO resolves to dtype.
   /// @return The result of the operation.
   virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                              size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                              std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
                              bool symmetricMemory = false,
-                             const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
+                             const std::unordered_map<std::string, uintptr_t>& extras = {},
+                             DataType accumDtype = DataType::AUTO) = 0;
 
   /// Reset the algorithm state, clearing any cached contexts.
   virtual void reset() = 0;
@@ -186,10 +188,11 @@ class NativeAlgorithm : public Algorithm {
   /// @param nBlocks Number of CUDA blocks.
   /// @param nThreadsPerBlock Number of threads per block.
   /// @param extras Additional algorithm-specific parameters.
+  /// @param accumDtype Data type for accumulation (resolved from input dtype if sentinel).
   /// @return The result of the operation.
   using KernelFunc =
       std::function<CommResult(const std::shared_ptr<void>, const void*, void*, size_t, size_t, DataType, ReduceOp,
-                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&)>;
+                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&, DataType)>;
 
   /// Function type for creating algorithm contexts.
   /// @param comm The communicator.
@@ -233,8 +236,8 @@ class NativeAlgorithm : public Algorithm {
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                      size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                      std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                     bool symmetricMemory = false,
-                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+                     bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
+                     DataType accumDtype = DataType::AUTO) override;
   const std::string& name() const override;
   const std::string& collective() const override;
   const std::pair<size_t, size_t>& messageRange() const override;
@@ -285,8 +288,8 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                      size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                      std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                     bool symmetricMemory = false,
-                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+                     bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
+                     DataType accumDtype = DataType::AUTO) override;
   AlgorithmType type() const override { return AlgorithmType::DSL; }
   Constraint constraint() const override;
   void reset() override;
diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
index 1cecbea6..fa31a28f 100644
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -64,18 +64,151 @@ using __bfloat162 = __nv_bfloat162;
 
 #endif
 
+/// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15.
+/// Format (MSB first): [sign:1][exponent:4][mantissa:3]
+/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention).
+/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6.
+struct alignas(1) __fp8_e4m3b15 {
+  uint8_t __x;
+
+  __fp8_e4m3b15() = default;
+
+  /// Construct from raw bits (use __fp8_e4m3b15::fromRaw() for clarity).
+  MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(uint8_t raw) : __x(raw) {}
+
+  /// Construct from float32 (explicit to avoid ambiguous conversion chains).
+  MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(float val) : __x(fromFloat(val)) {}
+
+  /// Convert to float32.
+  MSCCLPP_HOST_DEVICE_INLINE operator float() const { return toFloat(__x); }
+
+  /// Construct from a raw bit pattern without conversion.
+  static MSCCLPP_HOST_DEVICE_INLINE __fp8_e4m3b15 fromRaw(uint8_t bits) {
+    __fp8_e4m3b15 r;
+    r.__x = bits;
+    return r;
+  }
+
+ private:
+  /// Decode fp8_e4m3b15 bits → float32.
+  ///
+  /// Uses bit manipulation through fp16 as intermediate, adapted from the Triton compiler.
+  /// fp8_e4m3b15 is identical to fp8_e4m3fn (NVIDIA) except exponent bias is 15 vs 7.
+  /// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8,
+  /// then convert fp16 → float32.
+  static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) {
+    // Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN.
+    uint32_t exp = (bits >> 3) & 0xFu;
+    if (bits == 0x80 || exp == 15) {
+      union {
+        uint32_t u;
+        float f;
+      } nan_val = {0x7FC00000u};
+      return nan_val.f;
+    }
+    if (bits == 0) return 0.0f;
+
+    // Triton-style bit manipulation: fp8 → fp16 → fp32.
+    // fp8 layout: [S:1][E:4][M:3]  (bias=15)
+    // fp16 layout: [S:1][E:5][M:10] (bias=15)
+    //
+    // Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1
+    // to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15.
+    // Refer:
+    // https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34
+    uint16_t h = (uint16_t)bits << 8;             // place fp8 in upper byte of fp16
+    uint16_t sign16 = h & 0x8000u;                // extract sign at fp16 position
+    uint16_t nosign = h & 0x7F00u;                // exponent + mantissa (no sign)
+    uint16_t fp16_bits = sign16 | (nosign >> 1);  // shift exponent right by 1
+
+    // For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0
+    // and fp16 mantissa = (fp8_mantissa << 7), which correctly represents
+    // the subnormal fp16 value since both share bias=15.
+
+    // Convert fp16 bits to float via __half (works on host and device, CUDA and HIP).
+    union {
+      uint16_t u;
+      __half h;
+    } cvt = {fp16_bits};
+    return __half2float(cvt.h);
+  }
+
+  /// Encode float32 → fp8_e4m3b15 bits.
+  ///
+  /// Algorithm adapted from Triton: float32 → fp16 → bit-manipulate → fp8.
+  /// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15),
+  /// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1.
+  static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) {
+    union {
+      float f;
+      uint32_t u;
+    } in = {val};
+
+    // NaN → 0x80 (negative-zero bit pattern = NaN in fnuz).
+    if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u;
+
+    // Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP).
+    __half h_val = __float2half_rn(val);
+    union {
+      __half h;
+      uint16_t u;
+    } cvt = {h_val};
+    uint16_t fp16_bits = cvt.u;
+
+    // Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80.
+    uint16_t abs_fp16 = fp16_bits & 0x7FFFu;
+    if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u;
+
+    // Reconstruct with sign.
+    uint16_t sign16 = fp16_bits & 0x8000u;
+
+    // Triton-style: fp16 → fp8.
+    // fp16 layout: [S:1][E:5][M:10] (bias=15)
+    // fp8 layout:  [S:1][E:4][M:3]  (bias=15)
+    //
+    // mad.lo.u32 a0, a0, 2, 0x00800080  →  (abs_fp16 * 2 + 0x0080)
+    // This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias.
+    // Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea  →  (sign & 0x8000) | a0
+    // Finally: prmt for byte extraction.
+    //
+    // Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte.
+    uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u);
+    // The upper byte now contains [E:4][M:3][round_bit].
+    // Combine with sign and extract.
+    uint16_t with_sign = sign16 | adjusted;
+    uint8_t result = (uint8_t)(with_sign >> 8);
+
+    // Zero → 0x00 (ensure positive zero, not negative zero which is NaN).
+    if ((result & 0x7Fu) == 0) result = 0x00u;
+
+    return result;
+  }
+};
+
+/// Packed 2x fp8_e4m3b15 storage.
+struct alignas(2) __fp8x2_e4m3b15 {
+  uint16_t __x;
+};
+
+/// Packed 4x fp8_e4m3b15 storage.
+struct alignas(4) __fp8x4_e4m3b15 {
+  uint32_t __x;
+};
+
 namespace mscclpp {
 
 /// Data types supported by mscclpp operations.
 enum class DataType {
-  INT32,        // 32-bit signed integer.
-  UINT32,       // 32-bit unsigned integer.
-  FLOAT16,      // IEEE 754 half precision.
-  FLOAT32,      // IEEE 754 single precision.
-  BFLOAT16,     // bfloat16 precision.
-  FLOAT8_E4M3,  // float8 with E4M3 layout.
-  FLOAT8_E5M2,  // float8 with E5M2 layout.
-  UINT8,        // 8-bit unsigned integer.
+  INT32,           // 32-bit signed integer.
+  UINT32,          // 32-bit unsigned integer.
+  FLOAT16,         // IEEE 754 half precision.
+  FLOAT32,         // IEEE 754 single precision.
+  BFLOAT16,        // bfloat16 precision.
+  FLOAT8_E4M3,     // float8 with E4M3 layout.
+  FLOAT8_E5M2,     // float8 with E5M2 layout.
+  UINT8,           // 8-bit unsigned integer.
+  FLOAT8_E4M3B15,  // float8 with E4M3 layout, bias=15 (software, no HW accel).
+  AUTO = 255,      // Sentinel: resolve to the input dtype at runtime.
 };
 
 /// Word array.
@@ -97,6 +230,7 @@ struct alignas(Bytes) Words<Bytes, false> {};
 template <typename T, int N, typename StorageT>
 union alignas(sizeof(T) * N) VectorTypeImpl {
   static_assert(N > 0, "N must be greater than 0");
+  static_assert(sizeof(StorageT) >= sizeof(T) * N, "StorageT must cover the full vector size");
 
   T data[N];
   Words<sizeof(T) * N> words;
@@ -127,13 +261,14 @@ union alignas(sizeof(T) * N) VectorTypeImpl {
   MSCCLPP_HOST_DEVICE_INLINE const T& operator[](int i) const { return data[i]; }
 };
 
-// Helper template to get the appropriate vector type for a given element type and count
+// Helper template to get the appropriate vector type for a given element type and count.
 template <typename T, int N>
 struct VectorTypeHelper {
-  using type =
-      VectorTypeImpl<T, N,
-                     typename std::conditional_t<N * sizeof(T) == 4, uint32_t,
-                                                 typename std::conditional_t<N * sizeof(T) == 8, uint2, uint4>>>;
+  static constexpr int Bytes = N * sizeof(T);
+  using type = VectorTypeImpl<
+      T, N,
+      std::conditional_t<Bytes == 4, uint32_t,
+                         std::conditional_t<Bytes == 8, uint2, std::conditional_t<Bytes <= 16, uint4, Words<Bytes>>>>>;
 };
 
 /// Vector type - clean user interface (automatically selects appropriate storage type)
@@ -170,6 +305,11 @@ DEFINE_VEC(bf16x4, __bfloat16, 4, uint2);
 DEFINE_VEC(f16x8, __half, 8, uint4);
 DEFINE_VEC(bf16x8, __bfloat16, 8, uint4);
 
+// Aliases for large vector types (>16 bytes) where no native CUDA storage type exists.
+using f32x8 = VectorType<float, 8>;
+using f32x16 = VectorType<float, 16>;
+using f16x16 = VectorType<__half, 16>;
+
 #if defined(__FP8_TYPES_EXIST__)
 DEFINE_VEC(f8_e4m3x2, __fp8_e4m3, 2, __fp8x2_e4m3);
 DEFINE_VEC(f8_e4m3x4, __fp8_e4m3, 4, __fp8x4_e4m3);
@@ -181,6 +321,12 @@ DEFINE_VEC(f8_e5m2x4, __fp8_e5m2, 4, __fp8x4_e5m2);
 DEFINE_VEC(f8_e5m2x8, __fp8_e5m2, 8, uint2);
 DEFINE_VEC(f8_e5m2x16, __fp8_e5m2, 16, uint4);
 #endif
+
+// fp8_e4m3b15 vectors (always available — software type, no HW dependency)
+DEFINE_VEC(f8_e4m3b15x2, __fp8_e4m3b15, 2, __fp8x2_e4m3b15);
+DEFINE_VEC(f8_e4m3b15x4, __fp8_e4m3b15, 4, __fp8x4_e4m3b15);
+DEFINE_VEC(f8_e4m3b15x8, __fp8_e4m3b15, 8, uint2);
+DEFINE_VEC(f8_e4m3b15x16, __fp8_e4m3b15, 16, uint4);
 #undef DEFINE_VEC
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
@@ -254,6 +400,21 @@ MSCCLPP_DEVICE_INLINE __fp8_e5m2 clip(__fp8_e5m2 val) {
 }
 #endif
 
+// --- f32x2 arithmetic ---
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f32x2 operator+(const f32x2& a, const f32x2& b) {
+#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ >= 1000)
+  // Blackwell (SM 10.0+): packed float2 add in a single instruction.
+  return __fadd2_rn(a.storage, b.storage);
+#else
+  f32x2 result;
+  result.data[0] = a.data[0] + b.data[0];
+  result.data[1] = a.data[1] + b.data[1];
+  return result;
+#endif
+}
+
 template <bool UseClip = true>
 MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
   __half2 result;
@@ -265,6 +426,18 @@ MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
   return result;
 }
 
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f16x4 operator+(const f16x4& a, const f16x4& b) {
+  // Decompose into 2× packed __hadd2 (2 instructions instead of 4 scalar __hadd).
+  const f16x2* a2 = reinterpret_cast<const f16x2*>(&a);
+  const f16x2* b2 = reinterpret_cast<const f16x2*>(&b);
+  f16x4 result;
+  f16x2* r2 = reinterpret_cast<f16x2*>(&result);
+  r2[0] = a2[0] + b2[0];
+  r2[1] = a2[1] + b2[1];
+  return result;
+}
+
 template <bool UseClip = true>
 MSCCLPP_DEVICE_INLINE bf16x2 operator+(const bf16x2& a, const bf16x2& b) {
   __bfloat162 result;
@@ -449,6 +622,14 @@ MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) {
   return (a < b ? a : b);
 }
 
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 min(const f32x2& a, const f32x2& b) {
+  f32x2 result;
+  result.data[0] = fminf(a.data[0], b.data[0]);
+  result.data[1] = fminf(a.data[1], b.data[1]);
+  return result;
+}
+
 template <>
 MSCCLPP_DEVICE_INLINE f16x2 min(const f16x2& a, const f16x2& b) {
 #if defined(MSCCLPP_DEVICE_HIP)
@@ -489,6 +670,51 @@ MSCCLPP_DEVICE_INLINE u8x4 min(const u8x4& a, const u8x4& b) {
 #endif
 }
 
+/// Convert a vector type From to vector type To.
+/// Primary template with auto-decomposition: vectors with N > 4 elements decompose into x4 chunks,
+/// vectors with N == 4 decompose into x2 chunks, enabling optimized x2/x4 specializations to be reached.
+/// Specialized below for optimized FP8 conversion paths at x2/x4 level.
+template <typename To, typename From>
+MSCCLPP_DEVICE_INLINE To to(const From& v) {
+  static_assert(To::Size == From::Size, "to<To, From>: vector sizes must match");
+  constexpr int N = From::Size;
+
+  // Auto-decompose: N > 4 → split into x4 chunks
+  if constexpr (N > 4 && N % 4 == 0) {
+    constexpr int nChunks = N / 4;
+    using FromChunk = VectorType<typename From::ElementType, 4>;
+    using ToChunk = VectorType<typename To::ElementType, 4>;
+    const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
+    To result;
+    ToChunk* out = reinterpret_cast<ToChunk*>(&result);
+#pragma unroll
+    for (int c = 0; c < nChunks; ++c) {
+      out[c] = to<ToChunk>(in[c]);
+    }
+    return result;
+  }
+  // Auto-decompose: N == 4 → split into 2x x2 chunks
+  else if constexpr (N == 4) {
+    using FromChunk = VectorType<typename From::ElementType, 2>;
+    using ToChunk = VectorType<typename To::ElementType, 2>;
+    const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
+    To result;
+    ToChunk* out = reinterpret_cast<ToChunk*>(&result);
+    out[0] = to<ToChunk>(in[0]);
+    out[1] = to<ToChunk>(in[1]);
+    return result;
+  }
+  // Base case: element-wise conversion
+  else {
+    To result;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      result.data[i] = static_cast<typename To::ElementType>(v.data[i]);
+    }
+    return result;
+  }
+}
+
 #if defined(__FP8_TYPES_EXIST__)
 template <>
 MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) {
@@ -551,7 +777,526 @@ MSCCLPP_DEVICE_INLINE f8_e5m2x4 min(const f8_e5m2x4& a, const f8_e5m2x4& b) {
 
   return result;
 }
+
+// --- f8_e4m3 -> f32 specializations ---
+
+/// f8_e4m3x2 -> f32x2.
+/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
+/// HIP gfx942: fp8 -> float (via __builtin_amdgcn_cvt_pk_f32_fp8).
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3x2>(const f8_e4m3x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
+  f32x2 result;
+  result.data[0] = f[0];
+  result.data[1] = f[1];
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
+  f32x2 result;
+  result.data[0] = __half2float(bit_cast<__half>(h2.x));
+  result.data[1] = __half2float(bit_cast<__half>(h2.y));
+  return result;
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e4m3x4 -> f32x4.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3x4>(const f8_e4m3x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto lo = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, false);
+  auto hi = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, true);
+  f32x4 result;
+  result.data[0] = lo[0];
+  result.data[1] = lo[1];
+  result.data[2] = hi[0];
+  result.data[3] = hi[1];
+  return result;
+#else
+  const f8_e4m3x2* pair = reinterpret_cast<const f8_e4m3x2*>(&v);
+  f32x2 lo = to<f32x2>(pair[0]);
+  f32x2 hi = to<f32x2>(pair[1]);
+  f32x4 result;
+  result.data[0] = lo.data[0];
+  result.data[1] = lo.data[1];
+  result.data[2] = hi.data[0];
+  result.data[3] = hi.data[1];
+  return result;
+#endif
+}
+
+// --- f8_e5m2 -> f32 specializations ---
+
+/// f8_e5m2x2 -> f32x2.
+/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
+/// HIP gfx942: bf8 -> float (via __builtin_amdgcn_cvt_pk_f32_bf8).
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e5m2x2>(const f8_e5m2x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, 0);
+  f32x2 result;
+  result.data[0] = f[0];
+  result.data[1] = f[1];
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E5M2);
+  f32x2 result;
+  result.data[0] = __half2float(bit_cast<__half>(h2.x));
+  result.data[1] = __half2float(bit_cast<__half>(h2.y));
+  return result;
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e5m2x4 -> f32x4.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e5m2x4>(const f8_e5m2x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto lo = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, false);
+  auto hi = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, true);
+  f32x4 result;
+  result.data[0] = lo[0];
+  result.data[1] = lo[1];
+  result.data[2] = hi[0];
+  result.data[3] = hi[1];
+  return result;
+#else
+  const f8_e5m2x2* pair = reinterpret_cast<const f8_e5m2x2*>(&v);
+  f32x2 lo = to<f32x2>(pair[0]);
+  f32x2 hi = to<f32x2>(pair[1]);
+  f32x4 result;
+  result.data[0] = lo.data[0];
+  result.data[1] = lo.data[1];
+  result.data[2] = hi.data[0];
+  result.data[3] = hi.data[1];
+  return result;
+#endif
+}
+
+// --- f32 -> f8_e4m3 specializations (downcast) ---
+
+/// f32x2 -> f8_e4m3x2.
+/// HIP gfx942: float -> fp8 (via __builtin_amdgcn_cvt_pk_fp8_f32).
+/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2).
+/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
+  return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2;
+  h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
+  return bit_cast<f8_e4m3x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  f8_e4m3x2 result;
+  result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
+  result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
+  return result;
+#else
+  f8_e4m3x2 result;
+  result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e4m3x4.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x4 to<f8_e4m3x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
+  packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[2], v.data[3], packed, true);
+  return bit_cast<f8_e4m3x4>(packed);
+#else
+  f32x2 lo, hi;
+  lo.data[0] = v.data[0];
+  lo.data[1] = v.data[1];
+  hi.data[0] = v.data[2];
+  hi.data[1] = v.data[3];
+  f8_e4m3x2 lo_fp8 = to<f8_e4m3x2>(lo);
+  f8_e4m3x2 hi_fp8 = to<f8_e4m3x2>(hi);
+  f8_e4m3x4 result;
+  result.data[0] = lo_fp8.data[0];
+  result.data[1] = lo_fp8.data[1];
+  result.data[2] = hi_fp8.data[0];
+  result.data[3] = hi_fp8.data[1];
+  return result;
+#endif
+}
+
+// --- f32 -> f8_e5m2 specializations (downcast) ---
+
+/// f32x2 -> f8_e5m2x2.
+/// HIP gfx942: float -> bf8 (via __builtin_amdgcn_cvt_pk_bf8_f32).
+/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2 with __NV_E5M2).
+/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e5m2x2 to<f8_e5m2x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
+  return bit_cast<f8_e5m2x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2;
+  h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E5M2);
+  return bit_cast<f8_e5m2x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  f8_e5m2x2 result;
+  result.data[0] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E5M2));
+  result.data[1] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E5M2));
+  return result;
+#else
+  f8_e5m2x2 result;
+  result.data[0] = static_cast<__fp8_e5m2>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e5m2>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e5m2x4.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e5m2x4 to<f8_e5m2x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
+  packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[2], v.data[3], packed, true);
+  return bit_cast<f8_e5m2x4>(packed);
+#else
+  f32x2 lo, hi;
+  lo.data[0] = v.data[0];
+  lo.data[1] = v.data[1];
+  hi.data[0] = v.data[2];
+  hi.data[1] = v.data[3];
+  f8_e5m2x2 lo_fp8 = to<f8_e5m2x2>(lo);
+  f8_e5m2x2 hi_fp8 = to<f8_e5m2x2>(hi);
+  f8_e5m2x4 result;
+  result.data[0] = lo_fp8.data[0];
+  result.data[1] = lo_fp8.data[1];
+  result.data[2] = hi_fp8.data[0];
+  result.data[3] = hi_fp8.data[1];
+  return result;
+#endif
+}
+
+// --- f8_e4m3 <-> f16 conversion specializations ---
+
+/// f8_e4m3x2 -> f16x2.
+/// NVIDIA SM90+: packed intrinsic (1 instruction).
+/// HIP gfx942: fp8 -> float -> half (via AMD builtin).
+/// Pre-SM90 / fallback: element-wise scalar conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3x2>(const f8_e4m3x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
+  f16x2 result;
+  result.data[0] = __float2half(f[0]);
+  result.data[1] = __float2half(f[1]);
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
+  return bit_cast<f16x2>(h2);
+#else
+  f16x2 result;
+  result.data[0] = static_cast<__half>(v.data[0]);
+  result.data[1] = static_cast<__half>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f16x2 -> f8_e4m3x2.
+/// NVIDIA SM90+: packed intrinsic (1 instruction).
+/// HIP gfx942: half -> float -> fp8 (via AMD builtin).
+/// Pre-SM90: element-wise scalar conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f16x2>(const f16x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  float f0 = __half2float(v.data[0]);
+  float f1 = __half2float(v.data[1]);
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(f0, f1, 0, false);
+  return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = bit_cast<__half2_raw>(v);
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
+  return bit_cast<f8_e4m3x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(v.data[0]);
+  h1.x = bit_cast<unsigned short>(v.data[1]);
+  f8_e4m3x2 result;
+  result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
+  result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
+  return result;
+#else
+  f8_e4m3x2 result;
+  result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
+  return result;
+#endif
+}
+
 #endif  // defined(__FP8_TYPES_EXIST__)
+
+// --- fp8_e4m3b15 <-> fp16 direct conversion specializations ---
+// These are the PRIMARY conversions: fp8_b15 <-> fp16 is just a 1-bit exponent shift
+// (E4 bias=15 <-> E5 bias=15), no precision loss since fp16 has 10 mantissa bits
+// vs fp8's 3. fp32 conversions are derived by routing through fp16.
+
+/// f8_e4m3b15x2 -> f16x2.
+/// Direct fp8 -> fp16 via branch-free bit manipulation.
+template <>
+MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint16_t in = v.storage.__x;
+  // Spread 2 fp8 bytes into packed fp16 pair, adjust exponent E4->E5.
+  uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
+  uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
+  uint32_t out0 = b0 | (a0 & 0x80008000u);
+  __half2 h;
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h)) : "r"(out0));
+  return h;
+#else
+  f16x2 result;
+  result.data[0] = __float2half(float(v.data[0]));
+  result.data[1] = __float2half(float(v.data[1]));
+  return result;
+#endif
+}
+
+/// f8_e4m3b15x4 -> f16x4.
+/// Uses __byte_perm + lop3 for branch-free vectorized conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f16x4 to<f16x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in = v.storage.__x;
+  uint32_t a0 = __byte_perm(0u, in, 0x5746u);
+  uint32_t a0_shr = a0 >> 1;
+  uint32_t a0_sign = a0 & 0x80008000u;
+  uint32_t out0;
+  asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out0) : "r"(a0_shr), "r"(0x3f803f80u), "r"(a0_sign));
+  uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
+  uint32_t a1_shr = a1 >> 1;
+  uint32_t a1_sign = a1 & 0x80008000u;
+  uint32_t out1;
+  asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out1) : "r"(a1_shr), "r"(0x3f803f80u), "r"(a1_sign));
+  f16x4 result;
+  asm("mov.b32 %0, %1;" : "=r"(result.words[0]) : "r"(out0));
+  asm("mov.b32 %0, %1;" : "=r"(result.words[1]) : "r"(out1));
+  return result;
+#else
+  f16x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __float2half(float(v.data[i]));
+  }
+  return result;
+#endif
+}
+
+/// f16x2 -> f8_e4m3b15x2.
+/// Direct fp16 -> fp8 via clamp + exponent shift E5->E4 + pack.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in0;
+  asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast<const uint32_t*>(&v)));
+  // Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16).
+  uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16;
+  uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu;
+  alo = alo < 0x3B80u ? alo : 0x3B80u;
+  ahi = ahi < 0x3B80u ? ahi : 0x3B80u;
+  uint32_t a0 = alo | (ahi << 16);
+  a0 = a0 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
+  return bit_cast<f8_e4m3b15x2>(packed);
+#else
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(__half2float(v.data[0]));
+  result.data[1] = __fp8_e4m3b15(__half2float(v.data[1]));
+  return result;
+#endif
+}
+
+/// f16x4 -> f8_e4m3b15x4.
+/// Uses __vminu2 + lop3 + __byte_perm for branch-free vectorized conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in0, in1;
+  asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(v.words[0]));
+  asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1]));
+  uint32_t abs0 = in0 & 0x7fff7fffu;
+  uint32_t abs1 = in1 & 0x7fff7fffu;
+  uint32_t a0 = __vminu2(abs0, 0x3B803B80u);
+  uint32_t a1 = __vminu2(abs1, 0x3B803B80u);
+  a0 = a0 * 2u + 0x00800080u;
+  a1 = a1 * 2u + 0x00800080u;
+  uint32_t b0, b1;
+  asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b0) : "r"(a0), "r"(in0), "r"(0x80008000u));
+  asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b1) : "r"(a1), "r"(in1), "r"(0x80008000u));
+  uint32_t packed = __byte_perm(b0, b1, 0x7531u);
+  return bit_cast<f8_e4m3b15x4>(packed);
+#else
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(__half2float(v.data[i]));
+  }
+  return result;
+#endif
+}
+
+// --- fp8_e4m3b15 <-> f32 conversion specializations ---
+// Derived from fp16 conversions: fp8→f32 = fp8→fp16→f32, f32→fp8 = f32→fp16→fp8.
+
+/// f8_e4m3b15x2 -> f32x2.
+/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
+  float2 f2 = __half22float2(h);
+  return bit_cast<f32x2>(f2);
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e4m3b15x4 -> f32x4.
+/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
+  __half2 h0, h1;
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h0)) : "r"(h.words[0]));
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h1)) : "r"(h.words[1]));
+  float2 f0 = __half22float2(h0);
+  float2 f1 = __half22float2(h1);
+  f32x4 result;
+  result.data[0] = f0.x;
+  result.data[1] = f0.y;
+  result.data[2] = f1.x;
+  result.data[3] = f1.y;
+  return result;
+#else
+  f32x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = float(v.data[i]);
+  }
+  return result;
+#endif
+}
+
+/// f32x2 -> f8_e4m3b15x2.
+/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  float2 f2 = {v.data[0], v.data[1]};
+  __half2 h = __float22half2_rn(f2);
+  return to<f8_e4m3b15x2, f16x2>(h);
+#else
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(v.data[0]);
+  result.data[1] = __fp8_e4m3b15(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e4m3b15x4.
+/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  float2 f01 = {v.data[0], v.data[1]};
+  float2 f23 = {v.data[2], v.data[3]};
+  __half2 h01 = __float22half2_rn(f01);
+  __half2 h23 = __float22half2_rn(f23);
+  f16x4 h;
+  asm("mov.b32 %0, %1;" : "=r"(h.words[0]) : "r"(*reinterpret_cast<uint32_t*>(&h01)));
+  asm("mov.b32 %0, %1;" : "=r"(h.words[1]) : "r"(*reinterpret_cast<uint32_t*>(&h23)));
+  return to<f8_e4m3b15x4, f16x4>(h);
+#else
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(v.data[i]);
+  }
+  return result;
+#endif
+}
+
+// --- fp8_e4m3b15 arithmetic (software, always available) ---
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 operator+(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
+  return __fp8_e4m3b15(float(a) + float(b));
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 operator+(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(float(a.data[0]) + float(b.data[0]));
+  result.data[1] = __fp8_e4m3b15(float(a.data[1]) + float(b.data[1]));
+  return result;
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 operator+(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(float(a.data[i]) + float(b.data[i]));
+  }
+  return result;
+}
+
+// --- fp8_e4m3b15 min (software) ---
+
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 min(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
+  return __fp8_e4m3b15(fminf(float(a), float(b)));
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 min(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
+  f8_e4m3b15x2 result;
+  result.data[0] = mscclpp::min(a.data[0], b.data[0]);
+  result.data[1] = mscclpp::min(a.data[1], b.data[1]);
+  return result;
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 min(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = mscclpp::min(a.data[i], b.data[i]);
+  }
+  return result;
+}
+
 #endif  // MSCCLPP_DEVICE_COMPILE
 }  // namespace mscclpp
 
diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp
index 1a93cbc0..1cb3f253 100644
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -75,15 +75,17 @@ void register_algorithm(nb::module_& m) {
               [](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
                  size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
                  std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
-                 std::unordered_map<std::string, uintptr_t> extras) {
+                 std::unordered_map<std::string, uintptr_t> extras, int32_t accumDtype) {
                 return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
                                     inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
-                                    nBlocks, nThreadsPerBlock, symmetricMemory, extras);
+                                    nBlocks, nThreadsPerBlock, symmetricMemory, extras,
+                                    static_cast<DataType>(accumDtype));
               },
               nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
               nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
               nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
-              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>())
+              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>(),
+              nb::arg("accum_dtype") = static_cast<int32_t>(DataType::AUTO))
           .def("reset", &Algorithm::reset);
 
   nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index 47d76ac4..b8649564 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -47,7 +47,8 @@ void register_core(nb::module_& m) {
       .value("bfloat16", DataType::BFLOAT16)
       .value("float8_e4m3", DataType::FLOAT8_E4M3)
       .value("float8_e5m2", DataType::FLOAT8_E5M2)
-      .value("uint8", DataType::UINT8);
+      .value("uint8", DataType::UINT8)
+      .value("float8_e4m3b15", DataType::FLOAT8_E4M3B15);
 
   nb::class_<Bootstrap>(m, "CppBootstrap")
       .def("get_rank", &Bootstrap::getRank)
diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp
index 6995756b..60880456 100644
--- a/python/csrc/gpu_utils_py.cpp
+++ b/python/csrc/gpu_utils_py.cpp
@@ -34,6 +34,19 @@ static DLDataType getDlType(std::string type) {
     return DLDataType{kDLBfloat, 16, 1};
   } else if (type == "torch.float16") {
     return DLDataType{kDLFloat, 16, 1};
+  } else if (type == "torch.float8_e4m3fn") {
+    return DLDataType{kDLFloat8_e4m3fn, 8, 1};
+  } else if (type == "torch.float8_e4m3fnuz") {
+    return DLDataType{kDLFloat8_e4m3fnuz, 8, 1};
+  } else if (type == "torch.float8_e5m2") {
+    return DLDataType{kDLFloat8_e5m2, 8, 1};
+  } else if (type == "torch.float8_e5m2fnuz") {
+    return DLDataType{kDLFloat8_e5m2fnuz, 8, 1};
+  } else if (type == "torch.uint8") {
+    return DLDataType{kDLUInt, 8, 1};
+  } else if (type == "fp8_e4m3b15") {
+    // No standard DLPack code for fp8_e4m3b15; store as raw uint8 bytes.
+    return DLDataType{kDLUInt, 8, 1};
   } else {
     throw Error("Unsupported type: " + type, ErrorCode::InvalidUsage);
   }
diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py
index 744cf39e..f12a3027 100644
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -177,6 +177,7 @@ class Algorithm:
         nthreads_per_block=0,
         symmetric_memory: bool = False,
         extras: Optional[Dict[str, int]] = None,
+        accum_dtype: Optional[CppDataType] = None,
     ) -> int:
         """Execute the collective algorithm.
 
@@ -194,10 +195,14 @@ class Algorithm:
             nthreads_per_block: Number of threads per block (0 for auto-selection).
             symmetric_memory: Whether to use symmetric memory optimization (default: False).
             extras: Additional algorithm-specific parameters.
+            accum_dtype: Data type for accumulation during reduction. If None, defaults to
+                         the same as dtype. Use DataType.float32 for high-precision FP8 accumulation.
 
         Returns:
             The result code (0 for success).
         """
+        merged_extras = dict(extras) if extras is not None else {}
+        accum_dtype = accum_dtype if accum_dtype is not None else dtype
         return self._algorithm.execute(
             comm,
             int(input_buffer),
@@ -211,7 +216,8 @@ class Algorithm:
             nblocks,
             nthreads_per_block,
             symmetric_memory,
-            extras if extras is not None else {},
+            merged_extras,
+            int(accum_dtype),
         )
 
     def reset(self):
diff --git a/python/test/test_fp8_accum.py b/python/test/test_fp8_accum.py
new file mode 100644
index 00000000..3a6c67f1
--- /dev/null
+++ b/python/test/test_fp8_accum.py
@@ -0,0 +1,391 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Correctness test for FP8 allreduce with different accumulation types.
+#
+# Verifies that FP8 allreduce with higher-precision accumulation produces
+# results at least as accurate as native FP8 accumulation, by comparing
+# against a float32 reference.
+#
+# Usage:
+#   mpirun -np 8 pytest python/test/test_fp8_accum.py -v
+
+import cupy as cp
+import numpy as np
+import pytest
+
+from mscclpp import CommGroup, GpuBuffer, DataType, ReduceOp, is_nvls_supported
+from mscclpp.ext import AlgorithmCollectionBuilder
+from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
+
+# FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs.
+# On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed.
+_is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip
+# TODO(binyli): Skip hip for now, will fix it in the next PR
+_skip_fp8 = _is_hip or int(cp.cuda.Device().compute_capability) < 89
+pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA (HIP not yet supported)")
+
+# ---------------------------------------------------------------------------
+# FP8 E4M3FN helpers (bias=7, no infinity, NaN = exp=15 & mant=7)
+# ---------------------------------------------------------------------------
+
+
+def e4m3fn_to_float(uint8_array):
+    """Decode a cupy uint8 array of E4M3FN bit patterns to float32."""
+    bits = uint8_array.astype(cp.int32)
+    sign = (bits >> 7) & 1
+    exp = (bits >> 3) & 0xF
+    mant = bits & 0x7
+
+    # Normal: (-1)^s * 2^(exp-7) * (1 + mant/8)
+    normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 7).astype(cp.int32))
+    # Subnormal (exp==0): (-1)^s * 2^(-6) * (mant/8)
+    subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-6))
+
+    result = cp.where(exp == 0, subnormal_val, normal_val)
+    result = cp.where(sign == 1, -result, result)
+    # Zero
+    result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
+    # NaN: exp==15 & mant==7
+    nan_mask = (exp == 15) & (mant == 7)
+    result = cp.where(nan_mask, cp.float32(float("nan")), result)
+    return result
+
+
+def float_to_e4m3fn(f32_array, chunk_size=65536):
+    """Encode a cupy float32 array to uint8 E4M3FN bit patterns.
+
+    Uses a lookup-table approach: precompute all 128 positive E4M3FN values,
+    then find nearest match per element via chunked broadcast comparison.
+    """
+    # Build lookup table of all 128 positive E4M3FN values (0x00..0x7F)
+    all_bytes = cp.arange(128, dtype=cp.uint8)
+    all_floats = e4m3fn_to_float(all_bytes)  # (128,) float32
+    # Mark NaN entries as inf so they're never selected as nearest
+    all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
+
+    # Clamp input and extract sign
+    clamped = f32_array.astype(cp.float32)
+    clamped = cp.clip(clamped, -448.0, 448.0)
+    signs = (clamped < 0).astype(cp.uint8)
+    absval = cp.abs(clamped)
+
+    result = cp.zeros(absval.shape, dtype=cp.uint8)
+    n = absval.size
+    absval_flat = absval.ravel()
+    result_flat = result.ravel()
+
+    for start in range(0, n, chunk_size):
+        end = min(start + chunk_size, n)
+        chunk = absval_flat[start:end]
+        # (chunk_size, 128) difference matrix
+        diffs = cp.abs(chunk[:, None] - all_floats[None, :])
+        result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
+
+    # Combine with sign bit
+    result = result_flat.reshape(absval.shape)
+    result = result | (signs << 7)
+    # Handle exact zero
+    result = cp.where(absval == 0, cp.uint8(0), result)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80)
+# ---------------------------------------------------------------------------
+
+
+def e4m3b15_to_float(uint8_array):
+    """Decode a cupy uint8 array of E4M3B15 bit patterns to float32."""
+    bits = uint8_array.astype(cp.int32)
+    sign = (bits >> 7) & 1
+    exp = (bits >> 3) & 0xF
+    mant = bits & 0x7
+
+    # Normal: (-1)^s * 2^(exp-15) * (1 + mant/8)
+    normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 15).astype(cp.int32))
+    # Subnormal (exp==0): (-1)^s * 2^(-14) * (mant/8)
+    subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-14))
+
+    result = cp.where(exp == 0, subnormal_val, normal_val)
+    result = cp.where(sign == 1, -result, result)
+    # Zero
+    result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
+    # NaN: exp==15 or negative zero (0x80)
+    nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80)
+    result = cp.where(nan_mask, cp.float32(float("nan")), result)
+    return result
+
+
+def float_to_e4m3b15(f32_array, chunk_size=65536):
+    """Encode a cupy float32 array to uint8 E4M3B15 bit patterns.
+
+    Same lookup-table approach as float_to_e4m3fn.
+    """
+    # Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F)
+    all_bytes = cp.arange(128, dtype=cp.uint8)
+    all_floats = e4m3b15_to_float(all_bytes)  # (128,) float32
+    # Mark NaN entries as inf so they're never selected as nearest
+    all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
+
+    # Clamp input and extract sign
+    clamped = f32_array.astype(cp.float32)
+    clamped = cp.clip(clamped, -0.9375, 0.9375)
+    signs = (clamped < 0).astype(cp.uint8)
+    absval = cp.abs(clamped)
+
+    result = cp.zeros(absval.shape, dtype=cp.uint8)
+    n = absval.size
+    absval_flat = absval.ravel()
+    result_flat = result.ravel()
+
+    for start in range(0, n, chunk_size):
+        end = min(start + chunk_size, n)
+        chunk = absval_flat[start:end]
+        # (chunk_size, 128) difference matrix
+        diffs = cp.abs(chunk[:, None] - all_floats[None, :])
+        result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
+
+    # Combine with sign bit
+    result = result_flat.reshape(absval.shape)
+    result = result | (signs << 7)
+    # Handle exact zero
+    result = cp.where(absval == 0, cp.uint8(0), result)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Shared test helpers
+# ---------------------------------------------------------------------------
+
+
+def setup_algorithms(mpi_group):
+    """Build default algorithms and return (comm_group, algo_map, scratch_buf)."""
+    comm_group = CommGroup(mpi_group.comm)
+    scratch = GpuBuffer(1 << 27, dtype=cp.uint8)  # 128 MB
+    AlgorithmCollectionBuilder.reset()
+    builder = AlgorithmCollectionBuilder()
+    algorithms = builder.build_default_algorithms(
+        scratch_buffer=scratch.data.ptr,
+        scratch_buffer_size=scratch.nbytes,
+        rank=comm_group.my_rank,
+    )
+    algo_map = {a.name: a for a in algorithms}
+    return comm_group, algo_map, scratch
+
+
+def run_allreduce(algo, comm_group, buffer, dtype, accum_dtype=None, nblocks=0, nthreads_per_block=0):
+    """Run allreduce in-place on buffer and return a copy of the result."""
+    ret = algo.execute(
+        comm=comm_group.communicator,
+        input_buffer=buffer.data.ptr,
+        output_buffer=buffer.data.ptr,
+        input_size=buffer.nbytes,
+        output_size=buffer.nbytes,
+        dtype=dtype,
+        op=ReduceOp.SUM,
+        stream=cp.cuda.get_current_stream().ptr,
+        nblocks=nblocks,
+        nthreads_per_block=nthreads_per_block,
+        symmetric_memory=True,
+        accum_dtype=accum_dtype,
+    )
+    cp.cuda.Device().synchronize()
+    assert ret == 0, f"Allreduce failed with error code {ret}"
+    return buffer.copy()
+
+
+# ---------------------------------------------------------------------------
+# Test: FP8 E4M3 accumulation correctness
+# ---------------------------------------------------------------------------
+
+
+@parametrize_mpi_groups(8)
+@pytest.mark.parametrize(
+    "algo_name",
+    [
+        "default_allreduce_packet",
+        "default_allreduce_nvls_packet",
+        "default_allreduce_fullmesh",
+        "default_allreduce_rsag_zero_copy",
+    ],
+)
+@pytest.mark.parametrize("size", [1024, 4096, 16384, 65536, 262144, 1048576])
+def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
+    """Verify that FP8 E4M3 allreduce with higher-precision accumulation is at
+    least as accurate as native FP8 accumulation, across all algorithm variants."""
+    rank = mpi_group.comm.rank
+    world_size = mpi_group.comm.size
+
+    comm_group, algo_map, scratch = setup_algorithms(mpi_group)
+    if algo_name not in algo_map:
+        pytest.skip(f"{algo_name} not available")
+    algo = algo_map[algo_name]
+
+    buf = GpuBuffer(size, dtype=cp.uint8)
+
+    accum_configs = [
+        ("fp8_native", DataType.float8_e4m3),
+        ("float16", DataType.float16),
+        ("float32", DataType.float32),
+    ]
+
+    # rsag_zero_copy and fullmesh need explicit block/thread counts
+    if "rsag" in algo_name:
+        nb = max(1, min(32, size // (world_size * 32)))
+        nt = 1024
+    elif "fullmesh" in algo_name:
+        nb = 35
+        nt = 512
+    else:
+        nb = 0
+        nt = 0
+
+    errors = {}
+    for accum_label, accum_dtype in accum_configs:
+        # Generate deterministic per-rank data
+        cp.random.seed(42 + rank)
+        src_f32 = cp.random.randn(size).astype(cp.float32)
+        src_f32 = cp.clip(src_f32, -240.0, 240.0)
+        src_fp8 = float_to_e4m3fn(src_f32)
+
+        # Copy into symmetric buffer
+        buf[:] = src_fp8
+        cp.cuda.Device().synchronize()
+
+        # Run allreduce
+        result = run_allreduce(
+            algo,
+            comm_group,
+            buf,
+            dtype=DataType.float8_e4m3,
+            accum_dtype=accum_dtype,
+            nblocks=nb,
+            nthreads_per_block=nt,
+        )
+        result_f32 = e4m3fn_to_float(result)
+
+        # Compute float32 reference: sum all ranks' quantized FP8 inputs in float32
+        ref_f32 = cp.zeros(size, dtype=cp.float32)
+        for r in range(world_size):
+            cp.random.seed(42 + r)
+            rank_data = cp.random.randn(size).astype(cp.float32)
+            rank_data = cp.clip(rank_data, -240.0, 240.0)
+            rank_data_fp8 = float_to_e4m3fn(rank_data)
+            ref_f32 += e4m3fn_to_float(rank_data_fp8)
+
+        # Compute errors
+        abs_err = cp.abs(result_f32 - ref_f32)
+        mean_abs_err = float(cp.mean(abs_err))
+        errors[accum_label] = mean_abs_err
+
+        # Reset between runs
+        algo.reset()
+
+    # Higher-precision accumulation should be at least as accurate as native fp8
+    assert (
+        errors["float16"] <= errors["fp8_native"] + 1e-6
+    ), f"float16 accum ({errors['float16']:.6f}) worse than native ({errors['fp8_native']:.6f})"
+    assert (
+        errors["float32"] <= errors["fp8_native"] + 1e-6
+    ), f"float32 accum ({errors['float32']:.6f}) worse than native ({errors['fp8_native']:.6f})"
+
+
+# ---------------------------------------------------------------------------
+# Test: FP8 E4M3B15 accumulation correctness
+# ---------------------------------------------------------------------------
+
+
+@parametrize_mpi_groups(8)
+@pytest.mark.parametrize(
+    "algo_name",
+    [
+        "default_allreduce_packet",
+        "default_allreduce_nvls_packet",
+        "default_allreduce_rsag_zero_copy",
+    ],
+)
+@pytest.mark.parametrize("size", [1024, 4096, 65536])
+def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
+    """Verify that FP8 E4M3B15 allreduce with higher-precision accumulation is at
+    least as accurate as native E4M3B15 accumulation."""
+    rank = mpi_group.comm.rank
+    world_size = mpi_group.comm.size
+
+    comm_group, algo_map, scratch = setup_algorithms(mpi_group)
+    if algo_name not in algo_map:
+        pytest.skip(f"{algo_name} not available")
+
+    algo = algo_map[algo_name]
+    buf = GpuBuffer(size, dtype=cp.uint8)
+
+    accum_configs = [
+        ("e4m3b15_native", DataType.float8_e4m3b15),
+        ("float16", DataType.float16),
+        ("float32", DataType.float32),
+    ]
+
+    # rsag_zero_copy needs explicit block/thread counts, scaled to data size
+    if "rsag" in algo_name:
+        nb = max(1, min(32, size // (world_size * 32)))
+        nt = 1024
+    else:
+        nb = 0
+        nt = 0
+
+    errors = {}
+    for accum_label, accum_dtype in accum_configs:
+        # Generate deterministic per-rank random uint8 values in valid e4m3b15 range
+        cp.random.seed(42 + rank)
+        raw = cp.random.randint(0, 0x78, (size,), dtype=cp.uint8)
+        signs = cp.random.randint(0, 2, (size,), dtype=cp.uint8).astype(cp.uint8) << 7
+        src_uint8 = raw | signs
+        # Fix negative zero -> positive zero
+        src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8)
+
+        # Copy into symmetric buffer
+        buf[:] = src_uint8
+        cp.cuda.Device().synchronize()
+
+        # Run allreduce
+        result = run_allreduce(
+            algo,
+            comm_group,
+            buf,
+            dtype=DataType.float8_e4m3b15,
+            accum_dtype=accum_dtype,
+            nblocks=nb,
+            nthreads_per_block=nt,
+        )
+
+        # Decode result
+        result_f32 = e4m3b15_to_float(result)
+
+        # Compute float32 reference
+        ref_f32 = cp.zeros(size, dtype=cp.float32)
+        for r in range(world_size):
+            cp.random.seed(42 + r)
+            raw_r = cp.random.randint(0, 0x78, (size,), dtype=cp.uint8)
+            signs_r = cp.random.randint(0, 2, (size,), dtype=cp.uint8).astype(cp.uint8) << 7
+            bits_r = raw_r | signs_r
+            bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r)
+            ref_f32 += e4m3b15_to_float(bits_r)
+
+        # Clamp reference to e4m3b15 representable range
+        ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375)
+
+        # Compute errors (only on valid entries)
+        valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32)
+        abs_err = cp.abs(result_f32[valid] - ref_f32[valid])
+        mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0
+        errors[accum_label] = mean_abs_err
+
+        algo.reset()
+
+    # Higher-precision accumulation should be at least as accurate as native
+    assert (
+        errors["float16"] <= errors["e4m3b15_native"] + 1e-8
+    ), f"float16 accum ({errors['float16']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
+    assert (
+        errors["float32"] <= errors["e4m3b15_native"] + 1e-8
+    ), f"float32 accum ({errors['float32']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc
index 99e7b031..ffa53aa8 100644
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -41,7 +41,9 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
 CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
                                     size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
                                     cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
-                                    bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras) {
+                                    bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras,
+                                    DataType accumDtype) {
+  if (accumDtype == DataType::AUTO) accumDtype = dtype;
   if (!initialized_) {
     initFunc_(comm);
     initialized_ = true;
@@ -53,7 +55,7 @@ CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const vo
     contexts_[ctxKey] = ctx;
   }
   return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks,
-                           nThreadsPerBlock, extras);
+                           nThreadsPerBlock, extras, accumDtype);
 }
 
 const std::string& NativeAlgorithm::name() const { return name_; }
@@ -77,10 +79,7 @@ const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferM
 
 Algorithm::Constraint NativeAlgorithm::constraint() const { return constraint_; }
 
-void NativeAlgorithm::reset() {
-  contexts_.clear();
-  initialized_ = false;
-}
+void NativeAlgorithm::reset() { contexts_.clear(); }
 
 void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName,
                                             std::shared_ptr<Algorithm> algorithm) {
@@ -166,7 +165,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
 CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                                  size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
                                  std::shared_ptr<Executor> executor, int, int, bool,
-                                 const std::unordered_map<std::string, uintptr_t>&) {
+                                 const std::unordered_map<std::string, uintptr_t>&, DataType) {
   if (!executor) {
     THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
   }
@@ -192,6 +191,10 @@ CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void*
                         plan_, stream);
       break;
 #endif
+    case DataType::FLOAT8_E4M3B15:
+      executor->execute(rank, (__fp8_e4m3b15*)input, (__fp8_e4m3b15*)output, inputSize, outputSize,
+                        DataType::FLOAT8_E4M3B15, plan_, stream);
+      break;
     case DataType::INT32:
     case DataType::UINT32:
       executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream);
diff --git a/src/core/executor/execution_kernel.cu b/src/core/executor/execution_kernel.cu
index 2d36bcf5..28ced77f 100644
--- a/src/core/executor/execution_kernel.cu
+++ b/src/core/executor/execution_kernel.cu
@@ -82,6 +82,12 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
     case DataType::FLOAT8_E5M2:
       // FP8 is not supported in CUDA execution kernel.
       break;
+    case DataType::FLOAT8_E4M3B15:
+      // fp8_e4m3b15 is a software type not supported in the CUDA execution kernel.
+      break;
+    case DataType::AUTO:
+      // AUTO is a sentinel resolved before reaching this point; nothing to do.
+      break;
   }
 }
 
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index 20147c30..87b88888 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -210,7 +210,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input
           sizeof(int4);
       void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]);
       val = mscclpp::read<int4>(remoteMemory, srcOffset + idx);
-      tmp = cal_vector<T, OpType>(tmp, val);
+      tmp = calVector<T, OpType>(tmp, val);
     }
     output4[outputOffset4 + idx] = tmp;
     if constexpr (SendToRemote) {
@@ -353,9 +353,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in
     for (uint32_t index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
       PacketPayload<PacketType> val = pkt[idx].read(flag_);
-      data = cal_vector<T, OpType>(data, val);
+      data = calVector<T, OpType>(data, val);
     }
-    data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
+    data = calVector<T, OpType>(data, srcPacketPayload[idx]);
     dstPacketPayload[idx] = data;
 
     if constexpr (SendToRemote) {
@@ -394,9 +394,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void
     for (uint32_t index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
       PacketPayload<PacketType> val = pkt[idx].read(flag_);
-      data = cal_vector<T, OpType>(data, val);
+      data = calVector<T, OpType>(data, val);
     }
-    data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
+    data = calVector<T, OpType>(data, srcPacketPayload[idx]);
     dstPacketPayload[idx] = data;
     PacketType* dst_val = &dstPkt[idx];
     dst_val->write(data, flag_);
@@ -464,7 +464,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo
       size_t buffOffset =
           (inputOffsets[index] + getOffset<ReuseScratch>(outputBufferRefs[index].type, offset)) / sizeof(int4);
       int4 val = buff4[buffOffset + idx];
-      tmp = cal_vector<T, OpType>(tmp, val);
+      tmp = calVector<T, OpType>(tmp, val);
     }
     dst4[dstOffset4 + idx] = tmp;
     if constexpr (SendToRemote) {
@@ -899,6 +899,17 @@ class ExecutionKernel {
 #endif
         break;
 #endif  // __FP8_TYPES_EXIST__
+      case DataType::FLOAT8_E4M3B15:
+        executionKernel<__fp8_e4m3b15, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (__fp8_e4m3b15*)src, (__fp8_e4m3b15*)dst, (__fp8_e4m3b15*)scratch, scratchOffset, scratchChunkSize,
+            plan, semaphores, localMemoryIdBegin, flag
+#if defined(ENABLE_NPKIT)
+            ,
+            NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
+#else
+        );
+#endif
+        break;
       case DataType::UINT8:
         executionKernel<uint8_t, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
             rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores,
@@ -910,6 +921,10 @@ class ExecutionKernel {
         );
 #endif
         break;
+      case DataType::AUTO:
+        // AUTO is a sentinel that must be resolved before reaching this point.
+        assert(false && "DataType::AUTO must be resolved before kernel launch");
+        break;
     }
   }
 #else   // !defined(MSCCLPP_DEVICE_HIP)
diff --git a/src/core/include/reduce_kernel.hpp b/src/core/include/reduce_kernel.hpp
index fd9bd1e9..463f827d 100644
--- a/src/core/include/reduce_kernel.hpp
+++ b/src/core/include/reduce_kernel.hpp
@@ -14,7 +14,7 @@ namespace mscclpp {
 
 // Generic element-wise calculation helper
 template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
+MSCCLPP_DEVICE_INLINE T calElements(const T& a, const T& b) {
   if constexpr (OpType == SUM) {
     return a + b;
   } else if constexpr (OpType == MIN) {
@@ -24,56 +24,168 @@ MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
 }
 
 // Generic vector reduction helpers
-template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE int4 cal_vector_helper(const int4& a, const int4& b) {
-  int4 ret;
-  ret.w = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
-  ret.x = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  ret.z = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
-  return ret;
-}
 
 template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE uint2 cal_vector_helper(const uint2& a, const uint2& b) {
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) {
   uint2 ret;
-  ret.x = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
-  ret.y = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
+  ret.x = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
+  ret.y = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
   return ret;
 }
 
-template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE int cal_vector_helper(const int& a, const int& b) {
-  return bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+/// f32x2 specialization for uint2: uses packed f32x2 operator+ (Blackwell __fadd2_rn when available).
+template <>
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, SUM>(const uint2& a, const uint2& b) {
+  f32x2 fa = bit_cast<f32x2, uint2>(a);
+  f32x2 fb = bit_cast<f32x2, uint2>(b);
+  f32x2 fr = fa + fb;
+  return bit_cast<uint2, f32x2>(fr);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, MIN>(const uint2& a, const uint2& b) {
+  f32x2 fa = bit_cast<f32x2, uint2>(a);
+  f32x2 fb = bit_cast<f32x2, uint2>(b);
+  f32x2 fr = mscclpp::min(fa, fb);
+  return bit_cast<uint2, f32x2>(fr);
 }
 
 template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE uint32_t cal_vector_helper(const uint32_t& a, const uint32_t& b) {
-  return bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) {
+  int4 ret;
+  ret.w = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
+  ret.x = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  ret.z = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
+  return ret;
 }
 
-// cal_vector wrapper - converts scalar types to vector types and calls cal_vector_helper
+/// f32x2 specialization for int4: process as two uint2 pairs using packed f32x2 arithmetic.
+template <>
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, SUM>(const int4& a, const int4& b) {
+  uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
+  uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
+  uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
+  uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
+  uint2 lo_r = calVectorHelper<f32x2, SUM>(lo_a, lo_b);
+  uint2 hi_r = calVectorHelper<f32x2, SUM>(hi_a, hi_b);
+  return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, MIN>(const int4& a, const int4& b) {
+  uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
+  uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
+  uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
+  uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
+  uint2 lo_r = calVectorHelper<f32x2, MIN>(lo_a, lo_b);
+  uint2 hi_r = calVectorHelper<f32x2, MIN>(hi_a, hi_b);
+  return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE int calVectorHelper(const int& a, const int& b) {
+  return bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) {
+  return bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
+}
+
+/// f32x2 specialization for uint32_t: a single float packed in 32 bits (scalar fallback).
+template <>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, SUM>(const uint32_t& a, const uint32_t& b) {
+  float fa = bit_cast<float, uint32_t>(a);
+  float fb = bit_cast<float, uint32_t>(b);
+  return bit_cast<uint32_t, float>(fa + fb);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, MIN>(const uint32_t& a, const uint32_t& b) {
+  float fa = bit_cast<float, uint32_t>(a);
+  float fb = bit_cast<float, uint32_t>(b);
+  return bit_cast<uint32_t, float>(fminf(fa, fb));
+}
+
+// calVector wrapper – converts scalar types to vector types and calls calVectorHelper
 template <typename T, ReduceOp OpType, typename DataType>
-MSCCLPP_DEVICE_INLINE DataType cal_vector(const DataType& a, const DataType& b) {
+MSCCLPP_DEVICE_INLINE DataType calVector(const DataType& a, const DataType& b) {
   // Define the vectorized computation type based on the element type
   static_assert(sizeof(DataType) % sizeof(T) == 0, "DataType size must be multiple of T size");
   static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes");
   using CompType = typename std::conditional_t<
-      std::is_same_v<T, __half>, f16x2,
+      std::is_same_v<T, float>, f32x2,
       std::conditional_t<
-          std::is_same_v<T, __bfloat16>, bf16x2,
-          std::conditional_t<std::is_same_v<T, uint8_t>, u8x4,
+          std::is_same_v<T, __half>, f16x2,
+          std::conditional_t<
+              std::is_same_v<T, __bfloat16>, bf16x2,
+              std::conditional_t<
+                  std::is_same_v<T, uint8_t>, u8x4,
+                  std::conditional_t<std::is_same_v<T, __fp8_e4m3b15>, f8_e4m3b15x4,
 #if defined(__FP8_TYPES_EXIST__)
-                             std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
-                                                std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4,
-#endif
-                                                                   T
-#if defined(__FP8_TYPES_EXIST__)
-                                                                   >>>>>;
+                                     std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
+                                                        std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4, T>>
 #else
-                             >>>;
+                                     T
 #endif
-  return cal_vector_helper<CompType, OpType>(a, b);
+                                     >>>>>;
+  return calVectorHelper<CompType, OpType>(a, b);
+}
+
+/// Upcast a packed DataType (containing T elements) to a packed AccDataType (containing AccumT elements).
+/// Uses the optimized to<>() specializations when available (e.g. FP8 -> float hardware intrinsics).
+/// When AccumT == T, this is a no-op identity.
+template <typename T, typename AccumT, typename AccDataType, typename DataType>
+MSCCLPP_DEVICE_INLINE AccDataType upcastVector(const DataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return val;
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+    ToVec result = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
+    return reinterpret_cast<const AccDataType&>(result);
+  }
+}
+
+/// Downcast a packed AccDataType (containing AccumT elements) back to DataType (containing T elements).
+/// Uses the optimized to<>() specializations when available.
+/// When AccumT == T, this is a no-op identity.
+template <typename T, typename AccumT, typename DataType, typename AccDataType>
+MSCCLPP_DEVICE_INLINE DataType downcastVector(const AccDataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return val;
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+    FromVec result = mscclpp::to<FromVec>(reinterpret_cast<const ToVec&>(val));
+    return reinterpret_cast<const DataType&>(result);
+  }
+}
+
+/// Accumulate `val` (packed T elements in DataType) into `acc` (packed AccumT elements in AccDataType).
+/// When AccumT == T, falls back to the standard calVector.
+/// Otherwise, upcasts val to AccumT, reduces element-wise, and returns the AccumT accumulator.
+template <typename T, typename AccumT, ReduceOp OpType, typename AccDataType, typename DataType>
+MSCCLPP_DEVICE_INLINE AccDataType calVectorAccum(const AccDataType& acc, const DataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return calVector<T, OpType>(acc, val);
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+
+    ToVec fv = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
+    const ToVec& fa = reinterpret_cast<const ToVec&>(acc);
+    ToVec fr;
+#pragma unroll
+    for (int i = 0; i < nElems; ++i) {
+      fr.data[i] = calElements<AccumT, OpType>(fa.data[i], fv.data[i]);
+    }
+    return reinterpret_cast<const AccDataType&>(fr);
+  }
 }
 
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)
diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index 0b288b38..fb51a342 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -183,7 +183,8 @@ std::shared_ptr<Algorithm> AllgatherFullmesh::build() {
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, [[maybe_unused]] DataType dtype, [[maybe_unused]] ReduceOp op,
              cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-             const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             const std::unordered_map<std::string, uintptr_t>& extras,
+             [[maybe_unused]] DataType accumDtype) -> CommResult {
         return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
index cf6027d9..9d169d68 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -212,7 +212,8 @@ std::shared_ptr<Algorithm> AllgatherFullmesh2::build() {
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, [[maybe_unused]] mscclpp::DataType dtype, [[maybe_unused]] ReduceOp op,
              cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-             const std::unordered_map<std::string, uintptr_t>& extras) -> mscclpp::CommResult {
+             const std::unordered_map<std::string, uintptr_t>& extras,
+             [[maybe_unused]] mscclpp::DataType accumDtype) -> mscclpp::CommResult {
         return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 83950d7c..6cbc8977 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -47,7 +47,7 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
       const int remoteRank = index < rank ? index : index + 1;
       LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems;
       uint32_t val = dstPkt[idx].read(flag, -1);
-      data = cal_vector<T, OpType>(val, data);
+      data = calVector<T, OpType>(val, data);
     }
     dst[idx] = data;
   }
@@ -67,7 +67,7 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
   return {(worldSize - 1) * 4, 512};
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllpairAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
@@ -94,7 +94,8 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
                                                        size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op,
                                                        cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                       const std::unordered_map<std::string, uintptr_t>&) {
+                                                       const std::unordered_map<std::string, uintptr_t>&,
+                                                       DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
   std::pair<int, int> blockAndThreadNum{nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
@@ -105,7 +106,7 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   size_t channelInOffset = (char*)input - (char*)sendBasePtr;
 
-  AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -161,9 +162,9 @@ std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index 13c63ba1..ee46fd77 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -9,7 +9,7 @@
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(512, 1)
     allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                       DeviceHandle<MemoryChannel>* memoryOutChannels, size_t channelOutDataOffset, int rank,
@@ -26,6 +26,10 @@ __global__ void __launch_bounds__(512, 1)
   int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
   int4* resultBuff4 = reinterpret_cast<int4*>(resultBuff);
 
+  // AccumVec: wider vector for mixed-precision accumulation. When AccumT==T, this is just int4 (no-op).
+  constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T);
+  using AccumVec = std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nElemsPerInt4>>;
+
   // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4`
   constexpr size_t unitNInt4 = 512;
   const size_t maxNInt4PerBlock =
@@ -81,12 +85,14 @@ __global__ void __launch_bounds__(512, 1)
     __syncthreads();
 
     for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) {
-      int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(rawData);
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
-        data = cal_vector<T, OpType>(val, data);
+        acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, val);
       }
+      int4 data = mscclpp::downcastVector<T, AccumT, int4>(acc);
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
@@ -121,12 +127,14 @@ __global__ void __launch_bounds__(512, 1)
     __syncthreads();
 
     for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) {
-      int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(rawData);
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
-        data = cal_vector<T, OpType>(val, data);
+        acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, val);
       }
+      int4 data = mscclpp::downcastVector<T, AccumT, int4>(acc);
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
@@ -144,7 +152,7 @@ __global__ void __launch_bounds__(512, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceAllconnectAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t,
@@ -155,7 +163,7 @@ struct AllreduceAllconnectAdapter {
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0) nBlocks = 35;
     if (nThreadsPerBlock == 0) nThreadsPerBlock = 512;
-    allreduceFullmesh<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreduceFullmesh<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels,
         channelOutDataOffset, rank, nRanksPerNode, worldSize, nelems);
     return cudaGetLastError();
@@ -174,10 +182,10 @@ void AllreduceFullmesh::initialize(std::shared_ptr<Communicator> comm) {
   localScratchMemory_ = std::move(localMemory);
 }
 
-CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
-                                                  size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
-                                                  int nBlocks, int nThreadsPerBlock,
-                                                  const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceFullmesh::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx_void, const void* input, void* output, size_t inputSize, DataType dtype,
+    ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   size_t recvBytes;
   CUdeviceptr recvBasePtr;
@@ -198,7 +206,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
   }
   inputChannelHandles = this->memoryChannelsMap_[input].second;
 
-  AllreduceFunc allreduce = dispatch<AllreduceAllconnectAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceAllconnectAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", static_cast<int>(op),
          static_cast<int>(dtype));
@@ -261,9 +269,10 @@ std::shared_ptr<Algorithm> AllreduceFullmesh::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index b542a6a6..2d71cd63 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -146,7 +146,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsBlockPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -155,6 +155,9 @@ struct NvlsBlockPipelineAdapter {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
+      return cudaErrorNotSupported;
     } else
 #if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
       if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
@@ -187,9 +190,10 @@ void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm)
 CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
                                                            void* output, size_t inputSize, DataType dtype, ReduceOp op,
                                                            cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                           const std::unordered_map<std::string, uintptr_t>&) {
+                                                           const std::unordered_map<std::string, uintptr_t>& extras,
+                                                           DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -235,9 +239,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsBlockPipeline::build() {
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index 9824fbcd..a616485e 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -1,15 +1,17 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <type_traits>
+
 #include "allreduce/allreduce_nvls_packet.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
-#include "debug.h"
+#include "logger.hpp"
 
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output,
                         [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
@@ -31,15 +33,16 @@ __global__ void __launch_bounds__(1024, 1)
     mscclpp::SwitchChannelDeviceHandle::multimemStore(*(mscclpp::f32x2*)(&pkt), multiPkt + i);
   }
   for (uint32_t i = tid; i < nPktPerRank * worldSize; i += blockDim.x * gridDim.x) {
-    uint data = src[i];
+    // When T == AccumT, stay with raw uint to avoid type mismatch in identity path.
+    using AccRaw =
+        std::conditional_t<std::is_same_v<T, AccumT>, uint, mscclpp::VectorType<AccumT, sizeof(uint) / sizeof(T)>>;
+    AccRaw acc = mscclpp::upcastVector<T, AccumT, AccRaw>(src[i]);
     for (int peer = 0; peer < worldSize; peer++) {
-      if (peer == rank) {
-        continue;
-      }
+      if (peer == rank) continue;
       uint val = scratchPkt[peer * worldSize * nPktPerRank + i].read(flag);
-      data = cal_vector<T, OpType>(data, val);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(acc, val);
     }
-    dst[i] = data;
+    dst[i] = mscclpp::downcastVector<T, AccumT, uint>(acc);
   }
   __syncthreads();
   if (threadIdx.x == 0) {
@@ -62,13 +65,13 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize) {
   return {blockNum, threadNum};
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceNvlsPacketAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int, int worldSize, size_t inputSize, cudaStream_t stream,
                           void* flags, uint32_t flagBufferSize, uint32_t, int nBlocks, int nThreadsPerBlock) {
-    allreduceNvlsPacket<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreduceNvlsPacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank,
         worldSize, flags, flagBufferSize);
     return cudaGetLastError();
@@ -78,6 +81,8 @@ struct AllreduceNvlsPacketAdapter {
 void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator> comm) {
   int nSwitchChannels = 1;
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
+  this->switchChannels_ =
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
 }
 
 AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
@@ -92,9 +97,7 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
   ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
-  int nSwitchChannels = 1;
-  ctx->switchChannels =
-      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
+  ctx->switchChannels = this->switchChannels_;
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
@@ -102,19 +105,20 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
 CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
                                                     void* output, size_t inputSize, mscclpp::DataType dtype,
                                                     ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                    const std::unordered_map<std::string, uintptr_t>&) {
+                                                    const std::unordered_map<std::string, uintptr_t>&,
+                                                    mscclpp::DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize);
   }
   if (blockAndThreadNum.first > maxBlockNum_) {
-    WARN("Block number %d exceeds the maximum limit %d", blockAndThreadNum.first, maxBlockNum_);
+    WARN(ALGO, "Block number ", blockAndThreadNum.first, " exceeds the maximum limit ", maxBlockNum_);
     return CommResult::CommInvalidArgument;
   }
-  AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
+    WARN(ALGO, "Unsupported operation or data type for allreduce, dtype=", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   cudaError_t error =
@@ -122,7 +126,7 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
                 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
                 (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
+    WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
@@ -136,9 +140,10 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             mscclpp::DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index bc03ab26..3bb054da 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -109,7 +109,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsWarpPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -118,6 +118,9 @@ struct NvlsWarpPipelineAdapter {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
+      return cudaErrorNotSupported;
     } else
 #if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
       if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
@@ -147,12 +150,12 @@ void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
-CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
-                                                          void* output, size_t inputSize, DataType dtype, ReduceOp op,
-                                                          cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                          const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx_void, const void* input, void* output, size_t inputSize, DataType dtype,
+    ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsWarpPipelineAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsWarpPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -198,9 +201,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsWarpPipeline::build() {
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index f251bcda..e7f2028f 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -67,7 +67,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsAdapter {
   static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
@@ -77,6 +77,9 @@ struct NvlsAdapter {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
+      return cudaErrorNotSupported;
     } else
 #if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)
         if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
@@ -114,13 +117,14 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
 CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
                                               size_t inputSize, mscclpp::DataType dtype, ReduceOp op,
                                               cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                              const std::unordered_map<std::string, uintptr_t>&) {
+                                              [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras,
+                                              mscclpp::DataType accumDtype) {
   if (!symmetricMemory_) {
     WARN("AllreduceNvls requires symmetric memory for now.");
     return CommResult::CommInvalidArgument;
   }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -203,9 +207,10 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvls::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             mscclpp::DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index ceb545ee..e2d8ef73 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -2,16 +2,17 @@
 // Licensed under the MIT License.
 
 #include <mscclpp/algorithm.hpp>
+#include <type_traits>
 
 #include "allreduce/allreduce_packet.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
-#include "debug.h"
+#include "logger.hpp"
 
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
                     size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize,
@@ -92,12 +93,21 @@ __global__ void __launch_bounds__(1024, 1)
   // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
     uint2 data = src[idx];
-    for (int index = 0; index < nPeers; index++) {
-      const int remoteRank = index < rank ? index : index + 1;
-      mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
-      uint2 val = dstPkt[idx].read(flag);
-      data.x = cal_vector<T, OpType>(val.x, data.x);
-      data.y = cal_vector<T, OpType>(val.y, data.y);
+    {
+      // When T == AccumT, stay with raw uint32_t to avoid type mismatch in identity path.
+      using AccRaw = std::conditional_t<std::is_same_v<T, AccumT>, uint32_t,
+                                        mscclpp::VectorType<AccumT, sizeof(uint32_t) / sizeof(T)>>;
+      AccRaw accX = mscclpp::upcastVector<T, AccumT, AccRaw>(data.x);
+      AccRaw accY = mscclpp::upcastVector<T, AccumT, AccRaw>(data.y);
+      for (int index = 0; index < nPeers; index++) {
+        const int remoteRank = index < rank ? index : index + 1;
+        mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
+        uint2 val = dstPkt[idx].read(flag);
+        accX = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(accX, val.x);
+        accY = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(accY, val.y);
+      }
+      data.x = mscclpp::downcastVector<T, AccumT, uint32_t>(accX);
+      data.y = mscclpp::downcastVector<T, AccumT, uint32_t>(accY);
     }
 
     dst[idx].x = data.x;
@@ -142,7 +152,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct PacketAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
@@ -155,12 +165,12 @@ struct PacketAdapter {
     nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1);
 #if defined(ENABLE_NPKIT)
     size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS;
-    allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
+    allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
         nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
         NpKit::GetCpuTimestamp());
 #else
-    allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
         nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
 #endif
@@ -186,18 +196,22 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
     }
   }
 
-#if defined(__FP8_TYPES_EXIST__)
   // FP8-specific tuning for 32KB-256KB range
-  if (dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2) {
-    if (inputSize < (64 << 10)) {
-      nThreadsPerBlock = 64;
-    } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
-      nThreadsPerBlock = 128;
-    } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) {
-      nThreadsPerBlock = 256;
+  {
+    bool isFp8 = dtype == DataType::FLOAT8_E4M3B15;
+#if defined(__FP8_TYPES_EXIST__)
+    isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2;
+#endif
+    if (isFp8) {
+      if (inputSize < (64 << 10)) {
+        nThreadsPerBlock = 64;
+      } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
+        nThreadsPerBlock = 128;
+      } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) {
+        nThreadsPerBlock = 256;
+      }
     }
   }
-#endif
 #endif
   return {nBlocks, nThreadsPerBlock};
 }
@@ -213,7 +227,8 @@ void AllreducePacket::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
                                                 size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op,
                                                 cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                const std::unordered_map<std::string, uintptr_t>&) {
+                                                const std::unordered_map<std::string, uintptr_t>&,
+                                                DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
@@ -225,9 +240,10 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   size_t channelInOffset = (char*)input - (char*)sendBasePtr;
 
-  AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
+    WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
+         ", dtype=", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   cudaError_t error =
@@ -236,7 +252,7 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
                 stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
+    WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
@@ -280,9 +296,9 @@ std::shared_ptr<Algorithm> AllreducePacket::build() {
       "default_allreduce_packet", "allreduce", [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index d5be2257..db471b93 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -87,7 +87,7 @@ __global__ void __launch_bounds__(1024, 1)
       int rankIdx = (rank + i + 1) % nRanksPerNode;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       int4 data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
-      tmp = cal_vector<T, OpType>(data, tmp);
+      tmp = calVector<T, OpType>(data, tmp);
     }
     for (uint32_t i = 0; i < nPeers; i++) {
       int rankIdx = (rank + i + 1) % nRanksPerNode;
@@ -123,7 +123,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -166,9 +166,9 @@ void AllreduceRsAg::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
                                               size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                                               int nBlocks, int nThreadsPerBlock,
-                                              const std::unordered_map<std::string, uintptr_t>&) {
+                                              const std::unordered_map<std::string, uintptr_t>&, DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
-  AllreduceFunc allreduce = dispatch<AllreduceRsAgAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
          ", dtype=", static_cast<int>(dtype));
@@ -213,9 +213,10 @@ std::shared_ptr<Algorithm> AllreduceRsAg::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
index a230d8cd..eabe3dc5 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
@@ -168,7 +168,7 @@ __global__ void __launch_bounds__(1024, 1)
           uint32_t peerSlotOffset =
               baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
           int4 data = scratch4[peerSlotOffset];
-          tmp = cal_vector<T, OpType>(data, tmp);
+          tmp = calVector<T, OpType>(data, tmp);
         }
         storeVec(resultBuff, myChunkOffset, tmp, nelems);
         // Broadcast reduced result to all peers' scratch at SCATTER_AG_OFFSET + rank * nInt4PerIter
@@ -220,7 +220,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -274,12 +274,12 @@ void AllreduceRsAgPipeline::initialize(std::shared_ptr<Communicator> comm) {
             cudaMemcpyHostToDevice);
 }
 
-CommResult AllreduceRsAgPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
-                                                      size_t inputSize, DataType dtype, ReduceOp op,
-                                                      cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                      const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceRsAgPipeline::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op,
+    cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
-  AllreduceFunc allreduce = dispatch<AllreduceRsAgPipelineAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
          ", dtype=", static_cast<int>(dtype));
@@ -320,9 +320,10 @@ std::shared_ptr<Algorithm> AllreduceRsAgPipeline::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index caac07ae..f95ba7e3 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <type_traits>
+
 #include "allreduce/allreduce_rsag_zero_copy.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
@@ -36,7 +38,7 @@ __device__ mscclpp::DeviceSyncer globalSyncer;
 // the extra copy steps of the standard RSAG. The NRanksPerNode template
 // parameter enables compile-time unrolling of peer loops (supports 4 or 8).
 
-template <int NRanksPerNode, ReduceOp OpType, typename T>
+template <int NRanksPerNode, ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
                           DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int worldSize,
@@ -73,19 +75,26 @@ __global__ void __launch_bounds__(1024, 1)
   }
   __syncthreads();
   int4 data[NPeers];
+  // AccumInt4: when AccumT != T, use a wider accumulator type.
+  // For AccumT == T, this is just int4 (no-op conversion).
+  constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T);
+  // When T == AccumT, stay with raw int4 to avoid type mismatch in identity path.
+  using AccumVec = std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nElemsPerInt4>>;
   for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) {
     uint32_t offset = idx + offset4 + rank * nInt4PerRank;
     if (offset >= nInt4Total) continue;
-    int4 tmp = buff4[offset];
+    int4 tmp_raw = buff4[offset];
 #pragma unroll
     for (int i = 0; i < NPeers; i++) {
       int rankIdx = (rank + i + 1) % NRanksPerNode;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       data[i] = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
     }
+    AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(tmp_raw);
     for (int i = 0; i < NPeers; i++) {
-      tmp = cal_vector<T, OpType>(data[i], tmp);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data[i]);
     }
+    int4 tmp = mscclpp::downcastVector<T, AccumT, int4>(acc);
 #pragma unroll
     for (int i = 0; i < NPeers; i++) {
       int rankIdx = (rank + i + 1) % NRanksPerNode;
@@ -102,7 +111,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgZeroCopyAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -118,11 +127,11 @@ struct AllreduceRsAgZeroCopyAdapter {
       }
     }
     if (nRanksPerNode == 4) {
-      allreduceRsAgZeroCopy<4, OpType, T>
+      allreduceRsAgZeroCopy<4, OpType, T, AccumT>
           <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
                                                      switchChannel, remoteMemories, rank, worldSize, nelems);
     } else if (nRanksPerNode == 8) {
-      allreduceRsAgZeroCopy<8, OpType, T>
+      allreduceRsAgZeroCopy<8, OpType, T, AccumT>
           <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
                                                      switchChannel, remoteMemories, rank, worldSize, nelems);
     } else {
@@ -145,9 +154,10 @@ void AllreduceRsAgZeroCopy::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
                                                       size_t inputSize, DataType dtype, ReduceOp op,
                                                       cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                      const std::unordered_map<std::string, uintptr_t>&) {
+                                                      const std::unordered_map<std::string, uintptr_t>&,
+                                                      DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
-  AllreduceFunc allreduce = dispatch<AllreduceRsAgZeroCopyAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgZeroCopyAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
          ", dtype=", static_cast<int>(dtype));
@@ -220,9 +230,10 @@ std::shared_ptr<Algorithm> AllreduceRsAgZeroCopy::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
index bd402cfa..362308b2 100644
--- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
@@ -20,7 +20,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
index fa811b15..a54352b3 100644
--- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
@@ -16,7 +16,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
index 8b9b04ae..81b74add 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
@@ -19,7 +19,7 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
index 65a48923..fb0c63b8 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
@@ -21,7 +21,8 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<mscclpp::Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-                                 int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras);
+                                 int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+                                 mscclpp::DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
                                              size_t, mscclpp::DataType);
@@ -34,6 +35,7 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
   uintptr_t flagBuffer_;
   size_t flagBufferSize_;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
+  std::vector<SwitchChannel> switchChannels_;
 };
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
index e392b54e..8f02a873 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
@@ -19,7 +19,7 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
index d0593500..d53ea180 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -19,7 +19,7 @@ class AllreduceNvls : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
index f0438dea..de7ca471 100644
--- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
@@ -20,7 +20,7 @@ class AllreducePacket : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
index 6e033f67..1fd663da 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
@@ -19,7 +19,7 @@ class AllreduceRsAg : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
index 2a740ac0..7629f2fe 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
@@ -19,7 +19,7 @@ class AllreduceRsAgPipeline : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
index 6153a0e4..05bf2ef3 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
@@ -18,7 +18,7 @@ class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 9bfac69a..1e0e7e69 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#ifndef MSCCLPP_ALLREDUCE_COMMOM_HPP_
-#define MSCCLPP_ALLREDUCE_COMMOM_HPP_
+#ifndef MSCCLPP_ALLREDUCE_COMMON_HPP_
+#define MSCCLPP_ALLREDUCE_COMMON_HPP_
 
 #include <cmath>
 #include <mscclpp/algorithm.hpp>
@@ -77,55 +77,51 @@ using AllreduceFunc =
                               mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t, int, int, int,
                               size_t, cudaStream_t, void*, uint32_t, uint32_t, int, int)>;
 
-template <template <ReduceOp, typename> class Adapter>
-AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {
-  if (op == SUM) {
-    if (dtype == mscclpp::DataType::FLOAT16) {
-      return Adapter<SUM, half>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT32) {
-      return Adapter<SUM, float>::call;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::BFLOAT16) {
-      return Adapter<SUM, __bfloat16>::call;
-#endif
-#if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
-      return Adapter<SUM, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
-      return Adapter<SUM, __fp8_e5m2>::call;
-#endif
-    } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
-      return Adapter<SUM, int>::call;
-    } else if (dtype == mscclpp::DataType::UINT8) {
-      return Adapter<SUM, uint8_t>::call;
-    } else {
-      return nullptr;
-    }
-  } else if (op == MIN) {
-    if (dtype == mscclpp::DataType::FLOAT16) {
-      return Adapter<MIN, half>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT32) {
-      return Adapter<MIN, float>::call;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::BFLOAT16) {
-      return Adapter<MIN, __bfloat16>::call;
-#endif
-#if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
-      return Adapter<MIN, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
-      return Adapter<MIN, __fp8_e5m2>::call;
-#endif
-    } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
-      return Adapter<MIN, int>::call;
-    } else if (dtype == mscclpp::DataType::UINT8) {
-      return Adapter<MIN, uint8_t>::call;
-    } else {
-      return nullptr;
-    }
+/// Dispatch helper for FP8 types with a configurable accumulation type.
+template <ReduceOp Op, typename FP8T, template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatchFp8Accum(mscclpp::DataType accumDtype, mscclpp::DataType dtype) {
+  if (accumDtype == mscclpp::DataType::FLOAT32) {
+    return Adapter<Op, FP8T, float>::call;
+  } else if (accumDtype == mscclpp::DataType::FLOAT16) {
+    return Adapter<Op, FP8T, half>::call;
+  } else if (accumDtype == dtype) {
+    return Adapter<Op, FP8T, FP8T>::call;
   }
   return nullptr;
 }
+
+template <ReduceOp Op, template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatchByDtype(mscclpp::DataType dtype, mscclpp::DataType accumDtype) {
+  if (dtype == mscclpp::DataType::FLOAT16) {
+    return Adapter<Op, half, half>::call;
+  } else if (dtype == mscclpp::DataType::FLOAT32) {
+    return Adapter<Op, float, float>::call;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  } else if (dtype == mscclpp::DataType::BFLOAT16) {
+    return Adapter<Op, __bfloat16, __bfloat16>::call;
+#endif
+#if defined(__FP8_TYPES_EXIST__)
+  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
+    return dispatchFp8Accum<Op, __fp8_e4m3, Adapter>(accumDtype, dtype);
+  } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
+    return dispatchFp8Accum<Op, __fp8_e5m2, Adapter>(accumDtype, dtype);
+#endif
+  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3B15) {
+    return dispatchFp8Accum<Op, __fp8_e4m3b15, Adapter>(accumDtype, dtype);
+  } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
+    return Adapter<Op, int, int>::call;
+  } else if (dtype == mscclpp::DataType::UINT8) {
+    return Adapter<Op, uint8_t, uint8_t>::call;
+  }
+  return nullptr;
+}
+
+template <template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype, mscclpp::DataType accumDtype) {
+  if (op == SUM) return dispatchByDtype<SUM, Adapter>(dtype, accumDtype);
+  if (op == MIN) return dispatchByDtype<MIN, Adapter>(dtype, accumDtype);
+  return nullptr;
+}
 }  // namespace collective
 }  // namespace mscclpp
 
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
index 82dd2d9e..0b9592d7 100644
--- a/src/ext/nccl/algorithm_selector.cc
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -15,7 +15,8 @@ static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, Da
   bool nvlsSupported = config.nvlsSupported;
 
   // NVLS does not support uint8_t (no hardware support for byte-level reduction)
-  if (dtype == DataType::UINT8) {
+  // NVLS also does not support float8_e4m3b15 (software-defined type with no hardware NVLS reduction support)
+  if (dtype == DataType::UINT8 || dtype == DataType::FLOAT8_E4M3B15) {
     return false;
   }
 
diff --git a/src/ext/nccl/datatype_conversion.hpp b/src/ext/nccl/datatype_conversion.hpp
index 0270a753..dcfb645a 100644
--- a/src/ext/nccl/datatype_conversion.hpp
+++ b/src/ext/nccl/datatype_conversion.hpp
@@ -43,6 +43,7 @@ inline size_t getDataTypeSize(mscclpp::DataType dtype) {
     case mscclpp::DataType::UINT8:
     case mscclpp::DataType::FLOAT8_E4M3:
     case mscclpp::DataType::FLOAT8_E5M2:
+    case mscclpp::DataType::FLOAT8_E4M3B15:
       return 1;
     case mscclpp::DataType::FLOAT16:
     case mscclpp::DataType::BFLOAT16:
@@ -76,6 +77,10 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
     case mscclpp::DataType::FLOAT8_E5M2:
       return ncclFloat8e5m2;
 #endif
+    case mscclpp::DataType::FLOAT8_E4M3B15:
+      // float8_e4m3b15 has no NCCL equivalent; NCCL cannot reduce this type correctly.
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "FLOAT8_E4M3B15 (float8_e4m3b15) has no NCCL equivalent and cannot be used with NCCL collectives");
     default:
       THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
             "Unsupported mscclpp::DataType: " + std::to_string(static_cast<int>(dtype)));
diff --git a/src/ext/nccl/nccl.cc b/src/ext/nccl/nccl.cc
index e12f12b6..2d6c5f9d 100644
--- a/src/ext/nccl/nccl.cc
+++ b/src/ext/nccl/nccl.cc
@@ -83,17 +83,17 @@ static inline int mscclppNcclDlopenInit() {
   const char* ncclLibPath = mscclpp::env()->ncclSharedLibPath.c_str();
   if (ncclLibPath != nullptr && ncclLibPath[0] != '\0') {
     if (std::filesystem::is_directory(ncclLibPath)) {
-      WARN(MSCCLPP_NCCL, "The value of the environment variable %s is a directory", ncclLibPath);
+      WARN(MSCCLPP_NCCL, "MSCCLPP_NCCL_LIB_PATH points to a directory: ", ncclLibPath);
       return dlopenError;
     }
 
     mscclppNcclDlHandle = dlopen(ncclLibPath, RTLD_LAZY | RTLD_NODELETE | RTLD_DEEPBIND);
     if (!mscclppNcclDlHandle) {
-      WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
+      WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: ", dlerror());
       return dlopenError;
     }
   } else {
-    WARN(MSCCLPP_NCCL, "The value of MSCCLPP_NCCL_LIB_PATH is empty!\n");
+    WARN(MSCCLPP_NCCL, "The value of MSCCLPP_NCCL_LIB_PATH is empty!");
     return dlopenError;
   }
 
@@ -270,19 +270,18 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
     return mscclpp::nccl::selectSingleNodeAllreduce(algoMap, request, config);
   }
 
-  INFO(MSCCLPP_NCCL, "No suitable algorithm found for collective '%s', fallback to nccl/rccl",
-       request.collective.c_str());
+  INFO(MSCCLPP_NCCL, "No suitable algorithm found for collective '", request.collective, "', fallback to nccl/rccl");
   return nullptr;
 }
 
 NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
-  INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank %d, world_size=%d", rank, nranks);
+  INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank ", rank, ", world_size=", nranks);
   if (comm == nullptr) {
     WARN(MSCCLPP_NCCL, "comm is nullptr");
     return ncclInvalidArgument;
   }
   if (nranks < 0 || rank < 0 || rank >= nranks) {
-    WARN(MSCCLPP_NCCL, "nranks is %d, rank is %d", nranks, rank);
+    WARN(MSCCLPP_NCCL, "nranks is ", nranks, ", rank is ", rank);
     return ncclInvalidArgument;
   }
   std::shared_ptr<mscclpp::TcpBootstrap> bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
@@ -560,8 +559,8 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
     return ncclInvalidArgument;
   }
 
-  INFO(MSCCLPP_NCCL, "rank %d broadcast sendbuff %p recvbuff %p count %ld, dtype %d, comm: %p", rank, sendbuff,
-       recvbuff, count, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " broadcast sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", count,
+       ", dtype ", datatype, ", comm: ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("broadcast", fallbackList)) {
@@ -619,8 +618,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
   }
   // Declarating variables
   int rank = comm->comm->bootstrap()->getRank();
-  INFO(MSCCLPP_NCCL, "rank %d allreduce sendbuff %p recvbuff %p count %ld, dtype %d comm is %p", rank, sendbuff,
-       recvbuff, count, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " allreduce sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", count,
+       ", dtype ", datatype, " comm is ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib && mscclppNcclInFallbackList("allreduce", fallbackList)) {
@@ -673,8 +672,8 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
     return ncclInvalidArgument;
   }
 
-  INFO(MSCCLPP_NCCL, "ReduceScatter recvcount: %ld, datatype: %d, op: %d, messageSize: %ld", recvcount, datatype, op,
-       bytes * comm->comm->bootstrap()->getNranks());
+  INFO(MSCCLPP_NCCL, "ReduceScatter recvcount: ", recvcount, ", datatype: ", datatype, ", op: ", op,
+       ", messageSize: ", bytes * comm->comm->bootstrap()->getNranks());
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("reducescatter", fallbackList)) {
@@ -730,8 +729,8 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
 
   int rank = comm->comm->bootstrap()->getRank();
   int nRank = comm->comm->bootstrap()->getNranks();
-  INFO(MSCCLPP_NCCL, "rank %d allgather sendbuff %p recvbuff %p count %ld, dtype %d, comm %p", rank, sendbuff, recvbuff,
-       sendcount, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " allgather sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", sendcount,
+       ", dtype ", datatype, ", comm ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("allgather", fallbackList)) {
@@ -866,20 +865,20 @@ ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
     }
   } catch (const mscclpp::Error& e) {
     if (e.getErrorCode() == mscclpp::ErrorCode::InvalidUsage) {
-      WARN(MSCCLPP_NCCL, "Invalid usage: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Invalid usage: ", e.what());
       return ncclInvalidUsage;
     } else {
-      WARN(MSCCLPP_NCCL, "Internal error: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Internal error: ", e.what());
       return ncclInternalError;
     }
   } catch (const mscclpp::CudaError& e) {
-    WARN(MSCCLPP_NCCL, "Cuda error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cuda error: ", e.what());
     return ncclUnhandledCudaError;
   } catch (const mscclpp::CuError& e) {
-    WARN(MSCCLPP_NCCL, "Cu error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cu error: ", e.what());
     return ncclUnhandledCudaError;
   } catch (const mscclpp::BaseError& e) {
-    WARN(MSCCLPP_NCCL, "Base error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Base error: ", e.what());
     return ncclInternalError;
   }
   ptrMap[sharedPtr.get()] = sharedPtr;

From e66ce39647a3db04b6996d7d3c53bc18c0201488 Mon Sep 17 00:00:00 2001
From: Mahdieh Ghazi <mahdiehghazi@microsoft.com>
Date: Wed, 8 Apr 2026 12:38:56 -0400
Subject: [PATCH 07/21] Mahdieh/update version number (#775)

Update the version number for v0.9.0
---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index a3df0a69..ac39a106 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.8.0
+0.9.0

From 8896cd909a651d9c9b3556a3440adfc2c46e0d4a Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 8 Apr 2026 09:53:45 -0700
Subject: [PATCH 08/21] Add ROCm FP8 E4M3B15 support (#774)

## Summary

Add ROCm (gfx942) support for the FP8 E4M3B15 data type, including
optimized conversion routines between FP8 E4M3B15 and FP16/FP32 using
inline assembly.

Extends the allpair packet and fullmesh allreduce kernels to support
higher-precision accumulation (e.g., FP16/FP32) when reducing FP8 data,
improving numerical accuracy.

Adds Python tests to verify that higher-precision accumulation is at
least as accurate as native FP8 accumulation across all algorithm
variants.

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../customized_comm_with_tuning.py            | 666 +++++++++++-------
 include/mscclpp/gpu_data_types.hpp            |  70 +-
 python/csrc/CMakeLists.txt                    |   3 +
 python/requirements_rocm6.txt                 |   4 +-
 python/test/test_fp8_accum.py                 |  34 +-
 .../allreduce/allreduce_allpair_packet.cu     |  22 +-
 .../allreduce/allreduce_fullmesh.cu           |   7 +
 7 files changed, 538 insertions(+), 268 deletions(-)

diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 41be5825..060a0097 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -1,193 +1,117 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8  customized_comm_with_tuning.py
+# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
 
 import os
-import torch
-import mscclpp.utils as mscclpp_utils
-import mscclpp
-import mscclpp.ext
-import netifaces as ni
 import ipaddress
 
+import netifaces as ni
+import torch
+import mscclpp
+import mscclpp.ext
+import mscclpp.utils as mscclpp_utils
 
-def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
-    collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
-    return collection_builder.build_default_algorithms(
-        scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
+# -- Helpers ------------------------------------------------------------------
+
+
+def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor:
+    """Allocate a tensor backed by RawGpuBuffer (symmetric memory)."""
+    # PyTorch's from_dlpack does not support certain float8 DLPack type codes.
+    # Work around by importing as uint8 and reinterpreting via .view().
+    _DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)
+    if dtype in _DLPACK_UNSUPPORTED:
+        dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8))
+        return torch.utils.dlpack.from_dlpack(dlpack).view(dtype)
+    dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype))
+    return torch.utils.dlpack.from_dlpack(dlpack)
+
+
+def _load_algorithms(scratch: torch.Tensor, rank: int):
+    return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms(
+        scratch_buffer=scratch.data_ptr(),
+        scratch_buffer_size=scratch.nbytes,
+        rank=rank,
     )
 
 
-def interfaces_for_ip_netifaces(ip: str):
+def _interfaces_for_ip(ip: str):
     target = ipaddress.ip_address(ip)
-    for interface in ni.interfaces():
-        addresses = ni.ifaddresses(interface)
-        if ni.AF_INET in addresses:
-            for link in addresses[ni.AF_INET]:
-                if "addr" in link:
-                    addr = ipaddress.ip_address(link["addr"])
-                    if addr == target:
-                        return interface
+    for iface in ni.interfaces():
+        addrs = ni.ifaddresses(iface)
+        if ni.AF_INET in addrs:
+            for link in addrs[ni.AF_INET]:
+                if "addr" in link and ipaddress.ip_address(link["addr"]) == target:
+                    return iface
     return None
 
 
-def to_mscclpp_reduce_op(op: torch.distributed.ReduceOp) -> mscclpp.ReduceOp:
+def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
     if op == torch.distributed.ReduceOp.SUM:
         return mscclpp.ReduceOp.SUM
-    elif op == torch.distributed.ReduceOp.MIN:
+    if op == torch.distributed.ReduceOp.MIN:
         return mscclpp.ReduceOp.MIN
-    else:
-        raise ValueError(f"unsupported op: {op}")
+    raise ValueError(f"unsupported op: {op}")
+
+
+def _round_pow2(size: int) -> int:
+    """Round up to next power-of-2, clamped to [1024, 256 MB]."""
+    size = max(size, 1024)
+    size = min(size, 256 << 20)
+    return 1 << (size - 1).bit_length()
+
+
+# -- CustomizedComm -----------------------------------------------------------
 
 
 class CustomizedComm:
-    def __init__(self, comm: mscclpp.CommGroup):
+    """Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
+
+    _TUNE_N_WARMUP = 5
+    _TUNE_N_GRAPH_LAUNCHES = 10
+    _TUNE_N_OPS_PER_GRAPH = 100
+    _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128]
+    _CANDIDATE_NTHREADS = [512, 768, 1024]
+    _NBLOCKS_LIMIT = {
+        "default_allreduce_nvls_packet": 16,
+        "default_allreduce_packet": 56,
+        "default_allreduce_allpair_packet": 56,
+        "default_allreduce_fullmesh": 64,
+        "default_allgather_fullmesh2": 32,
+    }
+
+    def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self.comm = comm
         self.rank = comm.my_rank
         self.world_size = comm.nranks
-        self.local_rank = comm.my_rank % comm.nranks_per_node
-        self.n_ranks_per_node = comm.nranks_per_node
-        dlpack = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
-        self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
-        algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
-        self._algorithm_nvls_packet = [
-            algo
-            for algo in algorithms
-            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
-        ][0]
-        self._algorithm_rsag_zero_copy = [
-            algo
-            for algo in algorithms
-            if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy"
-        ][0]
-        self._algorithm_packet = [
-            algo for algo in algorithms if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
-        ][0]
-        if mscclpp.is_nvls_supported():
-            self._algorithm_nvls_zero_copy = [
-                algo
-                for algo in algorithms
-                if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy"
-            ][0]
-        self._tune(n_warmup=5, n_graph_launches=10, n_ops_per_graph=100)
+        self.symmetric_memory = symmetric_memory
+        self._nvls = mscclpp.is_nvls_supported()
 
-    def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
-        sizes = [1 << i for i in range(10, 28)]
-        # Pre-fill with defaults for barrier
-        self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
+        self._scratch = _make_tensor(1 << 27, torch.float16)
+        self._barrier_tensor = _make_tensor(4096, torch.float32)
 
-        tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
-        tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
-        tune_tensor.normal_()
-        candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
-        candidates_nthreads = [512, 768, 1024]
+        algos = _load_algorithms(self._scratch, self.rank)
+        self._algos = {(a.collective, a.name): a for a in algos}
 
-        for size in sizes:
-            algos = []
-            if mscclpp.is_nvls_supported():
-                algos.append(self._algorithm_nvls_zero_copy)
-            if size <= 4 * 1024 * 1024:
-                algos.append(self._algorithm_nvls_packet)
-                algos.append(self._algorithm_packet)
-            if size >= 512 * 1024:
-                algos.append(self._algorithm_rsag_zero_copy)
+        # {collective: {rounded_size: (algo, nblocks, nthreads)}}
+        self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}}
+        self._tune_buf = None
+        self._time_buf = None
 
-            best_time = float("inf")
-            best_config = None
+    def _algo(self, collective: str, name: str):
+        return self._algos.get((collective, name))
 
-            for algo in algos:
-                for nb in candidates_nblocks:
-                    if algo.name == "default_allreduce_nvls_packet" and nb > 16:
-                        continue
-                    if algo.name == "default_allreduce_packet" and nb > 56:
-                        continue
-                    for nt in candidates_nthreads:
-                        if self._run_algo(algo, tune_tensor, size, nb, nt) != 0:
-                            continue
+    def _default_ar_config(self):
+        """Fallback allreduce config for barrier / timing sync."""
+        pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
+        if self._nvls and pkt:
+            return (pkt, 0, 0)
+        return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
 
-                        for _ in range(n_warmup):
-                            self._run_algo(algo, tune_tensor, size, nb, nt)
-                        self.barrier()
+    # -- low-level execute --
 
-                        capture_stream = torch.cuda.Stream()
-                        capture_stream.wait_stream(torch.cuda.current_stream())
-
-                        g = torch.cuda.CUDAGraph()
-                        # Warmup on capture stream
-                        with torch.cuda.stream(capture_stream):
-                            self._run_algo(algo, tune_tensor, size, nb, nt)
-                        capture_stream.synchronize()
-
-                        with torch.cuda.graph(g, stream=capture_stream):
-                            for _ in range(n_ops_per_graph):
-                                self._run_algo(algo, tune_tensor, size, nb, nt)
-
-                        start_event = torch.cuda.Event(enable_timing=True)
-                        end_event = torch.cuda.Event(enable_timing=True)
-                        start_event.record(capture_stream)
-                        with torch.cuda.stream(capture_stream):
-                            for _ in range(n_graph_launches):
-                                g.replay()
-                        end_event.record(capture_stream)
-                        end_event.synchronize()
-
-                        elapsed = start_event.elapsed_time(end_event)
-
-                        # Synchronize timing results across all ranks to ensure consistent algorithm selection
-                        # replicate n times such due to algo limitations
-                        time_tensor = torch.full((self.world_size,), elapsed, dtype=torch.float64, device="cuda").to(
-                            dtype=torch.float32
-                        )
-                        torch.cuda.current_stream().wait_stream(capture_stream)
-                        # TODO: use all_reduce may cause problem if the time elapsed between different algos are too close.
-                        # May change to broadcast in the future if that becomes an issue.
-                        self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM)
-                        avg_time = time_tensor[self.rank].item() / self.world_size
-
-                        if avg_time < best_time:
-                            best_time = avg_time
-                            best_config = (algo, nb, nt)
-
-            if best_config:
-                self.best_configs[size] = best_config
-                if self.rank == 0:
-                    print(
-                        f"Size {size}: Best Algo {best_config[0].name} nblocks {best_config[1]} nthreads {best_config[2]} Time {(best_time/(n_graph_launches * n_ops_per_graph))*1000:.2f} us"
-                    )
-        # reset the algorithms after tuning
-        torch.cuda.synchronize()
-        for algo in algos:
-            algo.reset()
-
-    def _run_algo(self, algo: mscclpp.Algorithm, tensor, size, nblocks, nthreads):
-        return algo.execute(
-            comm=self.comm.communicator,
-            input_buffer=tensor.data_ptr(),
-            output_buffer=tensor.data_ptr(),
-            input_size=size,
-            output_size=size,
-            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
-            op=mscclpp.ReduceOp.SUM,
-            stream=torch.cuda.current_stream().cuda_stream,
-            nblocks=nblocks,
-            nthreads_per_block=nthreads,
-            symmetric_memory=True,
-        )
-
-    def get_tuned_config(self, size):
-        if size < 1024:
-            target_size = 1024
-        elif size > 256 * 1024 * 1024:
-            target_size = 256 * 1024 * 1024
-        else:
-            target_size = 1 << (size - 1).bit_length()
-        return self.best_configs.get(target_size)
-
-    def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
-        assert op == torch.distributed.ReduceOp.SUM
-        config = self.get_tuned_config(tensor.nbytes)
-        algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0)
+    def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True):
+        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
         ret = algo.execute(
             comm=self.comm.communicator,
             input_buffer=tensor.data_ptr(),
@@ -195,107 +119,357 @@ class CustomizedComm:
             input_size=tensor.nbytes,
             output_size=tensor.nbytes,
             dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
-            op=to_mscclpp_reduce_op(op),
-            stream=stream.cuda_stream if stream is not None else torch.cuda.current_stream().cuda_stream,
-            nblocks=nblocks,
-            nthreads_per_block=nthreads,
-            symmetric_memory=True,
+            op=op,
+            stream=s,
+            nblocks=nb,
+            nthreads_per_block=nt,
+            symmetric_memory=sym,
+            accum_dtype=accum_dtype,
         )
         if ret != 0:
-            print(f"Rank {self.rank}: Algo {algo.name} failed with error {ret}")
+            print(f"Rank {self.rank}: {algo.name} failed ({ret})")
+        return ret
+
+    def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
+        if sym is None:
+            sym = self.symmetric_memory
+        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
+        ret = algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=inp.data_ptr(),
+            output_buffer=out.data_ptr(),
+            input_size=inp.nbytes,
+            output_size=out.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype),
+            op=mscclpp.ReduceOp.NOP,
+            stream=s,
+            nblocks=nb,
+            nthreads_per_block=nt,
+            symmetric_memory=sym,
+        )
+        if ret != 0:
+            print(f"Rank {self.rank}: AG {algo.name} failed ({ret})")
+        return ret
+
+    def _barrier_internal(self):
+        a, nb, nt = self._default_ar_config()
+        self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
+
+    # -- lazy tuning --
+
+    def _ensure_tune_bufs(self):
+        if self._tune_buf is None:
+            self._tune_buf = _make_tensor(1 << 27, torch.float16)
+            self._tune_buf.normal_()
+            self._time_buf = _make_tensor(4096, torch.float32)
+        return self._tune_buf
+
+    def _ar_candidates(self, size: int):
+        out = []
+        if size <= 4 << 20:
+            a = self._algo("allreduce", "default_allreduce_nvls_packet")
+            if self._nvls and a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_packet")
+            if a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_allpair_packet")
+            if a:
+                out.append(a)
+        if size >= 512 << 10:
+            a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
+            if self._nvls and self.symmetric_memory and a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
+            if a:
+                out.append(a)
+        if torch.version.hip is not None:
+            a = self._algo("allreduce", "default_allreduce_fullmesh")
+            if a:
+                out.append(a)
+        return out
+
+    def _ag_candidates(self):
+        a = self._algo("allgather", "default_allgather_fullmesh2")
+        return [a] if a else []
+
+    def _run_tune(self, collective, algo, buf, size, nb, nt):
+        """Single tune invocation for either collective."""
+        if collective == "allreduce":
+            return algo.execute(
+                comm=self.comm.communicator,
+                input_buffer=buf.data_ptr(),
+                output_buffer=buf.data_ptr(),
+                input_size=size,
+                output_size=size,
+                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
+                op=mscclpp.ReduceOp.SUM,
+                stream=torch.cuda.current_stream().cuda_stream,
+                nblocks=nb,
+                nthreads_per_block=nt,
+                symmetric_memory=True,
+            )
+        else:
+            total = size * self.world_size
+            out_ptr = buf.data_ptr()
+            return algo.execute(
+                comm=self.comm.communicator,
+                input_buffer=out_ptr + self.rank * size,
+                output_buffer=out_ptr,
+                input_size=size,
+                output_size=total,
+                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
+                op=mscclpp.ReduceOp.NOP,
+                stream=torch.cuda.current_stream().cuda_stream,
+                nblocks=nb,
+                nthreads_per_block=nt,
+                symmetric_memory=False,
+            )
+
+    def _tune_size(self, collective: str, target_size: int):
+        """Auto-tune one (collective, target_size) pair and cache result."""
+        buf = self._ensure_tune_bufs()
+        cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates()
+
+        best_time, best_cfg = float("inf"), None
+        used = set()
+        run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
+
+        for algo in cands:
+            nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
+            for nb in self._CANDIDATE_NBLOCKS:
+                if nb > nb_limit:
+                    continue
+                for nt in self._CANDIDATE_NTHREADS:
+                    # Feasibility — sync result across ranks so all agree
+                    ret = run(algo, nb, nt)
+                    torch.cuda.synchronize()
+                    self._time_buf[0] = float(ret)
+                    self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True)
+                    if self._time_buf[0].item() != 0:
+                        continue
+                    used.add(algo)
+
+                    # Warmup
+                    for _ in range(self._TUNE_N_WARMUP):
+                        run(algo, nb, nt)
+
+                    # CUDA-graph timed benchmark
+                    cs = torch.cuda.Stream()
+                    cs.wait_stream(torch.cuda.current_stream())
+                    g = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(g, stream=cs):
+                        for _ in range(self._TUNE_N_OPS_PER_GRAPH):
+                            run(algo, nb, nt)
+
+                    start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+                    start.record(cs)
+                    with torch.cuda.stream(cs):
+                        for _ in range(self._TUNE_N_GRAPH_LAUNCHES):
+                            g.replay()
+                    end.record(cs)
+                    end.synchronize()
+                    elapsed = start.elapsed_time(end)
+
+                    # Cross-rank timing sync
+                    self._time_buf.fill_(elapsed)
+                    torch.cuda.current_stream().wait_stream(cs)
+                    self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True)
+                    avg = self._time_buf[self.rank].item() / self.world_size
+
+                    if avg < best_time:
+                        best_time, best_cfg = avg, (algo, nb, nt)
+
+        if best_cfg:
+            self._tune_cache[collective][target_size] = best_cfg
+            if self.rank == 0:
+                n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH
+                print(
+                    f"[tune] {collective} size={target_size}: {best_cfg[0].name} "
+                    f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us",
+                    flush=True,
+                )
+        else:
+            fb = (
+                self._default_ar_config()
+                if collective == "allreduce"
+                else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None)
+            )
+            self._tune_cache[collective][target_size] = fb
+
+        torch.cuda.synchronize()
+        self._barrier_internal()
+        for a in used:
+            a.reset()
+
+    # -- public API --
+
+    def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None):
+        sz = _round_pow2(tensor.nbytes)
+        if sz not in self._tune_cache["allreduce"]:
+            self._tune_size("allreduce", sz)
+        a, nb, nt = self._tune_cache["allreduce"][sz]
+        self._exec_ar(
+            tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory
+        )
+
+    def all_gather(self, output_tensor, input_tensor, stream=None):
+        sz = _round_pow2(input_tensor.nbytes)
+        if sz not in self._tune_cache["allgather"]:
+            self._tune_size("allgather", sz)
+        a, nb, nt = self._tune_cache["allgather"][sz]
+        self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory)
 
     def barrier(self):
-        tensor = torch.empty(self.world_size, dtype=torch.float, device=torch.device("cuda"))
-        self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
-
-    def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
-        low = 5 * 1024
-        high = 80 * 1024 * 1024
-        sizes = []
-        curr = low
-        while curr <= high:
-            sizes.append(curr)
-            curr *= 2
-
-        if self.rank == 0:
-            print(f"{'Size (Bytes)':<20} {'Time (us)':<20} {'AlgoBW (GB/s)':<20}")
-
-        dtype = torch.float16
-        capture_stream = torch.cuda.Stream()
-
-        # Allocate a single large RawGpuBuffer (symmetric memory) and reuse it for all sizes.
-        # Cannot allocate per-size tensors with symmetric memory.
-        bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype))
-        bench_buf = torch.utils.dlpack.from_dlpack(bench_buf)
-        bench_buf.normal_()
-
-        for size in sizes:
-            n_elements = size // bench_buf.element_size()
-            tensor = bench_buf[:n_elements]
-
-            capture_stream.wait_stream(torch.cuda.current_stream())
-            # Capture Graph
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g, stream=capture_stream):
-                for _ in range(n_iter_per_graph):
-                    self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
-
-            # warmup: Execute the graph once to prime the driver
-            with torch.cuda.stream(capture_stream):
-                for _ in range(n_warmup):
-                    g.replay()
-                self.barrier()
-            capture_stream.synchronize()
-
-            # Benchmark
-            start_event = torch.cuda.Event(enable_timing=True)
-            end_event = torch.cuda.Event(enable_timing=True)
-
-            start_event.record(capture_stream)
-            with torch.cuda.stream(capture_stream):
-                for _ in range(n_graph_launches):
-                    g.replay()
-            end_event.record(capture_stream)
-            end_event.synchronize()
-
-            # Get elapsed time in milliseconds
-            elapsed_ms = start_event.elapsed_time(end_event)
-            avg_time_ms = elapsed_ms / (n_graph_launches * n_iter_per_graph)
-            time_us = avg_time_ms * 1000
-
-            alg_bw = size / (avg_time_ms * 1e-3) if avg_time_ms > 0 else 0
-            if self.rank == 0:
-                print(f"{size:<20} {time_us:<20.2f} {alg_bw / 1e9:<20.2f}")
+        self._barrier_internal()
 
     def destroy(self):
-        self._algorithm_nvls_nonzero_copy = None
-        self._algorithm_nvls_packet = None
-        self.scratch_buffer = None
-        self.comm = None
+        self._algos.clear()
+        self._tune_cache = {"allreduce": {}, "allgather": {}}
+        self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None
 
 
-def init_dist() -> CustomizedComm:
-    rank = int(os.environ["RANK"])
-    world = int(os.environ["WORLD_SIZE"])
-    master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
-    master_port = os.environ["MSCCLPP_MASTER_PORT"]
-    interface = interfaces_for_ip_netifaces(master_addr)
-    if interface is None:
-        raise ValueError(f"Cannot find network interface for IP address {master_addr}")
-    interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
-    mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
-    return CustomizedComm(mscclpp_group)
+# -- Benchmarks (standalone) --------------------------------------------------
+
+
+def _bench_sizes(low=5 * 1024, high=80 << 20):
+    sizes, c = [], low
+    while c <= high:
+        sizes.append(c)
+        c *= 2
+    return sizes
+
+
+def benchmark_allreduce(
+    comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100
+):
+    sizes = _bench_sizes()
+    if comm.rank == 0:
+        print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}")
+        print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
+
+    cs = torch.cuda.Stream()
+    buf = _make_tensor(1 << 27, dtype)
+    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
+
+    for size in sizes:
+        nelems = size // buf.element_size()
+        t = buf[: size // buf.element_size()]
+        comm.all_reduce(t, accum_dtype=accum_dtype)
+        torch.cuda.synchronize()
+
+        cs.wait_stream(torch.cuda.current_stream())
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, stream=cs):
+            for _ in range(n_iter):
+                comm.all_reduce(t, accum_dtype=accum_dtype)
+        with torch.cuda.stream(cs):
+            for _ in range(n_warmup):
+                g.replay()
+            comm.barrier()
+        cs.synchronize()
+
+        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        s.record(cs)
+        with torch.cuda.stream(cs):
+            for _ in range(n_graph_launches):
+                g.replay()
+        e.record(cs)
+        e.synchronize()
+
+        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
+        if comm.rank == 0:
+            print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}")
+
+
+def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100):
+    sizes = _bench_sizes()
+    if comm.rank == 0:
+        print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}")
+        print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
+
+    cs = torch.cuda.Stream()
+    buf = _make_tensor(1 << 27, dtype)
+    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
+
+    for prs in sizes:
+        total = prs * comm.world_size
+        if total > buf.nbytes:
+            break
+        nt = total // buf.element_size()
+        npr = prs // buf.element_size()
+        out = buf[:nt]
+        inp = out[comm.rank * npr : (comm.rank + 1) * npr]
+
+        comm.all_gather(out, inp)
+        torch.cuda.synchronize()
+
+        cs.wait_stream(torch.cuda.current_stream())
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, stream=cs):
+            for _ in range(n_iter):
+                comm.all_gather(out, inp)
+        with torch.cuda.stream(cs):
+            for _ in range(n_warmup):
+                g.replay()
+            comm.barrier()
+        cs.synchronize()
+
+        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        s.record(cs)
+        with torch.cuda.stream(cs):
+            for _ in range(n_graph_launches):
+                g.replay()
+        e.record(cs)
+        e.synchronize()
+
+        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
+        if comm.rank == 0:
+            print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}")
+
+
+# -- Bootstrap & main ---------------------------------------------------------
+
+
+def init_dist() -> mscclpp.CommGroup:
+    addr = os.environ.get("MSCCLPP_MASTER_ADDR")
+    if addr:
+        rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"])
+        port = os.environ["MSCCLPP_MASTER_PORT"]
+        iface = _interfaces_for_ip(addr)
+        if not iface:
+            raise ValueError(f"No interface for {addr}")
+        return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
+    import torch.distributed as dist
+
+    dist.init_process_group(backend="gloo")
+    return mscclpp.CommGroup(torch_group=dist.group.WORLD)
 
 
 def main():
     local = int(os.environ["LOCAL_RANK"])
     torch.cuda.set_device(local)
-    comm = init_dist()
-    comm.benchmark(n_warmup=5, n_graph_launches=10, n_iter_per_graph=100)
-    comm.barrier()
+
+    dtype_str = os.environ.get("DTYPE", "float16")
+    dtype = getattr(torch, dtype_str, torch.float16)
+    accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
+    accum_str = os.environ.get("ACCUM_DTYPE")
+    accum_dtype = accum_map.get(accum_str) if accum_str else None
+
+    comm_group = init_dist()
+    cc = CustomizedComm(comm_group)
+
+    print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
+    benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
+    cc.barrier()
     torch.cuda.synchronize()
-    comm.destroy()
-    print(f"rank {local} All-reduce operation completed successfully.")
+
+    benchmark_allgather(cc, dtype=dtype)
+    cc.barrier()
+    torch.cuda.synchronize()
+
+    cc.destroy()
+    print(f"rank {local} completed successfully.")
 
 
 if __name__ == "__main__":
diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
index fa31a28f..41bd5928 100644
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -1072,6 +1072,15 @@ MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
   __half2 h;
   asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h)) : "r"(out0));
   return h;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: same bit manipulation as CUDA, store packed fp16 bits via words[].
+  uint16_t in = v.storage.__x;
+  uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
+  uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
+  uint32_t out0 = b0 | (a0 & 0x80008000u);
+  f16x2 result;
+  result.words[0] = out0;
+  return result;
 #else
   f16x2 result;
   result.data[0] = __float2half(float(v.data[0]));
@@ -1100,6 +1109,17 @@ MSCCLPP_DEVICE_INLINE f16x4 to<f16x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
   asm("mov.b32 %0, %1;" : "=r"(result.words[0]) : "r"(out0));
   asm("mov.b32 %0, %1;" : "=r"(result.words[1]) : "r"(out1));
   return result;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: __byte_perm + bitwise E4→E5 shift (no lop3), store via words[].
+  uint32_t in = v.storage.__x;
+  uint32_t a0 = __byte_perm(0u, in, 0x5746u);
+  uint32_t out0 = ((a0 >> 1) & 0x3f803f80u) | (a0 & 0x80008000u);
+  uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
+  uint32_t out1 = ((a1 >> 1) & 0x3f803f80u) | (a1 & 0x80008000u);
+  f16x4 result;
+  result.words[0] = out0;
+  result.words[1] = out1;
+  return result;
 #else
   f16x4 result;
 #pragma unroll
@@ -1127,6 +1147,16 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
   uint32_t b0 = a0 | (in0 & 0x80008000u);
   uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
   return bit_cast<f8_e4m3b15x2>(packed);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, pack.
+  uint32_t in0 = v.words[0];
+  uint32_t abs0 = in0 & 0x7fff7fffu;
+  uint32_t a0;
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
+  a0 = a0 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
+  return bit_cast<f8_e4m3b15x2>(packed);
 #else
   f8_e4m3b15x2 result;
   result.data[0] = __fp8_e4m3b15(__half2float(v.data[0]));
@@ -1154,6 +1184,19 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
   asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b1) : "r"(a1), "r"(in1), "r"(0x80008000u));
   uint32_t packed = __byte_perm(b0, b1, 0x7531u);
   return bit_cast<f8_e4m3b15x4>(packed);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, __byte_perm pack.
+  uint32_t in0 = v.words[0], in1 = v.words[1];
+  uint32_t abs0 = in0 & 0x7fff7fffu, abs1 = in1 & 0x7fff7fffu;
+  uint32_t a0, a1;
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3B803B80u));
+  a0 = a0 * 2u + 0x00800080u;
+  a1 = a1 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint32_t b1 = a1 | (in1 & 0x80008000u);
+  uint32_t packed = __byte_perm(b0, b1, 0x7531u);
+  return bit_cast<f8_e4m3b15x4>(packed);
 #else
   f8_e4m3b15x4 result;
 #pragma unroll
@@ -1164,8 +1207,7 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
 #endif
 }
 
-// --- fp8_e4m3b15 <-> f32 conversion specializations ---
-// Derived from fp16 conversions: fp8→f32 = fp8→fp16→f32, f32→fp8 = f32→fp16→fp8.
+// --- fp8_e4m3b15 <-> f32 conversion specializations (software, always available) ---
 
 /// f8_e4m3b15x2 -> f32x2.
 /// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
@@ -1175,6 +1217,12 @@ MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
   f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
   float2 f2 = __half22float2(h);
   return bit_cast<f32x2>(f2);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
+  f32x2 result;
+  result.data[0] = __half2float(h.data[0]);
+  result.data[1] = __half2float(h.data[1]);
+  return result;
 #else
   f32x2 result;
   result.data[0] = float(v.data[0]);
@@ -1200,6 +1248,14 @@ MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
   result.data[2] = f1.x;
   result.data[3] = f1.y;
   return result;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
+  f32x4 result;
+  result.data[0] = __half2float(h.data[0]);
+  result.data[1] = __half2float(h.data[1]);
+  result.data[2] = __half2float(h.data[2]);
+  result.data[3] = __half2float(h.data[3]);
+  return result;
 #else
   f32x4 result;
 #pragma unroll
@@ -1218,6 +1274,11 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f32x2>(const f32x2& v) {
   float2 f2 = {v.data[0], v.data[1]};
   __half2 h = __float22half2_rn(f2);
   return to<f8_e4m3b15x2, f16x2>(h);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x2 h;
+  h.data[0] = __float2half_rn(v.data[0]);
+  h.data[1] = __float2half_rn(v.data[1]);
+  return to<f8_e4m3b15x2, f16x2>(h);
 #else
   f8_e4m3b15x2 result;
   result.data[0] = __fp8_e4m3b15(v.data[0]);
@@ -1239,6 +1300,11 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f32x4>(const f32x4& v) {
   asm("mov.b32 %0, %1;" : "=r"(h.words[0]) : "r"(*reinterpret_cast<uint32_t*>(&h01)));
   asm("mov.b32 %0, %1;" : "=r"(h.words[1]) : "r"(*reinterpret_cast<uint32_t*>(&h23)));
   return to<f8_e4m3b15x4, f16x4>(h);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x4 h;
+  h.words[0] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[0], v.data[1]));
+  h.words[1] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[2], v.data[3]));
+  return to<f8_e4m3b15x4, f16x4>(h);
 #else
   f8_e4m3b15x4 result;
 #pragma unroll
diff --git a/python/csrc/CMakeLists.txt b/python/csrc/CMakeLists.txt
index 8759201f..44fb150f 100644
--- a/python/csrc/CMakeLists.txt
+++ b/python/csrc/CMakeLists.txt
@@ -24,4 +24,7 @@ set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
 set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
 target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES})
 target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
+if(MSCCLPP_USE_ROCM)
+    target_compile_definitions(mscclpp_py PRIVATE MSCCLPP_USE_ROCM)
+endif()
 install(TARGETS mscclpp_py LIBRARY DESTINATION .)
diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt
index d2a3389b..7ed4fef3 100644
--- a/python/requirements_rocm6.txt
+++ b/python/requirements_rocm6.txt
@@ -1,5 +1,5 @@
-mpi4py==4.1.1
-cupy==13.6.0
+mpi4py
+cupy
 prettytable
 netifaces
 pytest
diff --git a/python/test/test_fp8_accum.py b/python/test/test_fp8_accum.py
index 3a6c67f1..82981ce1 100644
--- a/python/test/test_fp8_accum.py
+++ b/python/test/test_fp8_accum.py
@@ -21,9 +21,8 @@ from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
 # FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs.
 # On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed.
 _is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip
-# TODO(binyli): Skip hip for now, will fix it in the next PR
-_skip_fp8 = _is_hip or int(cp.cuda.Device().compute_capability) < 89
-pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA (HIP not yet supported)")
+_skip_fp8 = not _is_hip and int(cp.cuda.Device().compute_capability) < 89
+pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA")
 
 # ---------------------------------------------------------------------------
 # FP8 E4M3FN helpers (bias=7, no infinity, NaN = exp=15 & mant=7)
@@ -208,6 +207,7 @@ def run_allreduce(algo, comm_group, buffer, dtype, accum_dtype=None, nblocks=0,
         "default_allreduce_nvls_packet",
         "default_allreduce_fullmesh",
         "default_allreduce_rsag_zero_copy",
+        "default_allreduce_allpair_packet",
     ],
 )
 @pytest.mark.parametrize("size", [1024, 4096, 16384, 65536, 262144, 1048576])
@@ -220,6 +220,8 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
     comm_group, algo_map, scratch = setup_algorithms(mpi_group)
     if algo_name not in algo_map:
         pytest.skip(f"{algo_name} not available")
+    if "nvls" in algo_name and not is_nvls_supported():
+        pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
     algo = algo_map[algo_name]
 
     buf = GpuBuffer(size, dtype=cp.uint8)
@@ -243,9 +245,9 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
 
     errors = {}
     for accum_label, accum_dtype in accum_configs:
-        # Generate deterministic per-rank data
-        cp.random.seed(42 + rank)
-        src_f32 = cp.random.randn(size).astype(cp.float32)
+        # Generate deterministic per-rank data (use numpy to avoid hipRAND issues on ROCm)
+        rng = np.random.RandomState(42 + rank)
+        src_f32 = cp.asarray(rng.randn(size).astype(np.float32))
         src_f32 = cp.clip(src_f32, -240.0, 240.0)
         src_fp8 = float_to_e4m3fn(src_f32)
 
@@ -268,8 +270,8 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
         # Compute float32 reference: sum all ranks' quantized FP8 inputs in float32
         ref_f32 = cp.zeros(size, dtype=cp.float32)
         for r in range(world_size):
-            cp.random.seed(42 + r)
-            rank_data = cp.random.randn(size).astype(cp.float32)
+            rng_r = np.random.RandomState(42 + r)
+            rank_data = cp.asarray(rng_r.randn(size).astype(np.float32))
             rank_data = cp.clip(rank_data, -240.0, 240.0)
             rank_data_fp8 = float_to_e4m3fn(rank_data)
             ref_f32 += e4m3fn_to_float(rank_data_fp8)
@@ -303,6 +305,8 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
         "default_allreduce_packet",
         "default_allreduce_nvls_packet",
         "default_allreduce_rsag_zero_copy",
+        "default_allreduce_fullmesh",
+        "default_allreduce_allpair_packet",
     ],
 )
 @pytest.mark.parametrize("size", [1024, 4096, 65536])
@@ -315,6 +319,8 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
     comm_group, algo_map, scratch = setup_algorithms(mpi_group)
     if algo_name not in algo_map:
         pytest.skip(f"{algo_name} not available")
+    if "nvls" in algo_name and not is_nvls_supported():
+        pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
 
     algo = algo_map[algo_name]
     buf = GpuBuffer(size, dtype=cp.uint8)
@@ -336,9 +342,9 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
     errors = {}
     for accum_label, accum_dtype in accum_configs:
         # Generate deterministic per-rank random uint8 values in valid e4m3b15 range
-        cp.random.seed(42 + rank)
-        raw = cp.random.randint(0, 0x78, (size,), dtype=cp.uint8)
-        signs = cp.random.randint(0, 2, (size,), dtype=cp.uint8).astype(cp.uint8) << 7
+        rng = np.random.RandomState(42 + rank)
+        raw = cp.asarray(rng.randint(0, 0x78, (size,)).astype(np.uint8))
+        signs = cp.asarray(rng.randint(0, 2, (size,)).astype(np.uint8)) << 7
         src_uint8 = raw | signs
         # Fix negative zero -> positive zero
         src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8)
@@ -364,9 +370,9 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
         # Compute float32 reference
         ref_f32 = cp.zeros(size, dtype=cp.float32)
         for r in range(world_size):
-            cp.random.seed(42 + r)
-            raw_r = cp.random.randint(0, 0x78, (size,), dtype=cp.uint8)
-            signs_r = cp.random.randint(0, 2, (size,), dtype=cp.uint8).astype(cp.uint8) << 7
+            rng_r = np.random.RandomState(42 + r)
+            raw_r = cp.asarray(rng_r.randint(0, 0x78, (size,)).astype(np.uint8))
+            signs_r = cp.asarray(rng_r.randint(0, 2, (size,)).astype(np.uint8)) << 7
             bits_r = raw_r | signs_r
             bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r)
             ref_f32 += e4m3b15_to_float(bits_r)
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 6cbc8977..17bcfc33 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <collective_utils.hpp>
+#include <type_traits>
 
 #include "allreduce/allreduce_allpair_packet.hpp"
 #include "allreduce/common.hpp"
@@ -11,7 +12,7 @@
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                                   size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
                                   int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
@@ -43,13 +44,16 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
   // step 2: Reduce Data
   for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nelems; idx += blockDim.x * gridDim.x) {
     uint32_t data = src[idx];
+    using AccRaw = std::conditional_t<std::is_same_v<T, AccumT>, uint32_t,
+                                      mscclpp::VectorType<AccumT, sizeof(uint32_t) / sizeof(T)>>;
+    AccRaw acc = mscclpp::upcastVector<T, AccumT, AccRaw>(data);
     for (int index = 0; index < nPeers; index++) {
       const int remoteRank = index < rank ? index : index + 1;
       LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems;
       uint32_t val = dstPkt[idx].read(flag, -1);
-      data = calVector<T, OpType>(val, data);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(acc, val);
     }
-    dst[idx] = data;
+    dst[idx] = mscclpp::downcastVector<T, AccumT, uint32_t>(acc);
   }
   __syncthreads();
   if (threadIdx.x == 0) {
@@ -76,7 +80,12 @@ struct AllpairAdapter {
                           int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
-    allreduceAllPairs<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    // Round nBlocks to multiple of nPeers so every block maps to a valid peer.
+    const int nPeers = worldSize - 1;
+    if (nPeers > 0) {
+      nBlocks = (nBlocks / nPeers) * nPeers;
+    }
+    allreduceAllPairs<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
         nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
     return cudaGetLastError();
@@ -101,6 +110,11 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<voi
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
   }
+  // nBlocks must be at least nPeers for allpair — each block maps to one peer.
+  const int nPeers = algoCtx->nRanksPerNode - 1;
+  if (nPeers > 0 && blockAndThreadNum.first < nPeers) {
+    return CommResult::CommInvalidArgument;
+  }
   size_t sendBytes;
   CUdeviceptr sendBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index ee46fd77..24d2a31c 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -213,6 +213,13 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(
     return CommResult::CommInvalidArgument;
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  if (numBlocksAndThreads.first > 64) {
+    WARN("AllreduceFullmesh: number of blocks exceeds maximum supported blocks, which is 64");
+    return mscclpp::CommResult::CommInvalidArgument;
+  }
+  if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
+    numBlocksAndThreads = {35, 512};
+  }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(),
                 nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,

From ed565ceb333c128b48176cf0f8ff4c22164e9be6 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Wed, 8 Apr 2026 14:59:05 -0700
Subject: [PATCH 09/21] Fix missing directory of document for new tag v0.9.0 
 (#776)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The v0.9.0 conf.py (introduced in #775) dynamically loads the version
from python/mscclpp/_version.py.

This file is generated at build time by setuptools_scm and is listed in
.gitignore — it is never committed to the repo. Earlier tags (v0.8.0 and
below) used a hardcoded release (e.g., "v0.8.0") in conf.py, so they had
no dependency on generated files.
sphinx-multiversion checks out each tag using git archive, which only
extracts committed files.
Since _version.py is not committed, the v0.9.0 checkout is missing it,
and conf.py crashes on import. All future tags will have this same
problem.

**Three changes:**
1. docs/build_multiversion.py (new): A wrapper around
sphinx-multiversion that monkey-patches copy_tree to generate
_version.py in each tag checkout after extraction. The version string is
parsed from the tag name (e.g., v0.9.0 → __version__ = "0.9.0").
2. Makefile: The multiversion target now calls build_multiversion.py
instead of sphinx-multiversion directly.
3. conf.py: Added a fallback so that if _version.py doesn't exist, it
reads the version from the VERSION file instead. This makes conf.py
resilient for any future scenario where _version.py is missing.

**Testing**
Verified locally:
• make multiversion now successfully builds all 11 versions (v0.4.0
through v0.9.0)
• v0.9.0 docs are correctly generated under _build/html/v0.9.0/
Version selector shows v0.9.0 as latest
---
 docs/Makefile              |  2 +-
 docs/build_multiversion.py | 49 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 docs/build_multiversion.py

diff --git a/docs/Makefile b/docs/Makefile
index 5bc7422e..bf82c03a 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -5,7 +5,7 @@
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
-SPHINXMULTIVERSION ?= sphinx-multiversion
+SPHINXMULTIVERSION ?= python3 build_multiversion.py
 SOURCEDIR     = .
 BUILDDIR      = _build
 
diff --git a/docs/build_multiversion.py b/docs/build_multiversion.py
new file mode 100644
index 00000000..ace20fc0
--- /dev/null
+++ b/docs/build_multiversion.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Wrapper around sphinx-multiversion that patches copy_tree to generate
+_version.py in each tag checkout. This is needed because setuptools_scm
+generates _version.py at build time, but sphinx-multiversion uses
+`git archive` which only contains committed files.
+
+Usage (called by Makefile):
+    python3 build_multiversion.py <sourcedir> <outputdir> [sphinx-opts...]
+"""
+
+import os
+import re
+import subprocess
+import sys
+
+import sphinx_multiversion.git as smv_git
+from sphinx_multiversion import main as smv_main
+
+# Save the original copy_tree
+_original_copy_tree = smv_git.copy_tree
+
+
+def _patched_copy_tree(gitroot, src, dst, reference, sourcepath="."):
+    """Call original copy_tree, then generate _version.py from the VERSION file."""
+    _original_copy_tree(gitroot, src, dst, reference, sourcepath)
+
+    # Extract version from the tag name (e.g., "v0.9.0" -> "0.9.0")
+    refname = getattr(reference, "refname", "") or ""
+    match = re.search(r"v(\d+\.\d+\.\d+)", refname)
+    if not match:
+        return
+
+    version = match.group(1)
+    version_py_dir = os.path.join(dst, "python", "mscclpp")
+    if os.path.isdir(version_py_dir):
+        version_py = os.path.join(version_py_dir, "_version.py")
+        if not os.path.exists(version_py):
+            with open(version_py, "w") as f:
+                f.write(f'__version__ = "{version}"\n')
+
+
+# Monkey-patch
+smv_git.copy_tree = _patched_copy_tree
+
+if __name__ == "__main__":
+    sys.exit(smv_main(sys.argv[1:]))

From 3e5c41c98a2633555a1404a69332cb476e9c5e88 Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Wed, 8 Apr 2026 16:59:08 -0700
Subject: [PATCH 10/21] Adding Channel Type in ReduceSend Operation on DSL
 (#777)

The reduce send operation in DSL essentially combines the reduce and put
operations. The put operation carry the information about the channel
type, whereas previously, we were using the channel type from the reduce
operation.
---
 python/mscclpp/language/internal/operations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mscclpp/language/internal/operations.py b/python/mscclpp/language/internal/operations.py
index 5f719c21..5fb392e3 100644
--- a/python/mscclpp/language/internal/operations.py
+++ b/python/mscclpp/language/internal/operations.py
@@ -745,7 +745,7 @@ class ReduceOperation(BaseOperation):
                 remote_dst_buff=self.remote_dst_buff + other.dst_buff,
                 channel_ids=self.channel_ids,
                 put_channel_ids=self.put_channel_ids + other.channel_ids,
-                channel_type=self.channel_type,
+                channel_type=other.channel_type,
                 reduce_operation=self.reduce_operation,
                 tbg_info=self.tbg_info,
                 packet=self.packet,

From a7273047e9623d87cc031568f820b53c7637a1e7 Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Wed, 8 Apr 2026 17:02:07 -0700
Subject: [PATCH 11/21] Fix TBG on DSL Get Operation (#778)

---
 python/mscclpp/language/channel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mscclpp/language/channel.py b/python/mscclpp/language/channel.py
index 1b22e4e2..23d76eda 100644
--- a/python/mscclpp/language/channel.py
+++ b/python/mscclpp/language/channel.py
@@ -140,7 +140,7 @@ class MemoryChannel:
 
         for tb_id in tb_list:
             tb_chunk_id = get_program().setup_remote_chunk(self.src_rank, tb_id, remote_chunk, self.channel_type)
-            tb_channel_ids = get_program().setup_channel(tb, self)
+            tb_channel_ids = get_program().setup_channel(tb_id, self)
             op = GetOperation(
                 src_buff=[RemoteChunk(src_chunk.buffer, src_chunk.index, src_chunk.size, tb_chunk_id)],
                 dst_buff=[LocalChunk(dst_chunk.buffer, dst_chunk.index, dst_chunk.size)],

From d63f9403c0c03fd87b5c1bffcc2e4091ec65e9a4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 9 Apr 2026 02:24:30 -0700
Subject: [PATCH 12/21] IB `host-no-atomic`: GDRCopy + mlx5dv Data Direct for
 memory-consistent low-latency signaling (#753)

Major enhancements to the IB signal forwarding mechanisms
(`host-no-atomic` mode), primarily adding support for GDRCopy and MLX5
Direct Verbs, and refactoring the signal forwarding path for IB
HostNoAtomic mode. The changes fix memory consistency issues and reduce
signaling latency.
- GDRCopy and MLX5 Direct Verbs MR integration
- Signal forwarding path redesign
- Semaphore and connection API updates
- Environment (`MSCCLPP_FORCE_DISABLE_GDR`) and documentation updates
---
 .github/copilot-instructions.md      |   2 +-
 CMakeLists.txt                       |  21 +++
 cmake/FindGDRCopy.cmake              |  50 ++++++
 cmake/FindMLX5.cmake                 |  38 ++++
 docker/base-dev-x.dockerfile         |  19 +-
 docs/quickstart.md                   |   3 +
 include/mscclpp/atomic_device.hpp    |   5 +-
 include/mscclpp/env.hpp              |   5 +
 include/mscclpp/semaphore.hpp        |  10 ++
 src/core/CMakeLists.txt              |  10 ++
 src/core/connection.cc               | 192 +++++++++++++-------
 src/core/context.cc                  |   2 -
 src/core/endpoint.cc                 |   4 +-
 src/core/env.cpp                     |   4 +-
 src/core/gdr.cc                      | 204 ++++++++++++++++++++++
 src/core/ib.cc                       | 120 +++++++++++--
 src/core/include/connection.hpp      |  57 ++++--
 src/core/include/context.hpp         |   2 -
 src/core/include/gdr.hpp             |  62 +++++++
 src/core/include/ib.hpp              |  18 +-
 src/core/include/mlx5dv_wrapper.hpp  |  38 ++++
 src/core/mlx5dv_wrapper.cc           | 126 ++++++++++++++
 src/core/semaphore.cc                |  58 +++++--
 test/framework.cc                    |  46 +++++
 test/framework.hpp                   |   7 +
 test/mp_unit/ib_tests.cu             | 127 +++++++++++---
 test/mp_unit/memory_channel_tests.cu |   6 +-
 test/mp_unit/mp_unit_tests.hpp       |   1 +
 test/mp_unit/port_channel_tests.cu   | 140 ++++++++++++++-
 test/mp_unit/semaphore_perf_tests.cu |   2 +-
 test/unit/CMakeLists.txt             |   1 +
 test/unit/gdr_tests.cu               | 251 +++++++++++++++++++++++++++
 32 files changed, 1472 insertions(+), 159 deletions(-)
 create mode 100644 cmake/FindGDRCopy.cmake
 create mode 100644 cmake/FindMLX5.cmake
 create mode 100644 src/core/gdr.cc
 create mode 100644 src/core/include/gdr.hpp
 create mode 100644 src/core/include/mlx5dv_wrapper.hpp
 create mode 100644 src/core/mlx5dv_wrapper.cc
 create mode 100644 test/unit/gdr_tests.cu

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 4f13c557..9d7e7798 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -43,7 +43,7 @@ For testing after successful build:
 # To run tests with two GPUs - two is enough for most tests
 mpirun -np 2 ./build/bin/mp_unit_tests
 # To run tests excluding IB-related ones (when IB is not available)
-mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
+mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
 ```
 
 For building a Python package:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9db54d15..ef8b785a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -224,9 +224,30 @@ if(MSCCLPP_USE_IB)
     if(NOT IBVERBS_FOUND)
         message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.")
     endif()
+    find_package(MLX5)
+    if(MLX5_FOUND)
+        message(STATUS "MLX5 Direct Verbs found: ${MLX5_LIBRARIES}")
+    else()
+        message(STATUS "MLX5 Direct Verbs not found, mlx5dv optimizations disabled")
+    endif()
 endif()
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)
+
+option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON)
+if(MSCCLPP_USE_ROCM)
+    set(MSCCLPP_USE_GDRCOPY OFF)
+endif()
+if(MSCCLPP_USE_GDRCOPY)
+    find_package(GDRCopy)
+    if(NOT GDRCOPY_FOUND)
+        message(STATUS "GDRCopy not found, disabling GDRCopy support")
+        set(MSCCLPP_USE_GDRCOPY OFF)
+    else()
+        message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}")
+    endif()
+endif()
+
 include(FetchContent)
 FetchContent_Declare(json
     GIT_REPOSITORY https://github.com/nlohmann/json.git
diff --git a/cmake/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake
new file mode 100644
index 00000000..54e0ba1c
--- /dev/null
+++ b/cmake/FindGDRCopy.cmake
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE)
+#
+# The following variables are optionally searched for defaults
+#  GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
+#  GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
+#  GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found
+
+# The following are set after configuration is done:
+#  GDRCOPY_FOUND
+#  GDRCOPY_INCLUDE_DIRS
+#  GDRCOPY_LIBRARIES
+
+find_path(GDRCOPY_INCLUDE_DIRS
+  NAMES gdrapi.h
+  HINTS
+  ${GDRCOPY_INCLUDE_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(GDRCOPY_LIBRARIES
+  NAMES gdrapi
+  HINTS
+  ${GDRCOPY_LIB_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+if(GDRCOPY_INCLUDE_DIRS)
+    include(CheckSymbolExists)
+    set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
+    set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
+    check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    unset(CMAKE_REQUIRED_INCLUDES)
+    if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
+        message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
+        set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
+    endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
+mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
diff --git a/cmake/FindMLX5.cmake b/cmake/FindMLX5.cmake
new file mode 100644
index 00000000..9fd59127
--- /dev/null
+++ b/cmake/FindMLX5.cmake
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the MLX5 Direct Verbs (mlx5dv) library
+#
+# The following variables are optionally searched for defaults
+#  MLX5_ROOT_DIR: Base directory where all MLX5 components are found
+#  MLX5_INCLUDE_DIR: Directory where MLX5 headers are found
+#  MLX5_LIB_DIR: Directory where MLX5 libraries are found
+
+# The following are set after configuration is done:
+#  MLX5_FOUND
+#  MLX5_INCLUDE_DIRS
+#  MLX5_LIBRARIES
+
+find_path(MLX5_INCLUDE_DIRS
+  NAMES infiniband/mlx5dv.h
+  HINTS
+  ${MLX5_INCLUDE_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(MLX5_LIBRARIES
+  NAMES mlx5
+  HINTS
+  ${MLX5_LIB_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
+mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
index 7c6c927e..47436202 100644
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -49,8 +49,25 @@ RUN OS_ARCH=$(uname -m) && \
     rm -rf ${CMAKE_HOME}.tar.gz && \
     ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
 
-# Install ROCm-specific packages if building for ROCm
+# Install GDRCopy userspace library for CUDA targets
 ARG TARGET="cuda13.0"
+RUN if echo "$TARGET" | grep -q "^cuda"; then \
+        GDRCOPY_VERSION="2.5.2" && \
+        apt-get update -y && \
+        apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \
+        cd /tmp && \
+        curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
+        tar xzf gdrcopy.tar.gz && \
+        cd gdrcopy-${GDRCOPY_VERSION}/packages && \
+        ./build-deb-packages.sh -k -t && \
+        dpkg -i libgdrapi_*.deb && \
+        cd / && rm -rf /tmp/gdrcopy* && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* /tmp/*; \
+    fi
+
+# Install ROCm-specific packages if building for ROCm
 RUN if echo "$TARGET" | grep -q "^rocm"; then \
         apt-get update -y && \
         apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
diff --git a/docs/quickstart.md b/docs/quickstart.md
index b7a68050..c9c98128 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -31,6 +31,9 @@
         ```
         If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
     * (Optional, for benchmarks) MPI
+    * (Optional, for NVIDIA platforms) [GDRCopy](https://github.com/NVIDIA/gdrcopy) >= 2.5.1
+        * GDRCopy is required for IB `HostNoAtomic` mode, which uses CPU-side signal forwarding to GPU memory via BAR1 mappings. This mode is used on platforms where RDMA atomics are not available (e.g., when using Data Direct Virtual Functions).
+        * Install GDRCopy from source or via packages. See the [GDRCopy installation guide](https://github.com/NVIDIA/gdrcopy#installation).
 * Others
     * For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites).
     * For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.
diff --git a/include/mscclpp/atomic_device.hpp b/include/mscclpp/atomic_device.hpp
index 74f6122f..d00bb50c 100644
--- a/include/mscclpp/atomic_device.hpp
+++ b/include/mscclpp/atomic_device.hpp
@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
   return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
 }
 
-#elif defined(MSCCLPP_DEVICE_HIP)
+#else  // !defined(MSCCLPP_DEVICE_CUDA)
 
 constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
 constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
 constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
 constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
 
-// HIP does not have thread scope enums like CUDA
 constexpr auto scopeSystem = 0;
 constexpr auto scopeDevice = 0;
 
@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
   return __atomic_fetch_add(ptr, val, memoryOrder);
 }
 
-#endif  // defined(MSCCLPP_DEVICE_HIP)
+#endif  // !defined(MSCCLPP_DEVICE_CUDA)
 
 }  // namespace mscclpp
 
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index 39f73e8d..fb1da22c 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -110,6 +110,11 @@ class Env {
   /// Default is false.
   const bool forceDisableNvls;
 
+  /// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++.
+  /// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded.
+  /// Default is false.
+  const bool forceDisableGdr;
+
  private:
   Env();
 
diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index 85787c95..4d1f2e32 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -16,6 +16,7 @@ namespace mscclpp {
 class Host2DeviceSemaphore {
  private:
   Semaphore semaphore_;
+  std::shared_ptr<uint64_t> inboundToken_;
   detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
   std::unique_ptr<uint64_t> outboundToken_;
 
@@ -29,6 +30,15 @@ class Host2DeviceSemaphore {
   /// @param connection The connection associated with this semaphore.
   Host2DeviceSemaphore(Communicator& communicator, const Connection& connection);
 
+  /// Destructor.
+  ~Host2DeviceSemaphore();
+
+  /// Move constructor.
+  Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default;
+
+  /// Move assignment operator.
+  Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default;
+
   /// Returns the connection.
   /// @return The connection associated with this semaphore.
   Connection& connection();
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index c1aa25bb..9ca5fed3 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -28,6 +28,16 @@ if(MSCCLPP_USE_IB)
     target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
     target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
     target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
+    if(MLX5_FOUND)
+        target_include_directories(mscclpp_obj SYSTEM PRIVATE ${MLX5_INCLUDE_DIRS})
+        target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MLX5DV)
+    endif()
+endif()
+
+if(MSCCLPP_USE_GDRCOPY)
+    target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GDRCOPY_INCLUDE_DIRS})
+    target_link_libraries(mscclpp_obj PRIVATE ${GDRCOPY_LIBRARIES})
+    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
 endif()
 
 set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 6466ca2a..8b6c0afb 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -7,6 +7,7 @@
 #include <mscclpp/npkit/npkit.hpp>
 #endif
 
+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
@@ -197,45 +198,54 @@ void IBConnection::recvThreadFunc() {
     }
   }
 
-  // Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy)
+  uint32_t lastImmData = 0;
+  uint64_t immHighBits = 0;
   uint64_t newValueHost = 0;
 
-  while (!stopRecvThread_.load(std::memory_order_relaxed)) {
-    auto qp = qp_.lock();
-    if (!qp) break;
+  auto qp = qp_.lock();
+  if (!qp) return;
 
+  while (!stopRecvThread_.load(std::memory_order_relaxed)) {
     int wcNum = qp->pollRecvCq();
     if (wcNum < 0) {
-      WARN(NET, "IBConnection recvThreadFunc: pollRecvCq failed");
+      recvThreadErrorMsg_ = "pollRecvCq failed";
+      recvThreadError_.store(true, std::memory_order_release);
+      WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
       break;
     }
 
     for (int i = 0; i < wcNum; ++i) {
       int status = qp->getRecvWcStatus(i);
       if (status != static_cast<int>(WsStatus::Success)) {
-        WARN(NET, "IBConnection recvThreadFunc: recv work completion failed: ", qp->getRecvWcStatusString(i));
-        // Post another recv to replace the failed one
-        qp->stageRecv(/*wrId=*/0);
-        qp->postRecv();
-        continue;
+        // A failed recv WC typically means the QP entered error state (e.g., WR Flushed Error).
+        // All remaining WRs will also fail — no recovery without QP recreation. Exit the thread
+        // and set the error flag so the main thread can detect it.
+        recvThreadErrorMsg_ = std::string("recv work completion failed: ") + qp->getRecvWcStatusString(i);
+        recvThreadError_.store(true, std::memory_order_release);
+        WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
+        return;
       }
 
-      // The imm_data contains newValue (32-bit, extended to 64-bit)
-      // Note: getRecvWcImmData already converts from network byte order via ntohl
-      unsigned int immData = qp->getRecvWcImmData(i);
-      newValueHost = static_cast<uint64_t>(immData);
+      // Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
+      // using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits
+      // are less than the previous value, the upper 32 bits must have incremented by 1.
+      uint32_t immData = qp->getRecvWcImmData(i);
+      if (immData < lastImmData) {
+        immHighBits += (1ULL << 32);
+      }
+      lastImmData = immData;
+      newValueHost = immHighBits | static_cast<uint64_t>(immData);
 
-      // Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
-      uint64_t dstGpuAddr = remoteUpdateDstAddr_;
-      if (dstGpuAddr != 0) {
-        uint64_t* dstPtr = reinterpret_cast<uint64_t*>(dstGpuAddr);
-
-        // Use cudaMemcpyAsync with our dedicated stream to avoid blocking on the default stream
-        MSCCLPP_CUDATHROW(
-            cudaMemcpyAsync(dstPtr, &newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_));
-
-        INFO(CONN, "IBConnection recvThreadFunc: updated GPU ptr ", dstPtr, " to ", newValueHost, " (immData=", immData,
-             ")");
+      // Forward the token to the semaphore's inbound token address via atomicStore
+      // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
+      if (signalAddr_ != 0) {
+        if (signalGdrMap_ && signalGdrMap_->valid()) {
+          atomicStore(signalGdrMap_->hostPtr(), newValueHost, memoryOrderRelaxed);
+        } else {
+          // For HIP/ROCm.
+          // NOTE: may need a fix in the future to ensure BAR1 mapping.
+          *reinterpret_cast<volatile uint64_t*>(signalAddr_) = newValueHost;
+        }
       }
 
       // Post another recv for future messages
@@ -250,60 +260,105 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
     : BaseConnection(context, localEndpoint),
       transport_(localEndpoint.transport()),
       remoteTransport_(remoteEndpoint.transport()),
-      dummyAtomicSource_(std::make_unique<uint64_t>(0)),
+      atomicSrc_(std::make_unique<uint64_t>(0)),
       ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
+      gdrSignalForwarding_(false),
       stopRecvThread_(false),
+      recvThreadError_(false),
       localGpuDeviceId_(localEndpoint.device().id),
-      signalStream_(nullptr),
-      remoteUpdateDstAddr_(0) {
+      signalAddr_(0) {
   qp_ = getImpl(localEndpoint).ibQp_;
   qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
   qp_.lock()->rts();
-  dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
-  validateTransport(dummyAtomicSourceMem_, transport_);
-  dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_);
+  atomicSrcMem_ = context->registerMemory(atomicSrc_.get(), sizeof(uint64_t), transport_);
+  validateTransport(atomicSrcMem_, transport_);
+  atomicSrcTransportInfo_ = getImpl(atomicSrcMem_).getTransportInfo(transport_);
 
   if (ibNoAtomic_) {
-    // Create a CUDA stream for async memory copies
-    MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking));
+#if defined(MSCCLPP_USE_CUDA)
+    // On CUDA, HostNoAtomic requires GDRCopy for CPU→GPU signal forwarding through BAR1.
+    if (!gdrEnabled()) {
+      THROW(CONN, Error, ErrorCode::InvalidUsage,
+            "IB host-no-atomic mode on CUDA requires GDRCopy: ", gdrStatusMessage());
+    }
+    gdrSignalForwarding_ = true;
+#endif  // defined(MSCCLPP_USE_CUDA)
 
-    // Pre-post receive requests for incoming write-with-imm
+    // On platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200
+    // NVLink-C2C), HostNoAtomic requires Data Direct for correct memory ordering. Data Direct
+    // routes NIC DMA through the PCIe Data Direct engine, bypassing the bridge. It is available
+    // on Virtual Function (VF) devices. On platforms without such a bridge (x86, non-Grace
+    // aarch64), HostNoAtomic works without Data Direct.
+    //
+    // We cannot reliably detect the bridge at compile time or runtime, so we emit a warning
+    // when the device is not a VF. If data corruption occurs, switching to VF devices with
+    // Data Direct or using IbMode::Host with RDMA atomics will resolve it.
+    {
+      IbCtx* ibCtx = getImpl(*context).getIbContext(transport_);
+      if (!ibCtx->isVirtualFunction()) {
+        WARN(CONN,
+             "IB HostNoAtomic mode without a Virtual Function (VF) device may cause data corruption "
+             "on platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200). "
+             "Device ",
+             ibCtx->getDevName(),
+             " is not a VF. "
+             "If you experience data corruption, use VF devices with Data Direct or IbMode::Host.");
+      }
+    }
+
+    // Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
+    // The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
     auto qp = qp_.lock();
     int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
     for (int i = 0; i < maxRecvWr; ++i) {
       qp->stageRecv(/*wrId=*/0);
     }
     qp->postRecv();
-    // Start the background thread to poll recv CQ
-    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
-    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with no-atomic mode");
+    // The recv thread is started later in startSignalForwarding() when the semaphore
+    // provides the signal forwarding destination. This ensures the thread lifetime is
+    // bounded by the GdrMap lifetime (created before start, destroyed after stop).
+    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with signal forwarding (HostNoAtomic) mode");
   } else {
     INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode");
   }
 }
 
-IBConnection::~IBConnection() {
-  if (ibNoAtomic_) {
-    stopRecvThread_.store(true, std::memory_order_relaxed);
-    if (recvThread_.joinable()) {
-      recvThread_.join();
-    }
-    if (signalStream_ != nullptr) {
-      // Synchronize stream to ensure all async copies are complete before destruction
-      // Ignore errors during teardown (CUDA context may already be destroyed)
-      MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamSynchronize(signalStream_));
-      MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamDestroy(signalStream_));
-    }
-  }
-}
+IBConnection::~IBConnection() { stopSignalForwarding(); }
 
 Transport IBConnection::transport() const { return transport_; }
 
 Transport IBConnection::remoteTransport() const { return remoteTransport_; }
 
-void IBConnection::setRemoteUpdateDstAddr(uint64_t addr) {
-  remoteUpdateDstAddr_ = addr;
-  INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)addr);
+bool IBConnection::isSignalForwarding() const { return ibNoAtomic_; }
+
+void IBConnection::startSignalForwarding(std::shared_ptr<uint64_t> mem) {
+  // Set up the forwarding destination and GdrMap, then start the recv thread.
+  // Order: set address → create GdrMap → start thread.
+  signalAddr_ = reinterpret_cast<uint64_t>(mem.get());
+  if (gdrSignalForwarding_) {
+    signalGdrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
+  }
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(false, std::memory_order_relaxed);
+    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
+  }
+  INFO(CONN, "IBConnection startSignalForwarding: ", (void*)signalAddr_);
+}
+
+void IBConnection::stopSignalForwarding() {
+  // Stop the recv thread, then tear down GdrMap and address.
+  // Order: stop thread → destroy GdrMap → clear address.
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(true, std::memory_order_relaxed);
+    if (recvThread_.joinable()) {
+      recvThread_.join();
+    }
+  }
+  if (gdrSignalForwarding_) {
+    signalGdrMap_.reset();
+  }
+  signalAddr_ = 0;
+  INFO(CONN, "IBConnection stopSignalForwarding");
 }
 
 void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
@@ -356,25 +411,29 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
   *src = newValue;
 
   if (ibNoAtomic_) {
-    // Use RDMA write-with-imm instead of atomic operation
-    // Send only newValue in imm_data (0-byte write)
-    // The remote's recvThreadFunc will use its stored remoteUpdateDstAddr_ to write
-
-    // Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
+    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the
+    // token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around
+    // detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
+    // indicates the upper 32 bits incremented by 1).
+    if (newValue <= oldValue) {
+      WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ", newValue);
+    } else if (newValue - oldValue >= (1ULL << 32)) {
+      WARN(CONN,
+           "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ", oldValue,
+           " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
+    }
     unsigned int immData = static_cast<unsigned int>(newValue);
-
-    // Send 0-byte write-with-imm; use dstMrInfo as target (we don't actually write anything)
     qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
                                       /*size=*/0, /*wrId=*/0,
                                       /*srcOffset=*/0, /*dstOffset=*/0,
                                       /*signaled=*/true, /*immData=*/immData);
     qp_.lock()->postSend();
-    INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
+    INFO(CONN, "IBConnection signal forwarding: value ", oldValue, " -> ", newValue);
   } else {
-    qp_.lock()->stageSendAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
+    qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
                                    /*signaled=*/true);
     qp_.lock()->postSend();
-    INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
+    INFO(CONN, "IBConnection atomic write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
          " -> ", newValue);
   }
 
@@ -388,6 +447,11 @@ void IBConnection::flush(int64_t timeoutUsec) {
   NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_FLUSH_ENTRY, 0, 0, *NpKit::GetCpuTimestamp(), 0);
 #endif
 
+  // Check if the recv thread has already reported an error (e.g., QP entered error state).
+  if (recvThreadError_.load(std::memory_order_acquire)) {
+    THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_);
+  }
+
   Timer timer;
   while (qp_.lock()->getNumSendCqItems()) {
     int wcNum = qp_.lock()->pollSendCq();
diff --git a/src/core/context.cc b/src/core/context.cc
index a5cdffb2..aabe71df 100644
--- a/src/core/context.cc
+++ b/src/core/context.cc
@@ -46,8 +46,6 @@ void CudaIpcStream::sync() {
   }
 }
 
-Context::Impl::Impl() {}
-
 IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
   // Find IB context or create it
   auto it = ibContexts_.find(ibTransport);
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 4795aa62..5ab4bad0 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -51,7 +51,7 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
 
     ibQp_ = contextImpl.getIbContext(config_.transport)
                 ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
-                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend);
+                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
     ibQpInfo_ = ibQp_->getInfo();
   } else if (config_.transport == Transport::Ethernet) {
     // Configuring Ethernet Interfaces
@@ -74,6 +74,7 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
   if (AllIBTransports.has(config_.transport)) {
     ibLocal_ = false;
     it = detail::deserialize(it, ibQpInfo_);
+    it = detail::deserialize(it, ibNoAtomic_);
   } else if (config_.transport == Transport::Ethernet) {
     it = detail::deserialize(it, socketAddress_);
   }
@@ -103,6 +104,7 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
   detail::serialize(data, pimpl_->pidHash_);
   if (AllIBTransports.has(pimpl_->config_.transport)) {
     detail::serialize(data, pimpl_->ibQpInfo_);
+    detail::serialize(data, pimpl_->ibNoAtomic_);
   } else if (pimpl_->config_.transport == Transport::Ethernet) {
     detail::serialize(data, pimpl_->socketAddress_);
   }
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 484b40af..96f53492 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -65,7 +65,8 @@ Env::Env()
       ncclSharedLibPath(readEnv<std::string>("MSCCLPP_NCCL_LIB_PATH", "")),
       forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
       ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
-      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)) {}
+      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
+      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -93,6 +94,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation);
     logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
+    logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
   }
   return globalEnv;
 }
diff --git a/src/core/gdr.cc b/src/core/gdr.cc
new file mode 100644
index 00000000..22ac15c9
--- /dev/null
+++ b/src/core/gdr.cc
@@ -0,0 +1,204 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "gdr.hpp"
+
+#if defined(MSCCLPP_USE_GDRCOPY)
+
+#include <gdrapi.h>
+#include <unistd.h>
+
+#include <mscclpp/env.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "logger.hpp"
+
+#ifndef GPU_PAGE_SHIFT
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
+#endif
+
+namespace mscclpp {
+
+// GdrContext
+
+class GdrContext {
+ public:
+  GdrContext();
+  ~GdrContext();
+
+  GdrContext(const GdrContext&) = delete;
+  GdrContext& operator=(const GdrContext&) = delete;
+
+  GdrStatus status() const { return status_; }
+  gdr_t handle() const { return handle_; }
+
+ private:
+  GdrStatus status_;
+  gdr_t handle_;
+};
+
+static std::shared_ptr<GdrContext> gdrContext() {
+  static auto instance = std::make_shared<GdrContext>();
+  return instance;
+}
+
+GdrStatus gdrStatus() { return gdrContext()->status(); }
+
+bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
+
+const char* gdrStatusMessage() {
+  switch (gdrStatus()) {
+    case GdrStatus::Ok:
+      return "GDRCopy initialized successfully";
+    case GdrStatus::NotBuilt:
+      return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
+    case GdrStatus::Disabled:
+      return "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
+    case GdrStatus::DriverMissing:
+      return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
+    case GdrStatus::OpenFailed:
+      return "gdr_open() failed; GDRCopy driver may be misconfigured";
+    default:
+      return "unknown GDRCopy status";
+  }
+}
+
+GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
+  if (env()->forceDisableGdr) {
+    INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
+    status_ = GdrStatus::Disabled;
+    return;
+  }
+
+  // Auto-detect: check if driver is available
+  if (access("/dev/gdrdrv", F_OK) != 0) {
+    INFO(GPU, "GDRCopy driver not detected, disabling GDRCopy");
+    status_ = GdrStatus::DriverMissing;
+    return;
+  }
+
+  handle_ = gdr_open();
+  if (handle_ == nullptr) {
+    INFO(GPU, "gdr_open() failed, disabling GDRCopy");
+    status_ = GdrStatus::OpenFailed;
+    return;
+  }
+
+  status_ = GdrStatus::Ok;
+  INFO(GPU, "GDRCopy initialized successfully");
+}
+
+GdrContext::~GdrContext() {
+  if (handle_ != nullptr) {
+    gdr_close(handle_);
+    handle_ = nullptr;
+  }
+}
+
+// GdrMap::Impl — real implementation with GDRCopy
+
+struct GdrMap::Impl {
+  std::shared_ptr<GdrContext> ctx;
+  std::shared_ptr<void> gpuMem;
+  gdr_mh_t mh;
+  void* barPtr;
+  uint64_t* hostDstPtr;
+  size_t mappedSize;
+};
+
+GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : pimpl_(std::make_unique<Impl>()) {
+  pimpl_->ctx = gdrContext();
+  pimpl_->gpuMem = std::move(gpuMem);
+  pimpl_->mh = {};
+  pimpl_->barPtr = nullptr;
+  pimpl_->hostDstPtr = nullptr;
+  pimpl_->mappedSize = 0;
+
+  // Ensure CUDA device context is active for gdr_pin_buffer
+  CudaDeviceGuard deviceGuard(deviceId);
+
+  uint64_t gpuAddr = reinterpret_cast<uint64_t>(pimpl_->gpuMem.get());
+  // Align to GPU page boundary and pin one page around the target address
+  unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK;
+  unsigned long pageOffset = gpuAddr - alignedAddr;
+  pimpl_->mappedSize = GPU_PAGE_SIZE;
+
+  // Pin the GPU memory for GDRCopy BAR1 mapping. Try GDR_PIN_FLAG_FORCE_PCIE first for optimal
+  // ordering on platforms that support it (e.g., GB200). Fall back to flags=0 if FORCE_PCIE is
+  // not supported. Both paths work correctly: CPU writes via atomicStore, GPU reads via
+  // system-scope acquire.
+  int ret =
+      gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, GDR_PIN_FLAG_FORCE_PCIE, &pimpl_->mh);
+  if (ret != 0) {
+    ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh);
+    if (ret != 0) {
+      THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
+            ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
+    }
+  }
+
+  ret = gdr_map(pimpl_->ctx->handle(), pimpl_->mh, &pimpl_->barPtr, pimpl_->mappedSize);
+  if (ret != 0) {
+    (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
+    THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
+  }
+
+  pimpl_->hostDstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(pimpl_->barPtr) + pageOffset);
+
+  INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)pimpl_->hostDstPtr);
+}
+
+GdrMap::~GdrMap() {
+  if (pimpl_) {
+    if (pimpl_->barPtr != nullptr) {
+      (void)gdr_unmap(pimpl_->ctx->handle(), pimpl_->mh, pimpl_->barPtr, pimpl_->mappedSize);
+    }
+    if (pimpl_->hostDstPtr != nullptr) {
+      (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
+    }
+  }
+}
+
+bool GdrMap::valid() const { return pimpl_ && pimpl_->hostDstPtr != nullptr; }
+
+uint64_t* GdrMap::hostPtr() const { return pimpl_ ? pimpl_->hostDstPtr : nullptr; }
+
+void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(pimpl_->mh, pimpl_->hostDstPtr, src, size); }
+
+void GdrMap::copyFrom(void* dst, size_t size) const {
+  gdr_copy_from_mapping(pimpl_->mh, dst, pimpl_->hostDstPtr, size);
+}
+
+}  // namespace mscclpp
+
+#else  // !defined(MSCCLPP_USE_GDRCOPY)
+
+namespace mscclpp {
+
+GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
+
+bool gdrEnabled() { return false; }
+
+const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
+
+// GdrMap::Impl — stub (no GDRCopy)
+
+struct GdrMap::Impl {};
+
+GdrMap::GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
+
+GdrMap::~GdrMap() = default;
+
+bool GdrMap::valid() const { return false; }
+
+uint64_t* GdrMap::hostPtr() const { return nullptr; }
+
+void GdrMap::copyTo(const void* /*src*/, size_t /*size*/) {}
+
+void GdrMap::copyFrom(void* /*dst*/, size_t /*size*/) const {}
+
+}  // namespace mscclpp
+
+#endif  // !defined(MSCCLPP_USE_GDRCOPY)
diff --git a/src/core/ib.cc b/src/core/ib.cc
index b8854a6e..557f0426 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -21,6 +21,9 @@
 #include "context.hpp"
 #if defined(USE_IBVERBS)
 #include "ibverbs_wrapper.hpp"
+#if defined(MSCCLPP_USE_MLX5DV)
+#include "mlx5dv_wrapper.hpp"
+#endif  // defined(MSCCLPP_USE_MLX5DV)
 #endif  // defined(USE_IBVERBS)
 #include "logger.hpp"
 
@@ -64,7 +67,7 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
   return ret;
 }
 
-IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff), size_(0) {
+IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nullptr), buff_(buff), size_(0) {
   if (size == 0) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0");
   }
@@ -80,13 +83,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
   bool isGpuBuff = (gpuId != -1);
   if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
 #if !defined(MSCCLPP_USE_ROCM)
-    int fd;
-    MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    int fd = -1;
+    size_t rangeSize = pages * pageSize;
 
+    // Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
+    // bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
+    // routes DMA through the Data Direct engine for correct ordering and higher throughput.
+    // Fall back to the default (non-PCIe) mapping if the flag is unsupported.
+#if (CUDA_VERSION >= 12030)
+    CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                                                   CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    if (cuRes != CUDA_SUCCESS || fd < 0) {
+      if (fd >= 0) ::close(fd);
+      fd = -1;
+    }
+    bool usedPcieFlag = (fd >= 0);
+#endif  // CUDA_VERSION >= 12030
+    if (fd < 0) {
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    }
+
+    // Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
+    // which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
     size_t offsetInDmaBuf = buffIntPtr % pageSize;
-    mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd,
-                                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
-                                         IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC);
+    int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
+                      IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
+
+#if defined(MSCCLPP_USE_MLX5DV)
+    if (isDataDirect) {
+      mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+#endif
+    if (mr_ == nullptr) {
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+
+    // If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
+#if (CUDA_VERSION >= 12030)
+    if (mr_ == nullptr && usedPcieFlag) {
+      ::close(fd);
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+#endif  // CUDA_VERSION >= 12030
+
     ::close(fd);
     if (mr_ == nullptr) {
       THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
@@ -131,7 +171,7 @@ const void* IbMr::getBuff() const { return buff_; }
 uint32_t IbMr::getLkey() const { return mr_->lkey; }
 
 IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
-           int maxSendWr, int maxRecvWr, int maxWrPerSend)
+           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
     : portNum_(portNum),
       gidIndex_(gidIndex),
       info_(),
@@ -151,7 +191,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
       maxSendCqPollNum_(maxSendCqPollNum),
       maxSendWr_(maxSendWr),
       maxWrPerSend_(maxWrPerSend),
-      maxRecvWr_(maxRecvWr) {
+      maxRecvWr_(maxRecvWr),
+      noAtomic_(noAtomic) {
   sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
   if (sendCq_ == nullptr) {
     THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
@@ -211,7 +252,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
   qpAttr.qp_state = IBV_QPS_INIT;
   qpAttr.pkey_index = 0;
   qpAttr.port_num = portNum_;
-  qpAttr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
+  qpAttr.qp_access_flags = noAtomic_ ? IBV_ACCESS_REMOTE_WRITE
+                                     : (IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC);
   if (IBVerbs::ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
     THROW(NET, IbError, errno, "ibv_modify_qp failed (errno ", errno, ")");
   }
@@ -240,7 +282,7 @@ void IbQp::rtr(const IbQpInfo& info) {
   qp_attr.path_mtu = static_cast<ibv_mtu>(info.mtu);
   qp_attr.dest_qp_num = info.qpn;
   qp_attr.rq_psn = 0;
-  qp_attr.max_dest_rd_atomic = 1;
+  qp_attr.max_dest_rd_atomic = noAtomic_ ? 0 : 1;
   qp_attr.min_rnr_timer = 0x12;
   if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.isGrh) {
     qp_attr.ah_attr.is_global = 1;
@@ -272,7 +314,7 @@ void IbQp::rts() {
   qp_attr.retry_cnt = 7;
   qp_attr.rnr_retry = 7;
   qp_attr.sq_psn = 0;
-  qp_attr.max_rd_atomic = 1;
+  qp_attr.max_rd_atomic = noAtomic_ ? 0 : 1;
   int ret = IBVerbs::ibv_modify_qp(
       qp_, &qp_attr,
       IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC);
@@ -434,12 +476,38 @@ std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_
 
 unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }
 
-IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false) {
+IbCtx::IbCtx(const std::string& devName)
+    : devName_(devName),
+      ctx_(nullptr),
+      pd_(nullptr),
+      supportsRdmaAtomics_(false),
+      isMlx5_(false),
+      isDataDirect_(false),
+      isVF_(false) {
   int num;
   struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
   for (int i = 0; i < num; ++i) {
     if (std::string(devices[i]->name) == devName_) {
       ctx_ = IBVerbs::ibv_open_device(devices[i]);
+
+      // Detect if this IB device is a Virtual Function (VF).
+      // VFs have a 'physfn' sysfs symlink pointing to their parent PF; PFs do not.
+      {
+        std::string physfnPath = "/sys/class/infiniband/" + devName_ + "/device/physfn";
+        isVF_ = (access(physfnPath.c_str(), F_OK) == 0);
+        if (isVF_) {
+          INFO(NET, "IB device ", devName_, " is a Virtual Function (Data Direct ordering available)");
+        }
+      }
+
+#if defined(MSCCLPP_USE_MLX5DV)
+      if (MLX5DV::isAvailable()) {
+        isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]);
+        if (isMlx5_) {
+          INFO(NET, "IB device ", devName_, " supports mlx5 Direct Verbs");
+        }
+      }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
       break;
     }
   }
@@ -452,6 +520,20 @@ IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_
     THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")");
   }
 
+  // Detect Data Direct support via mlx5dv_get_data_direct_sysfs_path
+#if defined(MSCCLPP_USE_MLX5DV)
+  if (isMlx5_ && MLX5DV::isAvailable()) {
+    char sysfsPath[256];
+    int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath));
+    if (ret == 0) {
+      isDataDirect_ = true;
+      INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")");
+    } else {
+      INFO(NET, "IB device ", devName_, " does not support Data Direct");
+    }
+  }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+
   // Query and cache RDMA atomics capability
   struct ibv_device_attr attr = {};
   if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) {
@@ -512,7 +594,7 @@ int IbCtx::getAnyUsablePort(int gidIndex) const {
 }
 
 std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-                                      int maxRecvWr, int maxWrPerSend) {
+                                      int maxRecvWr, int maxWrPerSend, bool noAtomic) {
   if (port == -1) {
     port = this->getAnyUsablePort(gidIndex);
     if (port == -1) {
@@ -521,16 +603,22 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
   } else if (!this->isPortUsable(port, gidIndex)) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
   }
-  return std::shared_ptr<IbQp>(
-      new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend));
+  return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
+                                        maxRecvWr, maxWrPerSend, noAtomic));
 }
 
 std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
-  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size));
+  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isDataDirect_));
 }
 
 bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
 
+bool IbCtx::isMlx5() const { return isMlx5_; }
+
+bool IbCtx::isDataDirect() const { return isDataDirect_; }
+
+bool IbCtx::isVirtualFunction() const { return isVF_; }
+
 MSCCLPP_API_CPP int getIBDeviceCount() {
   int num;
   IBVerbs::ibv_get_device_list(&num);
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 06e733c7..22a9930f 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -5,6 +5,7 @@
 #define MSCCLPP_CONNECTION_HPP_
 
 #include <atomic>
+#include <memory>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mutex>
@@ -15,6 +16,7 @@
 #include "communicator.hpp"
 #include "context.hpp"
 #include "endpoint.hpp"
+#include "gdr.hpp"
 #include "ib.hpp"
 #include "registered_memory.hpp"
 #include "socket.h"
@@ -35,11 +37,18 @@ class BaseConnection {
 
   virtual void flush(int64_t timeoutUsec = -1) = 0;
 
-  /// Set the local address where remote updateAndSync operations should write.
-  /// This is called by the receiver to specify where incoming signals should be written.
-  /// Default implementation is a no-op for connections that don't need it.
-  /// @param addr The local address for incoming writes.
-  virtual void setRemoteUpdateDstAddr(uint64_t /*addr*/) {}
+  /// Start signal forwarding to the given memory address.
+  /// Called by the semaphore to specify where incoming signals should be written.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  virtual void startSignalForwarding(std::shared_ptr<uint64_t> /*mem*/) {}
+
+  /// Stop signal forwarding and release associated resources.
+  virtual void stopSignalForwarding() {}
+
+  /// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode).
+  /// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
+  /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
+  virtual bool isSignalForwarding() const { return false; }
 
   virtual Transport transport() const = 0;
 
@@ -91,22 +100,29 @@ class IBConnection : public BaseConnection {
   Transport transport_;
   Transport remoteTransport_;
   std::weak_ptr<IbQp> qp_;
-  std::unique_ptr<uint64_t> dummyAtomicSource_;  // not used anywhere but IB needs a source
-  RegisteredMemory dummyAtomicSourceMem_;
-  mscclpp::TransportInfo dstTransportInfo_;
+  std::unique_ptr<uint64_t> atomicSrc_;
+  RegisteredMemory atomicSrcMem_;
+  mscclpp::TransportInfo atomicSrcTransportInfo_;
 
   // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
   // instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
   bool ibNoAtomic_;
+  bool gdrSignalForwarding_;  // ibNoAtomic_ && gdrEnabled() — decided once at construction
   std::thread recvThread_;
   std::atomic<bool> stopRecvThread_;
-  int localGpuDeviceId_;  // Local GPU device ID for setting CUDA context in recv thread
-  cudaStream_t signalStream_;
+  std::atomic<bool> recvThreadError_;  // Set by recv thread on fatal error
+  std::string recvThreadErrorMsg_;     // Error message from recv thread (written before recvThreadError_ is set)
+  int localGpuDeviceId_;               // Local GPU device ID for CUDA context and GDR mapping
 
-  // Write-with-imm design:
-  // - Sender: 0-byte RDMA write-with-imm to dst MR, newValue in imm_data (32-bit)
-  // - Receiver: uses remoteUpdateDstAddr_ (set via setRemoteUpdateDstAddr) to know where to write
-  uint64_t remoteUpdateDstAddr_;
+  // Signal forwarding design (HostNoAtomic mode):
+  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
+  // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
+  //   the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around
+  //   detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half
+  //   incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
+  uint64_t signalAddr_;
+
+  std::unique_ptr<GdrMap> signalGdrMap_;
 
   void recvThreadFunc();
 
@@ -114,10 +130,15 @@ class IBConnection : public BaseConnection {
   IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
   ~IBConnection();
 
-  /// Set the local address where remote updateAndSync operations will write.
-  /// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
-  /// @param addr The local address for incoming writes.
-  void setRemoteUpdateDstAddr(uint64_t addr) override;
+  /// Start signal forwarding to the given memory address.
+  /// Must be called before the remote sends any updateAndSync in HostNoAtomic mode.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  void startSignalForwarding(std::shared_ptr<uint64_t> mem) override;
+
+  /// Stop signal forwarding and release associated resources.
+  void stopSignalForwarding() override;
+
+  bool isSignalForwarding() const override;
 
   Transport transport() const override;
 
diff --git a/src/core/include/context.hpp b/src/core/include/context.hpp
index ee84d0f7..42d03db1 100644
--- a/src/core/include/context.hpp
+++ b/src/core/include/context.hpp
@@ -42,8 +42,6 @@ struct Context::Impl {
   std::shared_ptr<TokenPool> tokenPool_;
   const size_t maxNumTokens_ = 1 << 15;  // 32K tokens
 
-  Impl();
-
   IbCtx* getIbContext(Transport ibTransport);
   std::shared_ptr<uint64_t> getToken();
 };
diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp
new file mode 100644
index 00000000..e0c7f006
--- /dev/null
+++ b/src/core/include/gdr.hpp
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_GDR_HPP_
+#define MSCCLPP_GDR_HPP_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace mscclpp {
+
+enum class GdrStatus {
+  Ok,             // GDRCopy initialized successfully
+  NotBuilt,       // Built without MSCCLPP_USE_GDRCOPY
+  Disabled,       // Disabled via MSCCLPP_FORCE_DISABLE_GDR
+  DriverMissing,  // /dev/gdrdrv not found
+  OpenFailed,     // gdr_open() failed
+};
+
+/// Return the detailed status of the global GDRCopy context.
+GdrStatus gdrStatus();
+
+/// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok).
+bool gdrEnabled();
+
+/// Return a human-readable error message for the current GDRCopy status.
+const char* gdrStatusMessage();
+
+/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
+/// When GDRCopy is not available, all operations are no-ops and valid() returns false.
+class GdrMap {
+ public:
+  /// Pin and map a GPU address for direct host-side access.
+  /// @param gpuMem   Shared pointer to the GPU memory (e.g. from gpuCallocShared).
+  /// @param deviceId The CUDA device ID for setting context.
+  GdrMap(std::shared_ptr<void> gpuMem, int deviceId);
+  ~GdrMap();
+
+  GdrMap(const GdrMap&) = delete;
+  GdrMap& operator=(const GdrMap&) = delete;
+
+  /// Whether the mapping was established successfully.
+  bool valid() const;
+
+  /// Return the BAR1-mapped host pointer to the GPU location.
+  uint64_t* hostPtr() const;
+
+  /// Copy data from host memory to the mapped GPU location.
+  void copyTo(const void* src, size_t size);
+
+  /// Copy data from the mapped GPU location to host memory.
+  void copyFrom(void* dst, size_t size) const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> pimpl_;
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_GDR_HPP_
diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp
index e9363e9c..36c5a237 100644
--- a/src/core/include/ib.hpp
+++ b/src/core/include/ib.hpp
@@ -36,7 +36,7 @@ class IbMr {
   uint32_t getLkey() const;
 
  private:
-  IbMr(ibv_pd* pd, void* buff, std::size_t size);
+  IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect);
 
   ibv_mr* mr_;
   void* buff_;
@@ -101,7 +101,7 @@ class IbQp {
   };
 
   IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-       int maxRecvWr, int maxWrPerSend);
+       int maxRecvWr, int maxWrPerSend, bool noAtomic);
   SendWrInfo getNewSendWrInfo();
   RecvWrInfo getNewRecvWrInfo();
 
@@ -128,6 +128,7 @@ class IbQp {
   const int maxSendWr_;
   const int maxWrPerSend_;
   const int maxRecvWr_;
+  const bool noAtomic_;
 
   friend class IbCtx;
 };
@@ -139,18 +140,24 @@ class IbCtx {
   ~IbCtx();
 
   std::shared_ptr<IbQp> createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-                                 int maxRecvWr, int maxWrPerSend);
+                                 int maxRecvWr, int maxWrPerSend, bool noAtomic);
   std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
   bool supportsRdmaAtomics() const;
+  bool isMlx5() const;
+  bool isDataDirect() const;
+  bool isVirtualFunction() const;
 #else
   IbCtx([[maybe_unused]] const std::string& devName) {}
   ~IbCtx() {}
 
-  std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int) { return nullptr; }
+  std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int, bool) { return nullptr; }
   std::unique_ptr<const IbMr> registerMr([[maybe_unused]] void* buff, [[maybe_unused]] std::size_t size) {
     return nullptr;
   }
   bool supportsRdmaAtomics() const { return false; }
+  bool isMlx5() const { return false; }
+  bool isDataDirect() const { return false; }
+  bool isVirtualFunction() const { return false; }
 #endif
 
   const std::string& getDevName() const { return devName_; };
@@ -163,6 +170,9 @@ class IbCtx {
   ibv_context* ctx_;
   ibv_pd* pd_;
   bool supportsRdmaAtomics_;
+  bool isMlx5_;
+  bool isDataDirect_;
+  bool isVF_;
 };
 
 }  // namespace mscclpp
diff --git a/src/core/include/mlx5dv_wrapper.hpp b/src/core/include/mlx5dv_wrapper.hpp
new file mode 100644
index 00000000..79403a36
--- /dev/null
+++ b/src/core/include/mlx5dv_wrapper.hpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_MLX5DV_WRAPPER_HPP_
+#define MSCCLPP_MLX5DV_WRAPPER_HPP_
+
+#if defined(MSCCLPP_USE_MLX5DV)
+
+#include <infiniband/verbs.h>
+
+#include <string>
+
+namespace mscclpp {
+
+struct MLX5DV {
+  /// Whether libmlx5.so was successfully loaded at runtime.
+  static bool isAvailable();
+
+  /// Check if the given IB device supports mlx5 Direct Verbs.
+  static bool mlx5dv_is_supported(struct ibv_device* device);
+
+  /// Register a DMABUF memory region using mlx5dv extensions.
+  /// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version.
+  static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
+                                             int access);
+
+  /// Query the Data Direct sysfs path for the given IB context.
+  /// Returns 0 on success (device supports Data Direct), non-zero otherwise.
+  static int mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len);
+
+ private:
+  static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
+};
+
+}  // namespace mscclpp
+
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+#endif  // MSCCLPP_MLX5DV_WRAPPER_HPP_
diff --git a/src/core/mlx5dv_wrapper.cc b/src/core/mlx5dv_wrapper.cc
new file mode 100644
index 00000000..a56fad96
--- /dev/null
+++ b/src/core/mlx5dv_wrapper.cc
@@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#if defined(MSCCLPP_USE_MLX5DV)
+
+// _GNU_SOURCE is required for dlvsym()
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "mlx5dv_wrapper.hpp"
+
+#include <dlfcn.h>
+#include <infiniband/mlx5dv.h>
+
+#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
+#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
+#endif
+
+#include <memory>
+
+#include "logger.hpp"
+
+namespace mscclpp {
+
+static std::unique_ptr<void, int (*)(void*)> globalMLX5Handle(nullptr, &::dlclose);
+
+void* MLX5DV::dlsym(const std::string& symbol, bool allowReturnNull) {
+  if (!globalMLX5Handle) {
+    const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
+    for (int i = 0; possibleLibNames[i] != nullptr; i++) {
+      void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
+      if (handle) {
+        globalMLX5Handle.reset(handle);
+        break;
+      }
+    }
+    if (!globalMLX5Handle) {
+      if (allowReturnNull) return nullptr;
+      THROW(NET, SysError, errno, "Failed to open libmlx5: ", std::string(::dlerror()));
+    }
+  }
+  void* ptr = ::dlsym(globalMLX5Handle.get(), symbol.c_str());
+  if (!ptr && !allowReturnNull) {
+    THROW(NET, SysError, errno, "Failed to load libmlx5 symbol: ", symbol);
+  }
+  return ptr;
+}
+
+bool MLX5DV::isAvailable() {
+  static int available = -1;
+  if (available == -1) {
+    // Try to load the library; if it fails, mlx5dv is not available
+    const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
+    for (int i = 0; possibleLibNames[i] != nullptr; i++) {
+      void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
+      if (handle) {
+        if (!globalMLX5Handle) {
+          globalMLX5Handle.reset(handle);
+        } else {
+          ::dlclose(handle);
+        }
+        available = 1;
+        INFO(NET, "libmlx5 loaded successfully");
+        return true;
+      }
+    }
+    available = 0;
+    DEBUG(NET, "libmlx5 not available");
+  }
+  return available == 1;
+}
+
+bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
+  using FuncType = bool (*)(struct ibv_device*);
+  static FuncType impl = nullptr;
+  if (!impl) {
+    void* ptr = MLX5DV::dlsym("mlx5dv_is_supported", /*allowReturnNull=*/true);
+    if (!ptr) return false;
+    impl = reinterpret_cast<FuncType>(ptr);
+  }
+  return impl(device);
+}
+
+struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
+                                            int access) {
+  // mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
+  // Must use dlvsym with "MLX5_1.25" version to get the Data Direct-capable symbol.
+  using FuncType = struct ibv_mr* (*)(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int, int);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_reg_dmabuf_mr", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_reg_dmabuf_mr", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
+    resolved = true;
+  }
+  if (!impl) return nullptr;
+  return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
+}
+
+int MLX5DV::mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) {
+  using FuncType = int (*)(struct ibv_context*, char*, size_t);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_get_data_direct_sysfs_path", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_get_data_direct_sysfs_path", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
+    resolved = true;
+  }
+  if (!impl) return -1;
+  return impl(context, buf, buf_len);
+}
+
+}  // namespace mscclpp
+
+#endif  // defined(MSCCLPP_USE_MLX5DV)
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index bea43327..49a3791b 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -8,6 +8,7 @@
 #include "atomic.hpp"
 #include "connection.hpp"
 #include "context.hpp"
+#include "logger.hpp"
 #include "registered_memory.hpp"
 #include "serialization.hpp"
 
@@ -48,12 +49,12 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection
     token_ = std::make_shared<uint64_t>(0);
   } else if (localDevice.type == DeviceType::GPU) {
     if (localDevice.id < 0) {
-      throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
+      THROW(CONN, Error, ErrorCode::InvalidUsage, "Local GPU ID is not provided");
     }
     CudaDeviceGuard deviceGuard(localDevice.id);
     token_ = gpuCallocToken(connection_.context());
   } else {
-    throw Error("Unsupported local device type", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Unsupported local device type");
   }
   idMemory_ = std::move(connection_.context()->registerMemory(token_.get(), sizeof(uint64_t), connection_.transport()));
 }
@@ -78,7 +79,7 @@ MSCCLPP_API_CPP SemaphoreStub SemaphoreStub::deserialize(const std::vector<char>
   RegisteredMemory idMemory(std::make_shared<RegisteredMemory::Impl>(data.begin(), memEnd));
   auto it = detail::deserialize(memEnd, device);
   if (it != data.end()) {
-    throw Error("SemaphoreStub deserialize failed", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "SemaphoreStub deserialize failed");
   }
   return SemaphoreStub(std::make_shared<Impl>(std::move(idMemory), device));
 }
@@ -119,15 +120,35 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
       expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
       outboundToken_(std::make_unique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
-    throw Error("Local endpoint device type of Host2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
   }
-  BaseConnection::getImpl(connection())
-      ->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
+  auto connImpl = BaseConnection::getImpl(connection());
+  if (connImpl->isSignalForwarding()) {
+    // Signal forwarding (HostNoAtomic): the receiver's recv thread polls the recv CQ for
+    // WRITE_WITH_IMM completions, then forwards the token to inboundToken_ via GDRCopy.
+    CudaDeviceGuard deviceGuard(connection().localDevice().id);
+#if defined(MSCCLPP_USE_ROCM)
+    inboundToken_ = detail::gpuCallocUncachedShared<uint64_t>();
+#else
+    inboundToken_ = detail::gpuCallocShared<uint64_t>();
+#endif
+    connImpl->startSignalForwarding(inboundToken_);
+  }
+  // When isSignalForwarding() is false (atomic mode), inboundToken_ stays null
+  // and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
 }
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator, const Connection& connection)
     : Host2DeviceSemaphore(buildSemaphoreFromConnection(communicator, connection)) {}
 
+MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
+  if (inboundToken_) {
+    // Clear the connection's signal forwarding destination (and GdrMap)
+    // before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
+    BaseConnection::getImpl(connection())->stopSignalForwarding();
+  }
+}
+
 MSCCLPP_API_CPP Connection& Host2DeviceSemaphore::connection() { return semaphore_.connection(); }
 
 MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
@@ -136,7 +157,11 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const {
   Host2DeviceSemaphore::DeviceHandle device;
-  device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
+  // If inboundToken_ is allocated (signal forwarding mode), the GPU polls it.
+  // Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
+  // which is the same address targeted by the NIC's atomic operation.
+  device.inboundToken =
+      inboundToken_ ? inboundToken_.get() : reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
   device.expectedInboundToken = expectedInboundToken_.get();
   return device;
 }
@@ -146,13 +171,19 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
       expectedInboundToken_(std::make_unique<uint64_t>()),
       outboundToken_(std::make_unique<uint64_t>()) {
   if (connection().transport() == Transport::CudaIpc) {
-    throw Error("Host2HostSemaphore cannot be used with CudaIpc transport", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Host2HostSemaphore cannot be used with CudaIpc transport");
   }
   if (connection().localDevice().type != DeviceType::CPU) {
-    throw Error("Local endpoint device type of Host2HostSemaphore should be CPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
+  }
+  auto connImpl = BaseConnection::getImpl(connection());
+  if (connImpl->isSignalForwarding()) {
+    // Signal forwarding mode: tell the recv thread where to write the incoming token.
+    // Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
+    auto token =
+        std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
+    connImpl->startSignalForwarding(std::move(token));
   }
-  BaseConnection::getImpl(connection())
-      ->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
 }
 
 MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(Communicator& communicator, const Connection& connection)
@@ -177,7 +208,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
   while (atomicLoad(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), memoryOrderAcquire) <
          (*expectedInboundToken_)) {
     if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) {
-      throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout);
+      THROW(CONN, Error, ErrorCode::Timeout, "Host2HostSemaphore::wait timed out");
     }
   }
 }
@@ -185,7 +216,8 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
 MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const Semaphore& semaphore)
     : semaphore_(semaphore), expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
-    throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage,
+          "Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU");
   }
 }
 
diff --git a/test/framework.cc b/test/framework.cc
index 73cf1272..941fdcba 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -20,8 +20,30 @@ static bool gCurrentTestPassed = true;
 static std::string gCurrentTestFailureMessage;
 static std::string gCurrentTestName;
 
+// Performance result collection
+struct PerfResult {
+  std::string label;
+  double value;
+  std::string unit;
+};
+struct PerfTestResults {
+  std::string testName;
+  std::vector<PerfResult> results;
+};
+static std::vector<PerfTestResults> gPerfResults;
+
 std::string currentTestName() { return gCurrentTestName; }
 
+void reportPerfResult(const std::string& label, double value, const std::string& unit) {
+  if (gMpiRank != 0) return;
+  if (gCurrentTestName.empty()) return;
+  // Find or create entry for the current test
+  if (gPerfResults.empty() || gPerfResults.back().testName != gCurrentTestName) {
+    gPerfResults.push_back({gCurrentTestName, {}});
+  }
+  gPerfResults.back().results.push_back({label, value, unit});
+}
+
 namespace utils {
 
 void initializeMPI(int argc, char* argv[]) {
@@ -151,6 +173,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   // Parse command line arguments
   std::string filter;
   bool excludePerfTests = false;
+  bool onlyPerfTests = false;
 
   for (int i = 1; i < argc; ++i) {
     std::string arg = argv[i];
@@ -161,6 +184,8 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       ++i;
     } else if (arg == "--exclude-perf-tests") {
       excludePerfTests = true;
+    } else if (arg == "--only-perf-tests") {
+      onlyPerfTests = true;
     }
   }
 
@@ -189,6 +214,10 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       skippedByFilter++;
       continue;
     }
+    if (onlyPerfTests && !entry.isPerfTest) {
+      skippedByFilter++;
+      continue;
+    }
     if (!matchesFilter(fullName, filter)) {
       skippedByFilter++;
       continue;
@@ -208,6 +237,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     std::string fullName = entry.suiteName + "." + entry.testName;
 
     if (excludePerfTests && entry.isPerfTest) continue;
+    if (onlyPerfTests && !entry.isPerfTest) continue;
     if (!matchesFilter(fullName, filter)) continue;
 
     gCurrentTestPassed = true;
@@ -285,6 +315,9 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
         passed++;
       } else {
         std::cout << "[  FAILED  ] " << fullName << std::endl;
+        if (!gCurrentTestFailureMessage.empty()) {
+          std::cout << "            Reason: " << gCurrentTestFailureMessage << std::endl;
+        }
         failed++;
       }
     }
@@ -301,6 +334,19 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     if (failed > 0) {
       std::cout << "[  FAILED  ] " << failed << " tests.\n";
     }
+
+    // Print collected performance results
+    if (!gPerfResults.empty()) {
+      std::cout << "\n[   PERF   ] Performance results:\n";
+      for (const auto& testResult : gPerfResults) {
+        std::cout << "[   PERF   ] " << testResult.testName << "\n";
+        for (const auto& r : testResult.results) {
+          std::cout << "[   PERF   ]   " << std::setw(12) << r.label << ": " << std::setprecision(4) << r.value << " "
+                    << r.unit << "\n";
+        }
+      }
+      gPerfResults.clear();
+    }
   }
 
   // Tear down global test environments (in reverse order)
diff --git a/test/framework.hpp b/test/framework.hpp
index 26a32d5b..b2431ed9 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -63,6 +63,13 @@ class TestRegistry {
 // Returns "Suite.Name" for the currently running test, or "" if none.
 std::string currentTestName();
 
+/// Collect a performance result for the current test. Results are printed together
+/// after all tests complete. Only rank 0 should call this (results are ignored on other ranks).
+/// @param label A label for this measurement (e.g., "128 MB" or "latency").
+/// @param value The numeric result.
+/// @param unit The unit string (e.g., "GB/s", "us/iter").
+void reportPerfResult(const std::string& label, double value, const std::string& unit);
+
 // Utility functions
 namespace utils {
 
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 04ab402d..e5945563 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -3,8 +3,12 @@
 
 #include <mpi.h>
 
+#include <atomic>
+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <thread>
 
+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"
 
@@ -40,7 +44,11 @@ void IbPeerToPeerTest::SetUp() {
   int ib_gid_index = std::stoi(gEnv->args["ib_gid_index"]);
 
   ibCtx = std::make_shared<mscclpp::IbCtx>(ibDevName);
-  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, 0, 64);
+  bool noAtomic = !ibCtx->supportsRdmaAtomics();
+  // When atomics are not supported, the MemoryConsistency test uses
+  // write-with-imm which requires recv WRs on the receiver side.
+  int maxRecvWr = noAtomic ? 64 : 0;
+  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, maxRecvWr, 64, noAtomic);
 
   qpInfo[gEnv->rank] = qp->getInfo();
   bootstrap->allGather(qpInfo.data(), sizeof(mscclpp::IbQpInfo));
@@ -78,7 +86,7 @@ void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint6
   qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
 }
 
-TEST(IbPeerToPeerTest, SimpleSendRecv) {
+PERF_TEST(IbPeerToPeerTest, SimpleSendRecv) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -114,7 +122,7 @@ TEST(IbPeerToPeerTest, SimpleSendRecv) {
       }
     }
     float us = (float)timer.elapsed();
-    std::cout << "IbPeerToPeerTest.SimpleSendRecv: " << us / maxIter << " us/iter" << std::endl;
+    ::mscclpp::test::reportPerfResult("latency", us / maxIter, "us/iter");
   }
   bootstrap->barrier();
 }
@@ -199,11 +207,33 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     return;
   }
 
+  // Use atomic path if supported by the IB device.
+  bool useAtomic = ibCtx->supportsRdmaAtomics();
+
   const uint64_t signalPeriod = 1024;
   const uint64_t maxIter = 10000;
   const uint64_t nelem = 65536 + 1;
   auto data = mscclpp::detail::gpuCallocUnique<uint64_t>(nelem);
 
+  // For no-atomic mode: allocate a separate signal buffer for write-with-imm destination.
+  // The sender writes-with-imm to this buffer; the receiver's CPU thread reads the imm_data
+  // from the recv CQ and writes the iteration value to data[0] via GDRCopy atomicStore.
+  std::shared_ptr<uint64_t> signalBuf;
+  std::unique_ptr<const mscclpp::IbMr> signalMr;
+  std::array<mscclpp::IbMrInfo, 2> signalMrInfo{};
+  if (!useAtomic) {
+    signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+    signalMr = ibCtx->registerMr(signalBuf.get(), sizeof(uint64_t));
+    signalMrInfo[gEnv->rank] = signalMr->getInfo();
+    bootstrap->allGather(signalMrInfo.data(), sizeof(mscclpp::IbMrInfo));
+
+    // Pre-post recv WRs for write-with-imm on both ranks
+    for (int i = 0; i < 64; ++i) {
+      qp->stageRecv(0);
+    }
+    qp->postRecv();
+  }
+
   registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);
 
   uint64_t res = 0;
@@ -222,6 +252,40 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     ASSERT_EQ(*ptrCurIter, 0);
     ASSERT_EQ(*ptrResult, 0);
 
+    // For no-atomic mode: create a GDRCopy mapping for data[0] and start a CPU thread that
+    // polls recv CQ and forwards the signal via GDRCopy BAR1 write — the same mechanism
+    // used by IBConnection::recvThreadFunc for port channels.
+    std::atomic<bool> stopRecvThread(false);
+    std::thread recvThread;
+    std::unique_ptr<mscclpp::GdrMap> dataGdrMap;
+    if (!useAtomic) {
+      if (!mscclpp::gdrEnabled()) {
+        SKIP_TEST() << "No-atomic mode requires GDRCopy but it is not available.";
+      }
+      // Create GDRCopy BAR1 mapping for data[0] — same as how connection.cc maps inboundToken_
+      dataGdrMap =
+          std::make_unique<mscclpp::GdrMap>(std::shared_ptr<void>(data.get(), [](void*) {}),  // non-owning shared_ptr
+                                            cudaDevId);
+
+      recvThread = std::thread([&]() {
+        while (!stopRecvThread.load(std::memory_order_relaxed)) {
+          int wcNum = qp->pollRecvCq();
+          if (wcNum <= 0) continue;
+          for (int i = 0; i < wcNum; ++i) {
+            int status = qp->getRecvWcStatus(i);
+            if (status != static_cast<int>(mscclpp::WsStatus::Success)) continue;
+            uint64_t val = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+            // Write the iteration value to data[0] via GDRCopy BAR1 atomicStore —
+            // same pattern as IBConnection::recvThreadFunc.
+            mscclpp::atomicStore(dataGdrMap->hostPtr(), val, mscclpp::memoryOrderRelaxed);
+            // Re-post recv
+            qp->stageRecv(0);
+            qp->postRecv();
+          }
+        }
+      });
+    }
+
     kernelMemoryConsistency<<<1, 1024>>>(data.get(), ptrCurIter, ptrResult, nelem, maxIter);
     MSCCLPP_CUDATHROW(cudaGetLastError());
 
@@ -243,6 +307,11 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     }
 
     MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+    if (!useAtomic) {
+      stopRecvThread.store(true, std::memory_order_relaxed);
+      if (recvThread.joinable()) recvThread.join();
+    }
   } else if (gEnv->rank == 1) {
     // Sender
     std::vector<uint64_t> hostBuffer(nelem, 0);
@@ -263,15 +332,20 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
       stageSendWrite(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
       qp->postSend();
 
-#if 0
-      // For reference: send the first element using a normal send. This should occasionally see a wrong result.
-      stageSendWrite(sizeof(uint64_t), 0, 0, 0, false);
-      qp->postSend();
-#else
-      // Send the first element using AtomicAdd. This should see the correct result.
-      stageSendAtomicAdd(0, 0, 1, false);
-      qp->postSend();
-#endif
+      if (useAtomic) {
+        // Send the first element using AtomicAdd. The non-posted PCIe atomic operation
+        // provides end-to-end ordering: data[1..N] are guaranteed visible when data[0] updates.
+        stageSendAtomicAdd(0, 0, 1, false);
+        qp->postSend();
+      } else {
+        // No-atomic mode: send a 0-byte WRITE_WITH_IMM carrying the iteration in imm_data.
+        // The receiver's CPU thread polls the recv CQ and writes the value to data[0]
+        // via GDRCopy atomicStore.
+        // QP ordering guarantees data[1..N] WRITE completes before this write-with-imm.
+        const mscclpp::IbMrInfo& remoteSignalMrInfo = signalMrInfo[(gEnv->rank == 1) ? 0 : 1];
+        qp->stageSendWriteWithImm(nullptr, remoteSignalMrInfo, 0, 0, 0, 0, false, static_cast<unsigned int>(iter));
+        qp->postSend();
+      }
 
       if (signaled) {
         int wcNum = qp->pollSendCq();
@@ -292,20 +366,33 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     }
   }
 
-  if (res & 2) {
-    FAIL() << "The receiver is stuck at iteration " << iter << ".";
-  } else if (res != 0 && res != 1) {
-    FAIL() << "Unknown error is detected at iteration " << iter << ". res =" << res;
+  if (useAtomic) {
+    // With RDMA atomics, memory consistency must be guaranteed.
+    if (res & 2) {
+      FAIL() << "The receiver is stuck at iteration " << iter << ".";
+    }
+    EXPECT_EQ(res, 0);
+  } else {
+    if (res == 0) {
+      // No-atomic path works correctly here.
+    } else if (res & 2) {
+      SKIP_TEST() << "No-atomic signal forwarding: receiver stuck at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    } else {
+      SKIP_TEST() << "No-atomic signal forwarding: memory inconsistency detected at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    }
   }
-
-  EXPECT_EQ(res, 0);
 }
 
-TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
+PERF_TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
   }
+  if (!ibCtx->supportsRdmaAtomics()) {
+    SKIP_TEST() << "This test requires RDMA atomics support.";
+  }
 
   mscclpp::Timer timeout(3);
 
@@ -339,7 +426,7 @@ TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
       }
     }
     float us = (float)timer.elapsed();
-    std::cout << "IbPeerToPeerTest.SimpleAtomicAdd: " << us / maxIter << " us/iter" << std::endl;
+    ::mscclpp::test::reportPerfResult("latency", us / maxIter, "us/iter");
   }
   bootstrap->barrier();
 }
diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
index 318d301a..1ce9eb0b 100644
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -103,7 +103,7 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)(nTries) << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)(nTries), "us/iter");
   }
 }
 
@@ -324,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int*
   }
 }
 
-TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
+PERF_TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
   auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
   packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
 }
 
-TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
+PERF_TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
   auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index 5f95d660..f4a26cf9 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -159,6 +159,7 @@ class PortChannelOneToOneTest : public CommunicatorTestBase {
   void testPingPongPerf(PingPongTestParams params);
   void testPacketPingPong(bool useIbOnly, IbMode ibMode = IbMode::Default);
   void testPacketPingPongPerf(bool useIbOnly, IbMode ibMode = IbMode::Default);
+  void testBandwidth(PingPongTestParams params);
 
   std::shared_ptr<mscclpp::ProxyService> proxyService;
 };
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 764c3299..3b14ed31 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -4,9 +4,38 @@
 #include <cstdint>
 #include <mscclpp/concurrency_device.hpp>
 
+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"
 
+// Skip the current test if the given IB mode will require GDRCopy on CUDA but it is unavailable.
+// On CUDA, HostNoAtomic requires GDRCopy for BAR1 signal forwarding. When IbMode::Host or
+// IbMode::Default is used and the IB device does not support RDMA atomics, the endpoint falls
+// back to no-atomic mode, which also requires GDRCopy.
+// On ROCm, no-atomic mode uses direct volatile writes and does not need GDRCopy.
+#if defined(MSCCLPP_USE_CUDA)
+inline void requireGdrForIbMode(IbMode mode, mscclpp::Transport ibTransport) {
+  if (mscclpp::gdrEnabled()) return;  // GDRCopy available — nothing to skip.
+  if (mode == IbMode::HostNoAtomic) {
+    SKIP_TEST() << "HostNoAtomic requires GDRCopy on CUDA: " << mscclpp::gdrStatusMessage();
+  }
+  // For Host/Default modes: check whether the IB device lacks RDMA atomics,
+  // which would cause an automatic fallback to no-atomic mode.
+  if (mode == IbMode::Host || mode == IbMode::Default) {
+    std::string devName = mscclpp::getIBDeviceName(ibTransport);
+    mscclpp::IbCtx ibCtx(devName);
+    if (!ibCtx.supportsRdmaAtomics()) {
+      SKIP_TEST() << "IB device " << devName
+                  << " lacks RDMA atomics; Host mode falls back to HostNoAtomic which requires GDRCopy: "
+                  << mscclpp::gdrStatusMessage();
+    }
+  }
+}
+#define REQUIRE_GDR_FOR_IB_MODE(mode) requireGdrForIbMode((mode), ibTransport)
+#else
+#define REQUIRE_GDR_FOR_IB_MODE(mode)  // No extra requirements on non-CUDA platforms.
+#endif
+
 void PortChannelOneToOneTest::SetUp() {
   // Use only two ranks
   setNumRanksToUse(2);
@@ -226,7 +255,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nTries, "us/iter");
   }
 
   proxyService->stopProxy();
@@ -239,6 +268,7 @@ TEST(PortChannelOneToOneTest, PingPong) {
 
 TEST(PortChannelOneToOneTest, PingPongIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 }
@@ -255,28 +285,31 @@ TEST(PortChannelOneToOneTest, PingPongWithPoll) {
 
 TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerf) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerf) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
@@ -443,7 +476,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nTries, "us/iter");
   }
 
   proxyService->stopProxy();
@@ -453,28 +486,117 @@ TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode
 
 TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPacketPingPong(true, IbMode::Host);
 }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPacketPingPongPerf(true, IbMode::Host);
 }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPacketPingPongPerf(true, IbMode::HostNoAtomic);
 }
 
 TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
 
 TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPacketPingPong(true, IbMode::HostNoAtomic);
 }
+
+// Bandwidth test: bidirectional bulk transfer matching the tutorial pattern.
+// Both ranks do signal+wait+putWithSignal+wait per iteration.
+__global__ void kernelBandwidthBidir(int* buff, int nElem, int nIters, int rank) {
+  DeviceHandle<mscclpp::PortChannel>& portChan = gChannelOneToOneTestConstPortChans;
+  if (threadIdx.x != 0) return;
+  const uint64_t srcOffset = rank * nElem * sizeof(int);
+  const uint64_t dstOffset = srcOffset;
+  for (int i = 0; i < nIters; i++) {
+    portChan.signal();
+    portChan.wait();
+    portChan.putWithSignal(dstOffset, srcOffset, nElem * sizeof(int));
+    portChan.wait();
+  }
+}
+
+void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  const int maxElem = 32 * 1024 * 1024;  // 128 MB per direction
+  const int bufElem = maxElem * 2;       // 2x for bidirectional
+
+  std::vector<mscclpp::PortChannel> portChannels;
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(bufElem).memory();
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), bufElem * sizeof(int),
+                       nullptr, 0, params.ibMode);
+
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
+  for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
+
+  ASSERT_EQ(portChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::PortChannel>)));
+
+  proxyService->startProxy();
+
+  const std::string testName = ::mscclpp::test::currentTestName();
+  const int nIters = 1000;
+
+  for (int nElem : {256, 16 * 1024, 256 * 1024, 1024 * 1024, 4 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024}) {
+    // Warm-up
+    kernelBandwidthBidir<<<1, 1024>>>(buff.get(), nElem, 10, gEnv->rank);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    communicator->bootstrap()->barrier();
+
+    // Measure
+    mscclpp::Timer timer;
+    kernelBandwidthBidir<<<1, 1024>>>(buff.get(), nElem, nIters, gEnv->rank);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    double elapsedUs = timer.elapsed();
+    communicator->bootstrap()->barrier();
+
+    if (gEnv->rank == 0) {
+      double copyBytes = (double)nElem * sizeof(int);
+      double elapsedMsPerIter = elapsedUs / 1e3 / nIters;
+      double gbps = copyBytes / elapsedMsPerIter * 1e-6;
+      double sizeKB = copyBytes / 1024.0;
+      std::string label =
+          (sizeKB >= 1024.0) ? (std::to_string((int)(sizeKB / 1024.0)) + " MB") : (std::to_string((int)sizeKB) + " KB");
+      ::mscclpp::test::reportPerfResult(label, gbps, "GB/s");
+    }
+  }
+
+  proxyService->stopProxy();
+}
+
+PERF_TEST(PortChannelOneToOneTest, Bandwidth) {
+  testBandwidth(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
+}
+
+PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  testBandwidth(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
+}
+
+PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
+  testBandwidth(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
+}
diff --git a/test/mp_unit/semaphore_perf_tests.cu b/test/mp_unit/semaphore_perf_tests.cu
index 92560539..a4c0e29f 100644
--- a/test/mp_unit/semaphore_perf_tests.cu
+++ b/test/mp_unit/semaphore_perf_tests.cu
@@ -68,6 +68,6 @@ PERF_TEST(SemaphorePerfTest, SignalPingPong) {
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nIters << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nIters, "us/iter");
   }
 }
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 7836e063..a345effc 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -4,6 +4,7 @@
 target_sources(unit_tests PRIVATE
     unit_tests_main.cc
     core_tests.cc
+    gdr_tests.cu
     gpu_utils_tests.cc
     errors_tests.cc
     fifo_tests.cu
diff --git a/test/unit/gdr_tests.cu b/test/unit/gdr_tests.cu
new file mode 100644
index 00000000..78bb2e1a
--- /dev/null
+++ b/test/unit/gdr_tests.cu
@@ -0,0 +1,251 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <mscclpp/atomic_device.hpp>
+#include <mscclpp/errors.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "../framework.hpp"
+#include "gdr.hpp"
+
+// GdrStatus and gdrEnabled
+
+class GdrStatusTest : public ::mscclpp::test::TestCase {};
+
+TEST(GdrStatusTest, StatusIsValid) {
+  // gdrStatus() should return one of the defined enum values
+  auto status = mscclpp::gdrStatus();
+  ASSERT_TRUE(status == mscclpp::GdrStatus::Ok || status == mscclpp::GdrStatus::NotBuilt ||
+              status == mscclpp::GdrStatus::Disabled || status == mscclpp::GdrStatus::DriverMissing ||
+              status == mscclpp::GdrStatus::OpenFailed);
+}
+
+TEST(GdrStatusTest, EnabledConsistentWithStatus) {
+  // gdrEnabled() should be true iff gdrStatus() == Ok
+  EXPECT_EQ(mscclpp::gdrEnabled(), mscclpp::gdrStatus() == mscclpp::GdrStatus::Ok);
+}
+
+// GdrMap tests — only run when GDRCopy is available
+
+class GdrMapTest : public ::mscclpp::test::TestCase {
+ protected:
+  void SetUp() override {
+    if (!mscclpp::gdrEnabled()) {
+      SKIP_TEST() << "GDRCopy not enabled on this platform.";
+    }
+    MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
+    // Try creating a GDRCopy mapping to check if pin+map works on this platform.
+    try {
+      auto testMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+      mscclpp::GdrMap testMap(std::static_pointer_cast<void>(testMem), deviceId_);
+    } catch (const std::exception&) {
+      SKIP_TEST() << "GDRCopy mapping not supported on this platform.";
+    }
+  }
+
+  int deviceId_ = 0;
+};
+
+TEST(GdrMapTest, BasicMapping) {
+  // Allocate GPU memory via cudaMalloc (not VMM) and create a GDRCopy mapping
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+
+  ASSERT_TRUE(map.valid());
+  EXPECT_NE(map.hostPtr(), nullptr);
+}
+
+TEST(GdrMapTest, CopyToAndFrom) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write a value to GPU via GDRCopy
+  uint64_t writeVal = 0xDEADBEEFCAFE0123ULL;
+  map.copyTo(&writeVal, sizeof(uint64_t));
+
+  // Read it back via GDRCopy
+  uint64_t readVal = 0;
+  map.copyFrom(&readVal, sizeof(uint64_t));
+  EXPECT_EQ(readVal, writeVal);
+
+  // Also verify via cudaMemcpy
+  uint64_t cudaVal = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&cudaVal, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(cudaVal, writeVal);
+}
+
+TEST(GdrMapTest, CopyToVisibleFromGpu) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy, verify GPU sees it via cudaMemcpy
+  uint64_t val = 42;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 42);
+}
+
+TEST(GdrMapTest, MultipleWritesReadBack) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write multiple values sequentially and verify each
+  for (uint64_t i = 1; i <= 100; ++i) {
+    map.copyTo(&i, sizeof(uint64_t));
+    uint64_t readback = 0;
+    map.copyFrom(&readback, sizeof(uint64_t));
+    EXPECT_EQ(readback, i);
+    if (readback != i) break;
+  }
+}
+
+TEST(GdrMapTest, HostPtrIsWritable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write directly through the hostPtr (volatile store)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  *ptr = 12345;
+
+  // Read back via GDRCopy
+  uint64_t readback = 0;
+  map.copyFrom(&readback, sizeof(uint64_t));
+  EXPECT_EQ(readback, 12345);
+}
+
+TEST(GdrMapTest, HostPtrIsReadable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy copyTo (same BAR1 path as the read)
+  uint64_t val = 99999;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  // Read through the hostPtr (volatile load via BAR1)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  EXPECT_EQ(*ptr, 99999);
+}
+
+TEST(GdrMapTest, DestroyDoesNotCrash) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  {
+    mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+    ASSERT_TRUE(map.valid());
+    uint64_t val = 1;
+    map.copyTo(&val, sizeof(uint64_t));
+  }
+  // After GdrMap is destroyed, gpuMem should still be valid
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 1);
+}
+
+// GPU kernel: polls signalFromCpu until it reaches expectedIter, then writes expectedIter to ackToHost.
+// Repeats for maxIter iterations. The GPU uses system-scope acquire loads on signalFromCpu
+// and plain stores to ackToHost (which is host-pinned memory visible to CPU).
+__global__ void kernelGdrVisibilityPingPong(volatile uint64_t* signalFromCpu, volatile uint64_t* ackToHost,
+                                            uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // Poll until CPU writes the expected iteration value via GDRCopy BAR1
+    while (*signalFromCpu < iter) {
+    }
+    // Ack back to CPU via host-pinned memory
+    *ackToHost = iter;
+  }
+}
+
+TEST(GdrMapTest, CpuGpuVisibilityPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // signalBuf: GPU memory mapped via GDRCopy BAR1. CPU writes here, GPU polls.
+  auto signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap signalMap(std::static_pointer_cast<void>(signalBuf), deviceId_);
+  ASSERT_TRUE(signalMap.valid());
+
+  // ackBuf: host-pinned memory (gpuCallocHostShared). GPU writes here, CPU polls.
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — it will poll signalBuf and write ackBuf for each iteration
+  kernelGdrVisibilityPingPong<<<1, 1>>>(signalBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes iteration value to GPU via GDRCopy BAR1
+    uint64_t val = iter;
+    signalMap.copyTo(&val, sizeof(uint64_t));
+
+    // CPU polls host-pinned ack until GPU confirms it saw the value
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        FAIL() << "GPU did not ack iteration " << iter << " (ack=" << *ackPtr << ")";
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}
+
+// GPU kernel that polls a counter using system-scope acquire load.
+// When counter >= expectedIter, writes ack.
+__global__ void kernelCounterWait(uint64_t* counter, volatile uint64_t* ackToHost, uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // System-scope acquire load — matches the atomicStore(relaxed) on the CPU side
+    uint64_t got;
+    do {
+      got = mscclpp::atomicLoad(counter, mscclpp::memoryOrderAcquire);
+    } while (got < iter);
+    // Ack back
+    *ackToHost = iter;
+  }
+}
+
+// Test the GDRCopy counter pattern used by HostNoAtomic mode:
+// - GPU memory allocated via gpuCallocShared (cudaMalloc)
+// - GdrMap for BAR1 mapping
+// - CPU writes via atomicStore(relaxed) through GDRCopy BAR1 mapping
+// - GPU reads via atomicLoad with memory_order_acquire
+TEST(GdrMapTest, AtomicStoreCounterPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // Allocate GPU memory via gpuCallocShared
+  auto counterBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap counterMap(std::static_pointer_cast<void>(counterBuf), deviceId_);
+  ASSERT_TRUE(counterMap.valid());
+
+  // Ack buffer: host-pinned memory
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — polls counterBuf with system-scope acquire load
+  kernelCounterWait<<<1, 1>>>(counterBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes counter via atomicStore (relaxed — GPU uses acquire on read)
+    mscclpp::atomicStore(counterMap.hostPtr(), iter, mscclpp::memoryOrderRelaxed);
+
+    // Wait for GPU ack
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+        FAIL() << "GPU did not ack iteration " << iter;
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}

From feda3385954a1558cbfb1afd6356bcd744bbce8c Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Fri, 10 Apr 2026 13:57:14 -0700
Subject: [PATCH 13/21] Adjusting Torch Integration Example (#779)

Co-authored-by: Binyang Li <binyli@microsoft.com>
---
 examples/torch-integration/dsl_with_nccl_api.py   | 15 ++++++++-------
 .../csrc/ext/algorithm_collection_builder_py.cpp  |  1 +
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/torch-integration/dsl_with_nccl_api.py b/examples/torch-integration/dsl_with_nccl_api.py
index 975d3749..5a4dd1c4 100644
--- a/examples/torch-integration/dsl_with_nccl_api.py
+++ b/examples/torch-integration/dsl_with_nccl_api.py
@@ -1,19 +1,20 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/nccl/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
 
 import os
 from typing import Any, Dict
 import torch, torch.distributed as dist
-import mscclpp
+import mscclpp.ext
 from mscclpp.language.collectives import AllReduce
 from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
 from mscclpp.language.program import CollectiveProgram
 from mscclpp.language.rank import Rank
+from mscclpp.language.utils import AlgoSpec
 
 
-def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
+def allreduce_nvls(spec: AlgoSpec) -> CollectiveProgram:
     gpu_size = spec.world_size
     with CollectiveProgram.from_spec(spec) as program:
         # Creating Channels
@@ -63,8 +64,8 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
     return program
 
 
-def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int):
-    spec = mscclpp.AlgoSpec(
+def setup_plan(algo_collection_builder: mscclpp.ext.AlgorithmCollectionBuilder, rank: int, world_size: int):
+    spec = AlgoSpec(
         name="allreduce_nvls",
         collective=AllReduce(8, 1, True),
         nranks_per_node=8,
@@ -94,10 +95,10 @@ def init_dist():
     rank = int(os.environ["RANK"])
     world = int(os.environ["WORLD_SIZE"])
     local = int(os.environ["LOCAL_RANK"])
-    algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder()
+    algorithm_collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
     setup_plan(algorithm_collection_builder, rank, world)
     algorithm_collection_builder.set_algorithm_selector(selector)
-    dist.init_process_group(backend="nccl", device_id=local)
+    dist.init_process_group(backend="nccl", device_id=torch.device("cuda", local))
     return rank, world, local
 
 
diff --git a/python/csrc/ext/algorithm_collection_builder_py.cpp b/python/csrc/ext/algorithm_collection_builder_py.cpp
index be7f944e..4a3563d9 100644
--- a/python/csrc/ext/algorithm_collection_builder_py.cpp
+++ b/python/csrc/ext/algorithm_collection_builder_py.cpp
@@ -4,6 +4,7 @@
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/function.h>
 #include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
 #include <nanobind/stl/unordered_map.h>
 #include <nanobind/stl/vector.h>
 

From 5380a4ac6ef705f9f6e25141234dacaa6a95ffa0 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 13 Apr 2026 09:59:42 -0700
Subject: [PATCH 14/21] Add MSCCLPP_IB_GID_INDEX env (#780)

Use MSCCLPP_IB_GID_INDEX to control ib gid index

---------

Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 include/mscclpp/core.hpp | 4 ++--
 include/mscclpp/env.hpp  | 4 ++++
 python/csrc/env_py.cpp   | 3 ++-
 src/core/endpoint.cc     | 5 +++++
 src/core/env.cpp         | 4 +++-
 5 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 37bdbd51..ca2fc34f 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -389,7 +389,7 @@ struct EndpointConfig {
     };
 
     static constexpr int DefaultPort = -1;
-    static constexpr int DefaultGidIndex = 0;
+    static constexpr int DefaultGidIndex = -1;
     static constexpr int DefaultMaxCqSize = 1024;
     static constexpr int DefaultMaxCqPollNum = 1;
     static constexpr int DefaultMaxSendWr = 8192;
@@ -418,7 +418,7 @@ struct EndpointConfig {
     /// Constructor.
     /// @param deviceIndex Device index.
     /// @param port Port number.
-    /// @param gidIndex GID index.
+    /// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
     /// @param maxCqSize Maximum send completion queue size.
     /// @param maxCqPollNum Maximum send completion queue poll count.
     /// @param maxSendWr Maximum outstanding send work requests.
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index fb1da22c..a6dd306b 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -115,6 +115,10 @@ class Env {
   /// Default is false.
   const bool forceDisableGdr;
 
+  /// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
+  /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
+  const int ibGidIndex;
+
  private:
   Env();
 
diff --git a/python/csrc/env_py.cpp b/python/csrc/env_py.cpp
index ce89fd3d..d4b2f5da 100644
--- a/python/csrc/env_py.cpp
+++ b/python/csrc/env_py.cpp
@@ -23,7 +23,8 @@ void register_env(nb::module_& m) {
       .def_ro("ibv_mode", &Env::ibvMode)
       .def_ro("cache_dir", &Env::cacheDir)
       .def_ro("npkit_dump_dir", &Env::npkitDumpDir)
-      .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
+      .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream)
+      .def_ro("ib_gid_index", &Env::ibGidIndex);
 
   m.def("env", &env);
 }
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 5ab4bad0..fe51e348 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -47,6 +47,11 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
       }
     }
 
+    // Resolve GID index: explicit value (>= 0) takes priority, otherwise use env
+    if (config_.ib.gidIndex < 0) {
+      config_.ib.gidIndex = env()->ibGidIndex;
+    }
+
     int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
 
     ibQp_ = contextImpl.getIbContext(config_.transport)
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 96f53492..7a42471b 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -66,7 +66,8 @@ Env::Env()
       forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
       ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
       forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
-      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
+      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
+      ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -95,6 +96,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
     logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
+    logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
   }
   return globalEnv;
 }

From b59e6d7f0018ec39dc160553b3bd7d34308d204a Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Mon, 13 Apr 2026 13:36:42 -0700
Subject: [PATCH 15/21] Updating NpKit (#785)

---
 src/core/npkit/npkit.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/core/npkit/npkit.cc b/src/core/npkit/npkit.cc
index 30fc35c7..84457abf 100644
--- a/src/core/npkit/npkit.cc
+++ b/src/core/npkit/npkit.cc
@@ -103,10 +103,10 @@ static int GetGpuClockRateInKhz() {
   else
     return 25000;
 #else
-  cudaDeviceProp dev_prop;
+  int clockRate;
   MSCCLPP_CUDATHROW(cudaGetDevice(&dev_id));
-  MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev_id));
-  return dev_prop.clockRate;
+  MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev_id));
+  return clockRate;
 #endif
 }
 #endif

From b6d0ca13ca866d45a6385bd5ad0e1c4b568fd37c Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Mon, 13 Apr 2026 13:55:45 -0700
Subject: [PATCH 16/21] Adding CI Test to DSL Executor (#782)

---
 .azure-pipelines/templates/ut-executor.yml    |  42 +
 .azure-pipelines/ut.yml                       |  21 +
 test/executor-tests/algos/reduce.py           |  87 ++
 test/executor-tests/algos/reduce_nvls.py      |  91 +++
 .../algos/reduce_nvls_pipeline.py             |  94 +++
 test/executor-tests/algos/reduce_pack.py      |  73 ++
 test/executor-tests/algos/reduce_pack_tbg.py  |  77 ++
 test/executor-tests/algos/reduce_tbg.py       |  99 +++
 test/executor-tests/algos/transfer_pack.py    |  67 ++
 .../executor-tests/algos/transfer_pack_tbg.py |  71 ++
 .../execution-plans/reduce.json               | 389 +++++++++
 .../execution-plans/reduce_nvls.json          | 246 ++++++
 .../execution-plans/reduce_nvls_pipeline.json | 264 ++++++
 .../execution-plans/reduce_pack.json          | 297 +++++++
 .../execution-plans/reduce_pack_tbg.json      | 576 +++++++++++++
 .../execution-plans/reduce_tbg.json           | 773 ++++++++++++++++++
 .../execution-plans/transfer_pack.json        | 216 +++++
 .../execution-plans/transfer_pack_tbg.json    | 406 +++++++++
 18 files changed, 3889 insertions(+)
 create mode 100644 .azure-pipelines/templates/ut-executor.yml
 create mode 100644 test/executor-tests/algos/reduce.py
 create mode 100644 test/executor-tests/algos/reduce_nvls.py
 create mode 100644 test/executor-tests/algos/reduce_nvls_pipeline.py
 create mode 100644 test/executor-tests/algos/reduce_pack.py
 create mode 100644 test/executor-tests/algos/reduce_pack_tbg.py
 create mode 100644 test/executor-tests/algos/reduce_tbg.py
 create mode 100644 test/executor-tests/algos/transfer_pack.py
 create mode 100644 test/executor-tests/algos/transfer_pack_tbg.py
 create mode 100644 test/executor-tests/execution-plans/reduce.json
 create mode 100644 test/executor-tests/execution-plans/reduce_nvls.json
 create mode 100644 test/executor-tests/execution-plans/reduce_nvls_pipeline.json
 create mode 100644 test/executor-tests/execution-plans/reduce_pack.json
 create mode 100644 test/executor-tests/execution-plans/reduce_pack_tbg.json
 create mode 100644 test/executor-tests/execution-plans/reduce_tbg.json
 create mode 100644 test/executor-tests/execution-plans/transfer_pack.json
 create mode 100644 test/executor-tests/execution-plans/transfer_pack_tbg.json

diff --git a/.azure-pipelines/templates/ut-executor.yml b/.azure-pipelines/templates/ut-executor.yml
new file mode 100644
index 00000000..426daf17
--- /dev/null
+++ b/.azure-pipelines/templates/ut-executor.yml
@@ -0,0 +1,42 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: ExecutorTest
+    displayName: Run executor tests
+    remoteScript: |
+      python3 -m pip install .
+      PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
+      TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 4e6f96b1..6b8c9eda 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -148,3 +148,24 @@ jobs:
       vmssName:         mscclpp-mi300x-ci
       platform:         rocm
       gpuArch:          gfx942
+
+- job: UnitTestExecutor
+  timeoutInMinutes: 60
+  displayName: Test DSL Executor
+  pool:
+    name: msccl-ci-h100
+
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut-executor.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
\ No newline at end of file
diff --git a/test/executor-tests/algos/reduce.py b/test/executor-tests/algos/reduce.py
new file mode 100644
index 00000000..db630a43
--- /dev/null
+++ b/test/executor-tests/algos/reduce.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Test
+
+This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
+operations. It implements a 2-GPU allreduce using the Simple protocol
+with instruction fusion enabled.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce(name, num_threads_per_block, min_message_size, max_message_size):
+    collective = AllReduce(2, 2, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        2,
+        protocol="Simple",
+        instr_fusion=True,
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 4)
+        second_scratch_buffer = Buffer(1, 4)
+
+        # Each rank copies its input chunks to scratch to prepare for remote access
+        first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb=0)
+        second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb=0)
+
+        # Signal and wait to ensure scratch data is visible to the remote rank
+        first_ch.signal(tb=0)
+        second_ch.signal(tb=0)
+
+        first_ch.wait(tb=0)
+        second_ch.wait(tb=0)
+
+        # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
+        first_ch.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb=0)
+        first_ch.put(second_input_buffer[0:1], first_input_buffer[0:1], tb=0)
+
+        # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
+        first_ch.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=0)
+        first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb=0)
+        first_ch.put(second_input_buffer[1:2], first_input_buffer[1:2], tb=0)
+
+        # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
+        second_ch.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb=0)
+        second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb=0)
+        second_ch.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb=0)
+
+        # Final signal/wait to ensure all reduced data is consistent across both ranks
+        first_ch.signal(tb=0)
+        second_ch.signal(tb=0)
+
+        first_ch.wait(tb=0)
+        second_ch.wait(tb=0)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_nvls.py b/test/executor-tests/algos/reduce_nvls.py
new file mode 100644
index 00000000..e59b8247
--- /dev/null
+++ b/test/executor-tests/algos/reduce_nvls.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce NVLS Test
+
+This file tests the executor MULTI_LOAD_REDUCE_STORE operation using
+NVLS SwitchChannels. Each GPU reduces its chunk via the
+NVSwitch and broadcasts the result to all other GPUs.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_nvls(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        instances=1,
+        protocol="Simple",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the GPUs are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Reducing and Storing the data
+        for gpu in range(gpu_size):
+            buffer_offset = gpu
+            rank = Rank(gpu)
+            input_buffer = rank.get_input_buffer()
+            nvls_chan.at_rank(gpu).reduce(
+                buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+            )
+            nvls_chan.at_rank(gpu).broadcast(
+                src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+            )
+
+        # Synchronization to Ensure the GPUs finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_nvls(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_nvls_pipeline.py b/test/executor-tests/algos/reduce_nvls_pipeline.py
new file mode 100644
index 00000000..d7a4925e
--- /dev/null
+++ b/test/executor-tests/algos/reduce_nvls_pipeline.py
@@ -0,0 +1,94 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce NVLS Pipeline Test
+
+This file tests the executor MULTI_LOAD_REDUCE_STORE operation in a
+pipeline context using SwitchChannel. Each GPU reduces
+its chunk via the NVSwitch and broadcasts the result, processing data
+in a pipelined loop over fixed-size iterations.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+from mscclpp.language.loop import LoopIterationContext
+
+
+def reduce_nvls_pipeline(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        instances=1,
+        protocol="Simple",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the GPUs are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Pipeline Reducing and Storing the data
+        with LoopIterationContext(unit=2**20, num_chunks=1):
+            for gpu in range(gpu_size):
+                buffer_offset = gpu
+                rank = Rank(gpu)
+                input_buffer = rank.get_input_buffer()
+                nvls_chan.at_rank(gpu).reduce(
+                    buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+                )
+                nvls_chan.at_rank(gpu).broadcast(
+                    src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+                )
+
+        # Synchronization to Ensure the GPUs finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_nvls_pipeline(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_pack.py b/test/executor-tests/algos/reduce_pack.py
new file mode 100644
index 00000000..9aa48caf
--- /dev/null
+++ b/test/executor-tests/algos/reduce_pack.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Pack Test
+
+This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
+operations. It implements a 2-GPU allreduce with the LL (low-latency)
+packet protocol.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_pack(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 3)
+        second_scratch_buffer = Buffer(1, 3)
+
+        # Each rank sends its input chunk as packets to the other rank's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb=0)
+        second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb=0)
+
+        # Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
+        first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb=1, packet=True)
+        first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb=1)
+
+        # Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
+        second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb=1, packet=True)
+        second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb=1)
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb=1)
+
+        # Both ranks unpack the final reduced packets from scratch into their output buffers
+        first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb=2)
+        second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb=2)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_pack_tbg.py b/test/executor-tests/algos/reduce_pack_tbg.py
new file mode 100644
index 00000000..eaca4c4c
--- /dev/null
+++ b/test/executor-tests/algos/reduce_pack_tbg.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Pack Thread Block Group Test
+
+This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
+operations using thread block groups. It implements a 2-GPU allreduce
+with the LL (low-latency) packet protocol, where multiple thread
+blocks cooperate on each phase.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 3)
+        second_scratch_buffer = Buffer(1, 3)
+        tbg = []
+        for i in range(3):
+            tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
+
+        # Each rank sends its input chunk as packets to the other rank's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb_group=tbg[0])
+        second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb_group=tbg[0])
+
+        # Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
+        first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb_group=tbg[1], packet=True)
+        first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb_group=tbg[1])
+
+        # Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
+        second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb_group=tbg[1], packet=True)
+        second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb_group=tbg[1])
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb_group=tbg[1])
+
+        # Both ranks unpack the final reduced packets from scratch into their output buffers
+        first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[2])
+        second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_tbg.py b/test/executor-tests/algos/reduce_tbg.py
new file mode 100644
index 00000000..103c6d20
--- /dev/null
+++ b/test/executor-tests/algos/reduce_tbg.py
@@ -0,0 +1,99 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Thread Block Group Test
+
+This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
+operations using thread block groups. It implements a 2-GPU allreduce
+with the Simple protocol and instruction fusion, where multiple thread
+blocks cooperate on each operation.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    collective = AllReduce(2, 2, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        2,
+        protocol="Simple",
+        instr_fusion=True,
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, memory channels, input buffers, and scratch buffers for 2-GPU AllReduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch_tb0 = MemoryChannel(1, 0)
+        first_ch_tb1 = MemoryChannel(1, 0)
+        second_ch_tb0 = MemoryChannel(0, 1)
+        second_ch_tb1 = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 4)
+        second_scratch_buffer = Buffer(1, 4)
+        tbg = ThreadBlockGroup(tb_list=[0, 1])
+
+        # Each rank copies its input chunks to scratch to prepare for remote access
+        first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb_group=tbg)
+        second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb_group=tbg)
+
+        # Signal and wait on both TBs to ensure scratch data is visible to the remote rank
+        first_ch_tb0.signal(tb=0)
+        first_ch_tb1.signal(tb=1)
+        second_ch_tb0.signal(tb=0)
+        second_ch_tb1.signal(tb=1)
+
+        first_ch_tb0.wait(tb=0)
+        first_ch_tb1.wait(tb=1)
+        second_ch_tb0.wait(tb=0)
+        second_ch_tb1.wait(tb=1)
+
+        # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
+        first_ch_tb0.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb_group=tbg)
+        first_ch_tb0.put(second_input_buffer[0:1], first_input_buffer[0:1], tb_group=tbg)
+
+        # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
+        first_ch_tb0.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg)
+        first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb_group=tbg)
+        first_ch_tb0.put(second_input_buffer[1:2], first_input_buffer[1:2], tb_group=tbg)
+
+        # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
+        second_ch_tb0.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb_group=tbg)
+        second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb_group=tbg)
+        second_ch_tb0.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb_group=tbg)
+
+        # Final signal/wait on both TBs to ensure all reduced data is consistent across both ranks
+        first_ch_tb0.signal(tb=0)
+        first_ch_tb1.signal(tb=1)
+        second_ch_tb0.signal(tb=0)
+        second_ch_tb1.signal(tb=1)
+
+        first_ch_tb0.wait(tb=0)
+        first_ch_tb1.wait(tb=1)
+        second_ch_tb0.wait(tb=0)
+        second_ch_tb1.wait(tb=1)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/transfer_pack.py b/test/executor-tests/algos/transfer_pack.py
new file mode 100644
index 00000000..e382f012
--- /dev/null
+++ b/test/executor-tests/algos/transfer_pack.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Transfer Pack Test
+
+This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
+PUT_PACKETS operations. It implements a 2-GPU allgather with the LL
+(low-latency) packet protocol.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def transfer_pack(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, output and scratch buffers for 2-GPU allgather
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_output_buffer = first_rank.get_output_buffer()
+        second_output_buffer = second_rank.get_output_buffer()
+        first_scratch_buffer = Buffer(0, 2)
+        second_scratch_buffer = Buffer(1, 2)
+
+        # Rank 0 sends its output chunk as packets to rank 1's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb=0)
+
+        # Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
+        second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb=0)
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=1)
+
+        # Both ranks unpack received packets from scratch into their output buffers
+        first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb=1)
+        second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb=2)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+transfer_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/transfer_pack_tbg.py b/test/executor-tests/algos/transfer_pack_tbg.py
new file mode 100644
index 00000000..5a2dc11b
--- /dev/null
+++ b/test/executor-tests/algos/transfer_pack_tbg.py
@@ -0,0 +1,71 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Transfer Pack Thread Block Group Test
+
+This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
+PUT_PACKETS operations using thread block groups. It implements a 2-GPU
+allgather with the LL (low-latency) packet protocol, where multiple
+thread blocks cooperate on each phase.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def transfer_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, output and scratch buffers for 2-GPU allgather
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_output_buffer = first_rank.get_output_buffer()
+        second_output_buffer = second_rank.get_output_buffer()
+        first_scratch_buffer = Buffer(0, 2)
+        second_scratch_buffer = Buffer(1, 2)
+        tbg = []
+        for i in range(3):
+            tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
+
+        # Rank 0 sends its output chunk as packets to rank 1's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb_group=tbg[0])
+
+        # Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
+        second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb_group=tbg[0])
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg[1])
+
+        # Both ranks unpack received packets from scratch into their output buffers
+        first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[1])
+        second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+transfer_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/execution-plans/reduce.json b/test/executor-tests/execution-plans/reduce.json
new file mode 100644
index 00000000..49a1048a
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce.json
@@ -0,0 +1,389 @@
+{
+  "name": "reduce",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        },
+        {
+          "rank": 1,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_nvls.json b/test/executor-tests/execution-plans/reduce_nvls.json
new file mode 100644
index 00000000..ac1261d6
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_nvls.json
@@ -0,0 +1,246 @@
+{
+  "name": "allreduce_nvls",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "glres",
+              "src_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "switch",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "glres",
+              "src_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "switch",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_nvls_pipeline.json b/test/executor-tests/execution-plans/reduce_nvls_pipeline.json
new file mode 100644
index 00000000..c9fb0760
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_nvls_pipeline.json
@@ -0,0 +1,264 @@
+{
+  "name": "allreduce_nvls_pipeline",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pipeline",
+              "iter_context": {
+                "unit_size": 1048576,
+                "num_chunks": 1
+              },
+              "ops": [
+                {
+                  "name": "glres",
+                  "src_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 0,
+                      "size": 1
+                    }
+                  ],
+                  "dst_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 0,
+                      "size": 1
+                    }
+                  ],
+                  "channel_type": "switch",
+                  "reduce_op": "sum"
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pipeline",
+              "iter_context": {
+                "unit_size": 1048576,
+                "num_chunks": 1
+              },
+              "ops": [
+                {
+                  "name": "glres",
+                  "src_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 1,
+                      "size": 1
+                    }
+                  ],
+                  "dst_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 1,
+                      "size": 1
+                    }
+                  ],
+                  "channel_type": "switch",
+                  "reduce_op": "sum"
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_pack.json b/test/executor-tests/execution-plans/reduce_pack.json
new file mode 100644
index 00000000..b74d5772
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_pack.json
@@ -0,0 +1,297 @@
+{
+  "name": "reduce_pack",
+  "collective": "allreduce",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_pack_tbg.json b/test/executor-tests/execution-plans/reduce_pack_tbg.json
new file mode 100644
index 00000000..4380de6e
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_pack_tbg.json
@@ -0,0 +1,576 @@
+{
+  "name": "reduce_pack_tbg",
+  "collective": "allreduce",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_tbg.json b/test/executor-tests/execution-plans/reduce_tbg.json
new file mode 100644
index 00000000..a4683236
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_tbg.json
@@ -0,0 +1,773 @@
+{
+  "name": "reduce_tbg",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                1,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1,
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        },
+        {
+          "rank": 1,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                1,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0,
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/transfer_pack.json b/test/executor-tests/execution-plans/transfer_pack.json
new file mode 100644
index 00000000..270d6c13
--- /dev/null
+++ b/test/executor-tests/execution-plans/transfer_pack.json
@@ -0,0 +1,216 @@
+{
+  "name": "transfer_pack",
+  "collective": "allgather",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/transfer_pack_tbg.json b/test/executor-tests/execution-plans/transfer_pack_tbg.json
new file mode 100644
index 00000000..bec8459d
--- /dev/null
+++ b/test/executor-tests/execution-plans/transfer_pack_tbg.json
@@ -0,0 +1,406 @@
+{
+  "name": "transfer_pack_tbg",
+  "collective": "allgather",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}

From ecd33722d4e3bc108994d9517824352efbf30bfa Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 13 Apr 2026 21:51:29 -0700
Subject: [PATCH 17/21] Fix multi-node H100 CI: CUDA compat, deploy
 improvements (#781)

## Summary

- **Multi-node H100 CI setup**: Improve architecture detection and GPU
configuration
- **Remove hardcoded VMSS hostnames** from deploy files
- **Fix CUDA compat library issue**: Remove stale compat paths from
Docker image for CUDA 12+. Instead, `peer_access_test` now returns a
distinct exit code (2) for CUDA init failure, and `setup.sh`
conditionally adds compat libs only when needed. This fixes
`cudaErrorSystemNotReady` (error 803) when the host driver is newer than
the container's compat libs.
- **Speed up deploy**: Replace recursive `parallel-scp` with
tar+scp+untar to avoid per-file SSH overhead.

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .azure-pipelines/multi-nodes-test.yml         | 73 +++++++++-----
 .../templates/run-remote-task.yml             |  4 +
 docker/build.sh                               |  5 -
 src/core/registered_memory.cc                 | 22 ++++-
 test/deploy/config                            |  8 --
 test/deploy/deploy.sh                         | 24 ++++-
 test/deploy/hostfile                          |  2 -
 test/deploy/hostfile_mpi                      |  2 -
 test/deploy/perf_ndmv5.jsonl                  |  9 +-
 test/deploy/run_tests.sh                      | 96 +++++++++++--------
 test/deploy/setup.sh                          | 33 ++++++-
 tools/peer-access-test/peer_access_test.cu    | 10 +-
 12 files changed, 200 insertions(+), 88 deletions(-)
 delete mode 100644 test/deploy/config
 delete mode 100644 test/deploy/hostfile
 delete mode 100644 test/deploy/hostfile_mpi

diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index d4924879..3b3ebe1f 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -16,23 +16,24 @@ pr: none
 
 
 parameters:
+- name: vmssName
+  type: string
+  default: mscclpp-h100-multinode-ci
 - name: hostEntries
   type: string
   default: |
-    10.0.0.10 mscclit-000000
-    10.0.0.11 mscclit-000001
+    10.0.0.5 mscclpp-h100-multinode-ci000000
+    10.0.0.4 mscclpp-h100-multinode-ci000001
 
 jobs:
 - job: MultiNodesTest
   displayName: Multi nodes test
   strategy:
     matrix:
-      cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
   pool:
-    name: mscclpp-it
+    name: mscclpp-multi-node
   container:
     image: $[ variables['containerImage'] ]
 
@@ -42,25 +43,53 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        ENTRY="${{ parameters.hostEntries }}"
-        if ! grep -qxF "$ENTRY" /etc/hosts; then
-          echo "Adding to /etc/hosts"
-          echo "$ENTRY" | sudo tee -a /etc/hosts
-        else
-          echo "Entry already exists, nothing to do."
-        fi
+        while IFS= read -r line; do
+          [ -z "$line" ] && continue
+          if ! grep -qxF "$line" /etc/hosts; then
+            echo "Adding to /etc/hosts: $line"
+            echo "$line" | sudo tee -a /etc/hosts
+          else
+            echo "Entry already exists: $line"
+          fi
+        done <<< "${{ parameters.hostEntries }}"
+
+  - task: Bash@3
+    displayName: Generate deploy files
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        VMSS="${{ parameters.vmssName }}"
+        DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
+        NODE0="${VMSS}000000"
+        NODE1="${VMSS}000001"
+
+        echo "Host ${NODE0}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no
+        Host ${NODE1}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
+
+        printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
+
+        printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
 
   - template: templates/deploy.yml
     parameters:
-      subscription:  msccl-it
-      vmssName:      mscclit-vmss
-      resourceGroup: msccl-IT
+      subscription:  mscclpp-ci-h100
+      vmssName:      ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
+      gpuArch:       '90'
 
   - template: templates/run-remote-task.yml
     parameters:
       name: RunMscclppTest
       displayName: Run multi-nodes mscclpp-test
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      continueOnError: true
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
 
@@ -68,7 +97,7 @@ jobs:
     parameters:
       name: RunMultiNodeUnitTest
       displayName: Run multi-nodes unit tests
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
 
@@ -76,7 +105,7 @@ jobs:
     parameters:
       name: RunMultiNodePythonTests
       displayName: Run multi-nodes python tests
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh pytests
 
@@ -84,12 +113,12 @@ jobs:
     parameters:
       name: RunMultiNodePythonBenchmark
       displayName: Run multi-nodes python benchmark
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
 
   - template: templates/stop.yml
     parameters:
-      subscription:  msccl-it
-      vmssName:      mscclit-vmss
-      resourceGroup: msccl-IT
+      subscription:  mscclpp-ci-h100
+      vmssName:      ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
diff --git a/.azure-pipelines/templates/run-remote-task.yml b/.azure-pipelines/templates/run-remote-task.yml
index 37b3a7d7..3ca0d98a 100644
--- a/.azure-pipelines/templates/run-remote-task.yml
+++ b/.azure-pipelines/templates/run-remote-task.yml
@@ -12,12 +12,16 @@ parameters:
 - name: workingDirectory
   type: string
   default: '$(System.DefaultWorkingDirectory)'
+- name: continueOnError
+  type: boolean
+  default: false
 
 steps:
 - task: Bash@3
   ${{ if ne(parameters.name, '') }}:
     name: ${{ parameters.name }}
   displayName: ${{ parameters.displayName }}
+  continueOnError: ${{ parameters.continueOnError }}
   inputs:
     targetType: 'inline'
     script: |
diff --git a/docker/build.sh b/docker/build.sh
index 89568e19..651a6122 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -14,11 +14,6 @@ baseImageTable=(
 
 declare -A extraLdPathTable
 extraLdPathTable=(
-    ["cuda11.8"]="/usr/local/cuda-11.8/compat"
-    ["cuda12.4"]="/usr/local/cuda-12.4/compat"
-    ["cuda12.8"]="/usr/local/cuda-12.8/compat"
-    ["cuda12.9"]="/usr/local/cuda-12.9/compat"
-    ["cuda13.0"]="/usr/local/cuda-13.0/compat"
     ["rocm6.2"]="/opt/rocm/lib"
 )
 
diff --git a/src/core/registered_memory.cc b/src/core/registered_memory.cc
index cb231a0f..f464de2a 100644
--- a/src/core/registered_memory.cc
+++ b/src/core/registered_memory.cc
@@ -158,11 +158,25 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
       }
     }
   } else if (transports.has(Transport::CudaIpc)) {
+    // When transports include both CudaIpc and IB (e.g., CudaIpc | IB0),
+    // try CudaIpc first and fall back to IB on failure.
     auto entry = getTransportInfo(Transport::CudaIpc);
-    auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
-    // Create a memory map for the remote GPU memory. The memory map will keep the GpuIpcMem instance alive.
-    this->remoteMemMap = gpuIpcMem->map();
-    this->data = this->remoteMemMap.get();
+    bool hasIB = (transports & AllIBTransports).any();
+    try {
+      auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
+      this->remoteMemMap = gpuIpcMem->map();
+      this->data = this->remoteMemMap.get();
+    } catch (const BaseError& e) {
+      if (!hasIB) {
+        throw;
+      }
+      bool isSameHost = (getHostHash() == this->hostHash);
+      if (isSameHost) {
+        WARN(GPU, "CudaIpc import failed on same host, falling back to IB transport: ", e.what());
+      } else {
+        INFO(GPU, "CudaIpc import failed on remote host, falling back to IB transport: ", e.what());
+      }
+    }
   }
   if (this->data != nullptr) {
     INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);
diff --git a/test/deploy/config b/test/deploy/config
deleted file mode 100644
index 2905f752..00000000
--- a/test/deploy/config
+++ /dev/null
@@ -1,8 +0,0 @@
-Host mscclit-000000
-  Port 22345
-  IdentityFile /root/mscclpp/sshkey
-  StrictHostKeyChecking no
-Host mscclit-000001
-  Port 22345
-  IdentityFile /root/mscclpp/sshkey
-  StrictHostKeyChecking no
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index 1f1d0e52..e6f1259c 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -33,12 +33,34 @@ done
 
 set -e
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
-parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
+tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} .
+parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz
+parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+  "sudo mkdir -p ${DST_DIR} && sudo tar xzf /tmp/mscclpp.tar.gz -C ${DST_DIR} && sudo rm -f /tmp/mscclpp.tar.gz"
+rm -f /tmp/mscclpp.tar.gz
 
 if [ "${PLATFORM}" == "rocm" ]; then
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
 fi
 
+# Install GDRCopy kernel module on host VMs (CUDA only)
+GDRCOPY_VERSION="2.5.2"
+if [ "${PLATFORM}" == "cuda" ]; then
+  parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+    "if lsmod | grep -q gdrdrv; then
+      echo 'gdrdrv module already loaded'
+    else
+      set -e
+      sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
+      cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz
+      tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages
+      CUDA=/usr/local/cuda ./build-deb-packages.sh
+      sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}*.deb
+      sudo modprobe gdrdrv
+      rm -rf /tmp/gdrcopy.tar.gz /tmp/gdrcopy-${GDRCOPY_VERSION}
+    fi"
+fi
+
 # force to pull the latest image
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
   "sudo docker pull ${CONTAINERIMAGE}"
diff --git a/test/deploy/hostfile b/test/deploy/hostfile
deleted file mode 100644
index b1bfc1df..00000000
--- a/test/deploy/hostfile
+++ /dev/null
@@ -1,2 +0,0 @@
-azureuser@mscclit-000000
-azureuser@mscclit-000001
diff --git a/test/deploy/hostfile_mpi b/test/deploy/hostfile_mpi
deleted file mode 100644
index ac2514da..00000000
--- a/test/deploy/hostfile_mpi
+++ /dev/null
@@ -1,2 +0,0 @@
-mscclit-000000
-mscclit-000001
diff --git a/test/deploy/perf_ndmv5.jsonl b/test/deploy/perf_ndmv5.jsonl
index 042c6822..df36de78 100644
--- a/test/deploy/perf_ndmv5.jsonl
+++ b/test/deploy/perf_ndmv5.jsonl
@@ -1,3 +1,10 @@
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":3.98,  "busBw":6.96,   "size":24576,      "time":6.18,    "target":"latency"}
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":7.42,  "busBw":12.99,  "size":49152,      "time":6.62,    "target":"latency"}
-{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":10.67, "busBw":18.68,  "size":73728,      "time":6.91,    "target":"latency"}
\ No newline at end of file
+{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":10.67, "busBw":18.68,  "size":73728,      "time":6.91,    "target":"latency"}
+{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8,  "algBw":430.62,"busBw":403.70, "size":3221225472, "time":7480.40, "target":"throughput"}
+{"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8,  "algBw":0.54,  "busBw":1.01,   "size":8192,       "time":15.10,   "target":"latency"}
+{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8,  "algBw":201.46,"busBw":377.74, "size":3221225472, "time":15989.38,"target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8,  "algBw":118.49,"busBw":222.17, "size":25165824,   "time":212.39,  "target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8,  "algBw":138.48,"busBw":259.65, "size":50331648,   "time":363.40,  "target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8,  "algBw":166.72,"busBw":312.60, "size":3221225472, "time":19321.02,"target":"throughput"}
+{"name":"alltoall",  "kernel":0, "ranks":16,"ranksPerNode":8,  "algBw":96.94, "busBw":90.88,  "size":1073741824, "time":11076.24,"target":"throughput"}
\ No newline at end of file
diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh
index 0c05a090..6a70c76e 100644
--- a/test/deploy/run_tests.sh
+++ b/test/deploy/run_tests.sh
@@ -1,83 +1,99 @@
 set -e
 HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
+HEAD_HOST=$(head -1 ${HOSTFILE})
+# Resolve HEAD_HOST to an IP address on eth0 to ensure bootstrap uses the correct interface
+HEAD_IP=$(ssh -o StrictHostKeyChecking=no -p 22345 -i /root/mscclpp/sshkey ${HEAD_HOST} "ip -4 addr show eth0 | grep -oP 'inet \K[0-9.]+' | head -1" 2>/dev/null)
+if [ -z "${HEAD_IP}" ]; then
+    HEAD_IP=${HEAD_HOST}
+fi
+MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0"
+MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH"
+
+# Select perf baseline based on GPU type
+GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader -i 0 2>/dev/null | head -1)
+if echo "${GPU_NAME}" | grep -qi "H100"; then
+    PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv5.jsonl
+else
+    PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv4.jsonl
+fi
 
 function run_mscclpp_test()
 {
   echo "=================Run allgather_test_perf on 2 nodes========================="
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
   # For kernel 2, the message size must can be divided by 3
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
 
   echo "==================Run allreduce_test_perf on 2 nodes========================="
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
 
   echo "==================Run alltoall_test_perf on 2 nodes========================="
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
   echo "========================Run performance check==============================="
   python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
-    --baseline-file /root/mscclpp/test/deploy/perf_ndmv4.jsonl
+    --baseline-file ${PERF_BASELINE}
 }
 
 function run_mp_ut()
 {
   echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
-  mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-  -npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
+  mpirun ${MPI_ARGS} -tag-output -np 2 \
+  ${MSCCLPP_ENV} \
+  -npernode 1 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
 
   echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
-  mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-  -npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
+  mpirun ${MPI_ARGS} -tag-output -np 16 \
+  ${MSCCLPP_ENV} \
+  -npernode 8 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
 }
 
 function run_pytests()
 {
   echo "==================Run python tests================================"
-  mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
+  mpirun ${MPI_ARGS} -tag-output -np 16 \
+  ${MSCCLPP_ENV} \
   -x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
 }
 
 function run_py_benchmark()
 {
   echo "==================Run python benchmark================================"
-  mpirun -allow-run-as-root -np 16 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-  -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
+  mpirun ${MPI_ARGS} -np 16 \
+  ${MSCCLPP_ENV} \
+  -mca pml ob1 -mca btl ^openib -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
   -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
   -x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
-  -x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
+  -x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
 }
 
 if [ $# -lt 1 ]; then
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
index d4996cc2..bc29efd8 100644
--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -5,11 +5,22 @@ PLATFORM="${1:-cuda}"
 mkdir -p /root/.ssh
 mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
 chown root:root /root/.ssh/authorized_keys
-mv /root/mscclpp/test/deploy/config /root/.ssh/config
-chown root:root /root/.ssh/config
 chmod 400 /root/mscclpp/sshkey
 chown root:root /root/mscclpp/sshkey
 
+# Generate SSH config from hostfile_mpi
+HOSTFILE_MPI=/root/mscclpp/test/deploy/hostfile_mpi
+if [ -f "${HOSTFILE_MPI}" ]; then
+    > /root/.ssh/config
+    while IFS= read -r host; do
+        echo "Host ${host}" >> /root/.ssh/config
+        echo "  Port 22345" >> /root/.ssh/config
+        echo "  IdentityFile /root/mscclpp/sshkey" >> /root/.ssh/config
+        echo "  StrictHostKeyChecking no" >> /root/.ssh/config
+    done < "${HOSTFILE_MPI}"
+    chown root:root /root/.ssh/config
+fi
+
 if [ "${PLATFORM}" == "cuda" ]; then
     nvidia-smi -pm 1
     for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
@@ -18,7 +29,25 @@ if [ "${PLATFORM}" == "cuda" ]; then
 fi
 
 make -C /root/mscclpp/tools/peer-access-test
+set +e
 /root/mscclpp/tools/peer-access-test/peer_access_test
+PEER_ACCESS_EXIT_CODE=$?
+set -e
+if [ ${PEER_ACCESS_EXIT_CODE} -eq 2 ] && [ "${PLATFORM}" == "cuda" ]; then
+    # Exit code 2 = CUDA init failure (e.g., driver/toolkit version mismatch).
+    # Add CUDA compat libs for forward compatibility and retry.
+    CUDA_COMPAT_PATH="/usr/local/cuda/compat"
+    if [ -d "${CUDA_COMPAT_PATH}" ]; then
+        echo "Adding ${CUDA_COMPAT_PATH} to LD_LIBRARY_PATH for forward compatibility"
+        export LD_LIBRARY_PATH="${CUDA_COMPAT_PATH}:${LD_LIBRARY_PATH}"
+        /root/mscclpp/tools/peer-access-test/peer_access_test
+    else
+        echo "CUDA compat libs not found at ${CUDA_COMPAT_PATH}"
+        exit 1
+    fi
+elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then
+    exit ${PEER_ACCESS_EXIT_CODE}
+fi
 make -C /root/mscclpp/tools/peer-access-test clean
 
 if [[ "${CUDA_VERSION}" == *"11."* ]]; then
diff --git a/tools/peer-access-test/peer_access_test.cu b/tools/peer-access-test/peer_access_test.cu
index 428ed1ac..03cb27a6 100644
--- a/tools/peer-access-test/peer_access_test.cu
+++ b/tools/peer-access-test/peer_access_test.cu
@@ -13,6 +13,10 @@ constexpr auto cudaSuccess = hipSuccess;
 
 #include <iostream>
 
+// Exit code 2 indicates CUDA initialization failure (e.g., driver/toolkit mismatch).
+// This allows callers to distinguish it from other failures and retry with compat libs.
+constexpr int EXIT_CUDA_INIT_FAILURE = 2;
+
 #define CUDACHECK(cmd)                                                \
   do {                                                                \
     cudaError_t e = cmd;                                              \
@@ -25,7 +29,11 @@ constexpr auto cudaSuccess = hipSuccess;
 int main() {
   bool canAccessPeerAll = true;
   int devCount = 0;
-  CUDACHECK(cudaGetDeviceCount(&devCount));
+  cudaError_t err = cudaGetDeviceCount(&devCount);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed: cudaGetDeviceCount(&devCount) returned " << err << std::endl;
+    return EXIT_CUDA_INIT_FAILURE;
+  }
   std::cout << "Detected " << devCount << " device(s)" << std::endl;
   if (devCount >= 2) {
     for (int i = 0; i < devCount; ++i) {

From 572028ea3d9671a65ea9e9b8c65b0879328fae60 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 15 Apr 2026 12:55:40 -0700
Subject: [PATCH 18/21] Fix nccl-test CI building for all GPU architectures
 (#786)

## Problem

`nccl-test.yml` was the only CI template calling `deploy.yml` without
passing `gpuArch`. Since the CI build machine has no GPU, CMake fell
back to building for **all** supported architectures (`80;90;100;120`),
unnecessarily slowing down CI builds.

## Fix

- Add `gpuArch` parameter to `nccl-test.yml` and forward it to
`deploy.yml`
- Pass `gpuArch: '80'` (A100) and `gpuArch: '90'` (H100) from
`nccl-api-test.yml`

All other templates were already passing `gpuArch` correctly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .azure-pipelines/nccl-api-test.yml       | 2 ++
 .azure-pipelines/templates/nccl-test.yml | 4 ++++
 test/deploy/deploy.sh                    | 4 ----
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.azure-pipelines/nccl-api-test.yml b/.azure-pipelines/nccl-api-test.yml
index cc017412..85b466ef 100644
--- a/.azure-pipelines/nccl-api-test.yml
+++ b/.azure-pipelines/nccl-api-test.yml
@@ -44,6 +44,7 @@ jobs:
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
+      gpuArch:          '80'
       nvccGencode:      "-gencode=arch=compute_80,code=sm_80"
 
 - job: NcclTestH100
@@ -64,4 +65,5 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
       nvccGencode:      "-gencode=arch=compute_90,code=sm_90"
\ No newline at end of file
diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml
index 211e2393..fa3900f1 100644
--- a/.azure-pipelines/templates/nccl-test.yml
+++ b/.azure-pipelines/templates/nccl-test.yml
@@ -10,6 +10,9 @@ parameters:
   type: string
 - name: vmssName
   type: string
+- name: gpuArch
+  type: string
+  default: '80'
 - name: nvccGencode
   type: string
   default: "-gencode=arch=compute_80,code=sm_80"
@@ -19,6 +22,7 @@ steps:
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
     deployArgs:       'nccltest-single-node'
 
 - template: run-remote-task.yml
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index e6f1259c..6358787b 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -6,10 +6,6 @@ PLATFORM="${3:-cuda}"
 
 KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
 ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
-if [ "${TEST_NAME}" == "nccltest-single-node" ]; then
-  ROOT_DIR="${ROOT_DIR}/mscclpp"
-  SYSTEM_DEFAULTWORKINGDIRECTORY="${SYSTEM_DEFAULTWORKINGDIRECTORY}/mscclpp"
-fi
 DST_DIR="/tmp/mscclpp"
 if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then
   HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci"

From eeea00b298e0674a48d498c3ca695d8e85f72dae Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 16 Apr 2026 21:24:45 -0700
Subject: [PATCH 19/21] Support python wheel build (#787)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Support Python wheel build

This PR modernizes the Python packaging for MSCCL++ by defining
dependencies and optional extras in `pyproject.toml`, enabling proper
wheel builds with `pip install ".[cuda12]"`.

### Changes

**`pyproject.toml`**
- Add `dependencies` (numpy, blake3, pybind11, sortedcontainers)
- Add `optional-dependencies` for platform-specific CuPy (`cuda11`,
`cuda12`, `cuda13`, `rocm6`), `benchmark`, and `test` extras
- Bump minimum Python version from 3.8 to 3.10

**`test/deploy/setup.sh`**
- Use `pip install ".[<platform>,benchmark,test]"` instead of separate
`pip install -r requirements_*.txt` + `pip install .` steps
- Add missing CUDA 13 case

**`docs/quickstart.md`**
- Update install instructions to use extras (e.g., `pip install
".[cuda12]"`)
- Document all available extras and clarify that `rocm6` builds CuPy
from source
- Update Python version references to 3.10

**`python/csrc/CMakeLists.txt`**, **`python/test/CMakeLists.txt`**
- Update `find_package(Python)` from 3.8 to 3.10

### Notes
- The `requirements_*.txt` files are kept for Docker base image builds
where only dependencies (not the project itself) should be installed.
- CuPy is intentionally not in base dependencies — users must specify a
platform extra to get the correct pre-built wheel (or source build for
ROCm).

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/quickstart.md               | 36 ++++++++++++++++++++++++--------
 pyproject.toml                   | 27 ++++++++++++++++++++++--
 python/csrc/CMakeLists.txt       |  2 +-
 python/mscclpp/_core/compiler.py |  8 +++++++
 python/requirements_cuda11.txt   |  2 +-
 python/requirements_cuda12.txt   |  2 +-
 python/requirements_cuda13.txt   |  2 +-
 python/requirements_rocm6.txt    |  2 +-
 python/test/CMakeLists.txt       |  2 +-
 test/deploy/setup.sh             | 20 +++++++++++-------
 10 files changed, 79 insertions(+), 24 deletions(-)

diff --git a/docs/quickstart.md b/docs/quickstart.md
index c9c98128..83a08d6a 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -25,9 +25,9 @@
         ```bash
         sudo apt-get install libnuma-dev
         ```
-    * (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.8 and Python Development Package
+    * (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.10 and Python Development Package
         ```bash
-        sudo apt-get satisfy "python3 (>=3.8), python3-dev (>=3.8)"
+        sudo apt-get satisfy "python3 (>=3.10), python3-dev (>=3.10)"
         ```
         If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
     * (Optional, for benchmarks) MPI
@@ -100,13 +100,30 @@ There are a few optional CMake options you can set:
 (install-from-source-python-module)=
 ## Install from Source (Python Module)
 
-Python 3.8 or later is required.
+Python 3.10 or later is required.
 
 ```bash
-# For NVIDIA platforms
-$ python -m pip install .
-# For AMD platforms, set the C++ compiler to HIPCC
-$ CXX=/opt/rocm/bin/hipcc python -m pip install .
+# For NVIDIA platforms (specify your CUDA version)
+$ python -m pip install ".[cuda12]"
+# For AMD platforms
+$ CXX=/opt/rocm/bin/hipcc python -m pip install ".[rocm6]"
+```
+
+> **Note:** A platform extra (`cuda11`, `cuda12`, `cuda13`, or `rocm6`) is required to install CuPy.
+> The CUDA extras install pre-built CuPy wheels. The `rocm6` extra installs CuPy from source,
+> which requires ROCm and may take longer. Running `pip install .` without an extra will not install CuPy.
+
+Optional extras can be installed by specifying them in brackets. Available extras:
+- **`cuda11`**, **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package for your CUDA version.
+- **`rocm6`**: Install CuPy from source for AMD ROCm platforms.
+- **`benchmark`**: Install benchmark dependencies (mpi4py, prettytable, netifaces, matplotlib).
+- **`test`**: Install test dependencies (pytest, mpi4py, netifaces).
+
+```bash
+# Example: install with CUDA 12 and benchmark extras
+$ python -m pip install ".[cuda12,benchmark]"
+# Example: install with all extras for testing on CUDA 12
+$ python -m pip install ".[cuda12,benchmark,test]"
 ```
 
 (vscode-dev-container)=
@@ -158,8 +175,9 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0
 [Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
 
 ```bash
-# Choose `requirements_*.txt` according to your CUDA/ROCm version.
-$ python3 -m pip install -r ./python/requirements_cuda12.txt
+# Install with benchmark dependencies and the appropriate CUDA/ROCm extras.
+# Replace `cuda12` with your platform: cuda11, cuda12, cuda13, or rocm6.
+$ python3 -m pip install ".[cuda12,benchmark,test]"
 $ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index 651fec3b..0ea569cb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,30 @@ build-backend = "scikit_build_core.build"
 name = "mscclpp"
 dynamic = ["version"]
 description = "MSCCL++ Python API"
-requires-python = ">=3.8"
+requires-python = ">=3.10"
+dependencies = [
+    "numpy",
+    "blake3",
+    "pybind11",
+    "sortedcontainers",
+]
+
+[project.optional-dependencies]
+cuda11 = ["cupy-cuda11x"]
+cuda12 = ["cupy-cuda12x"]
+cuda13 = ["cupy-cuda13x"]
+rocm6 = ["cupy"]
+benchmark = [
+    "mpi4py",
+    "prettytable",
+    "netifaces",
+    "matplotlib",
+]
+test = [
+    "pytest",
+    "mpi4py",
+    "netifaces",
+]
 
 [tool.setuptools_scm]
 write_to = "python/mscclpp/_version.py"
@@ -40,5 +63,5 @@ MSCCLPP_BUILD_TESTS = "OFF"
 
 [tool.black]
 line-length = 120
-target-version = ['py38']
+target-version = ['py310']
 include = '\.pyi?$'
diff --git a/python/csrc/CMakeLists.txt b/python/csrc/CMakeLists.txt
index 44fb150f..7c7bf3b9 100644
--- a/python/csrc/CMakeLists.txt
+++ b/python/csrc/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
+find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED)
 include(FetchContent)
 FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.9.2)
 FetchContent_MakeAvailable(nanobind)
diff --git a/python/mscclpp/_core/compiler.py b/python/mscclpp/_core/compiler.py
index b2da976d..3b77ce8e 100644
--- a/python/mscclpp/_core/compiler.py
+++ b/python/mscclpp/_core/compiler.py
@@ -192,6 +192,9 @@ class NativeCodeCompiler:
     """
 
     def __init__(self):
+        self._initialized = False
+
+    def _do_init(self):
         self._is_hip = cp.cuda.runtime.is_hip
         self._device_arch = get_device_arch()
         self._compiler = self._get_compiler()
@@ -226,6 +229,7 @@ class NativeCodeCompiler:
         ]
         self._cache_dir = Path(env().cache_dir) / "native"
         self._cache_dir.mkdir(parents=True, exist_ok=True)
+        self._initialized = True
 
     def _get_compiler(self) -> str:
         """Get the path to the appropriate compiler.
@@ -246,6 +250,8 @@ class NativeCodeCompiler:
         Returns:
             str: The GPU architecture string (e.g., "sm_90" for NVIDIA or "gfx90a" for AMD).
         """
+        if not self._initialized:
+            self._do_init()
         return self._device_arch
 
     def __call__(self, name: str, file: str, **kwds):
@@ -290,6 +296,8 @@ class NativeCodeCompiler:
             >>> # Use the module to create an algorithm
             >>> algo = module.create_allreduce_algorithm(comm, buffer, size)
         """
+        if not self._initialized:
+            self._do_init()
         if not os.path.isfile(file):
             raise FileNotFoundError(f"The specified source file does not exist: {file}")
 
diff --git a/python/requirements_cuda11.txt b/python/requirements_cuda11.txt
index 4e2e9371..a9786071 100644
--- a/python/requirements_cuda11.txt
+++ b/python/requirements_cuda11.txt
@@ -5,6 +5,6 @@ netifaces
 pytest
 numpy
 matplotlib
-sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+sortedcontainers
 blake3
 pybind11
\ No newline at end of file
diff --git a/python/requirements_cuda12.txt b/python/requirements_cuda12.txt
index e1c9b726..71572714 100644
--- a/python/requirements_cuda12.txt
+++ b/python/requirements_cuda12.txt
@@ -5,6 +5,6 @@ netifaces
 pytest
 numpy
 matplotlib
-sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+sortedcontainers
 blake3
 pybind11
\ No newline at end of file
diff --git a/python/requirements_cuda13.txt b/python/requirements_cuda13.txt
index 49cf13bc..95e99533 100644
--- a/python/requirements_cuda13.txt
+++ b/python/requirements_cuda13.txt
@@ -5,6 +5,6 @@ netifaces
 pytest
 numpy
 matplotlib
-sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+sortedcontainers
 blake3
 pybind11
\ No newline at end of file
diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt
index 7ed4fef3..757d4e26 100644
--- a/python/requirements_rocm6.txt
+++ b/python/requirements_rocm6.txt
@@ -5,6 +5,6 @@ netifaces
 pytest
 numpy
 matplotlib
-sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+sortedcontainers
 blake3
 pybind11
\ No newline at end of file
diff --git a/python/test/CMakeLists.txt b/python/test/CMakeLists.txt
index be62aea9..e55711d2 100644
--- a/python/test/CMakeLists.txt
+++ b/python/test/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
+find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED)
 include(FetchContent)
 FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0)
 FetchContent_MakeAvailable(nanobind)
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
index bc29efd8..2a88a310 100644
--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -50,12 +50,6 @@ elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then
 fi
 make -C /root/mscclpp/tools/peer-access-test clean
 
-if [[ "${CUDA_VERSION}" == *"11."* ]]; then
-    pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
-elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
-    pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
-fi
-
 if [ "${PLATFORM}" == "rocm" ]; then
     export CXX=/opt/rocm/bin/hipcc
 fi
@@ -65,7 +59,19 @@ if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then
     export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})"
     echo "Using CMAKE_ARGS: ${CMAKE_ARGS}"
 fi
-cd /root/mscclpp && pip3 install .
+
+cd /root/mscclpp
+if [[ "${CUDA_VERSION}" == *"11."* ]]; then
+    pip3 install ".[cuda11,benchmark,test]"
+elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
+    pip3 install ".[cuda12,benchmark,test]"
+elif [[ "${CUDA_VERSION}" == *"13."* ]]; then
+    pip3 install ".[cuda13,benchmark,test]"
+elif [ "${PLATFORM}" == "rocm" ]; then
+    pip3 install ".[rocm6,benchmark,test]"
+else
+    pip3 install ".[benchmark,test]"
+fi
 pip3 install setuptools_scm
 python3 -m setuptools_scm --force-write-version-files
 

From e874bf16663a30cd50c168d244211a4ab9c5b97c Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Wed, 22 Apr 2026 10:12:40 -0700
Subject: [PATCH 20/21] fix: isCuMemMapAllocated crashes on non-NVLS systems
 even with MSCCLPP_FORCE_DISABLE_NVLS=true (#790)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- [x] Fix `isCuMemMapAllocated()` to just return `true/false` without
throwing when NVLS is not supported
- [x] Fix `isNvlsSupported()` caching bug where `result`/`isChecked`
were never updated
- [x] Restore `[[maybe_unused]]` on `result` and `isChecked` statics —
needed in HIP/ROCm env where `CUDA_NVLS_API_AVAILABLE` is not defined
and the variables would otherwise be unused
- [x] Run linter (`./tools/lint.sh`)

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Binyang2014 <9415966+Binyang2014@users.noreply.github.com>
---
 src/core/gpu_utils.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc
index 628d2dcb..09d5025d 100644
--- a/src/core/gpu_utils.cc
+++ b/src/core/gpu_utils.cc
@@ -283,7 +283,9 @@ bool isNvlsSupported() {
     MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
     MSCCLPP_CUTHROW(cuDeviceGet(&dev, deviceId));
     MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
-    return isMulticastSupported == 1;
+    result = (isMulticastSupported == 1);
+    isChecked = true;
+    return result;
   }
   return result;
 #endif
@@ -300,9 +302,6 @@ bool isCuMemMapAllocated([[maybe_unused]] void* ptr) {
     return false;
   }
   MSCCLPP_CUTHROW(cuMemRelease(handle));
-  if (!isNvlsSupported()) {
-    throw Error("cuMemMap is used in env without NVLS support", ErrorCode::InvalidUsage);
-  }
   return true;
 #endif
 }

From c97be492d5d097f2ab5885e74b029610845400a4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 Apr 2026 10:32:20 -0700
Subject: [PATCH 21/21] GDRCopy status message to string (#793)

---
 src/core/gdr.cc          | 4 ++--
 src/core/include/gdr.hpp | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/core/gdr.cc b/src/core/gdr.cc
index 22ac15c9..f361a3aa 100644
--- a/src/core/gdr.cc
+++ b/src/core/gdr.cc
@@ -48,7 +48,7 @@ GdrStatus gdrStatus() { return gdrContext()->status(); }
 
 bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
 
-const char* gdrStatusMessage() {
+std::string gdrStatusMessage() {
   switch (gdrStatus()) {
     case GdrStatus::Ok:
       return "GDRCopy initialized successfully";
@@ -181,7 +181,7 @@ GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
 
 bool gdrEnabled() { return false; }
 
-const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
+std::string gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
 
 // GdrMap::Impl — stub (no GDRCopy)
 
diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp
index e0c7f006..c1378334 100644
--- a/src/core/include/gdr.hpp
+++ b/src/core/include/gdr.hpp
@@ -7,6 +7,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <string>
 
 namespace mscclpp {
 
@@ -25,7 +26,7 @@ GdrStatus gdrStatus();
 bool gdrEnabled();
 
 /// Return a human-readable error message for the current GDRCopy status.
-const char* gdrStatusMessage();
+std::string gdrStatusMessage();
 
 /// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
 /// When GDRCopy is not available, all operations are no-ops and valid() returns false.