From 0a26f9d5dec641cadfbc5c996d3e81243b790001 Mon Sep 17 00:00:00 2001
From: Caio Rocha <caiorocha@microsoft.com>
Date: Sat, 11 Apr 2026 21:07:09 +0000
Subject: [PATCH] wip

---
 .azure-pipelines/templates/deploy.yml         |  24 +-
 .azure-pipelines/templates/ut-npkit.yml       |  10 +-
 .azure-pipelines/templates/ut.yml             |   1 +
 .github/copilot-instructions.md               |   2 +-
 CMakeLists.txt                                |  21 +
 VERSION                                       |   2 +-
 cmake/FindGDRCopy.cmake                       |  50 ++
 cmake/FindMLX5.cmake                          |  38 +
 docker/base-dev-x.dockerfile                  |  19 +-
 docs/Makefile                                 |   2 +-
 docs/build_multiversion.py                    |  49 +
 docs/guide/mscclpp-torch-integration.md       |   3 +-
 docs/quickstart.md                            |   3 +
 .../customized_allgather.cu                   |   3 +-
 .../torch-integration/customized_allgather.cu |   3 +-
 .../customized_comm_with_tuning.py            | 666 +++++++++-----
 .../torch-integration/dsl_with_nccl_api.py    |  15 +-
 include/mscclpp/algorithm.hpp                 |  15 +-
 include/mscclpp/atomic_device.hpp             |   5 +-
 include/mscclpp/env.hpp                       |   5 +
 include/mscclpp/gpu_data_types.hpp            | 837 +++++++++++++++++-
 include/mscclpp/semaphore.hpp                 |  10 +
 python/csrc/CMakeLists.txt                    |   3 +
 python/csrc/algorithm.cpp                     |   8 +-
 python/csrc/core_py.cpp                       |   3 +-
 .../ext/algorithm_collection_builder_py.cpp   |   1 +
 python/csrc/gpu_utils_py.cpp                  |  13 +
 python/mscclpp/_core/algorithm.py             |   8 +-
 python/mscclpp/language/channel.py            |   2 +-
 .../mscclpp/language/internal/operations.py   |   2 +-
 .../tests/executor_tests/reduce_pack.py       |  67 --
 .../tests/executor_tests/reduce_pack_tbg.py   |  70 --
 .../tests/executor_tests/transfer_pack.py     |  60 --
 .../tests/executor_tests/transfer_pack_tbg.py |  63 --
 python/requirements_rocm6.txt                 |   4 +-
 python/test/test_fp8_accum.py                 | 397 +++++++++
 src/core/CMakeLists.txt                       |  10 +
 src/core/algorithm.cc                         |  17 +-
 src/core/connection.cc                        | 192 ++--
 src/core/context.cc                           |   2 -
 src/core/endpoint.cc                          |   4 +-
 src/core/env.cpp                              |   4 +-
 src/core/executor/execution_kernel.cu         |   6 +
 src/core/gdr.cc                               | 204 +++++
 src/core/ib.cc                                | 120 ++-
 src/core/include/connection.hpp               |  57 +-
 src/core/include/context.hpp                  |   2 -
 src/core/include/execution_kernel.hpp         |  27 +-
 src/core/include/gdr.hpp                      |  62 ++
 src/core/include/ib.hpp                       |  18 +-
 src/core/include/mlx5dv_wrapper.hpp           |  38 +
 src/core/include/reduce_kernel.hpp            | 174 +++-
 src/core/mlx5dv_wrapper.cc                    | 126 +++
 src/core/semaphore.cc                         |  58 +-
 .../allgather/allgather_fullmesh.cu           |   3 +-
 .../allgather/allgather_fullmesh_2.cu         |   3 +-
 .../allreduce/allreduce_allpair_packet.cu     |  33 +-
 .../allreduce/allreduce_fullmesh.cu           |  44 +-
 .../allreduce_nvls_block_pipeline.cu          |  14 +-
 .../allreduce/allreduce_nvls_packet.cu        |  45 +-
 .../allreduce/allreduce_nvls_warp_pipeline.cu |  19 +-
 .../allreduce/allreduce_nvls_zero_copy.cu     |  15 +-
 .../collectives/allreduce/allreduce_packet.cu |  68 +-
 .../collectives/allreduce/allreduce_rsag.cu   |  13 +-
 .../allreduce/allreduce_rsag_pipeline.cu      |  19 +-
 .../allreduce/allreduce_rsag_zero_copy.cu     |  31 +-
 .../allreduce/allreduce_allpair_packet.hpp    |   2 +-
 .../include/allreduce/allreduce_fullmesh.hpp  |   2 +-
 .../allreduce_nvls_block_pipeline.hpp         |   2 +-
 .../allreduce/allreduce_nvls_packet.hpp       |   4 +-
 .../allreduce_nvls_warp_pipeline.hpp          |   2 +-
 .../allreduce/allreduce_nvls_zero_copy.hpp    |   2 +-
 .../include/allreduce/allreduce_packet.hpp    |   2 +-
 .../include/allreduce/allreduce_rsag.hpp      |   2 +-
 .../allreduce/allreduce_rsag_pipeline.hpp     |   2 +-
 .../allreduce/allreduce_rsag_zero_copy.hpp    |   2 +-
 .../collectives/include/allreduce/common.hpp  |  92 +-
 src/ext/nccl/algorithm_selector.cc            |   3 +-
 src/ext/nccl/datatype_conversion.hpp          |   5 +
 src/ext/nccl/nccl.cc                          |  39 +-
 test/deploy/setup.sh                          |   6 +
 test/framework.cc                             |  46 +
 test/framework.hpp                            |   7 +
 test/mp_unit/ib_tests.cu                      | 127 ++-
 test/mp_unit/memory_channel_tests.cu          |   6 +-
 test/mp_unit/mp_unit_tests.hpp                |   1 +
 test/mp_unit/port_channel_tests.cu            | 140 ++-
 test/mp_unit/semaphore_perf_tests.cu          |   2 +-
 test/unit/CMakeLists.txt                      |   1 +
 test/unit/gdr_tests.cu                        | 251 ++++++
 tools/npkit/npkit_trace_generator.py          |  16 +-
 91 files changed, 3719 insertions(+), 957 deletions(-)
 create mode 100644 cmake/FindGDRCopy.cmake
 create mode 100644 cmake/FindMLX5.cmake
 create mode 100644 docs/build_multiversion.py
 delete mode 100644 python/mscclpp/language/tests/executor_tests/reduce_pack.py
 delete mode 100644 python/mscclpp/language/tests/executor_tests/reduce_pack_tbg.py
 delete mode 100644 python/mscclpp/language/tests/executor_tests/transfer_pack.py
 delete mode 100644 python/mscclpp/language/tests/executor_tests/transfer_pack_tbg.py
 create mode 100644 python/test/test_fp8_accum.py
 create mode 100644 src/core/gdr.cc
 create mode 100644 src/core/include/gdr.hpp
 create mode 100644 src/core/include/mlx5dv_wrapper.hpp
 create mode 100644 src/core/mlx5dv_wrapper.cc
 create mode 100644 test/unit/gdr_tests.cu

diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml
index fc116acf..2f642f1d 100644
--- a/.azure-pipelines/templates/deploy.yml
+++ b/.azure-pipelines/templates/deploy.yml
@@ -94,7 +94,27 @@ steps:
       du -sh build/bin/* 2>/dev/null || true
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-# 2. Download SSH key + install packages + start VMSS
+# 2. Write CMake args for pip install on remote VMs
+- task: Bash@3
+  name: WritePipCmakeArgs
+  displayName: Write pip CMake args
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      PIP_CMAKE_ARGS=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        PIP_CMAKE_ARGS="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ -n "${CMAKE_EXTRA_ARGS}" ]; then
+        PIP_CMAKE_ARGS="${PIP_CMAKE_ARGS} ${CMAKE_EXTRA_ARGS}"
+      fi
+      echo "${PIP_CMAKE_ARGS}" > pip_cmake_args.txt
+      echo "pip CMake args: $(cat pip_cmake_args.txt)"
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 3. Download SSH key + install packages + start VMSS
 - task: DownloadSecureFile@1
   name: SshKeyFile
   displayName: Download key file
@@ -120,7 +140,7 @@ steps:
     inlineScript: |
       az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
 
-# 3. Deploy test environment
+# 4. Deploy test environment
 - task: Bash@3
   name: DeployTestEnv
   displayName: Deploy Test Env
diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml
index e53b5cf5..1bd89caf 100644
--- a/.azure-pipelines/templates/ut-npkit.yml
+++ b/.azure-pipelines/templates/ut-npkit.yml
@@ -28,7 +28,7 @@ steps:
       grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
 
 - template: run-remote-task.yml
   parameters:
@@ -42,14 +42,14 @@ steps:
       grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
       grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
       rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
       mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
       python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
       grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
-      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_UNPACK_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
 
 - template: stop.yml
   parameters:
diff --git a/.azure-pipelines/templates/ut.yml b/.azure-pipelines/templates/ut.yml
index 9d17e923..743c66e6 100644
--- a/.azure-pipelines/templates/ut.yml
+++ b/.azure-pipelines/templates/ut.yml
@@ -41,6 +41,7 @@ steps:
     displayName: Run pytests
     remoteScript: |
       mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_fp8_accum.py -x
 
 - template: stop.yml
   parameters:
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 4f13c557..9d7e7798 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -43,7 +43,7 @@ For testing after successful build:
 # To run tests with two GPUs - two is enough for most tests
 mpirun -np 2 ./build/bin/mp_unit_tests
 # To run tests excluding IB-related ones (when IB is not available)
-mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
+mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
 ```
 
 For building a Python package:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9db54d15..ef8b785a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -224,9 +224,30 @@ if(MSCCLPP_USE_IB)
     if(NOT IBVERBS_FOUND)
         message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.")
     endif()
+    find_package(MLX5)
+    if(MLX5_FOUND)
+        message(STATUS "MLX5 Direct Verbs found: ${MLX5_LIBRARIES}")
+    else()
+        message(STATUS "MLX5 Direct Verbs not found, mlx5dv optimizations disabled")
+    endif()
 endif()
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)
+
+option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON)
+if(MSCCLPP_USE_ROCM)
+    set(MSCCLPP_USE_GDRCOPY OFF)
+endif()
+if(MSCCLPP_USE_GDRCOPY)
+    find_package(GDRCopy)
+    if(NOT GDRCOPY_FOUND)
+        message(STATUS "GDRCopy not found, disabling GDRCopy support")
+        set(MSCCLPP_USE_GDRCOPY OFF)
+    else()
+        message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}")
+    endif()
+endif()
+
 include(FetchContent)
 FetchContent_Declare(json
     GIT_REPOSITORY https://github.com/nlohmann/json.git
diff --git a/VERSION b/VERSION
index a3df0a69..ac39a106 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.8.0
+0.9.0
diff --git a/cmake/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake
new file mode 100644
index 00000000..54e0ba1c
--- /dev/null
+++ b/cmake/FindGDRCopy.cmake
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE)
+#
+# The following variables are optionally searched for defaults
+#  GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
+#  GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
+#  GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found
+
+# The following are set after configuration is done:
+#  GDRCOPY_FOUND
+#  GDRCOPY_INCLUDE_DIRS
+#  GDRCOPY_LIBRARIES
+
+find_path(GDRCOPY_INCLUDE_DIRS
+  NAMES gdrapi.h
+  HINTS
+  ${GDRCOPY_INCLUDE_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(GDRCOPY_LIBRARIES
+  NAMES gdrapi
+  HINTS
+  ${GDRCOPY_LIB_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+if(GDRCOPY_INCLUDE_DIRS)
+    include(CheckSymbolExists)
+    set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
+    set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
+    check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    unset(CMAKE_REQUIRED_INCLUDES)
+    if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
+        message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
+        set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
+    endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
+mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
diff --git a/cmake/FindMLX5.cmake b/cmake/FindMLX5.cmake
new file mode 100644
index 00000000..9fd59127
--- /dev/null
+++ b/cmake/FindMLX5.cmake
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the MLX5 Direct Verbs (mlx5dv) library
+#
+# The following variables are optionally searched for defaults
+#  MLX5_ROOT_DIR: Base directory where all MLX5 components are found
+#  MLX5_INCLUDE_DIR: Directory where MLX5 headers are found
+#  MLX5_LIB_DIR: Directory where MLX5 libraries are found
+
+# The following are set after configuration is done:
+#  MLX5_FOUND
+#  MLX5_INCLUDE_DIRS
+#  MLX5_LIBRARIES
+
+find_path(MLX5_INCLUDE_DIRS
+  NAMES infiniband/mlx5dv.h
+  HINTS
+  ${MLX5_INCLUDE_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(MLX5_LIBRARIES
+  NAMES mlx5
+  HINTS
+  ${MLX5_LIB_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
+mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
index 7c6c927e..47436202 100644
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -49,8 +49,25 @@ RUN OS_ARCH=$(uname -m) && \
     rm -rf ${CMAKE_HOME}.tar.gz && \
     ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
 
-# Install ROCm-specific packages if building for ROCm
+# Install GDRCopy userspace library for CUDA targets
 ARG TARGET="cuda13.0"
+RUN if echo "$TARGET" | grep -q "^cuda"; then \
+        GDRCOPY_VERSION="2.5.2" && \
+        apt-get update -y && \
+        apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \
+        cd /tmp && \
+        curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
+        tar xzf gdrcopy.tar.gz && \
+        cd gdrcopy-${GDRCOPY_VERSION}/packages && \
+        ./build-deb-packages.sh -k -t && \
+        dpkg -i libgdrapi_*.deb && \
+        cd / && rm -rf /tmp/gdrcopy* && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* /tmp/*; \
+    fi
+
+# Install ROCm-specific packages if building for ROCm
 RUN if echo "$TARGET" | grep -q "^rocm"; then \
         apt-get update -y && \
         apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
diff --git a/docs/Makefile b/docs/Makefile
index 5bc7422e..bf82c03a 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -5,7 +5,7 @@
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
-SPHINXMULTIVERSION ?= sphinx-multiversion
+SPHINXMULTIVERSION ?= python3 build_multiversion.py
 SOURCEDIR     = .
 BUILDDIR      = _build
 
diff --git a/docs/build_multiversion.py b/docs/build_multiversion.py
new file mode 100644
index 00000000..ace20fc0
--- /dev/null
+++ b/docs/build_multiversion.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Wrapper around sphinx-multiversion that patches copy_tree to generate
+_version.py in each tag checkout. This is needed because setuptools_scm
+generates _version.py at build time, but sphinx-multiversion uses
+`git archive` which only contains committed files.
+
+Usage (called by Makefile):
+    python3 build_multiversion.py <sourcedir> <outputdir> [sphinx-opts...]
+"""
+
+import os
+import re
+import subprocess
+import sys
+
+import sphinx_multiversion.git as smv_git
+from sphinx_multiversion import main as smv_main
+
+# Save the original copy_tree
+_original_copy_tree = smv_git.copy_tree
+
+
+def _patched_copy_tree(gitroot, src, dst, reference, sourcepath="."):
+    """Call original copy_tree, then generate _version.py from the VERSION file."""
+    _original_copy_tree(gitroot, src, dst, reference, sourcepath)
+
+    # Extract version from the tag name (e.g., "v0.9.0" -> "0.9.0")
+    refname = getattr(reference, "refname", "") or ""
+    match = re.search(r"v(\d+\.\d+\.\d+)", refname)
+    if not match:
+        return
+
+    version = match.group(1)
+    version_py_dir = os.path.join(dst, "python", "mscclpp")
+    if os.path.isdir(version_py_dir):
+        version_py = os.path.join(version_py_dir, "_version.py")
+        if not os.path.exists(version_py):
+            with open(version_py, "w") as f:
+                f.write(f'__version__ = "{version}"\n')
+
+
+# Monkey-patch
+smv_git.copy_tree = _patched_copy_tree
+
+if __name__ == "__main__":
+    sys.exit(smv_main(sys.argv[1:]))
diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md
index 1c966155..b4e4fcdf 100644
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -332,7 +332,8 @@ public:
                    size_t inputSize, size_t outputSize,
                    mscclpp::DataType dtype, mscclpp::ReduceOp op,
                    cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                   const std::unordered_map<std::string, uintptr_t>& extras) {
+                   const std::unordered_map<std::string, uintptr_t>& extras,
+                   [[maybe_unused]] mscclpp::DataType accumDtype) {
                 return self->kernelFunc(ctx, input, output, inputSize, dtype, stream);
             },
             // Context initialization function
diff --git a/docs/quickstart.md b/docs/quickstart.md
index b7a68050..c9c98128 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -31,6 +31,9 @@
         ```
         If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
     * (Optional, for benchmarks) MPI
+    * (Optional, for NVIDIA platforms) [GDRCopy](https://github.com/NVIDIA/gdrcopy) >= 2.5.1
+        * GDRCopy is required for IB `HostNoAtomic` mode, which uses CPU-side signal forwarding to GPU memory via BAR1 mappings. This mode is used on platforms where RDMA atomics are not available (e.g., when using Data Direct Virtual Functions).
+        * Install GDRCopy from source or via packages. See the [GDRCopy installation guide](https://github.com/NVIDIA/gdrcopy#installation).
 * Others
     * For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites).
     * For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.
diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu
index e78c4777..02df3685 100644
--- a/examples/customized-collective-algorithm/customized_allgather.cu
+++ b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -101,7 +101,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
         "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
         [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
                mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
-               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+               [[maybe_unused]] mscclpp::DataType accumDtype) {
           return self->allgatherKernelFunc(ctx, input, output, inputSize, stream);
         },
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu
index d48c4410..907b3ada 100644
--- a/examples/torch-integration/customized_allgather.cu
+++ b/examples/torch-integration/customized_allgather.cu
@@ -69,7 +69,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
         "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
         [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
                mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
-               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+               [[maybe_unused]] mscclpp::DataType accumDtype) {
           return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream);
         },
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
index 41be5825..060a0097 100644
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -1,193 +1,117 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8  customized_comm_with_tuning.py
+# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
 
 import os
-import torch
-import mscclpp.utils as mscclpp_utils
-import mscclpp
-import mscclpp.ext
-import netifaces as ni
 import ipaddress
 
+import netifaces as ni
+import torch
+import mscclpp
+import mscclpp.ext
+import mscclpp.utils as mscclpp_utils
 
-def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
-    collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
-    return collection_builder.build_default_algorithms(
-        scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
+# -- Helpers ------------------------------------------------------------------
+
+
+def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor:
+    """Allocate a tensor backed by RawGpuBuffer (symmetric memory)."""
+    # PyTorch's from_dlpack does not support certain float8 DLPack type codes.
+    # Work around by importing as uint8 and reinterpreting via .view().
+    _DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)
+    if dtype in _DLPACK_UNSUPPORTED:
+        dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8))
+        return torch.utils.dlpack.from_dlpack(dlpack).view(dtype)
+    dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype))
+    return torch.utils.dlpack.from_dlpack(dlpack)
+
+
+def _load_algorithms(scratch: torch.Tensor, rank: int):
+    return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms(
+        scratch_buffer=scratch.data_ptr(),
+        scratch_buffer_size=scratch.nbytes,
+        rank=rank,
     )
 
 
-def interfaces_for_ip_netifaces(ip: str):
+def _interfaces_for_ip(ip: str):
     target = ipaddress.ip_address(ip)
-    for interface in ni.interfaces():
-        addresses = ni.ifaddresses(interface)
-        if ni.AF_INET in addresses:
-            for link in addresses[ni.AF_INET]:
-                if "addr" in link:
-                    addr = ipaddress.ip_address(link["addr"])
-                    if addr == target:
-                        return interface
+    for iface in ni.interfaces():
+        addrs = ni.ifaddresses(iface)
+        if ni.AF_INET in addrs:
+            for link in addrs[ni.AF_INET]:
+                if "addr" in link and ipaddress.ip_address(link["addr"]) == target:
+                    return iface
     return None
 
 
-def to_mscclpp_reduce_op(op: torch.distributed.ReduceOp) -> mscclpp.ReduceOp:
+def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
     if op == torch.distributed.ReduceOp.SUM:
         return mscclpp.ReduceOp.SUM
-    elif op == torch.distributed.ReduceOp.MIN:
+    if op == torch.distributed.ReduceOp.MIN:
         return mscclpp.ReduceOp.MIN
-    else:
-        raise ValueError(f"unsupported op: {op}")
+    raise ValueError(f"unsupported op: {op}")
+
+
+def _round_pow2(size: int) -> int:
+    """Round up to next power-of-2, clamped to [1024, 256 MB]."""
+    size = max(size, 1024)
+    size = min(size, 256 << 20)
+    return 1 << (size - 1).bit_length()
+
+
+# -- CustomizedComm -----------------------------------------------------------
 
 
 class CustomizedComm:
-    def __init__(self, comm: mscclpp.CommGroup):
+    """Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
+
+    _TUNE_N_WARMUP = 5
+    _TUNE_N_GRAPH_LAUNCHES = 10
+    _TUNE_N_OPS_PER_GRAPH = 100
+    _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128]
+    _CANDIDATE_NTHREADS = [512, 768, 1024]
+    _NBLOCKS_LIMIT = {
+        "default_allreduce_nvls_packet": 16,
+        "default_allreduce_packet": 56,
+        "default_allreduce_allpair_packet": 56,
+        "default_allreduce_fullmesh": 64,
+        "default_allgather_fullmesh2": 32,
+    }
+
+    def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
         self.comm = comm
         self.rank = comm.my_rank
         self.world_size = comm.nranks
-        self.local_rank = comm.my_rank % comm.nranks_per_node
-        self.n_ranks_per_node = comm.nranks_per_node
-        dlpack = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
-        self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
-        algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
-        self._algorithm_nvls_packet = [
-            algo
-            for algo in algorithms
-            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
-        ][0]
-        self._algorithm_rsag_zero_copy = [
-            algo
-            for algo in algorithms
-            if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy"
-        ][0]
-        self._algorithm_packet = [
-            algo for algo in algorithms if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
-        ][0]
-        if mscclpp.is_nvls_supported():
-            self._algorithm_nvls_zero_copy = [
-                algo
-                for algo in algorithms
-                if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy"
-            ][0]
-        self._tune(n_warmup=5, n_graph_launches=10, n_ops_per_graph=100)
+        self.symmetric_memory = symmetric_memory
+        self._nvls = mscclpp.is_nvls_supported()
 
-    def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
-        sizes = [1 << i for i in range(10, 28)]
-        # Pre-fill with defaults for barrier
-        self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
+        self._scratch = _make_tensor(1 << 27, torch.float16)
+        self._barrier_tensor = _make_tensor(4096, torch.float32)
 
-        tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
-        tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
-        tune_tensor.normal_()
-        candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
-        candidates_nthreads = [512, 768, 1024]
+        algos = _load_algorithms(self._scratch, self.rank)
+        self._algos = {(a.collective, a.name): a for a in algos}
 
-        for size in sizes:
-            algos = []
-            if mscclpp.is_nvls_supported():
-                algos.append(self._algorithm_nvls_zero_copy)
-            if size <= 4 * 1024 * 1024:
-                algos.append(self._algorithm_nvls_packet)
-                algos.append(self._algorithm_packet)
-            if size >= 512 * 1024:
-                algos.append(self._algorithm_rsag_zero_copy)
+        # {collective: {rounded_size: (algo, nblocks, nthreads)}}
+        self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}}
+        self._tune_buf = None
+        self._time_buf = None
 
-            best_time = float("inf")
-            best_config = None
+    def _algo(self, collective: str, name: str):
+        return self._algos.get((collective, name))
 
-            for algo in algos:
-                for nb in candidates_nblocks:
-                    if algo.name == "default_allreduce_nvls_packet" and nb > 16:
-                        continue
-                    if algo.name == "default_allreduce_packet" and nb > 56:
-                        continue
-                    for nt in candidates_nthreads:
-                        if self._run_algo(algo, tune_tensor, size, nb, nt) != 0:
-                            continue
+    def _default_ar_config(self):
+        """Fallback allreduce config for barrier / timing sync."""
+        pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
+        if self._nvls and pkt:
+            return (pkt, 0, 0)
+        return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
 
-                        for _ in range(n_warmup):
-                            self._run_algo(algo, tune_tensor, size, nb, nt)
-                        self.barrier()
+    # -- low-level execute --
 
-                        capture_stream = torch.cuda.Stream()
-                        capture_stream.wait_stream(torch.cuda.current_stream())
-
-                        g = torch.cuda.CUDAGraph()
-                        # Warmup on capture stream
-                        with torch.cuda.stream(capture_stream):
-                            self._run_algo(algo, tune_tensor, size, nb, nt)
-                        capture_stream.synchronize()
-
-                        with torch.cuda.graph(g, stream=capture_stream):
-                            for _ in range(n_ops_per_graph):
-                                self._run_algo(algo, tune_tensor, size, nb, nt)
-
-                        start_event = torch.cuda.Event(enable_timing=True)
-                        end_event = torch.cuda.Event(enable_timing=True)
-                        start_event.record(capture_stream)
-                        with torch.cuda.stream(capture_stream):
-                            for _ in range(n_graph_launches):
-                                g.replay()
-                        end_event.record(capture_stream)
-                        end_event.synchronize()
-
-                        elapsed = start_event.elapsed_time(end_event)
-
-                        # Synchronize timing results across all ranks to ensure consistent algorithm selection
-                        # replicate n times such due to algo limitations
-                        time_tensor = torch.full((self.world_size,), elapsed, dtype=torch.float64, device="cuda").to(
-                            dtype=torch.float32
-                        )
-                        torch.cuda.current_stream().wait_stream(capture_stream)
-                        # TODO: use all_reduce may cause problem if the time elapsed between different algos are too close.
-                        # May change to broadcast in the future if that becomes an issue.
-                        self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM)
-                        avg_time = time_tensor[self.rank].item() / self.world_size
-
-                        if avg_time < best_time:
-                            best_time = avg_time
-                            best_config = (algo, nb, nt)
-
-            if best_config:
-                self.best_configs[size] = best_config
-                if self.rank == 0:
-                    print(
-                        f"Size {size}: Best Algo {best_config[0].name} nblocks {best_config[1]} nthreads {best_config[2]} Time {(best_time/(n_graph_launches * n_ops_per_graph))*1000:.2f} us"
-                    )
-        # reset the algorithms after tuning
-        torch.cuda.synchronize()
-        for algo in algos:
-            algo.reset()
-
-    def _run_algo(self, algo: mscclpp.Algorithm, tensor, size, nblocks, nthreads):
-        return algo.execute(
-            comm=self.comm.communicator,
-            input_buffer=tensor.data_ptr(),
-            output_buffer=tensor.data_ptr(),
-            input_size=size,
-            output_size=size,
-            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
-            op=mscclpp.ReduceOp.SUM,
-            stream=torch.cuda.current_stream().cuda_stream,
-            nblocks=nblocks,
-            nthreads_per_block=nthreads,
-            symmetric_memory=True,
-        )
-
-    def get_tuned_config(self, size):
-        if size < 1024:
-            target_size = 1024
-        elif size > 256 * 1024 * 1024:
-            target_size = 256 * 1024 * 1024
-        else:
-            target_size = 1 << (size - 1).bit_length()
-        return self.best_configs.get(target_size)
-
-    def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
-        assert op == torch.distributed.ReduceOp.SUM
-        config = self.get_tuned_config(tensor.nbytes)
-        algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0)
+    def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True):
+        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
         ret = algo.execute(
             comm=self.comm.communicator,
             input_buffer=tensor.data_ptr(),
@@ -195,107 +119,357 @@ class CustomizedComm:
             input_size=tensor.nbytes,
             output_size=tensor.nbytes,
             dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
-            op=to_mscclpp_reduce_op(op),
-            stream=stream.cuda_stream if stream is not None else torch.cuda.current_stream().cuda_stream,
-            nblocks=nblocks,
-            nthreads_per_block=nthreads,
-            symmetric_memory=True,
+            op=op,
+            stream=s,
+            nblocks=nb,
+            nthreads_per_block=nt,
+            symmetric_memory=sym,
+            accum_dtype=accum_dtype,
         )
         if ret != 0:
-            print(f"Rank {self.rank}: Algo {algo.name} failed with error {ret}")
+            print(f"Rank {self.rank}: {algo.name} failed ({ret})")
+        return ret
+
+    def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
+        if sym is None:
+            sym = self.symmetric_memory
+        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
+        ret = algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=inp.data_ptr(),
+            output_buffer=out.data_ptr(),
+            input_size=inp.nbytes,
+            output_size=out.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype),
+            op=mscclpp.ReduceOp.NOP,
+            stream=s,
+            nblocks=nb,
+            nthreads_per_block=nt,
+            symmetric_memory=sym,
+        )
+        if ret != 0:
+            print(f"Rank {self.rank}: AG {algo.name} failed ({ret})")
+        return ret
+
+    def _barrier_internal(self):
+        a, nb, nt = self._default_ar_config()
+        self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
+
+    # -- lazy tuning --
+
+    def _ensure_tune_bufs(self):
+        if self._tune_buf is None:
+            self._tune_buf = _make_tensor(1 << 27, torch.float16)
+            self._tune_buf.normal_()
+            self._time_buf = _make_tensor(4096, torch.float32)
+        return self._tune_buf
+
+    def _ar_candidates(self, size: int):
+        out = []
+        if size <= 4 << 20:
+            a = self._algo("allreduce", "default_allreduce_nvls_packet")
+            if self._nvls and a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_packet")
+            if a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_allpair_packet")
+            if a:
+                out.append(a)
+        if size >= 512 << 10:
+            a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
+            if self._nvls and self.symmetric_memory and a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
+            if a:
+                out.append(a)
+        if torch.version.hip is not None:
+            a = self._algo("allreduce", "default_allreduce_fullmesh")
+            if a:
+                out.append(a)
+        return out
+
+    def _ag_candidates(self):
+        a = self._algo("allgather", "default_allgather_fullmesh2")
+        return [a] if a else []
+
+    def _run_tune(self, collective, algo, buf, size, nb, nt):
+        """Single tune invocation for either collective."""
+        if collective == "allreduce":
+            return algo.execute(
+                comm=self.comm.communicator,
+                input_buffer=buf.data_ptr(),
+                output_buffer=buf.data_ptr(),
+                input_size=size,
+                output_size=size,
+                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
+                op=mscclpp.ReduceOp.SUM,
+                stream=torch.cuda.current_stream().cuda_stream,
+                nblocks=nb,
+                nthreads_per_block=nt,
+                symmetric_memory=True,
+            )
+        else:
+            total = size * self.world_size
+            out_ptr = buf.data_ptr()
+            return algo.execute(
+                comm=self.comm.communicator,
+                input_buffer=out_ptr + self.rank * size,
+                output_buffer=out_ptr,
+                input_size=size,
+                output_size=total,
+                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
+                op=mscclpp.ReduceOp.NOP,
+                stream=torch.cuda.current_stream().cuda_stream,
+                nblocks=nb,
+                nthreads_per_block=nt,
+                symmetric_memory=False,
+            )
+
+    def _tune_size(self, collective: str, target_size: int):
+        """Auto-tune one (collective, target_size) pair and cache result."""
+        buf = self._ensure_tune_bufs()
+        cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates()
+
+        best_time, best_cfg = float("inf"), None
+        used = set()
+        run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
+
+        for algo in cands:
+            nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
+            for nb in self._CANDIDATE_NBLOCKS:
+                if nb > nb_limit:
+                    continue
+                for nt in self._CANDIDATE_NTHREADS:
+                    # Feasibility — sync result across ranks so all agree
+                    ret = run(algo, nb, nt)
+                    torch.cuda.synchronize()
+                    self._time_buf[0] = float(ret)
+                    self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True)
+                    if self._time_buf[0].item() != 0:
+                        continue
+                    used.add(algo)
+
+                    # Warmup
+                    for _ in range(self._TUNE_N_WARMUP):
+                        run(algo, nb, nt)
+
+                    # CUDA-graph timed benchmark
+                    cs = torch.cuda.Stream()
+                    cs.wait_stream(torch.cuda.current_stream())
+                    g = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(g, stream=cs):
+                        for _ in range(self._TUNE_N_OPS_PER_GRAPH):
+                            run(algo, nb, nt)
+
+                    start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+                    start.record(cs)
+                    with torch.cuda.stream(cs):
+                        for _ in range(self._TUNE_N_GRAPH_LAUNCHES):
+                            g.replay()
+                    end.record(cs)
+                    end.synchronize()
+                    elapsed = start.elapsed_time(end)
+
+                    # Cross-rank timing sync
+                    self._time_buf.fill_(elapsed)
+                    torch.cuda.current_stream().wait_stream(cs)
+                    self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True)
+                    avg = self._time_buf[self.rank].item() / self.world_size
+
+                    if avg < best_time:
+                        best_time, best_cfg = avg, (algo, nb, nt)
+
+        if best_cfg:
+            self._tune_cache[collective][target_size] = best_cfg
+            if self.rank == 0:
+                n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH
+                print(
+                    f"[tune] {collective} size={target_size}: {best_cfg[0].name} "
+                    f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us",
+                    flush=True,
+                )
+        else:
+            fb = (
+                self._default_ar_config()
+                if collective == "allreduce"
+                else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None)
+            )
+            self._tune_cache[collective][target_size] = fb
+
+        torch.cuda.synchronize()
+        self._barrier_internal()
+        for a in used:
+            a.reset()
+
+    # -- public API --
+
+    def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None):
+        sz = _round_pow2(tensor.nbytes)
+        if sz not in self._tune_cache["allreduce"]:
+            self._tune_size("allreduce", sz)
+        a, nb, nt = self._tune_cache["allreduce"][sz]
+        self._exec_ar(
+            tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory
+        )
+
+    def all_gather(self, output_tensor, input_tensor, stream=None):
+        sz = _round_pow2(input_tensor.nbytes)
+        if sz not in self._tune_cache["allgather"]:
+            self._tune_size("allgather", sz)
+        a, nb, nt = self._tune_cache["allgather"][sz]
+        self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory)
 
     def barrier(self):
-        tensor = torch.empty(self.world_size, dtype=torch.float, device=torch.device("cuda"))
-        self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
-
-    def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
-        low = 5 * 1024
-        high = 80 * 1024 * 1024
-        sizes = []
-        curr = low
-        while curr <= high:
-            sizes.append(curr)
-            curr *= 2
-
-        if self.rank == 0:
-            print(f"{'Size (Bytes)':<20} {'Time (us)':<20} {'AlgoBW (GB/s)':<20}")
-
-        dtype = torch.float16
-        capture_stream = torch.cuda.Stream()
-
-        # Allocate a single large RawGpuBuffer (symmetric memory) and reuse it for all sizes.
-        # Cannot allocate per-size tensors with symmetric memory.
-        bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype))
-        bench_buf = torch.utils.dlpack.from_dlpack(bench_buf)
-        bench_buf.normal_()
-
-        for size in sizes:
-            n_elements = size // bench_buf.element_size()
-            tensor = bench_buf[:n_elements]
-
-            capture_stream.wait_stream(torch.cuda.current_stream())
-            # Capture Graph
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g, stream=capture_stream):
-                for _ in range(n_iter_per_graph):
-                    self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
-
-            # warmup: Execute the graph once to prime the driver
-            with torch.cuda.stream(capture_stream):
-                for _ in range(n_warmup):
-                    g.replay()
-                self.barrier()
-            capture_stream.synchronize()
-
-            # Benchmark
-            start_event = torch.cuda.Event(enable_timing=True)
-            end_event = torch.cuda.Event(enable_timing=True)
-
-            start_event.record(capture_stream)
-            with torch.cuda.stream(capture_stream):
-                for _ in range(n_graph_launches):
-                    g.replay()
-            end_event.record(capture_stream)
-            end_event.synchronize()
-
-            # Get elapsed time in milliseconds
-            elapsed_ms = start_event.elapsed_time(end_event)
-            avg_time_ms = elapsed_ms / (n_graph_launches * n_iter_per_graph)
-            time_us = avg_time_ms * 1000
-
-            alg_bw = size / (avg_time_ms * 1e-3) if avg_time_ms > 0 else 0
-            if self.rank == 0:
-                print(f"{size:<20} {time_us:<20.2f} {alg_bw / 1e9:<20.2f}")
+        self._barrier_internal()
 
     def destroy(self):
-        self._algorithm_nvls_nonzero_copy = None
-        self._algorithm_nvls_packet = None
-        self.scratch_buffer = None
-        self.comm = None
+        self._algos.clear()
+        self._tune_cache = {"allreduce": {}, "allgather": {}}
+        self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None
 
 
-def init_dist() -> CustomizedComm:
-    rank = int(os.environ["RANK"])
-    world = int(os.environ["WORLD_SIZE"])
-    master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
-    master_port = os.environ["MSCCLPP_MASTER_PORT"]
-    interface = interfaces_for_ip_netifaces(master_addr)
-    if interface is None:
-        raise ValueError(f"Cannot find network interface for IP address {master_addr}")
-    interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
-    mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
-    return CustomizedComm(mscclpp_group)
+# -- Benchmarks (standalone) --------------------------------------------------
+
+
+def _bench_sizes(low=5 * 1024, high=80 << 20):
+    sizes, c = [], low
+    while c <= high:
+        sizes.append(c)
+        c *= 2
+    return sizes
+
+
+def benchmark_allreduce(
+    comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100
+):
+    sizes = _bench_sizes()
+    if comm.rank == 0:
+        print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}")
+        print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
+
+    cs = torch.cuda.Stream()
+    buf = _make_tensor(1 << 27, dtype)
+    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
+
+    for size in sizes:
+        nelems = size // buf.element_size()
+        t = buf[: size // buf.element_size()]
+        comm.all_reduce(t, accum_dtype=accum_dtype)
+        torch.cuda.synchronize()
+
+        cs.wait_stream(torch.cuda.current_stream())
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, stream=cs):
+            for _ in range(n_iter):
+                comm.all_reduce(t, accum_dtype=accum_dtype)
+        with torch.cuda.stream(cs):
+            for _ in range(n_warmup):
+                g.replay()
+            comm.barrier()
+        cs.synchronize()
+
+        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        s.record(cs)
+        with torch.cuda.stream(cs):
+            for _ in range(n_graph_launches):
+                g.replay()
+        e.record(cs)
+        e.synchronize()
+
+        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
+        if comm.rank == 0:
+            print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}")
+
+
+def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100):
+    sizes = _bench_sizes()
+    if comm.rank == 0:
+        print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}")
+        print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
+
+    cs = torch.cuda.Stream()
+    buf = _make_tensor(1 << 27, dtype)
+    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
+
+    for prs in sizes:
+        total = prs * comm.world_size
+        if total > buf.nbytes:
+            break
+        nt = total // buf.element_size()
+        npr = prs // buf.element_size()
+        out = buf[:nt]
+        inp = out[comm.rank * npr : (comm.rank + 1) * npr]
+
+        comm.all_gather(out, inp)
+        torch.cuda.synchronize()
+
+        cs.wait_stream(torch.cuda.current_stream())
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, stream=cs):
+            for _ in range(n_iter):
+                comm.all_gather(out, inp)
+        with torch.cuda.stream(cs):
+            for _ in range(n_warmup):
+                g.replay()
+            comm.barrier()
+        cs.synchronize()
+
+        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        s.record(cs)
+        with torch.cuda.stream(cs):
+            for _ in range(n_graph_launches):
+                g.replay()
+        e.record(cs)
+        e.synchronize()
+
+        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
+        if comm.rank == 0:
+            print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}")
+
+
+# -- Bootstrap & main ---------------------------------------------------------
+
+
+def init_dist() -> mscclpp.CommGroup:
+    addr = os.environ.get("MSCCLPP_MASTER_ADDR")
+    if addr:
+        rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"])
+        port = os.environ["MSCCLPP_MASTER_PORT"]
+        iface = _interfaces_for_ip(addr)
+        if not iface:
+            raise ValueError(f"No interface for {addr}")
+        return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
+    import torch.distributed as dist
+
+    dist.init_process_group(backend="gloo")
+    return mscclpp.CommGroup(torch_group=dist.group.WORLD)
 
 
 def main():
     local = int(os.environ["LOCAL_RANK"])
     torch.cuda.set_device(local)
-    comm = init_dist()
-    comm.benchmark(n_warmup=5, n_graph_launches=10, n_iter_per_graph=100)
-    comm.barrier()
+
+    dtype_str = os.environ.get("DTYPE", "float16")
+    dtype = getattr(torch, dtype_str, torch.float16)
+    accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
+    accum_str = os.environ.get("ACCUM_DTYPE")
+    accum_dtype = accum_map.get(accum_str) if accum_str else None
+
+    comm_group = init_dist()
+    cc = CustomizedComm(comm_group)
+
+    print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
+    benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
+    cc.barrier()
     torch.cuda.synchronize()
-    comm.destroy()
-    print(f"rank {local} All-reduce operation completed successfully.")
+
+    benchmark_allgather(cc, dtype=dtype)
+    cc.barrier()
+    torch.cuda.synchronize()
+
+    cc.destroy()
+    print(f"rank {local} completed successfully.")
 
 
 if __name__ == "__main__":
diff --git a/examples/torch-integration/dsl_with_nccl_api.py b/examples/torch-integration/dsl_with_nccl_api.py
index 975d3749..5a4dd1c4 100644
--- a/examples/torch-integration/dsl_with_nccl_api.py
+++ b/examples/torch-integration/dsl_with_nccl_api.py
@@ -1,19 +1,20 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/nccl/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
 
 import os
 from typing import Any, Dict
 import torch, torch.distributed as dist
-import mscclpp
+import mscclpp.ext
 from mscclpp.language.collectives import AllReduce
 from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
 from mscclpp.language.program import CollectiveProgram
 from mscclpp.language.rank import Rank
+from mscclpp.language.utils import AlgoSpec
 
 
-def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
+def allreduce_nvls(spec: AlgoSpec) -> CollectiveProgram:
     gpu_size = spec.world_size
     with CollectiveProgram.from_spec(spec) as program:
         # Creating Channels
@@ -63,8 +64,8 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
     return program
 
 
-def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int):
-    spec = mscclpp.AlgoSpec(
+def setup_plan(algo_collection_builder: mscclpp.ext.AlgorithmCollectionBuilder, rank: int, world_size: int):
+    spec = AlgoSpec(
         name="allreduce_nvls",
         collective=AllReduce(8, 1, True),
         nranks_per_node=8,
@@ -94,10 +95,10 @@ def init_dist():
     rank = int(os.environ["RANK"])
     world = int(os.environ["WORLD_SIZE"])
     local = int(os.environ["LOCAL_RANK"])
-    algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder()
+    algorithm_collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
     setup_plan(algorithm_collection_builder, rank, world)
     algorithm_collection_builder.set_algorithm_selector(selector)
-    dist.init_process_group(backend="nccl", device_id=local)
+    dist.init_process_group(backend="nccl", device_id=torch.device("cuda", local))
     return rank, world, local
 
 
diff --git a/include/mscclpp/algorithm.hpp b/include/mscclpp/algorithm.hpp
index 65b1ab3c..531cb857 100644
--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -103,12 +103,14 @@ class Algorithm {
   /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
   /// @param symmetricMemory Whether to use symmetric memory optimization.
   /// @param extras Additional parameters for algorithm-specific customization.
+  /// @param accumDtype Data type for accumulation during reduction. DataType::AUTO resolves to dtype.
   /// @return The result of the operation.
   virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                              size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                              std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
                              bool symmetricMemory = false,
-                             const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
+                             const std::unordered_map<std::string, uintptr_t>& extras = {},
+                             DataType accumDtype = DataType::AUTO) = 0;
 
   /// Reset the algorithm state, clearing any cached contexts.
   virtual void reset() = 0;
@@ -186,10 +188,11 @@ class NativeAlgorithm : public Algorithm {
   /// @param nBlocks Number of CUDA blocks.
   /// @param nThreadsPerBlock Number of threads per block.
   /// @param extras Additional algorithm-specific parameters.
+  /// @param accumDtype Data type for accumulation (resolved from input dtype if sentinel).
   /// @return The result of the operation.
   using KernelFunc =
       std::function<CommResult(const std::shared_ptr<void>, const void*, void*, size_t, size_t, DataType, ReduceOp,
-                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&)>;
+                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&, DataType)>;
 
   /// Function type for creating algorithm contexts.
   /// @param comm The communicator.
@@ -233,8 +236,8 @@ class NativeAlgorithm : public Algorithm {
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                      size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                      std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                     bool symmetricMemory = false,
-                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+                     bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
+                     DataType accumDtype = DataType::AUTO) override;
   const std::string& name() const override;
   const std::string& collective() const override;
   const std::pair<size_t, size_t>& messageRange() const override;
@@ -285,8 +288,8 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                      size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                      std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                     bool symmetricMemory = false,
-                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+                     bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
+                     DataType accumDtype = DataType::AUTO) override;
   AlgorithmType type() const override { return AlgorithmType::DSL; }
   Constraint constraint() const override;
   void reset() override;
diff --git a/include/mscclpp/atomic_device.hpp b/include/mscclpp/atomic_device.hpp
index 74f6122f..d00bb50c 100644
--- a/include/mscclpp/atomic_device.hpp
+++ b/include/mscclpp/atomic_device.hpp
@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
   return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
 }
 
-#elif defined(MSCCLPP_DEVICE_HIP)
+#else  // !defined(MSCCLPP_DEVICE_CUDA)
 
 constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
 constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
 constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
 constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
 
-// HIP does not have thread scope enums like CUDA
 constexpr auto scopeSystem = 0;
 constexpr auto scopeDevice = 0;
 
@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
   return __atomic_fetch_add(ptr, val, memoryOrder);
 }
 
-#endif  // defined(MSCCLPP_DEVICE_HIP)
+#endif  // !defined(MSCCLPP_DEVICE_CUDA)
 
 }  // namespace mscclpp
 
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index 39f73e8d..fb1da22c 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -110,6 +110,11 @@ class Env {
   /// Default is false.
   const bool forceDisableNvls;
 
+  /// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++.
+  /// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded.
+  /// Default is false.
+  const bool forceDisableGdr;
+
  private:
   Env();
 
diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
index 1cecbea6..41bd5928 100644
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -64,18 +64,151 @@ using __bfloat162 = __nv_bfloat162;
 
 #endif
 
+/// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15.
+/// Format (MSB first): [sign:1][exponent:4][mantissa:3]
+/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention).
+/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6.
+struct alignas(1) __fp8_e4m3b15 {
+  uint8_t __x;
+
+  __fp8_e4m3b15() = default;
+
+  /// Construct from raw bits (use __fp8_e4m3b15::fromRaw() for clarity).
+  MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(uint8_t raw) : __x(raw) {}
+
+  /// Construct from float32 (explicit to avoid ambiguous conversion chains).
+  MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(float val) : __x(fromFloat(val)) {}
+
+  /// Convert to float32.
+  MSCCLPP_HOST_DEVICE_INLINE operator float() const { return toFloat(__x); }
+
+  /// Construct from a raw bit pattern without conversion.
+  static MSCCLPP_HOST_DEVICE_INLINE __fp8_e4m3b15 fromRaw(uint8_t bits) {
+    __fp8_e4m3b15 r;
+    r.__x = bits;
+    return r;
+  }
+
+ private:
+  /// Decode fp8_e4m3b15 bits → float32.
+  ///
+  /// Uses bit manipulation through fp16 as intermediate, adapted from the Triton compiler.
+  /// fp8_e4m3b15 is identical to fp8_e4m3fn (NVIDIA) except exponent bias is 15 vs 7.
+  /// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8,
+  /// then convert fp16 → float32.
+  static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) {
+    // Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN.
+    uint32_t exp = (bits >> 3) & 0xFu;
+    if (bits == 0x80 || exp == 15) {
+      union {
+        uint32_t u;
+        float f;
+      } nan_val = {0x7FC00000u};
+      return nan_val.f;
+    }
+    if (bits == 0) return 0.0f;
+
+    // Triton-style bit manipulation: fp8 → fp16 → fp32.
+    // fp8 layout: [S:1][E:4][M:3]  (bias=15)
+    // fp16 layout: [S:1][E:5][M:10] (bias=15)
+    //
+    // Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1
+    // to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15.
+    // Refer:
+    // https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34
+    uint16_t h = (uint16_t)bits << 8;             // place fp8 in upper byte of fp16
+    uint16_t sign16 = h & 0x8000u;                // extract sign at fp16 position
+    uint16_t nosign = h & 0x7F00u;                // exponent + mantissa (no sign)
+    uint16_t fp16_bits = sign16 | (nosign >> 1);  // shift exponent right by 1
+
+    // For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0
+    // and fp16 mantissa = (fp8_mantissa << 7), which correctly represents
+    // the subnormal fp16 value since both share bias=15.
+
+    // Convert fp16 bits to float via __half (works on host and device, CUDA and HIP).
+    union {
+      uint16_t u;
+      __half h;
+    } cvt = {fp16_bits};
+    return __half2float(cvt.h);
+  }
+
+  /// Encode float32 → fp8_e4m3b15 bits.
+  ///
+  /// Algorithm adapted from Triton: float32 → fp16 → bit-manipulate → fp8.
+  /// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15),
+  /// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1.
+  static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) {
+    union {
+      float f;
+      uint32_t u;
+    } in = {val};
+
+    // NaN → 0x80 (negative-zero bit pattern = NaN in fnuz).
+    if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u;
+
+    // Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP).
+    __half h_val = __float2half_rn(val);
+    union {
+      __half h;
+      uint16_t u;
+    } cvt = {h_val};
+    uint16_t fp16_bits = cvt.u;
+
+    // Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80.
+    uint16_t abs_fp16 = fp16_bits & 0x7FFFu;
+    if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u;
+
+    // Reconstruct with sign.
+    uint16_t sign16 = fp16_bits & 0x8000u;
+
+    // Triton-style: fp16 → fp8.
+    // fp16 layout: [S:1][E:5][M:10] (bias=15)
+    // fp8 layout:  [S:1][E:4][M:3]  (bias=15)
+    //
+    // mad.lo.u32 a0, a0, 2, 0x00800080  →  (abs_fp16 * 2 + 0x0080)
+    // This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias.
+    // Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea  →  (sign & 0x8000) | a0
+    // Finally: prmt for byte extraction.
+    //
+    // Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte.
+    uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u);
+    // The upper byte now contains [E:4][M:3][round_bit].
+    // Combine with sign and extract.
+    uint16_t with_sign = sign16 | adjusted;
+    uint8_t result = (uint8_t)(with_sign >> 8);
+
+    // Zero → 0x00 (ensure positive zero, not negative zero which is NaN).
+    if ((result & 0x7Fu) == 0) result = 0x00u;
+
+    return result;
+  }
+};
+
+/// Packed 2x fp8_e4m3b15 storage.
+struct alignas(2) __fp8x2_e4m3b15 {
+  uint16_t __x;
+};
+
+/// Packed 4x fp8_e4m3b15 storage.
+struct alignas(4) __fp8x4_e4m3b15 {
+  uint32_t __x;
+};
+
 namespace mscclpp {
 
 /// Data types supported by mscclpp operations.
 enum class DataType {
-  INT32,        // 32-bit signed integer.
-  UINT32,       // 32-bit unsigned integer.
-  FLOAT16,      // IEEE 754 half precision.
-  FLOAT32,      // IEEE 754 single precision.
-  BFLOAT16,     // bfloat16 precision.
-  FLOAT8_E4M3,  // float8 with E4M3 layout.
-  FLOAT8_E5M2,  // float8 with E5M2 layout.
-  UINT8,        // 8-bit unsigned integer.
+  INT32,           // 32-bit signed integer.
+  UINT32,          // 32-bit unsigned integer.
+  FLOAT16,         // IEEE 754 half precision.
+  FLOAT32,         // IEEE 754 single precision.
+  BFLOAT16,        // bfloat16 precision.
+  FLOAT8_E4M3,     // float8 with E4M3 layout.
+  FLOAT8_E5M2,     // float8 with E5M2 layout.
+  UINT8,           // 8-bit unsigned integer.
+  FLOAT8_E4M3B15,  // float8 with E4M3 layout, bias=15 (software, no HW accel).
+  AUTO = 255,      // Sentinel: resolve to the input dtype at runtime.
 };
 
 /// Word array.
@@ -97,6 +230,7 @@ struct alignas(Bytes) Words<Bytes, false> {};
 template <typename T, int N, typename StorageT>
 union alignas(sizeof(T) * N) VectorTypeImpl {
   static_assert(N > 0, "N must be greater than 0");
+  static_assert(sizeof(StorageT) >= sizeof(T) * N, "StorageT must cover the full vector size");
 
   T data[N];
   Words<sizeof(T) * N> words;
@@ -127,13 +261,14 @@ union alignas(sizeof(T) * N) VectorTypeImpl {
   MSCCLPP_HOST_DEVICE_INLINE const T& operator[](int i) const { return data[i]; }
 };
 
-// Helper template to get the appropriate vector type for a given element type and count
+// Helper template to get the appropriate vector type for a given element type and count.
 template <typename T, int N>
 struct VectorTypeHelper {
-  using type =
-      VectorTypeImpl<T, N,
-                     typename std::conditional_t<N * sizeof(T) == 4, uint32_t,
-                                                 typename std::conditional_t<N * sizeof(T) == 8, uint2, uint4>>>;
+  static constexpr int Bytes = N * sizeof(T);
+  using type = VectorTypeImpl<
+      T, N,
+      std::conditional_t<Bytes == 4, uint32_t,
+                         std::conditional_t<Bytes == 8, uint2, std::conditional_t<Bytes <= 16, uint4, Words<Bytes>>>>>;
 };
 
 /// Vector type - clean user interface (automatically selects appropriate storage type)
@@ -170,6 +305,11 @@ DEFINE_VEC(bf16x4, __bfloat16, 4, uint2);
 DEFINE_VEC(f16x8, __half, 8, uint4);
 DEFINE_VEC(bf16x8, __bfloat16, 8, uint4);
 
+// Aliases for large vector types (>16 bytes) where no native CUDA storage type exists.
+using f32x8 = VectorType<float, 8>;
+using f32x16 = VectorType<float, 16>;
+using f16x16 = VectorType<__half, 16>;
+
 #if defined(__FP8_TYPES_EXIST__)
 DEFINE_VEC(f8_e4m3x2, __fp8_e4m3, 2, __fp8x2_e4m3);
 DEFINE_VEC(f8_e4m3x4, __fp8_e4m3, 4, __fp8x4_e4m3);
@@ -181,6 +321,12 @@ DEFINE_VEC(f8_e5m2x4, __fp8_e5m2, 4, __fp8x4_e5m2);
 DEFINE_VEC(f8_e5m2x8, __fp8_e5m2, 8, uint2);
 DEFINE_VEC(f8_e5m2x16, __fp8_e5m2, 16, uint4);
 #endif
+
+// fp8_e4m3b15 vectors (always available — software type, no HW dependency)
+DEFINE_VEC(f8_e4m3b15x2, __fp8_e4m3b15, 2, __fp8x2_e4m3b15);
+DEFINE_VEC(f8_e4m3b15x4, __fp8_e4m3b15, 4, __fp8x4_e4m3b15);
+DEFINE_VEC(f8_e4m3b15x8, __fp8_e4m3b15, 8, uint2);
+DEFINE_VEC(f8_e4m3b15x16, __fp8_e4m3b15, 16, uint4);
 #undef DEFINE_VEC
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
@@ -254,6 +400,21 @@ MSCCLPP_DEVICE_INLINE __fp8_e5m2 clip(__fp8_e5m2 val) {
 }
 #endif
 
+// --- f32x2 arithmetic ---
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f32x2 operator+(const f32x2& a, const f32x2& b) {
+#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ >= 1000)
+  // Blackwell (SM 10.0+): packed float2 add in a single instruction.
+  return __fadd2_rn(a.storage, b.storage);
+#else
+  f32x2 result;
+  result.data[0] = a.data[0] + b.data[0];
+  result.data[1] = a.data[1] + b.data[1];
+  return result;
+#endif
+}
+
 template <bool UseClip = true>
 MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
   __half2 result;
@@ -265,6 +426,18 @@ MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
   return result;
 }
 
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f16x4 operator+(const f16x4& a, const f16x4& b) {
+  // Decompose into 2× packed __hadd2 (2 instructions instead of 4 scalar __hadd).
+  const f16x2* a2 = reinterpret_cast<const f16x2*>(&a);
+  const f16x2* b2 = reinterpret_cast<const f16x2*>(&b);
+  f16x4 result;
+  f16x2* r2 = reinterpret_cast<f16x2*>(&result);
+  r2[0] = a2[0] + b2[0];
+  r2[1] = a2[1] + b2[1];
+  return result;
+}
+
 template <bool UseClip = true>
 MSCCLPP_DEVICE_INLINE bf16x2 operator+(const bf16x2& a, const bf16x2& b) {
   __bfloat162 result;
@@ -449,6 +622,14 @@ MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) {
   return (a < b ? a : b);
 }
 
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 min(const f32x2& a, const f32x2& b) {
+  f32x2 result;
+  result.data[0] = fminf(a.data[0], b.data[0]);
+  result.data[1] = fminf(a.data[1], b.data[1]);
+  return result;
+}
+
 template <>
 MSCCLPP_DEVICE_INLINE f16x2 min(const f16x2& a, const f16x2& b) {
 #if defined(MSCCLPP_DEVICE_HIP)
@@ -489,6 +670,51 @@ MSCCLPP_DEVICE_INLINE u8x4 min(const u8x4& a, const u8x4& b) {
 #endif
 }
 
+/// Convert a vector type From to vector type To.
+/// Primary template with auto-decomposition: vectors with N > 4 elements decompose into x4 chunks,
+/// vectors with N == 4 decompose into x2 chunks, enabling optimized x2/x4 specializations to be reached.
+/// Specialized below for optimized FP8 conversion paths at x2/x4 level.
+template <typename To, typename From>
+MSCCLPP_DEVICE_INLINE To to(const From& v) {
+  static_assert(To::Size == From::Size, "to<To, From>: vector sizes must match");
+  constexpr int N = From::Size;
+
+  // Auto-decompose: N > 4 → split into x4 chunks
+  if constexpr (N > 4 && N % 4 == 0) {
+    constexpr int nChunks = N / 4;
+    using FromChunk = VectorType<typename From::ElementType, 4>;
+    using ToChunk = VectorType<typename To::ElementType, 4>;
+    const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
+    To result;
+    ToChunk* out = reinterpret_cast<ToChunk*>(&result);
+#pragma unroll
+    for (int c = 0; c < nChunks; ++c) {
+      out[c] = to<ToChunk>(in[c]);
+    }
+    return result;
+  }
+  // Auto-decompose: N == 4 → split into 2x x2 chunks
+  else if constexpr (N == 4) {
+    using FromChunk = VectorType<typename From::ElementType, 2>;
+    using ToChunk = VectorType<typename To::ElementType, 2>;
+    const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
+    To result;
+    ToChunk* out = reinterpret_cast<ToChunk*>(&result);
+    out[0] = to<ToChunk>(in[0]);
+    out[1] = to<ToChunk>(in[1]);
+    return result;
+  }
+  // Base case: element-wise conversion
+  else {
+    To result;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      result.data[i] = static_cast<typename To::ElementType>(v.data[i]);
+    }
+    return result;
+  }
+}
+
 #if defined(__FP8_TYPES_EXIST__)
 template <>
 MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) {
@@ -551,7 +777,592 @@ MSCCLPP_DEVICE_INLINE f8_e5m2x4 min(const f8_e5m2x4& a, const f8_e5m2x4& b) {
 
   return result;
 }
+
+// --- f8_e4m3 -> f32 specializations ---
+
+/// f8_e4m3x2 -> f32x2.
+/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
+/// HIP gfx942: fp8 -> float (via __builtin_amdgcn_cvt_pk_f32_fp8).
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3x2>(const f8_e4m3x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
+  f32x2 result;
+  result.data[0] = f[0];
+  result.data[1] = f[1];
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
+  f32x2 result;
+  result.data[0] = __half2float(bit_cast<__half>(h2.x));
+  result.data[1] = __half2float(bit_cast<__half>(h2.y));
+  return result;
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e4m3x4 -> f32x4.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3x4>(const f8_e4m3x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto lo = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, false);
+  auto hi = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, true);
+  f32x4 result;
+  result.data[0] = lo[0];
+  result.data[1] = lo[1];
+  result.data[2] = hi[0];
+  result.data[3] = hi[1];
+  return result;
+#else
+  const f8_e4m3x2* pair = reinterpret_cast<const f8_e4m3x2*>(&v);
+  f32x2 lo = to<f32x2>(pair[0]);
+  f32x2 hi = to<f32x2>(pair[1]);
+  f32x4 result;
+  result.data[0] = lo.data[0];
+  result.data[1] = lo.data[1];
+  result.data[2] = hi.data[0];
+  result.data[3] = hi.data[1];
+  return result;
+#endif
+}
+
+// --- f8_e5m2 -> f32 specializations ---
+
+/// f8_e5m2x2 -> f32x2.
+/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
+/// HIP gfx942: bf8 -> float (via __builtin_amdgcn_cvt_pk_f32_bf8).
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e5m2x2>(const f8_e5m2x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, 0);
+  f32x2 result;
+  result.data[0] = f[0];
+  result.data[1] = f[1];
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E5M2);
+  f32x2 result;
+  result.data[0] = __half2float(bit_cast<__half>(h2.x));
+  result.data[1] = __half2float(bit_cast<__half>(h2.y));
+  return result;
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e5m2x4 -> f32x4.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e5m2x4>(const f8_e5m2x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto lo = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, false);
+  auto hi = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, true);
+  f32x4 result;
+  result.data[0] = lo[0];
+  result.data[1] = lo[1];
+  result.data[2] = hi[0];
+  result.data[3] = hi[1];
+  return result;
+#else
+  const f8_e5m2x2* pair = reinterpret_cast<const f8_e5m2x2*>(&v);
+  f32x2 lo = to<f32x2>(pair[0]);
+  f32x2 hi = to<f32x2>(pair[1]);
+  f32x4 result;
+  result.data[0] = lo.data[0];
+  result.data[1] = lo.data[1];
+  result.data[2] = hi.data[0];
+  result.data[3] = hi.data[1];
+  return result;
+#endif
+}
+
+// --- f32 -> f8_e4m3 specializations (downcast) ---
+
+/// f32x2 -> f8_e4m3x2.
+/// HIP gfx942: float -> fp8 (via __builtin_amdgcn_cvt_pk_fp8_f32).
+/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2).
+/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
+  return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2;
+  h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
+  return bit_cast<f8_e4m3x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  f8_e4m3x2 result;
+  result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
+  result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
+  return result;
+#else
+  f8_e4m3x2 result;
+  result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e4m3x4.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x4 to<f8_e4m3x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
+  packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[2], v.data[3], packed, true);
+  return bit_cast<f8_e4m3x4>(packed);
+#else
+  f32x2 lo, hi;
+  lo.data[0] = v.data[0];
+  lo.data[1] = v.data[1];
+  hi.data[0] = v.data[2];
+  hi.data[1] = v.data[3];
+  f8_e4m3x2 lo_fp8 = to<f8_e4m3x2>(lo);
+  f8_e4m3x2 hi_fp8 = to<f8_e4m3x2>(hi);
+  f8_e4m3x4 result;
+  result.data[0] = lo_fp8.data[0];
+  result.data[1] = lo_fp8.data[1];
+  result.data[2] = hi_fp8.data[0];
+  result.data[3] = hi_fp8.data[1];
+  return result;
+#endif
+}
+
+// --- f32 -> f8_e5m2 specializations (downcast) ---
+
+/// f32x2 -> f8_e5m2x2.
+/// HIP gfx942: float -> bf8 (via __builtin_amdgcn_cvt_pk_bf8_f32).
+/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2 with __NV_E5M2).
+/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e5m2x2 to<f8_e5m2x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
+  return bit_cast<f8_e5m2x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2;
+  h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E5M2);
+  return bit_cast<f8_e5m2x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  f8_e5m2x2 result;
+  result.data[0] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E5M2));
+  result.data[1] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E5M2));
+  return result;
+#else
+  f8_e5m2x2 result;
+  result.data[0] = static_cast<__fp8_e5m2>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e5m2>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e5m2x4.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e5m2x4 to<f8_e5m2x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
+  packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[2], v.data[3], packed, true);
+  return bit_cast<f8_e5m2x4>(packed);
+#else
+  f32x2 lo, hi;
+  lo.data[0] = v.data[0];
+  lo.data[1] = v.data[1];
+  hi.data[0] = v.data[2];
+  hi.data[1] = v.data[3];
+  f8_e5m2x2 lo_fp8 = to<f8_e5m2x2>(lo);
+  f8_e5m2x2 hi_fp8 = to<f8_e5m2x2>(hi);
+  f8_e5m2x4 result;
+  result.data[0] = lo_fp8.data[0];
+  result.data[1] = lo_fp8.data[1];
+  result.data[2] = hi_fp8.data[0];
+  result.data[3] = hi_fp8.data[1];
+  return result;
+#endif
+}
+
+// --- f8_e4m3 <-> f16 conversion specializations ---
+
+/// f8_e4m3x2 -> f16x2.
+/// NVIDIA SM90+: packed intrinsic (1 instruction).
+/// HIP gfx942: fp8 -> float -> half (via AMD builtin).
+/// Pre-SM90 / fallback: element-wise scalar conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3x2>(const f8_e4m3x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
+  f16x2 result;
+  result.data[0] = __float2half(f[0]);
+  result.data[1] = __float2half(f[1]);
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
+  return bit_cast<f16x2>(h2);
+#else
+  f16x2 result;
+  result.data[0] = static_cast<__half>(v.data[0]);
+  result.data[1] = static_cast<__half>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f16x2 -> f8_e4m3x2.
+/// NVIDIA SM90+: packed intrinsic (1 instruction).
+/// HIP gfx942: half -> float -> fp8 (via AMD builtin).
+/// Pre-SM90: element-wise scalar conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f16x2>(const f16x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  float f0 = __half2float(v.data[0]);
+  float f1 = __half2float(v.data[1]);
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(f0, f1, 0, false);
+  return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = bit_cast<__half2_raw>(v);
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
+  return bit_cast<f8_e4m3x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(v.data[0]);
+  h1.x = bit_cast<unsigned short>(v.data[1]);
+  f8_e4m3x2 result;
+  result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
+  result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
+  return result;
+#else
+  f8_e4m3x2 result;
+  result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
+  return result;
+#endif
+}
+
 #endif  // defined(__FP8_TYPES_EXIST__)
+
+// --- fp8_e4m3b15 <-> fp16 direct conversion specializations ---
+// These are the PRIMARY conversions: fp8_b15 <-> fp16 is just a 1-bit exponent shift
+// (E4 bias=15 <-> E5 bias=15), no precision loss since fp16 has 10 mantissa bits
+// vs fp8's 3. fp32 conversions are derived by routing through fp16.
+
+/// f8_e4m3b15x2 -> f16x2.
+/// Direct fp8 -> fp16 via branch-free bit manipulation.
+template <>
+MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint16_t in = v.storage.__x;
+  // Spread 2 fp8 bytes into packed fp16 pair, adjust exponent E4->E5.
+  uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
+  uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
+  uint32_t out0 = b0 | (a0 & 0x80008000u);
+  __half2 h;
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h)) : "r"(out0));
+  return h;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: same bit manipulation as CUDA, store packed fp16 bits via words[].
+  uint16_t in = v.storage.__x;
+  uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
+  uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
+  uint32_t out0 = b0 | (a0 & 0x80008000u);
+  f16x2 result;
+  result.words[0] = out0;
+  return result;
+#else
+  f16x2 result;
+  result.data[0] = __float2half(float(v.data[0]));
+  result.data[1] = __float2half(float(v.data[1]));
+  return result;
+#endif
+}
+
+/// f8_e4m3b15x4 -> f16x4.
+/// Uses __byte_perm + lop3 for branch-free vectorized conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f16x4 to<f16x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in = v.storage.__x;
+  uint32_t a0 = __byte_perm(0u, in, 0x5746u);
+  uint32_t a0_shr = a0 >> 1;
+  uint32_t a0_sign = a0 & 0x80008000u;
+  uint32_t out0;
+  asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out0) : "r"(a0_shr), "r"(0x3f803f80u), "r"(a0_sign));
+  uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
+  uint32_t a1_shr = a1 >> 1;
+  uint32_t a1_sign = a1 & 0x80008000u;
+  uint32_t out1;
+  asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out1) : "r"(a1_shr), "r"(0x3f803f80u), "r"(a1_sign));
+  f16x4 result;
+  asm("mov.b32 %0, %1;" : "=r"(result.words[0]) : "r"(out0));
+  asm("mov.b32 %0, %1;" : "=r"(result.words[1]) : "r"(out1));
+  return result;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: __byte_perm + bitwise E4→E5 shift (no lop3), store via words[].
+  uint32_t in = v.storage.__x;
+  uint32_t a0 = __byte_perm(0u, in, 0x5746u);
+  uint32_t out0 = ((a0 >> 1) & 0x3f803f80u) | (a0 & 0x80008000u);
+  uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
+  uint32_t out1 = ((a1 >> 1) & 0x3f803f80u) | (a1 & 0x80008000u);
+  f16x4 result;
+  result.words[0] = out0;
+  result.words[1] = out1;
+  return result;
+#else
+  f16x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __float2half(float(v.data[i]));
+  }
+  return result;
+#endif
+}
+
+/// f16x2 -> f8_e4m3b15x2.
+/// Direct fp16 -> fp8 via clamp + exponent shift E5->E4 + pack.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in0;
+  asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast<const uint32_t*>(&v)));
+  // Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16).
+  uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16;
+  uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu;
+  alo = alo < 0x3B80u ? alo : 0x3B80u;
+  ahi = ahi < 0x3B80u ? ahi : 0x3B80u;
+  uint32_t a0 = alo | (ahi << 16);
+  a0 = a0 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
+  return bit_cast<f8_e4m3b15x2>(packed);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, pack.
+  uint32_t in0 = v.words[0];
+  uint32_t abs0 = in0 & 0x7fff7fffu;
+  uint32_t a0;
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
+  a0 = a0 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
+  return bit_cast<f8_e4m3b15x2>(packed);
+#else
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(__half2float(v.data[0]));
+  result.data[1] = __fp8_e4m3b15(__half2float(v.data[1]));
+  return result;
+#endif
+}
+
+/// f16x4 -> f8_e4m3b15x4.
+/// Uses __vminu2 + lop3 + __byte_perm for branch-free vectorized conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in0, in1;
+  asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(v.words[0]));
+  asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1]));
+  uint32_t abs0 = in0 & 0x7fff7fffu;
+  uint32_t abs1 = in1 & 0x7fff7fffu;
+  uint32_t a0 = __vminu2(abs0, 0x3B803B80u);
+  uint32_t a1 = __vminu2(abs1, 0x3B803B80u);
+  a0 = a0 * 2u + 0x00800080u;
+  a1 = a1 * 2u + 0x00800080u;
+  uint32_t b0, b1;
+  asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b0) : "r"(a0), "r"(in0), "r"(0x80008000u));
+  asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b1) : "r"(a1), "r"(in1), "r"(0x80008000u));
+  uint32_t packed = __byte_perm(b0, b1, 0x7531u);
+  return bit_cast<f8_e4m3b15x4>(packed);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, __byte_perm pack.
+  uint32_t in0 = v.words[0], in1 = v.words[1];
+  uint32_t abs0 = in0 & 0x7fff7fffu, abs1 = in1 & 0x7fff7fffu;
+  uint32_t a0, a1;
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3B803B80u));
+  a0 = a0 * 2u + 0x00800080u;
+  a1 = a1 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint32_t b1 = a1 | (in1 & 0x80008000u);
+  uint32_t packed = __byte_perm(b0, b1, 0x7531u);
+  return bit_cast<f8_e4m3b15x4>(packed);
+#else
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(__half2float(v.data[i]));
+  }
+  return result;
+#endif
+}
+
+// --- fp8_e4m3b15 <-> f32 conversion specializations (software, always available) ---
+
+/// f8_e4m3b15x2 -> f32x2.
+/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
+  float2 f2 = __half22float2(h);
+  return bit_cast<f32x2>(f2);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
+  f32x2 result;
+  result.data[0] = __half2float(h.data[0]);
+  result.data[1] = __half2float(h.data[1]);
+  return result;
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e4m3b15x4 -> f32x4.
+/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
+  __half2 h0, h1;
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h0)) : "r"(h.words[0]));
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h1)) : "r"(h.words[1]));
+  float2 f0 = __half22float2(h0);
+  float2 f1 = __half22float2(h1);
+  f32x4 result;
+  result.data[0] = f0.x;
+  result.data[1] = f0.y;
+  result.data[2] = f1.x;
+  result.data[3] = f1.y;
+  return result;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
+  f32x4 result;
+  result.data[0] = __half2float(h.data[0]);
+  result.data[1] = __half2float(h.data[1]);
+  result.data[2] = __half2float(h.data[2]);
+  result.data[3] = __half2float(h.data[3]);
+  return result;
+#else
+  f32x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = float(v.data[i]);
+  }
+  return result;
+#endif
+}
+
+/// f32x2 -> f8_e4m3b15x2.
+/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  float2 f2 = {v.data[0], v.data[1]};
+  __half2 h = __float22half2_rn(f2);
+  return to<f8_e4m3b15x2, f16x2>(h);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x2 h;
+  h.data[0] = __float2half_rn(v.data[0]);
+  h.data[1] = __float2half_rn(v.data[1]);
+  return to<f8_e4m3b15x2, f16x2>(h);
+#else
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(v.data[0]);
+  result.data[1] = __fp8_e4m3b15(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e4m3b15x4.
+/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  float2 f01 = {v.data[0], v.data[1]};
+  float2 f23 = {v.data[2], v.data[3]};
+  __half2 h01 = __float22half2_rn(f01);
+  __half2 h23 = __float22half2_rn(f23);
+  f16x4 h;
+  asm("mov.b32 %0, %1;" : "=r"(h.words[0]) : "r"(*reinterpret_cast<uint32_t*>(&h01)));
+  asm("mov.b32 %0, %1;" : "=r"(h.words[1]) : "r"(*reinterpret_cast<uint32_t*>(&h23)));
+  return to<f8_e4m3b15x4, f16x4>(h);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x4 h;
+  h.words[0] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[0], v.data[1]));
+  h.words[1] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[2], v.data[3]));
+  return to<f8_e4m3b15x4, f16x4>(h);
+#else
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(v.data[i]);
+  }
+  return result;
+#endif
+}
+
+// --- fp8_e4m3b15 arithmetic (software, always available) ---
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 operator+(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
+  return __fp8_e4m3b15(float(a) + float(b));
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 operator+(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(float(a.data[0]) + float(b.data[0]));
+  result.data[1] = __fp8_e4m3b15(float(a.data[1]) + float(b.data[1]));
+  return result;
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 operator+(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(float(a.data[i]) + float(b.data[i]));
+  }
+  return result;
+}
+
+// --- fp8_e4m3b15 min (software) ---
+
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 min(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
+  return __fp8_e4m3b15(fminf(float(a), float(b)));
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 min(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
+  f8_e4m3b15x2 result;
+  result.data[0] = mscclpp::min(a.data[0], b.data[0]);
+  result.data[1] = mscclpp::min(a.data[1], b.data[1]);
+  return result;
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 min(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = mscclpp::min(a.data[i], b.data[i]);
+  }
+  return result;
+}
+
 #endif  // MSCCLPP_DEVICE_COMPILE
 }  // namespace mscclpp
 
diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index 85787c95..4d1f2e32 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -16,6 +16,7 @@ namespace mscclpp {
 class Host2DeviceSemaphore {
  private:
   Semaphore semaphore_;
+  std::shared_ptr<uint64_t> inboundToken_;
   detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
   std::unique_ptr<uint64_t> outboundToken_;
 
@@ -29,6 +30,15 @@ class Host2DeviceSemaphore {
   /// @param connection The connection associated with this semaphore.
   Host2DeviceSemaphore(Communicator& communicator, const Connection& connection);
 
+  /// Destructor.
+  ~Host2DeviceSemaphore();
+
+  /// Move constructor.
+  Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default;
+
+  /// Move assignment operator.
+  Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default;
+
   /// Returns the connection.
   /// @return The connection associated with this semaphore.
   Connection& connection();
diff --git a/python/csrc/CMakeLists.txt b/python/csrc/CMakeLists.txt
index 8759201f..44fb150f 100644
--- a/python/csrc/CMakeLists.txt
+++ b/python/csrc/CMakeLists.txt
@@ -24,4 +24,7 @@ set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
 set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
 target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES})
 target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
+if(MSCCLPP_USE_ROCM)
+    target_compile_definitions(mscclpp_py PRIVATE MSCCLPP_USE_ROCM)
+endif()
 install(TARGETS mscclpp_py LIBRARY DESTINATION .)
diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp
index 1a93cbc0..1cb3f253 100644
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -75,15 +75,17 @@ void register_algorithm(nb::module_& m) {
               [](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
                  size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
                  std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
-                 std::unordered_map<std::string, uintptr_t> extras) {
+                 std::unordered_map<std::string, uintptr_t> extras, int32_t accumDtype) {
                 return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
                                     inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
-                                    nBlocks, nThreadsPerBlock, symmetricMemory, extras);
+                                    nBlocks, nThreadsPerBlock, symmetricMemory, extras,
+                                    static_cast<DataType>(accumDtype));
               },
               nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
               nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
               nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
-              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>())
+              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>(),
+              nb::arg("accum_dtype") = static_cast<int32_t>(DataType::AUTO))
           .def("reset", &Algorithm::reset);
 
   nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index 47d76ac4..b8649564 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -47,7 +47,8 @@ void register_core(nb::module_& m) {
       .value("bfloat16", DataType::BFLOAT16)
       .value("float8_e4m3", DataType::FLOAT8_E4M3)
       .value("float8_e5m2", DataType::FLOAT8_E5M2)
-      .value("uint8", DataType::UINT8);
+      .value("uint8", DataType::UINT8)
+      .value("float8_e4m3b15", DataType::FLOAT8_E4M3B15);
 
   nb::class_<Bootstrap>(m, "CppBootstrap")
       .def("get_rank", &Bootstrap::getRank)
diff --git a/python/csrc/ext/algorithm_collection_builder_py.cpp b/python/csrc/ext/algorithm_collection_builder_py.cpp
index be7f944e..4a3563d9 100644
--- a/python/csrc/ext/algorithm_collection_builder_py.cpp
+++ b/python/csrc/ext/algorithm_collection_builder_py.cpp
@@ -4,6 +4,7 @@
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/function.h>
 #include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
 #include <nanobind/stl/unordered_map.h>
 #include <nanobind/stl/vector.h>
 
diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp
index 6995756b..60880456 100644
--- a/python/csrc/gpu_utils_py.cpp
+++ b/python/csrc/gpu_utils_py.cpp
@@ -34,6 +34,19 @@ static DLDataType getDlType(std::string type) {
     return DLDataType{kDLBfloat, 16, 1};
   } else if (type == "torch.float16") {
     return DLDataType{kDLFloat, 16, 1};
+  } else if (type == "torch.float8_e4m3fn") {
+    return DLDataType{kDLFloat8_e4m3fn, 8, 1};
+  } else if (type == "torch.float8_e4m3fnuz") {
+    return DLDataType{kDLFloat8_e4m3fnuz, 8, 1};
+  } else if (type == "torch.float8_e5m2") {
+    return DLDataType{kDLFloat8_e5m2, 8, 1};
+  } else if (type == "torch.float8_e5m2fnuz") {
+    return DLDataType{kDLFloat8_e5m2fnuz, 8, 1};
+  } else if (type == "torch.uint8") {
+    return DLDataType{kDLUInt, 8, 1};
+  } else if (type == "fp8_e4m3b15") {
+    // No standard DLPack code for fp8_e4m3b15; store as raw uint8 bytes.
+    return DLDataType{kDLUInt, 8, 1};
   } else {
     throw Error("Unsupported type: " + type, ErrorCode::InvalidUsage);
   }
diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py
index 744cf39e..f12a3027 100644
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -177,6 +177,7 @@ class Algorithm:
         nthreads_per_block=0,
         symmetric_memory: bool = False,
         extras: Optional[Dict[str, int]] = None,
+        accum_dtype: Optional[CppDataType] = None,
     ) -> int:
         """Execute the collective algorithm.
 
@@ -194,10 +195,14 @@ class Algorithm:
             nthreads_per_block: Number of threads per block (0 for auto-selection).
             symmetric_memory: Whether to use symmetric memory optimization (default: False).
             extras: Additional algorithm-specific parameters.
+            accum_dtype: Data type for accumulation during reduction. If None, defaults to
+                         the same as dtype. Use DataType.float32 for high-precision FP8 accumulation.
 
         Returns:
             The result code (0 for success).
         """
+        merged_extras = dict(extras) if extras is not None else {}
+        accum_dtype = accum_dtype if accum_dtype is not None else dtype
         return self._algorithm.execute(
             comm,
             int(input_buffer),
@@ -211,7 +216,8 @@ class Algorithm:
             nblocks,
             nthreads_per_block,
             symmetric_memory,
-            extras if extras is not None else {},
+            merged_extras,
+            int(accum_dtype),
         )
 
     def reset(self):
diff --git a/python/mscclpp/language/channel.py b/python/mscclpp/language/channel.py
index 1b22e4e2..23d76eda 100644
--- a/python/mscclpp/language/channel.py
+++ b/python/mscclpp/language/channel.py
@@ -140,7 +140,7 @@ class MemoryChannel:
 
         for tb_id in tb_list:
             tb_chunk_id = get_program().setup_remote_chunk(self.src_rank, tb_id, remote_chunk, self.channel_type)
-            tb_channel_ids = get_program().setup_channel(tb, self)
+            tb_channel_ids = get_program().setup_channel(tb_id, self)
             op = GetOperation(
                 src_buff=[RemoteChunk(src_chunk.buffer, src_chunk.index, src_chunk.size, tb_chunk_id)],
                 dst_buff=[LocalChunk(dst_chunk.buffer, dst_chunk.index, dst_chunk.size)],
diff --git a/python/mscclpp/language/internal/operations.py b/python/mscclpp/language/internal/operations.py
index 5f719c21..5fb392e3 100644
--- a/python/mscclpp/language/internal/operations.py
+++ b/python/mscclpp/language/internal/operations.py
@@ -745,7 +745,7 @@ class ReduceOperation(BaseOperation):
                 remote_dst_buff=self.remote_dst_buff + other.dst_buff,
                 channel_ids=self.channel_ids,
                 put_channel_ids=self.put_channel_ids + other.channel_ids,
-                channel_type=self.channel_type,
+                channel_type=other.channel_type,
                 reduce_operation=self.reduce_operation,
                 tbg_info=self.tbg_info,
                 packet=self.packet,
diff --git a/python/mscclpp/language/tests/executor_tests/reduce_pack.py b/python/mscclpp/language/tests/executor_tests/reduce_pack.py
deleted file mode 100644
index bf584a0f..00000000
--- a/python/mscclpp/language/tests/executor_tests/reduce_pack.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-import argparse
-from mscclpp.language.channel import *
-from mscclpp.language.rank import *
-from mscclpp.language.general import *
-from mscclpp.language.program import *
-from mscclpp.language.collectives import *
-from enum import Enum
-
-def allreduce_example(name, num_threads_per_block, min_message_size, max_message_size):
-    chunksperloop = 1
-    gpu_size = 2
-    collective = AllReduce(gpu_size, chunksperloop, True)
-    with CollectiveProgram(
-        name,
-        collective,
-        gpu_size,
-        protocol="LL",
-        num_threads_per_block=num_threads_per_block,
-        use_double_scratch_buffer=True,
-        min_message_size=min_message_size,
-        max_message_size=max_message_size,
-    ):
-        # Declaring Ranks, Channels, Buffers for 2 GPU allgather example
-        first_rank = Rank(0)
-        second_rank = Rank(1)
-        first_ch = MemoryChannel(1, 0)
-        second_ch = MemoryChannel(0, 1)
-        first_input_buffer = first_rank.get_input_buffer()
-        second_input_buffer = second_rank.get_input_buffer()
-        first_scratch_buffer = Buffer(0, 3)
-        second_scratch_buffer = Buffer(1, 3)
-
-        # First rank puts packets in the remote scratch buffer of the second rank
-        first_ch.put_packets(second_scratch_buffer[1: 2], first_input_buffer[1 : 2], tb=0)
-        second_ch.put_packets(first_scratch_buffer[0 : 1], second_input_buffer[0 : 1], tb=0)
-
-        # Second rank copy packets to scratch buffer and then read put packets to first rank output buffer
-        first_rank.reduce(first_input_buffer[0 : 1], [first_scratch_buffer[0 : 1]], tb=1, packet=True)
-        first_ch.put_packets(second_scratch_buffer[0 : 1], first_input_buffer[0 : 1], tb=1)
-
-        # First rank copy packets to scratch buffer and then read put packets to second rank output buffer
-        second_rank.reduce(second_input_buffer[1 : 2], [second_scratch_buffer[1 : 2]],  tb=1, packet=True)
-        second_rank.copy_packets(second_scratch_buffer[2: 3], second_input_buffer[1 : 2], tb=1)
-        second_ch.read_put_packets(first_scratch_buffer[1 : 2], second_scratch_buffer[2: 3], tb=1)
-
-        # First rank copy packets to scratch buffer and then read put packets to second rank output buffer
-        first_rank.unpack_packets(first_input_buffer[1 : 2], first_scratch_buffer[1 : 2], tb=2)
-        second_rank.unpack_packets(second_input_buffer[0 : 1], second_scratch_buffer[0 : 1], tb=2)
-
-
-        print(JSON())
-
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument("--name", type=str, help="name of the program")
-parser.add_argument("--num_gpus", type=int, help="number of gpus")
-parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
-parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
-parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
-
-args = parser.parse_args()
-
-allreduce_example(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/python/mscclpp/language/tests/executor_tests/reduce_pack_tbg.py b/python/mscclpp/language/tests/executor_tests/reduce_pack_tbg.py
deleted file mode 100644
index 9f7f9df8..00000000
--- a/python/mscclpp/language/tests/executor_tests/reduce_pack_tbg.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-import argparse
-from mscclpp.language.channel import *
-from mscclpp.language.rank import *
-from mscclpp.language.general import *
-from mscclpp.language.program import *
-from mscclpp.language.collectives import *
-from enum import Enum
-
-def allreduce_example(name, num_threads_per_block, min_message_size, max_message_size):
-    chunksperloop = 1
-    gpu_size = 2
-    collective = AllReduce(gpu_size, chunksperloop, True)
-    with CollectiveProgram(
-        name,
-        collective,
-        gpu_size,
-        protocol="LL",
-        num_threads_per_block=num_threads_per_block,
-        use_double_scratch_buffer=True,
-        min_message_size=min_message_size,
-        max_message_size=max_message_size,
-    ):
-        # Declaring Ranks, Channels, Buffers for 2 GPU allgather example
-        first_rank = Rank(0)
-        second_rank = Rank(1)
-        first_ch = MemoryChannel(1, 0)
-        second_ch = MemoryChannel(0, 1)
-        first_input_buffer = first_rank.get_input_buffer()
-        second_input_buffer = second_rank.get_input_buffer()
-        first_scratch_buffer = Buffer(0, 3)
-        second_scratch_buffer = Buffer(1, 3)
-        tbg = []
-        for i in range(3):
-            tbg.append(ThreadBlockGroup(tb_list = [2 * i, 2 * i + 1]))
-
-        # First rank puts packets in the remote scratch buffer of the second rank
-        first_ch.put_packets(second_scratch_buffer[1: 2], first_input_buffer[1 : 2], tb_group=tbg[0])
-        second_ch.put_packets(first_scratch_buffer[0 : 1], second_input_buffer[0 : 1], tb_group=tbg[0])
-
-        # Second rank copy packets to scratch buffer and then read put packets to first rank output buffer
-        first_rank.reduce(first_input_buffer[0 : 1], [first_scratch_buffer[0 : 1]], tb_group=tbg[1], packet=True)
-        first_ch.put_packets(second_scratch_buffer[0 : 1], first_input_buffer[0 : 1], tb_group=tbg[1])
-
-        # First rank copy packets to scratch buffer and then read put packets to second rank output buffer
-        second_rank.reduce(second_input_buffer[1 : 2], [second_scratch_buffer[1 : 2]],  tb_group=tbg[1], packet=True)
-        second_rank.copy_packets(second_scratch_buffer[2: 3], second_input_buffer[1 : 2], tb_group=tbg[1])
-        second_ch.read_put_packets(first_scratch_buffer[1 : 2], second_scratch_buffer[2: 3], tb_group=tbg[1])
-
-        # First rank copy packets to scratch buffer and then read put packets to second rank output buffer
-        first_rank.unpack_packets(first_input_buffer[1 : 2], first_scratch_buffer[1 : 2], tb_group=tbg[2])
-        second_rank.unpack_packets(second_input_buffer[0 : 1], second_scratch_buffer[0 : 1], tb_group=tbg[2])
-
-
-        print(JSON())
-
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument("--name", type=str, help="name of the program")
-parser.add_argument("--num_gpus", type=int, help="number of gpus")
-parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
-parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
-parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
-
-args = parser.parse_args()
-
-allreduce_example(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/python/mscclpp/language/tests/executor_tests/transfer_pack.py b/python/mscclpp/language/tests/executor_tests/transfer_pack.py
deleted file mode 100644
index bb3b5592..00000000
--- a/python/mscclpp/language/tests/executor_tests/transfer_pack.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-import argparse
-from mscclpp.language.channel import *
-from mscclpp.language.rank import *
-from mscclpp.language.general import *
-from mscclpp.language.program import *
-from mscclpp.language.collectives import *
-from enum import Enum
-
-def allgather_example(name, num_threads_per_block, min_message_size, max_message_size):
-    chunksperloop = 1
-    gpu_size = 2
-    collective = AllGather(gpu_size, chunksperloop, True)
-    with CollectiveProgram(
-        name,
-        collective,
-        gpu_size,
-        protocol="LL",
-        num_threads_per_block=num_threads_per_block,
-        use_double_scratch_buffer=True,
-        min_message_size=min_message_size,
-        max_message_size=max_message_size,
-    ):
-        # Declaring Ranks, Channels, Buffers for 2 GPU allgather example
-        first_rank = Rank(0)
-        second_rank = Rank(1)
-        first_ch = MemoryChannel(1, 0)
-        second_ch = MemoryChannel(0, 1)
-        first_output_buffer = first_rank.get_output_buffer()
-        second_output_buffer = second_rank.get_output_buffer()
-        first_scratch_buffer = Buffer(0, 2)
-        second_scratch_buffer = Buffer(1, 2)
-
-        # First rank puts packets in the remote scratch buffer of the second rank
-        first_ch.put_packets(second_scratch_buffer[0: 1], first_output_buffer[0 : 1], tb=0)
-
-        # Second rank copy packets to scratch buffer and then read put packets to first rank output buffer
-        second_rank.copy_packets(second_scratch_buffer[1 : 2], second_output_buffer[1 : 2], tb=0)
-        second_ch.read_put_packets(first_scratch_buffer[1 : 2], second_scratch_buffer[1 : 2], tb=1)
-
-        # Copying packets from local scratch buffer to local output buffer
-        first_rank.unpack_packets(first_output_buffer[1 : 2], first_scratch_buffer[1 : 2], tb=1)
-        second_rank.unpack_packets(second_output_buffer[0 : 1], second_scratch_buffer[0 : 1], tb=2) 
-
-        print(JSON())
-
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument("--name", type=str, help="name of the program")
-parser.add_argument("--num_gpus", type=int, help="number of gpus")
-parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
-parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
-parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
-
-args = parser.parse_args()
-
-allgather_example(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/python/mscclpp/language/tests/executor_tests/transfer_pack_tbg.py b/python/mscclpp/language/tests/executor_tests/transfer_pack_tbg.py
deleted file mode 100644
index 41c81a6e..00000000
--- a/python/mscclpp/language/tests/executor_tests/transfer_pack_tbg.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-import argparse
-from mscclpp.language.channel import *
-from mscclpp.language.rank import *
-from mscclpp.language.general import *
-from mscclpp.language.program import *
-from mscclpp.language.collectives import *
-from enum import Enum
-
-def allgather_example(name, num_threads_per_block, min_message_size, max_message_size):
-    chunksperloop = 1
-    gpu_size = 2
-    collective = AllGather(gpu_size, chunksperloop, True)
-    with CollectiveProgram(
-        name,
-        collective,
-        gpu_size,
-        protocol="LL",
-        num_threads_per_block=num_threads_per_block,
-        use_double_scratch_buffer=True,
-        min_message_size=min_message_size,
-        max_message_size=max_message_size,
-    ):
-        # Declaring Ranks, Channels, Buffers for 2 GPU allgather example
-        first_rank = Rank(0)
-        second_rank = Rank(1)
-        first_ch = MemoryChannel(1, 0)
-        second_ch = MemoryChannel(0, 1)
-        first_output_buffer = first_rank.get_output_buffer()
-        second_output_buffer = second_rank.get_output_buffer()
-        first_scratch_buffer = Buffer(0, 2)
-        second_scratch_buffer = Buffer(1, 2)
-        tbg = []
-        for i in range(3):
-            tbg.append(ThreadBlockGroup(tb_list = [2 * i, 2 * i + 1]))
-
-        # First rank puts packets in the remote scratch buffer of the second rank
-        first_ch.put_packets(second_scratch_buffer[0: 1], first_output_buffer[0 : 1], tb_group=tbg[0])
-
-        # Second rank copy packets to scratch buffer and then read put packets to first rank output buffer
-        second_rank.copy_packets(second_scratch_buffer[1 : 2], second_output_buffer[1 : 2], tb_group=tbg[0])
-        second_ch.read_put_packets(first_scratch_buffer[1 : 2], second_scratch_buffer[1 : 2], tb_group=tbg[1])
-
-        # Copying packets from local scratch buffer to local output buffer
-        first_rank.unpack_packets(first_output_buffer[1 : 2], first_scratch_buffer[1 : 2], tb_group=tbg[1])
-        second_rank.unpack_packets(second_output_buffer[0 : 1], second_scratch_buffer[0 : 1], tb_group=tbg[2]) 
-
-        print(JSON())
-
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument("--name", type=str, help="name of the program")
-parser.add_argument("--num_gpus", type=int, help="number of gpus")
-parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
-parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
-parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
-
-args = parser.parse_args()
-
-allgather_example(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt
index d2a3389b..7ed4fef3 100644
--- a/python/requirements_rocm6.txt
+++ b/python/requirements_rocm6.txt
@@ -1,5 +1,5 @@
-mpi4py==4.1.1
-cupy==13.6.0
+mpi4py
+cupy
 prettytable
 netifaces
 pytest
diff --git a/python/test/test_fp8_accum.py b/python/test/test_fp8_accum.py
new file mode 100644
index 00000000..82981ce1
--- /dev/null
+++ b/python/test/test_fp8_accum.py
@@ -0,0 +1,397 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Correctness test for FP8 allreduce with different accumulation types.
+#
+# Verifies that FP8 allreduce with higher-precision accumulation produces
+# results at least as accurate as native FP8 accumulation, by comparing
+# against a float32 reference.
+#
+# Usage:
+#   mpirun -np 8 pytest python/test/test_fp8_accum.py -v
+
+import cupy as cp
+import numpy as np
+import pytest
+
+from mscclpp import CommGroup, GpuBuffer, DataType, ReduceOp, is_nvls_supported
+from mscclpp.ext import AlgorithmCollectionBuilder
+from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
+
+# FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs.
+# On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed.
+_is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip
+_skip_fp8 = not _is_hip and int(cp.cuda.Device().compute_capability) < 89
+pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA")
+
+# ---------------------------------------------------------------------------
+# FP8 E4M3FN helpers (bias=7, no infinity, NaN = exp=15 & mant=7)
+# ---------------------------------------------------------------------------
+
+
+def e4m3fn_to_float(uint8_array):
+    """Decode a cupy uint8 array of E4M3FN bit patterns to float32."""
+    bits = uint8_array.astype(cp.int32)
+    sign = (bits >> 7) & 1
+    exp = (bits >> 3) & 0xF
+    mant = bits & 0x7
+
+    # Normal: (-1)^s * 2^(exp-7) * (1 + mant/8)
+    normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 7).astype(cp.int32))
+    # Subnormal (exp==0): (-1)^s * 2^(-6) * (mant/8)
+    subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-6))
+
+    result = cp.where(exp == 0, subnormal_val, normal_val)
+    result = cp.where(sign == 1, -result, result)
+    # Zero
+    result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
+    # NaN: exp==15 & mant==7
+    nan_mask = (exp == 15) & (mant == 7)
+    result = cp.where(nan_mask, cp.float32(float("nan")), result)
+    return result
+
+
+def float_to_e4m3fn(f32_array, chunk_size=65536):
+    """Encode a cupy float32 array to uint8 E4M3FN bit patterns.
+
+    Uses a lookup-table approach: precompute all 128 positive E4M3FN values,
+    then find nearest match per element via chunked broadcast comparison.
+    """
+    # Build lookup table of all 128 positive E4M3FN values (0x00..0x7F)
+    all_bytes = cp.arange(128, dtype=cp.uint8)
+    all_floats = e4m3fn_to_float(all_bytes)  # (128,) float32
+    # Mark NaN entries as inf so they're never selected as nearest
+    all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
+
+    # Clamp input and extract sign
+    clamped = f32_array.astype(cp.float32)
+    clamped = cp.clip(clamped, -448.0, 448.0)
+    signs = (clamped < 0).astype(cp.uint8)
+    absval = cp.abs(clamped)
+
+    result = cp.zeros(absval.shape, dtype=cp.uint8)
+    n = absval.size
+    absval_flat = absval.ravel()
+    result_flat = result.ravel()
+
+    for start in range(0, n, chunk_size):
+        end = min(start + chunk_size, n)
+        chunk = absval_flat[start:end]
+        # (chunk_size, 128) difference matrix
+        diffs = cp.abs(chunk[:, None] - all_floats[None, :])
+        result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
+
+    # Combine with sign bit
+    result = result_flat.reshape(absval.shape)
+    result = result | (signs << 7)
+    # Handle exact zero
+    result = cp.where(absval == 0, cp.uint8(0), result)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80)
+# ---------------------------------------------------------------------------
+
+
+def e4m3b15_to_float(uint8_array):
+    """Decode a cupy uint8 array of E4M3B15 bit patterns to float32."""
+    bits = uint8_array.astype(cp.int32)
+    sign = (bits >> 7) & 1
+    exp = (bits >> 3) & 0xF
+    mant = bits & 0x7
+
+    # Normal: (-1)^s * 2^(exp-15) * (1 + mant/8)
+    normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 15).astype(cp.int32))
+    # Subnormal (exp==0): (-1)^s * 2^(-14) * (mant/8)
+    subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-14))
+
+    result = cp.where(exp == 0, subnormal_val, normal_val)
+    result = cp.where(sign == 1, -result, result)
+    # Zero
+    result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
+    # NaN: exp==15 or negative zero (0x80)
+    nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80)
+    result = cp.where(nan_mask, cp.float32(float("nan")), result)
+    return result
+
+
+def float_to_e4m3b15(f32_array, chunk_size=65536):
+    """Encode a cupy float32 array to uint8 E4M3B15 bit patterns.
+
+    Same lookup-table approach as float_to_e4m3fn.
+    """
+    # Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F)
+    all_bytes = cp.arange(128, dtype=cp.uint8)
+    all_floats = e4m3b15_to_float(all_bytes)  # (128,) float32
+    # Mark NaN entries as inf so they're never selected as nearest
+    all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
+
+    # Clamp input and extract sign
+    clamped = f32_array.astype(cp.float32)
+    clamped = cp.clip(clamped, -0.9375, 0.9375)
+    signs = (clamped < 0).astype(cp.uint8)
+    absval = cp.abs(clamped)
+
+    result = cp.zeros(absval.shape, dtype=cp.uint8)
+    n = absval.size
+    absval_flat = absval.ravel()
+    result_flat = result.ravel()
+
+    for start in range(0, n, chunk_size):
+        end = min(start + chunk_size, n)
+        chunk = absval_flat[start:end]
+        # (chunk_size, 128) difference matrix
+        diffs = cp.abs(chunk[:, None] - all_floats[None, :])
+        result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
+
+    # Combine with sign bit
+    result = result_flat.reshape(absval.shape)
+    result = result | (signs << 7)
+    # Handle exact zero
+    result = cp.where(absval == 0, cp.uint8(0), result)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Shared test helpers
+# ---------------------------------------------------------------------------
+
+
+def setup_algorithms(mpi_group):
+    """Build default algorithms and return (comm_group, algo_map, scratch_buf)."""
+    comm_group = CommGroup(mpi_group.comm)
+    scratch = GpuBuffer(1 << 27, dtype=cp.uint8)  # 128 MB
+    AlgorithmCollectionBuilder.reset()
+    builder = AlgorithmCollectionBuilder()
+    algorithms = builder.build_default_algorithms(
+        scratch_buffer=scratch.data.ptr,
+        scratch_buffer_size=scratch.nbytes,
+        rank=comm_group.my_rank,
+    )
+    algo_map = {a.name: a for a in algorithms}
+    return comm_group, algo_map, scratch
+
+
+def run_allreduce(algo, comm_group, buffer, dtype, accum_dtype=None, nblocks=0, nthreads_per_block=0):
+    """Run allreduce in-place on buffer and return a copy of the result."""
+    ret = algo.execute(
+        comm=comm_group.communicator,
+        input_buffer=buffer.data.ptr,
+        output_buffer=buffer.data.ptr,
+        input_size=buffer.nbytes,
+        output_size=buffer.nbytes,
+        dtype=dtype,
+        op=ReduceOp.SUM,
+        stream=cp.cuda.get_current_stream().ptr,
+        nblocks=nblocks,
+        nthreads_per_block=nthreads_per_block,
+        symmetric_memory=True,
+        accum_dtype=accum_dtype,
+    )
+    cp.cuda.Device().synchronize()
+    assert ret == 0, f"Allreduce failed with error code {ret}"
+    return buffer.copy()
+
+
+# ---------------------------------------------------------------------------
+# Test: FP8 E4M3 accumulation correctness
+# ---------------------------------------------------------------------------
+
+
+@parametrize_mpi_groups(8)
+@pytest.mark.parametrize(
+    "algo_name",
+    [
+        "default_allreduce_packet",
+        "default_allreduce_nvls_packet",
+        "default_allreduce_fullmesh",
+        "default_allreduce_rsag_zero_copy",
+        "default_allreduce_allpair_packet",
+    ],
+)
+@pytest.mark.parametrize("size", [1024, 4096, 16384, 65536, 262144, 1048576])
+def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
+    """Verify that FP8 E4M3 allreduce with higher-precision accumulation is at
+    least as accurate as native FP8 accumulation, across all algorithm variants."""
+    rank = mpi_group.comm.rank
+    world_size = mpi_group.comm.size
+
+    comm_group, algo_map, scratch = setup_algorithms(mpi_group)
+    if algo_name not in algo_map:
+        pytest.skip(f"{algo_name} not available")
+    if "nvls" in algo_name and not is_nvls_supported():
+        pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
+    algo = algo_map[algo_name]
+
+    buf = GpuBuffer(size, dtype=cp.uint8)
+
+    accum_configs = [
+        ("fp8_native", DataType.float8_e4m3),
+        ("float16", DataType.float16),
+        ("float32", DataType.float32),
+    ]
+
+    # rsag_zero_copy and fullmesh need explicit block/thread counts
+    if "rsag" in algo_name:
+        nb = max(1, min(32, size // (world_size * 32)))
+        nt = 1024
+    elif "fullmesh" in algo_name:
+        nb = 35
+        nt = 512
+    else:
+        nb = 0
+        nt = 0
+
+    errors = {}
+    for accum_label, accum_dtype in accum_configs:
+        # Generate deterministic per-rank data (use numpy to avoid hipRAND issues on ROCm)
+        rng = np.random.RandomState(42 + rank)
+        src_f32 = cp.asarray(rng.randn(size).astype(np.float32))
+        src_f32 = cp.clip(src_f32, -240.0, 240.0)
+        src_fp8 = float_to_e4m3fn(src_f32)
+
+        # Copy into symmetric buffer
+        buf[:] = src_fp8
+        cp.cuda.Device().synchronize()
+
+        # Run allreduce
+        result = run_allreduce(
+            algo,
+            comm_group,
+            buf,
+            dtype=DataType.float8_e4m3,
+            accum_dtype=accum_dtype,
+            nblocks=nb,
+            nthreads_per_block=nt,
+        )
+        result_f32 = e4m3fn_to_float(result)
+
+        # Compute float32 reference: sum all ranks' quantized FP8 inputs in float32
+        ref_f32 = cp.zeros(size, dtype=cp.float32)
+        for r in range(world_size):
+            rng_r = np.random.RandomState(42 + r)
+            rank_data = cp.asarray(rng_r.randn(size).astype(np.float32))
+            rank_data = cp.clip(rank_data, -240.0, 240.0)
+            rank_data_fp8 = float_to_e4m3fn(rank_data)
+            ref_f32 += e4m3fn_to_float(rank_data_fp8)
+
+        # Compute errors
+        abs_err = cp.abs(result_f32 - ref_f32)
+        mean_abs_err = float(cp.mean(abs_err))
+        errors[accum_label] = mean_abs_err
+
+        # Reset between runs
+        algo.reset()
+
+    # Higher-precision accumulation should be at least as accurate as native fp8
+    assert (
+        errors["float16"] <= errors["fp8_native"] + 1e-6
+    ), f"float16 accum ({errors['float16']:.6f}) worse than native ({errors['fp8_native']:.6f})"
+    assert (
+        errors["float32"] <= errors["fp8_native"] + 1e-6
+    ), f"float32 accum ({errors['float32']:.6f}) worse than native ({errors['fp8_native']:.6f})"
+
+
+# ---------------------------------------------------------------------------
+# Test: FP8 E4M3B15 accumulation correctness
+# ---------------------------------------------------------------------------
+
+
+@parametrize_mpi_groups(8)
+@pytest.mark.parametrize(
+    "algo_name",
+    [
+        "default_allreduce_packet",
+        "default_allreduce_nvls_packet",
+        "default_allreduce_rsag_zero_copy",
+        "default_allreduce_fullmesh",
+        "default_allreduce_allpair_packet",
+    ],
+)
+@pytest.mark.parametrize("size", [1024, 4096, 65536])
+def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
+    """Verify that FP8 E4M3B15 allreduce with higher-precision accumulation is at
+    least as accurate as native E4M3B15 accumulation."""
+    rank = mpi_group.comm.rank
+    world_size = mpi_group.comm.size
+
+    comm_group, algo_map, scratch = setup_algorithms(mpi_group)
+    if algo_name not in algo_map:
+        pytest.skip(f"{algo_name} not available")
+    if "nvls" in algo_name and not is_nvls_supported():
+        pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
+
+    algo = algo_map[algo_name]
+    buf = GpuBuffer(size, dtype=cp.uint8)
+
+    accum_configs = [
+        ("e4m3b15_native", DataType.float8_e4m3b15),
+        ("float16", DataType.float16),
+        ("float32", DataType.float32),
+    ]
+
+    # rsag_zero_copy needs explicit block/thread counts, scaled to data size
+    if "rsag" in algo_name:
+        nb = max(1, min(32, size // (world_size * 32)))
+        nt = 1024
+    else:
+        nb = 0
+        nt = 0
+
+    errors = {}
+    for accum_label, accum_dtype in accum_configs:
+        # Generate deterministic per-rank random uint8 values in valid e4m3b15 range
+        rng = np.random.RandomState(42 + rank)
+        raw = cp.asarray(rng.randint(0, 0x78, (size,)).astype(np.uint8))
+        signs = cp.asarray(rng.randint(0, 2, (size,)).astype(np.uint8)) << 7
+        src_uint8 = raw | signs
+        # Fix negative zero -> positive zero
+        src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8)
+
+        # Copy into symmetric buffer
+        buf[:] = src_uint8
+        cp.cuda.Device().synchronize()
+
+        # Run allreduce
+        result = run_allreduce(
+            algo,
+            comm_group,
+            buf,
+            dtype=DataType.float8_e4m3b15,
+            accum_dtype=accum_dtype,
+            nblocks=nb,
+            nthreads_per_block=nt,
+        )
+
+        # Decode result
+        result_f32 = e4m3b15_to_float(result)
+
+        # Compute float32 reference
+        ref_f32 = cp.zeros(size, dtype=cp.float32)
+        for r in range(world_size):
+            rng_r = np.random.RandomState(42 + r)
+            raw_r = cp.asarray(rng_r.randint(0, 0x78, (size,)).astype(np.uint8))
+            signs_r = cp.asarray(rng_r.randint(0, 2, (size,)).astype(np.uint8)) << 7
+            bits_r = raw_r | signs_r
+            bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r)
+            ref_f32 += e4m3b15_to_float(bits_r)
+
+        # Clamp reference to e4m3b15 representable range
+        ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375)
+
+        # Compute errors (only on valid entries)
+        valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32)
+        abs_err = cp.abs(result_f32[valid] - ref_f32[valid])
+        mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0
+        errors[accum_label] = mean_abs_err
+
+        algo.reset()
+
+    # Higher-precision accumulation should be at least as accurate as native
+    assert (
+        errors["float16"] <= errors["e4m3b15_native"] + 1e-8
+    ), f"float16 accum ({errors['float16']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
+    assert (
+        errors["float32"] <= errors["e4m3b15_native"] + 1e-8
+    ), f"float32 accum ({errors['float32']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index c1aa25bb..9ca5fed3 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -28,6 +28,16 @@ if(MSCCLPP_USE_IB)
     target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
     target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
     target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
+    if(MLX5_FOUND)
+        target_include_directories(mscclpp_obj SYSTEM PRIVATE ${MLX5_INCLUDE_DIRS})
+        target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MLX5DV)
+    endif()
+endif()
+
+if(MSCCLPP_USE_GDRCOPY)
+    target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GDRCOPY_INCLUDE_DIRS})
+    target_link_libraries(mscclpp_obj PRIVATE ${GDRCOPY_LIBRARIES})
+    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
 endif()
 
 set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc
index 99e7b031..ffa53aa8 100644
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -41,7 +41,9 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
 CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
                                     size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
                                     cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
-                                    bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras) {
+                                    bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras,
+                                    DataType accumDtype) {
+  if (accumDtype == DataType::AUTO) accumDtype = dtype;
   if (!initialized_) {
     initFunc_(comm);
     initialized_ = true;
@@ -53,7 +55,7 @@ CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const vo
     contexts_[ctxKey] = ctx;
   }
   return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks,
-                           nThreadsPerBlock, extras);
+                           nThreadsPerBlock, extras, accumDtype);
 }
 
 const std::string& NativeAlgorithm::name() const { return name_; }
@@ -77,10 +79,7 @@ const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferM
 
 Algorithm::Constraint NativeAlgorithm::constraint() const { return constraint_; }
 
-void NativeAlgorithm::reset() {
-  contexts_.clear();
-  initialized_ = false;
-}
+void NativeAlgorithm::reset() { contexts_.clear(); }
 
 void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName,
                                             std::shared_ptr<Algorithm> algorithm) {
@@ -166,7 +165,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
 CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                                  size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
                                  std::shared_ptr<Executor> executor, int, int, bool,
-                                 const std::unordered_map<std::string, uintptr_t>&) {
+                                 const std::unordered_map<std::string, uintptr_t>&, DataType) {
   if (!executor) {
     THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
   }
@@ -192,6 +191,10 @@ CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void*
                         plan_, stream);
       break;
 #endif
+    case DataType::FLOAT8_E4M3B15:
+      executor->execute(rank, (__fp8_e4m3b15*)input, (__fp8_e4m3b15*)output, inputSize, outputSize,
+                        DataType::FLOAT8_E4M3B15, plan_, stream);
+      break;
     case DataType::INT32:
     case DataType::UINT32:
       executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream);
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 6466ca2a..8b6c0afb 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -7,6 +7,7 @@
 #include <mscclpp/npkit/npkit.hpp>
 #endif
 
+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
@@ -197,45 +198,54 @@ void IBConnection::recvThreadFunc() {
     }
   }
 
-  // Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy)
+  uint32_t lastImmData = 0;
+  uint64_t immHighBits = 0;
   uint64_t newValueHost = 0;
 
-  while (!stopRecvThread_.load(std::memory_order_relaxed)) {
-    auto qp = qp_.lock();
-    if (!qp) break;
+  auto qp = qp_.lock();
+  if (!qp) return;
 
+  while (!stopRecvThread_.load(std::memory_order_relaxed)) {
     int wcNum = qp->pollRecvCq();
     if (wcNum < 0) {
-      WARN(NET, "IBConnection recvThreadFunc: pollRecvCq failed");
+      recvThreadErrorMsg_ = "pollRecvCq failed";
+      recvThreadError_.store(true, std::memory_order_release);
+      WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
       break;
     }
 
     for (int i = 0; i < wcNum; ++i) {
       int status = qp->getRecvWcStatus(i);
       if (status != static_cast<int>(WsStatus::Success)) {
-        WARN(NET, "IBConnection recvThreadFunc: recv work completion failed: ", qp->getRecvWcStatusString(i));
-        // Post another recv to replace the failed one
-        qp->stageRecv(/*wrId=*/0);
-        qp->postRecv();
-        continue;
+        // A failed recv WC typically means the QP entered error state (e.g., WR Flushed Error).
+        // All remaining WRs will also fail — no recovery without QP recreation. Exit the thread
+        // and set the error flag so the main thread can detect it.
+        recvThreadErrorMsg_ = std::string("recv work completion failed: ") + qp->getRecvWcStatusString(i);
+        recvThreadError_.store(true, std::memory_order_release);
+        WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
+        return;
       }
 
-      // The imm_data contains newValue (32-bit, extended to 64-bit)
-      // Note: getRecvWcImmData already converts from network byte order via ntohl
-      unsigned int immData = qp->getRecvWcImmData(i);
-      newValueHost = static_cast<uint64_t>(immData);
+      // Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
+      // using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits
+      // are less than the previous value, the upper 32 bits must have incremented by 1.
+      uint32_t immData = qp->getRecvWcImmData(i);
+      if (immData < lastImmData) {
+        immHighBits += (1ULL << 32);
+      }
+      lastImmData = immData;
+      newValueHost = immHighBits | static_cast<uint64_t>(immData);
 
-      // Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
-      uint64_t dstGpuAddr = remoteUpdateDstAddr_;
-      if (dstGpuAddr != 0) {
-        uint64_t* dstPtr = reinterpret_cast<uint64_t*>(dstGpuAddr);
-
-        // Use cudaMemcpyAsync with our dedicated stream to avoid blocking on the default stream
-        MSCCLPP_CUDATHROW(
-            cudaMemcpyAsync(dstPtr, &newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_));
-
-        INFO(CONN, "IBConnection recvThreadFunc: updated GPU ptr ", dstPtr, " to ", newValueHost, " (immData=", immData,
-             ")");
+      // Forward the token to the semaphore's inbound token address via atomicStore
+      // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
+      if (signalAddr_ != 0) {
+        if (signalGdrMap_ && signalGdrMap_->valid()) {
+          atomicStore(signalGdrMap_->hostPtr(), newValueHost, memoryOrderRelaxed);
+        } else {
+          // For HIP/ROCm.
+          // NOTE: may need a fix in the future to ensure BAR1 mapping.
+          *reinterpret_cast<volatile uint64_t*>(signalAddr_) = newValueHost;
+        }
       }
 
       // Post another recv for future messages
@@ -250,60 +260,105 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
     : BaseConnection(context, localEndpoint),
       transport_(localEndpoint.transport()),
       remoteTransport_(remoteEndpoint.transport()),
-      dummyAtomicSource_(std::make_unique<uint64_t>(0)),
+      atomicSrc_(std::make_unique<uint64_t>(0)),
       ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
+      gdrSignalForwarding_(false),
       stopRecvThread_(false),
+      recvThreadError_(false),
       localGpuDeviceId_(localEndpoint.device().id),
-      signalStream_(nullptr),
-      remoteUpdateDstAddr_(0) {
+      signalAddr_(0) {
   qp_ = getImpl(localEndpoint).ibQp_;
   qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
   qp_.lock()->rts();
-  dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
-  validateTransport(dummyAtomicSourceMem_, transport_);
-  dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_);
+  atomicSrcMem_ = context->registerMemory(atomicSrc_.get(), sizeof(uint64_t), transport_);
+  validateTransport(atomicSrcMem_, transport_);
+  atomicSrcTransportInfo_ = getImpl(atomicSrcMem_).getTransportInfo(transport_);
 
   if (ibNoAtomic_) {
-    // Create a CUDA stream for async memory copies
-    MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking));
+#if defined(MSCCLPP_USE_CUDA)
+    // On CUDA, HostNoAtomic requires GDRCopy for CPU→GPU signal forwarding through BAR1.
+    if (!gdrEnabled()) {
+      THROW(CONN, Error, ErrorCode::InvalidUsage,
+            "IB host-no-atomic mode on CUDA requires GDRCopy: ", gdrStatusMessage());
+    }
+    gdrSignalForwarding_ = true;
+#endif  // defined(MSCCLPP_USE_CUDA)
 
-    // Pre-post receive requests for incoming write-with-imm
+    // On platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200
+    // NVLink-C2C), HostNoAtomic requires Data Direct for correct memory ordering. Data Direct
+    // routes NIC DMA through the PCIe Data Direct engine, bypassing the bridge. It is available
+    // on Virtual Function (VF) devices. On platforms without such a bridge (x86, non-Grace
+    // aarch64), HostNoAtomic works without Data Direct.
+    //
+    // We cannot reliably detect the bridge at compile time or runtime, so we emit a warning
+    // when the device is not a VF. If data corruption occurs, switching to VF devices with
+    // Data Direct or using IbMode::Host with RDMA atomics will resolve it.
+    {
+      IbCtx* ibCtx = getImpl(*context).getIbContext(transport_);
+      if (!ibCtx->isVirtualFunction()) {
+        WARN(CONN,
+             "IB HostNoAtomic mode without a Virtual Function (VF) device may cause data corruption "
+             "on platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200). "
+             "Device ",
+             ibCtx->getDevName(),
+             " is not a VF. "
+             "If you experience data corruption, use VF devices with Data Direct or IbMode::Host.");
+      }
+    }
+
+    // Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
+    // The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
     auto qp = qp_.lock();
     int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
     for (int i = 0; i < maxRecvWr; ++i) {
       qp->stageRecv(/*wrId=*/0);
     }
     qp->postRecv();
-    // Start the background thread to poll recv CQ
-    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
-    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with no-atomic mode");
+    // The recv thread is started later in startSignalForwarding() when the semaphore
+    // provides the signal forwarding destination. This ensures the thread lifetime is
+    // bounded by the GdrMap lifetime (created before start, destroyed after stop).
+    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with signal forwarding (HostNoAtomic) mode");
   } else {
     INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode");
   }
 }
 
-IBConnection::~IBConnection() {
-  if (ibNoAtomic_) {
-    stopRecvThread_.store(true, std::memory_order_relaxed);
-    if (recvThread_.joinable()) {
-      recvThread_.join();
-    }
-    if (signalStream_ != nullptr) {
-      // Synchronize stream to ensure all async copies are complete before destruction
-      // Ignore errors during teardown (CUDA context may already be destroyed)
-      MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamSynchronize(signalStream_));
-      MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamDestroy(signalStream_));
-    }
-  }
-}
+IBConnection::~IBConnection() { stopSignalForwarding(); }
 
 Transport IBConnection::transport() const { return transport_; }
 
 Transport IBConnection::remoteTransport() const { return remoteTransport_; }
 
-void IBConnection::setRemoteUpdateDstAddr(uint64_t addr) {
-  remoteUpdateDstAddr_ = addr;
-  INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)addr);
+bool IBConnection::isSignalForwarding() const { return ibNoAtomic_; }
+
+void IBConnection::startSignalForwarding(std::shared_ptr<uint64_t> mem) {
+  // Set up the forwarding destination and GdrMap, then start the recv thread.
+  // Order: set address → create GdrMap → start thread.
+  signalAddr_ = reinterpret_cast<uint64_t>(mem.get());
+  if (gdrSignalForwarding_) {
+    signalGdrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
+  }
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(false, std::memory_order_relaxed);
+    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
+  }
+  INFO(CONN, "IBConnection startSignalForwarding: ", (void*)signalAddr_);
+}
+
+void IBConnection::stopSignalForwarding() {
+  // Stop the recv thread, then tear down GdrMap and address.
+  // Order: stop thread → destroy GdrMap → clear address.
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(true, std::memory_order_relaxed);
+    if (recvThread_.joinable()) {
+      recvThread_.join();
+    }
+  }
+  if (gdrSignalForwarding_) {
+    signalGdrMap_.reset();
+  }
+  signalAddr_ = 0;
+  INFO(CONN, "IBConnection stopSignalForwarding");
 }
 
 void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
@@ -356,25 +411,29 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
   *src = newValue;
 
   if (ibNoAtomic_) {
-    // Use RDMA write-with-imm instead of atomic operation
-    // Send only newValue in imm_data (0-byte write)
-    // The remote's recvThreadFunc will use its stored remoteUpdateDstAddr_ to write
-
-    // Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
+    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the
+    // token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around
+    // detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
+    // indicates the upper 32 bits incremented by 1).
+    if (newValue <= oldValue) {
+      WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ", newValue);
+    } else if (newValue - oldValue >= (1ULL << 32)) {
+      WARN(CONN,
+           "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ", oldValue,
+           " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
+    }
     unsigned int immData = static_cast<unsigned int>(newValue);
-
-    // Send 0-byte write-with-imm; use dstMrInfo as target (we don't actually write anything)
     qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
                                       /*size=*/0, /*wrId=*/0,
                                       /*srcOffset=*/0, /*dstOffset=*/0,
                                       /*signaled=*/true, /*immData=*/immData);
     qp_.lock()->postSend();
-    INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
+    INFO(CONN, "IBConnection signal forwarding: value ", oldValue, " -> ", newValue);
   } else {
-    qp_.lock()->stageSendAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
+    qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
                                    /*signaled=*/true);
     qp_.lock()->postSend();
-    INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
+    INFO(CONN, "IBConnection atomic write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
          " -> ", newValue);
   }
 
@@ -388,6 +447,11 @@ void IBConnection::flush(int64_t timeoutUsec) {
   NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_FLUSH_ENTRY, 0, 0, *NpKit::GetCpuTimestamp(), 0);
 #endif
 
+  // Check if the recv thread has already reported an error (e.g., QP entered error state).
+  if (recvThreadError_.load(std::memory_order_acquire)) {
+    THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_);
+  }
+
   Timer timer;
   while (qp_.lock()->getNumSendCqItems()) {
     int wcNum = qp_.lock()->pollSendCq();
diff --git a/src/core/context.cc b/src/core/context.cc
index a5cdffb2..aabe71df 100644
--- a/src/core/context.cc
+++ b/src/core/context.cc
@@ -46,8 +46,6 @@ void CudaIpcStream::sync() {
   }
 }
 
-Context::Impl::Impl() {}
-
 IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
   // Find IB context or create it
   auto it = ibContexts_.find(ibTransport);
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 4795aa62..5ab4bad0 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -51,7 +51,7 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
 
     ibQp_ = contextImpl.getIbContext(config_.transport)
                 ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
-                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend);
+                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
     ibQpInfo_ = ibQp_->getInfo();
   } else if (config_.transport == Transport::Ethernet) {
     // Configuring Ethernet Interfaces
@@ -74,6 +74,7 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
   if (AllIBTransports.has(config_.transport)) {
     ibLocal_ = false;
     it = detail::deserialize(it, ibQpInfo_);
+    it = detail::deserialize(it, ibNoAtomic_);
   } else if (config_.transport == Transport::Ethernet) {
     it = detail::deserialize(it, socketAddress_);
   }
@@ -103,6 +104,7 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
   detail::serialize(data, pimpl_->pidHash_);
   if (AllIBTransports.has(pimpl_->config_.transport)) {
     detail::serialize(data, pimpl_->ibQpInfo_);
+    detail::serialize(data, pimpl_->ibNoAtomic_);
   } else if (pimpl_->config_.transport == Transport::Ethernet) {
     detail::serialize(data, pimpl_->socketAddress_);
   }
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 484b40af..96f53492 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -65,7 +65,8 @@ Env::Env()
       ncclSharedLibPath(readEnv<std::string>("MSCCLPP_NCCL_LIB_PATH", "")),
       forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
       ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
-      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)) {}
+      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
+      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -93,6 +94,7 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation);
     logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
+    logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
   }
   return globalEnv;
 }
diff --git a/src/core/executor/execution_kernel.cu b/src/core/executor/execution_kernel.cu
index 2d36bcf5..28ced77f 100644
--- a/src/core/executor/execution_kernel.cu
+++ b/src/core/executor/execution_kernel.cu
@@ -82,6 +82,12 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
     case DataType::FLOAT8_E5M2:
       // FP8 is not supported in CUDA execution kernel.
       break;
+    case DataType::FLOAT8_E4M3B15:
+      // fp8_e4m3b15 is a software type not supported in the CUDA execution kernel.
+      break;
+    case DataType::AUTO:
+      // AUTO is a sentinel resolved before reaching this point; nothing to do.
+      break;
   }
 }
 
diff --git a/src/core/gdr.cc b/src/core/gdr.cc
new file mode 100644
index 00000000..22ac15c9
--- /dev/null
+++ b/src/core/gdr.cc
@@ -0,0 +1,204 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "gdr.hpp"
+
+#if defined(MSCCLPP_USE_GDRCOPY)
+
+#include <gdrapi.h>
+#include <unistd.h>
+
+#include <mscclpp/env.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "logger.hpp"
+
+#ifndef GPU_PAGE_SHIFT
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
+#endif
+
+namespace mscclpp {
+
+// GdrContext
+
+class GdrContext {
+ public:
+  GdrContext();
+  ~GdrContext();
+
+  GdrContext(const GdrContext&) = delete;
+  GdrContext& operator=(const GdrContext&) = delete;
+
+  GdrStatus status() const { return status_; }
+  gdr_t handle() const { return handle_; }
+
+ private:
+  GdrStatus status_;
+  gdr_t handle_;
+};
+
+static std::shared_ptr<GdrContext> gdrContext() {
+  static auto instance = std::make_shared<GdrContext>();
+  return instance;
+}
+
+GdrStatus gdrStatus() { return gdrContext()->status(); }
+
+bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
+
+const char* gdrStatusMessage() {
+  switch (gdrStatus()) {
+    case GdrStatus::Ok:
+      return "GDRCopy initialized successfully";
+    case GdrStatus::NotBuilt:
+      return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
+    case GdrStatus::Disabled:
+      return "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
+    case GdrStatus::DriverMissing:
+      return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
+    case GdrStatus::OpenFailed:
+      return "gdr_open() failed; GDRCopy driver may be misconfigured";
+    default:
+      return "unknown GDRCopy status";
+  }
+}
+
+GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
+  if (env()->forceDisableGdr) {
+    INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
+    status_ = GdrStatus::Disabled;
+    return;
+  }
+
+  // Auto-detect: check if driver is available
+  if (access("/dev/gdrdrv", F_OK) != 0) {
+    INFO(GPU, "GDRCopy driver not detected, disabling GDRCopy");
+    status_ = GdrStatus::DriverMissing;
+    return;
+  }
+
+  handle_ = gdr_open();
+  if (handle_ == nullptr) {
+    INFO(GPU, "gdr_open() failed, disabling GDRCopy");
+    status_ = GdrStatus::OpenFailed;
+    return;
+  }
+
+  status_ = GdrStatus::Ok;
+  INFO(GPU, "GDRCopy initialized successfully");
+}
+
+GdrContext::~GdrContext() {
+  if (handle_ != nullptr) {
+    gdr_close(handle_);
+    handle_ = nullptr;
+  }
+}
+
+// GdrMap::Impl — real implementation with GDRCopy
+
+struct GdrMap::Impl {
+  std::shared_ptr<GdrContext> ctx;
+  std::shared_ptr<void> gpuMem;
+  gdr_mh_t mh;
+  void* barPtr;
+  uint64_t* hostDstPtr;
+  size_t mappedSize;
+};
+
+GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : pimpl_(std::make_unique<Impl>()) {
+  pimpl_->ctx = gdrContext();
+  pimpl_->gpuMem = std::move(gpuMem);
+  pimpl_->mh = {};
+  pimpl_->barPtr = nullptr;
+  pimpl_->hostDstPtr = nullptr;
+  pimpl_->mappedSize = 0;
+
+  // Ensure CUDA device context is active for gdr_pin_buffer
+  CudaDeviceGuard deviceGuard(deviceId);
+
+  uint64_t gpuAddr = reinterpret_cast<uint64_t>(pimpl_->gpuMem.get());
+  // Align to GPU page boundary and pin one page around the target address
+  unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK;
+  unsigned long pageOffset = gpuAddr - alignedAddr;
+  pimpl_->mappedSize = GPU_PAGE_SIZE;
+
+  // Pin the GPU memory for GDRCopy BAR1 mapping. Try GDR_PIN_FLAG_FORCE_PCIE first for optimal
+  // ordering on platforms that support it (e.g., GB200). Fall back to flags=0 if FORCE_PCIE is
+  // not supported. Both paths work correctly: CPU writes via atomicStore, GPU reads via
+  // system-scope acquire.
+  int ret =
+      gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, GDR_PIN_FLAG_FORCE_PCIE, &pimpl_->mh);
+  if (ret != 0) {
+    ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh);
+    if (ret != 0) {
+      THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
+            ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
+    }
+  }
+
+  ret = gdr_map(pimpl_->ctx->handle(), pimpl_->mh, &pimpl_->barPtr, pimpl_->mappedSize);
+  if (ret != 0) {
+    (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
+    THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
+  }
+
+  pimpl_->hostDstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(pimpl_->barPtr) + pageOffset);
+
+  INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)pimpl_->hostDstPtr);
+}
+
+GdrMap::~GdrMap() {
+  if (pimpl_) {
+    if (pimpl_->barPtr != nullptr) {
+      (void)gdr_unmap(pimpl_->ctx->handle(), pimpl_->mh, pimpl_->barPtr, pimpl_->mappedSize);
+    }
+    if (pimpl_->hostDstPtr != nullptr) {
+      (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
+    }
+  }
+}
+
+bool GdrMap::valid() const { return pimpl_ && pimpl_->hostDstPtr != nullptr; }
+
+uint64_t* GdrMap::hostPtr() const { return pimpl_ ? pimpl_->hostDstPtr : nullptr; }
+
+void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(pimpl_->mh, pimpl_->hostDstPtr, src, size); }
+
+void GdrMap::copyFrom(void* dst, size_t size) const {
+  gdr_copy_from_mapping(pimpl_->mh, dst, pimpl_->hostDstPtr, size);
+}
+
+}  // namespace mscclpp
+
+#else  // !defined(MSCCLPP_USE_GDRCOPY)
+
+namespace mscclpp {
+
+GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
+
+bool gdrEnabled() { return false; }
+
+const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
+
+// GdrMap::Impl — stub (no GDRCopy)
+
+struct GdrMap::Impl {};
+
+GdrMap::GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
+
+GdrMap::~GdrMap() = default;
+
+bool GdrMap::valid() const { return false; }
+
+uint64_t* GdrMap::hostPtr() const { return nullptr; }
+
+void GdrMap::copyTo(const void* /*src*/, size_t /*size*/) {}
+
+void GdrMap::copyFrom(void* /*dst*/, size_t /*size*/) const {}
+
+}  // namespace mscclpp
+
+#endif  // !defined(MSCCLPP_USE_GDRCOPY)
diff --git a/src/core/ib.cc b/src/core/ib.cc
index b8854a6e..557f0426 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -21,6 +21,9 @@
 #include "context.hpp"
 #if defined(USE_IBVERBS)
 #include "ibverbs_wrapper.hpp"
+#if defined(MSCCLPP_USE_MLX5DV)
+#include "mlx5dv_wrapper.hpp"
+#endif  // defined(MSCCLPP_USE_MLX5DV)
 #endif  // defined(USE_IBVERBS)
 #include "logger.hpp"
 
@@ -64,7 +67,7 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
   return ret;
 }
 
-IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff), size_(0) {
+IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nullptr), buff_(buff), size_(0) {
   if (size == 0) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0");
   }
@@ -80,13 +83,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
   bool isGpuBuff = (gpuId != -1);
   if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
 #if !defined(MSCCLPP_USE_ROCM)
-    int fd;
-    MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    int fd = -1;
+    size_t rangeSize = pages * pageSize;
 
+    // Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
+    // bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
+    // routes DMA through the Data Direct engine for correct ordering and higher throughput.
+    // Fall back to the default (non-PCIe) mapping if the flag is unsupported.
+#if (CUDA_VERSION >= 12030)
+    CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                                                   CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    if (cuRes != CUDA_SUCCESS || fd < 0) {
+      if (fd >= 0) ::close(fd);
+      fd = -1;
+    }
+    bool usedPcieFlag = (fd >= 0);
+#endif  // CUDA_VERSION >= 12030
+    if (fd < 0) {
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    }
+
+    // Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
+    // which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
     size_t offsetInDmaBuf = buffIntPtr % pageSize;
-    mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd,
-                                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
-                                         IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC);
+    int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
+                      IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
+
+#if defined(MSCCLPP_USE_MLX5DV)
+    if (isDataDirect) {
+      mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+#endif
+    if (mr_ == nullptr) {
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+
+    // If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
+#if (CUDA_VERSION >= 12030)
+    if (mr_ == nullptr && usedPcieFlag) {
+      ::close(fd);
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+#endif  // CUDA_VERSION >= 12030
+
     ::close(fd);
     if (mr_ == nullptr) {
       THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
@@ -131,7 +171,7 @@ const void* IbMr::getBuff() const { return buff_; }
 uint32_t IbMr::getLkey() const { return mr_->lkey; }
 
 IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
-           int maxSendWr, int maxRecvWr, int maxWrPerSend)
+           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
     : portNum_(portNum),
       gidIndex_(gidIndex),
       info_(),
@@ -151,7 +191,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
       maxSendCqPollNum_(maxSendCqPollNum),
       maxSendWr_(maxSendWr),
       maxWrPerSend_(maxWrPerSend),
-      maxRecvWr_(maxRecvWr) {
+      maxRecvWr_(maxRecvWr),
+      noAtomic_(noAtomic) {
   sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
   if (sendCq_ == nullptr) {
     THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
@@ -211,7 +252,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
   qpAttr.qp_state = IBV_QPS_INIT;
   qpAttr.pkey_index = 0;
   qpAttr.port_num = portNum_;
-  qpAttr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
+  qpAttr.qp_access_flags = noAtomic_ ? IBV_ACCESS_REMOTE_WRITE
+                                     : (IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC);
   if (IBVerbs::ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
     THROW(NET, IbError, errno, "ibv_modify_qp failed (errno ", errno, ")");
   }
@@ -240,7 +282,7 @@ void IbQp::rtr(const IbQpInfo& info) {
   qp_attr.path_mtu = static_cast<ibv_mtu>(info.mtu);
   qp_attr.dest_qp_num = info.qpn;
   qp_attr.rq_psn = 0;
-  qp_attr.max_dest_rd_atomic = 1;
+  qp_attr.max_dest_rd_atomic = noAtomic_ ? 0 : 1;
   qp_attr.min_rnr_timer = 0x12;
   if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.isGrh) {
     qp_attr.ah_attr.is_global = 1;
@@ -272,7 +314,7 @@ void IbQp::rts() {
   qp_attr.retry_cnt = 7;
   qp_attr.rnr_retry = 7;
   qp_attr.sq_psn = 0;
-  qp_attr.max_rd_atomic = 1;
+  qp_attr.max_rd_atomic = noAtomic_ ? 0 : 1;
   int ret = IBVerbs::ibv_modify_qp(
       qp_, &qp_attr,
       IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC);
@@ -434,12 +476,38 @@ std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_
 
 unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }
 
-IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false) {
+IbCtx::IbCtx(const std::string& devName)
+    : devName_(devName),
+      ctx_(nullptr),
+      pd_(nullptr),
+      supportsRdmaAtomics_(false),
+      isMlx5_(false),
+      isDataDirect_(false),
+      isVF_(false) {
   int num;
   struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
   for (int i = 0; i < num; ++i) {
     if (std::string(devices[i]->name) == devName_) {
       ctx_ = IBVerbs::ibv_open_device(devices[i]);
+
+      // Detect if this IB device is a Virtual Function (VF).
+      // VFs have a 'physfn' sysfs symlink pointing to their parent PF; PFs do not.
+      {
+        std::string physfnPath = "/sys/class/infiniband/" + devName_ + "/device/physfn";
+        isVF_ = (access(physfnPath.c_str(), F_OK) == 0);
+        if (isVF_) {
+          INFO(NET, "IB device ", devName_, " is a Virtual Function (Data Direct ordering available)");
+        }
+      }
+
+#if defined(MSCCLPP_USE_MLX5DV)
+      if (MLX5DV::isAvailable()) {
+        isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]);
+        if (isMlx5_) {
+          INFO(NET, "IB device ", devName_, " supports mlx5 Direct Verbs");
+        }
+      }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
       break;
     }
   }
@@ -452,6 +520,20 @@ IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_
     THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")");
   }
 
+  // Detect Data Direct support via mlx5dv_get_data_direct_sysfs_path
+#if defined(MSCCLPP_USE_MLX5DV)
+  if (isMlx5_ && MLX5DV::isAvailable()) {
+    char sysfsPath[256];
+    int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath));
+    if (ret == 0) {
+      isDataDirect_ = true;
+      INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")");
+    } else {
+      INFO(NET, "IB device ", devName_, " does not support Data Direct");
+    }
+  }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+
   // Query and cache RDMA atomics capability
   struct ibv_device_attr attr = {};
   if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) {
@@ -512,7 +594,7 @@ int IbCtx::getAnyUsablePort(int gidIndex) const {
 }
 
 std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-                                      int maxRecvWr, int maxWrPerSend) {
+                                      int maxRecvWr, int maxWrPerSend, bool noAtomic) {
   if (port == -1) {
     port = this->getAnyUsablePort(gidIndex);
     if (port == -1) {
@@ -521,16 +603,22 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
   } else if (!this->isPortUsable(port, gidIndex)) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
   }
-  return std::shared_ptr<IbQp>(
-      new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend));
+  return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
+                                        maxRecvWr, maxWrPerSend, noAtomic));
 }
 
 std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
-  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size));
+  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isDataDirect_));
 }
 
 bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
 
+bool IbCtx::isMlx5() const { return isMlx5_; }
+
+bool IbCtx::isDataDirect() const { return isDataDirect_; }
+
+bool IbCtx::isVirtualFunction() const { return isVF_; }
+
 MSCCLPP_API_CPP int getIBDeviceCount() {
   int num;
   IBVerbs::ibv_get_device_list(&num);
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 06e733c7..22a9930f 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -5,6 +5,7 @@
 #define MSCCLPP_CONNECTION_HPP_
 
 #include <atomic>
+#include <memory>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mutex>
@@ -15,6 +16,7 @@
 #include "communicator.hpp"
 #include "context.hpp"
 #include "endpoint.hpp"
+#include "gdr.hpp"
 #include "ib.hpp"
 #include "registered_memory.hpp"
 #include "socket.h"
@@ -35,11 +37,18 @@ class BaseConnection {
 
   virtual void flush(int64_t timeoutUsec = -1) = 0;
 
-  /// Set the local address where remote updateAndSync operations should write.
-  /// This is called by the receiver to specify where incoming signals should be written.
-  /// Default implementation is a no-op for connections that don't need it.
-  /// @param addr The local address for incoming writes.
-  virtual void setRemoteUpdateDstAddr(uint64_t /*addr*/) {}
+  /// Start signal forwarding to the given memory address.
+  /// Called by the semaphore to specify where incoming signals should be written.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  virtual void startSignalForwarding(std::shared_ptr<uint64_t> /*mem*/) {}
+
+  /// Stop signal forwarding and release associated resources.
+  virtual void stopSignalForwarding() {}
+
+  /// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode).
+  /// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
+  /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
+  virtual bool isSignalForwarding() const { return false; }
 
   virtual Transport transport() const = 0;
 
@@ -91,22 +100,29 @@ class IBConnection : public BaseConnection {
   Transport transport_;
   Transport remoteTransport_;
   std::weak_ptr<IbQp> qp_;
-  std::unique_ptr<uint64_t> dummyAtomicSource_;  // not used anywhere but IB needs a source
-  RegisteredMemory dummyAtomicSourceMem_;
-  mscclpp::TransportInfo dstTransportInfo_;
+  std::unique_ptr<uint64_t> atomicSrc_;
+  RegisteredMemory atomicSrcMem_;
+  mscclpp::TransportInfo atomicSrcTransportInfo_;
 
   // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
   // instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
   bool ibNoAtomic_;
+  bool gdrSignalForwarding_;  // ibNoAtomic_ && gdrEnabled() — decided once at construction
   std::thread recvThread_;
   std::atomic<bool> stopRecvThread_;
-  int localGpuDeviceId_;  // Local GPU device ID for setting CUDA context in recv thread
-  cudaStream_t signalStream_;
+  std::atomic<bool> recvThreadError_;  // Set by recv thread on fatal error
+  std::string recvThreadErrorMsg_;     // Error message from recv thread (written before recvThreadError_ is set)
+  int localGpuDeviceId_;               // Local GPU device ID for CUDA context and GDR mapping
 
-  // Write-with-imm design:
-  // - Sender: 0-byte RDMA write-with-imm to dst MR, newValue in imm_data (32-bit)
-  // - Receiver: uses remoteUpdateDstAddr_ (set via setRemoteUpdateDstAddr) to know where to write
-  uint64_t remoteUpdateDstAddr_;
+  // Signal forwarding design (HostNoAtomic mode):
+  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
+  // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
+  //   the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around
+  //   detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half
+  //   incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
+  uint64_t signalAddr_;
+
+  std::unique_ptr<GdrMap> signalGdrMap_;
 
   void recvThreadFunc();
 
@@ -114,10 +130,15 @@ class IBConnection : public BaseConnection {
   IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
   ~IBConnection();
 
-  /// Set the local address where remote updateAndSync operations will write.
-  /// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
-  /// @param addr The local address for incoming writes.
-  void setRemoteUpdateDstAddr(uint64_t addr) override;
+  /// Start signal forwarding to the given memory address.
+  /// Must be called before the remote sends any updateAndSync in HostNoAtomic mode.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  void startSignalForwarding(std::shared_ptr<uint64_t> mem) override;
+
+  /// Stop signal forwarding and release associated resources.
+  void stopSignalForwarding() override;
+
+  bool isSignalForwarding() const override;
 
   Transport transport() const override;
 
diff --git a/src/core/include/context.hpp b/src/core/include/context.hpp
index ee84d0f7..42d03db1 100644
--- a/src/core/include/context.hpp
+++ b/src/core/include/context.hpp
@@ -42,8 +42,6 @@ struct Context::Impl {
   std::shared_ptr<TokenPool> tokenPool_;
   const size_t maxNumTokens_ = 1 << 15;  // 32K tokens
 
-  Impl();
-
   IbCtx* getIbContext(Transport ibTransport);
   std::shared_ptr<uint64_t> getToken();
 };
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index 01f25435..0cffd112 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -240,7 +240,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input
           sizeof(int4);
       void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]);
       val = mscclpp::read<int4>(remoteMemory, srcOffset + idx);
-      tmp = cal_vector<T, OpType>(tmp, val);
+      tmp = calVector<T, OpType>(tmp, val);
     }
     output4[outputOffset4 + idx] = tmp;
     if constexpr (SendToRemote) {
@@ -396,9 +396,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in
     for (uint32_t index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * (inputOffsets[index] + tbgOff));
       PacketPayload<PacketType> val = pkt[idx].read(flag_);
-      data = cal_vector<T, OpType>(data, val);
+      data = calVector<T, OpType>(data, val);
     }
-    data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
+    data = calVector<T, OpType>(data, srcPacketPayload[idx]);
     dstPacketPayload[idx] = data;
 
     if constexpr (SendToRemote) {
@@ -441,9 +441,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void
     for (uint32_t index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * (inputOffsets[index] + tbgOff));
       PacketPayload<PacketType> val = pkt[idx].read(flag_);
-      data = cal_vector<T, OpType>(data, val);
+      data = calVector<T, OpType>(data, val);
     }
-    data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
+    data = calVector<T, OpType>(data, srcPacketPayload[idx]);
     dstPacketPayload[idx] = data;
     PacketType* dst_val = &dstPkt[idx];
     dst_val->write(data, flag_);
@@ -527,7 +527,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo
           (inputOffsets[index] + getOffset<ReuseScratch>(outputBufferRefs[index].type, offset + tbgOffset)) /
           sizeof(int4);
       int4 val = buff4[buffOffset + idx];
-      tmp = cal_vector<T, OpType>(tmp, val);
+      tmp = calVector<T, OpType>(tmp, val);
     }
     dst4[dstOffset4 + idx] = tmp;
     if constexpr (SendToRemote) {
@@ -973,6 +973,17 @@ class ExecutionKernel {
 #endif
         break;
 #endif  // __FP8_TYPES_EXIST__
+      case DataType::FLOAT8_E4M3B15:
+        executionKernel<__fp8_e4m3b15, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (__fp8_e4m3b15*)src, (__fp8_e4m3b15*)dst, (__fp8_e4m3b15*)scratch, scratchOffset, scratchChunkSize,
+            plan, semaphores, localMemoryIdBegin, flag
+#if defined(ENABLE_NPKIT)
+            ,
+            NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
+#else
+        );
+#endif
+        break;
       case DataType::UINT8:
         executionKernel<uint8_t, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
             rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores,
@@ -984,6 +995,10 @@ class ExecutionKernel {
         );
 #endif
         break;
+      case DataType::AUTO:
+        // AUTO is a sentinel that must be resolved before reaching this point.
+        assert(false && "DataType::AUTO must be resolved before kernel launch");
+        break;
     }
   }
 #else   // !defined(MSCCLPP_DEVICE_HIP)
diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp
new file mode 100644
index 00000000..e0c7f006
--- /dev/null
+++ b/src/core/include/gdr.hpp
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_GDR_HPP_
+#define MSCCLPP_GDR_HPP_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace mscclpp {
+
+enum class GdrStatus {
+  Ok,             // GDRCopy initialized successfully
+  NotBuilt,       // Built without MSCCLPP_USE_GDRCOPY
+  Disabled,       // Disabled via MSCCLPP_FORCE_DISABLE_GDR
+  DriverMissing,  // /dev/gdrdrv not found
+  OpenFailed,     // gdr_open() failed
+};
+
+/// Return the detailed status of the global GDRCopy context.
+GdrStatus gdrStatus();
+
+/// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok).
+bool gdrEnabled();
+
+/// Return a human-readable error message for the current GDRCopy status.
+const char* gdrStatusMessage();
+
+/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
+/// When GDRCopy is not available, all operations are no-ops and valid() returns false.
+class GdrMap {
+ public:
+  /// Pin and map a GPU address for direct host-side access.
+  /// @param gpuMem   Shared pointer to the GPU memory (e.g. from gpuCallocShared).
+  /// @param deviceId The CUDA device ID for setting context.
+  GdrMap(std::shared_ptr<void> gpuMem, int deviceId);
+  ~GdrMap();
+
+  GdrMap(const GdrMap&) = delete;
+  GdrMap& operator=(const GdrMap&) = delete;
+
+  /// Whether the mapping was established successfully.
+  bool valid() const;
+
+  /// Return the BAR1-mapped host pointer to the GPU location.
+  uint64_t* hostPtr() const;
+
+  /// Copy data from host memory to the mapped GPU location.
+  void copyTo(const void* src, size_t size);
+
+  /// Copy data from the mapped GPU location to host memory.
+  void copyFrom(void* dst, size_t size) const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> pimpl_;
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_GDR_HPP_
diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp
index e9363e9c..36c5a237 100644
--- a/src/core/include/ib.hpp
+++ b/src/core/include/ib.hpp
@@ -36,7 +36,7 @@ class IbMr {
   uint32_t getLkey() const;
 
  private:
-  IbMr(ibv_pd* pd, void* buff, std::size_t size);
+  IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect);
 
   ibv_mr* mr_;
   void* buff_;
@@ -101,7 +101,7 @@ class IbQp {
   };
 
   IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-       int maxRecvWr, int maxWrPerSend);
+       int maxRecvWr, int maxWrPerSend, bool noAtomic);
   SendWrInfo getNewSendWrInfo();
   RecvWrInfo getNewRecvWrInfo();
 
@@ -128,6 +128,7 @@ class IbQp {
   const int maxSendWr_;
   const int maxWrPerSend_;
   const int maxRecvWr_;
+  const bool noAtomic_;
 
   friend class IbCtx;
 };
@@ -139,18 +140,24 @@ class IbCtx {
   ~IbCtx();
 
   std::shared_ptr<IbQp> createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
-                                 int maxRecvWr, int maxWrPerSend);
+                                 int maxRecvWr, int maxWrPerSend, bool noAtomic);
   std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
   bool supportsRdmaAtomics() const;
+  bool isMlx5() const;
+  bool isDataDirect() const;
+  bool isVirtualFunction() const;
 #else
   IbCtx([[maybe_unused]] const std::string& devName) {}
   ~IbCtx() {}
 
-  std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int) { return nullptr; }
+  std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int, bool) { return nullptr; }
   std::unique_ptr<const IbMr> registerMr([[maybe_unused]] void* buff, [[maybe_unused]] std::size_t size) {
     return nullptr;
   }
   bool supportsRdmaAtomics() const { return false; }
+  bool isMlx5() const { return false; }
+  bool isDataDirect() const { return false; }
+  bool isVirtualFunction() const { return false; }
 #endif
 
   const std::string& getDevName() const { return devName_; };
@@ -163,6 +170,9 @@ class IbCtx {
   ibv_context* ctx_;
   ibv_pd* pd_;
   bool supportsRdmaAtomics_;
+  bool isMlx5_;
+  bool isDataDirect_;
+  bool isVF_;
 };
 
 }  // namespace mscclpp
diff --git a/src/core/include/mlx5dv_wrapper.hpp b/src/core/include/mlx5dv_wrapper.hpp
new file mode 100644
index 00000000..79403a36
--- /dev/null
+++ b/src/core/include/mlx5dv_wrapper.hpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_MLX5DV_WRAPPER_HPP_
+#define MSCCLPP_MLX5DV_WRAPPER_HPP_
+
+#if defined(MSCCLPP_USE_MLX5DV)
+
+#include <infiniband/verbs.h>
+
+#include <string>
+
+namespace mscclpp {
+
+struct MLX5DV {
+  /// Whether libmlx5.so was successfully loaded at runtime.
+  static bool isAvailable();
+
+  /// Check if the given IB device supports mlx5 Direct Verbs.
+  static bool mlx5dv_is_supported(struct ibv_device* device);
+
+  /// Register a DMABUF memory region using mlx5dv extensions.
+  /// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version.
+  static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
+                                             int access);
+
+  /// Query the Data Direct sysfs path for the given IB context.
+  /// Returns 0 on success (device supports Data Direct), non-zero otherwise.
+  static int mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len);
+
+ private:
+  static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
+};
+
+}  // namespace mscclpp
+
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+#endif  // MSCCLPP_MLX5DV_WRAPPER_HPP_
diff --git a/src/core/include/reduce_kernel.hpp b/src/core/include/reduce_kernel.hpp
index fd9bd1e9..463f827d 100644
--- a/src/core/include/reduce_kernel.hpp
+++ b/src/core/include/reduce_kernel.hpp
@@ -14,7 +14,7 @@ namespace mscclpp {
 
 // Generic element-wise calculation helper
 template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
+MSCCLPP_DEVICE_INLINE T calElements(const T& a, const T& b) {
   if constexpr (OpType == SUM) {
     return a + b;
   } else if constexpr (OpType == MIN) {
@@ -24,56 +24,168 @@ MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
 }
 
 // Generic vector reduction helpers
-template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE int4 cal_vector_helper(const int4& a, const int4& b) {
-  int4 ret;
-  ret.w = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
-  ret.x = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  ret.z = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
-  return ret;
-}
 
 template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE uint2 cal_vector_helper(const uint2& a, const uint2& b) {
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) {
   uint2 ret;
-  ret.x = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
-  ret.y = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
+  ret.x = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
+  ret.y = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
   return ret;
 }
 
-template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE int cal_vector_helper(const int& a, const int& b) {
-  return bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+/// f32x2 specialization for uint2: uses packed f32x2 operator+ (Blackwell __fadd2_rn when available).
+template <>
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, SUM>(const uint2& a, const uint2& b) {
+  f32x2 fa = bit_cast<f32x2, uint2>(a);
+  f32x2 fb = bit_cast<f32x2, uint2>(b);
+  f32x2 fr = fa + fb;
+  return bit_cast<uint2, f32x2>(fr);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, MIN>(const uint2& a, const uint2& b) {
+  f32x2 fa = bit_cast<f32x2, uint2>(a);
+  f32x2 fb = bit_cast<f32x2, uint2>(b);
+  f32x2 fr = mscclpp::min(fa, fb);
+  return bit_cast<uint2, f32x2>(fr);
 }
 
 template <typename T, ReduceOp OpType>
-MSCCLPP_DEVICE_INLINE uint32_t cal_vector_helper(const uint32_t& a, const uint32_t& b) {
-  return bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) {
+  int4 ret;
+  ret.w = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
+  ret.x = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  ret.z = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
+  return ret;
 }
 
-// cal_vector wrapper - converts scalar types to vector types and calls cal_vector_helper
+/// f32x2 specialization for int4: process as two uint2 pairs using packed f32x2 arithmetic.
+template <>
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, SUM>(const int4& a, const int4& b) {
+  uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
+  uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
+  uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
+  uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
+  uint2 lo_r = calVectorHelper<f32x2, SUM>(lo_a, lo_b);
+  uint2 hi_r = calVectorHelper<f32x2, SUM>(hi_a, hi_b);
+  return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, MIN>(const int4& a, const int4& b) {
+  uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
+  uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
+  uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
+  uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
+  uint2 lo_r = calVectorHelper<f32x2, MIN>(lo_a, lo_b);
+  uint2 hi_r = calVectorHelper<f32x2, MIN>(hi_a, hi_b);
+  return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE int calVectorHelper(const int& a, const int& b) {
+  return bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) {
+  return bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
+}
+
+/// f32x2 specialization for uint32_t: a single float packed in 32 bits (scalar fallback).
+template <>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, SUM>(const uint32_t& a, const uint32_t& b) {
+  float fa = bit_cast<float, uint32_t>(a);
+  float fb = bit_cast<float, uint32_t>(b);
+  return bit_cast<uint32_t, float>(fa + fb);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, MIN>(const uint32_t& a, const uint32_t& b) {
+  float fa = bit_cast<float, uint32_t>(a);
+  float fb = bit_cast<float, uint32_t>(b);
+  return bit_cast<uint32_t, float>(fminf(fa, fb));
+}
+
+// calVector wrapper – converts scalar types to vector types and calls calVectorHelper
 template <typename T, ReduceOp OpType, typename DataType>
-MSCCLPP_DEVICE_INLINE DataType cal_vector(const DataType& a, const DataType& b) {
+MSCCLPP_DEVICE_INLINE DataType calVector(const DataType& a, const DataType& b) {
   // Define the vectorized computation type based on the element type
   static_assert(sizeof(DataType) % sizeof(T) == 0, "DataType size must be multiple of T size");
   static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes");
   using CompType = typename std::conditional_t<
-      std::is_same_v<T, __half>, f16x2,
+      std::is_same_v<T, float>, f32x2,
       std::conditional_t<
-          std::is_same_v<T, __bfloat16>, bf16x2,
-          std::conditional_t<std::is_same_v<T, uint8_t>, u8x4,
+          std::is_same_v<T, __half>, f16x2,
+          std::conditional_t<
+              std::is_same_v<T, __bfloat16>, bf16x2,
+              std::conditional_t<
+                  std::is_same_v<T, uint8_t>, u8x4,
+                  std::conditional_t<std::is_same_v<T, __fp8_e4m3b15>, f8_e4m3b15x4,
 #if defined(__FP8_TYPES_EXIST__)
-                             std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
-                                                std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4,
-#endif
-                                                                   T
-#if defined(__FP8_TYPES_EXIST__)
-                                                                   >>>>>;
+                                     std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
+                                                        std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4, T>>
 #else
-                             >>>;
+                                     T
 #endif
-  return cal_vector_helper<CompType, OpType>(a, b);
+                                     >>>>>;
+  return calVectorHelper<CompType, OpType>(a, b);
+}
+
+/// Upcast a packed DataType (containing T elements) to a packed AccDataType (containing AccumT elements).
+/// Uses the optimized to<>() specializations when available (e.g. FP8 -> float hardware intrinsics).
+/// When AccumT == T, this is a no-op identity.
+template <typename T, typename AccumT, typename AccDataType, typename DataType>
+MSCCLPP_DEVICE_INLINE AccDataType upcastVector(const DataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return val;
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+    ToVec result = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
+    return reinterpret_cast<const AccDataType&>(result);
+  }
+}
+
+/// Downcast a packed AccDataType (containing AccumT elements) back to DataType (containing T elements).
+/// Uses the optimized to<>() specializations when available.
+/// When AccumT == T, this is a no-op identity.
+template <typename T, typename AccumT, typename DataType, typename AccDataType>
+MSCCLPP_DEVICE_INLINE DataType downcastVector(const AccDataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return val;
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+    FromVec result = mscclpp::to<FromVec>(reinterpret_cast<const ToVec&>(val));
+    return reinterpret_cast<const DataType&>(result);
+  }
+}
+
+/// Accumulate `val` (packed T elements in DataType) into `acc` (packed AccumT elements in AccDataType).
+/// When AccumT == T, falls back to the standard calVector.
+/// Otherwise, upcasts val to AccumT, reduces element-wise, and returns the AccumT accumulator.
+template <typename T, typename AccumT, ReduceOp OpType, typename AccDataType, typename DataType>
+MSCCLPP_DEVICE_INLINE AccDataType calVectorAccum(const AccDataType& acc, const DataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return calVector<T, OpType>(acc, val);
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+
+    ToVec fv = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
+    const ToVec& fa = reinterpret_cast<const ToVec&>(acc);
+    ToVec fr;
+#pragma unroll
+    for (int i = 0; i < nElems; ++i) {
+      fr.data[i] = calElements<AccumT, OpType>(fa.data[i], fv.data[i]);
+    }
+    return reinterpret_cast<const AccDataType&>(fr);
+  }
 }
 
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)
diff --git a/src/core/mlx5dv_wrapper.cc b/src/core/mlx5dv_wrapper.cc
new file mode 100644
index 00000000..a56fad96
--- /dev/null
+++ b/src/core/mlx5dv_wrapper.cc
@@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#if defined(MSCCLPP_USE_MLX5DV)
+
+// _GNU_SOURCE is required for dlvsym()
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "mlx5dv_wrapper.hpp"
+
+#include <dlfcn.h>
+#include <infiniband/mlx5dv.h>
+
+#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
+#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
+#endif
+
+#include <memory>
+
+#include "logger.hpp"
+
+namespace mscclpp {
+
+static std::unique_ptr<void, int (*)(void*)> globalMLX5Handle(nullptr, &::dlclose);
+
+void* MLX5DV::dlsym(const std::string& symbol, bool allowReturnNull) {
+  if (!globalMLX5Handle) {
+    const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
+    for (int i = 0; possibleLibNames[i] != nullptr; i++) {
+      void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
+      if (handle) {
+        globalMLX5Handle.reset(handle);
+        break;
+      }
+    }
+    if (!globalMLX5Handle) {
+      if (allowReturnNull) return nullptr;
+      THROW(NET, SysError, errno, "Failed to open libmlx5: ", std::string(::dlerror()));
+    }
+  }
+  void* ptr = ::dlsym(globalMLX5Handle.get(), symbol.c_str());
+  if (!ptr && !allowReturnNull) {
+    THROW(NET, SysError, errno, "Failed to load libmlx5 symbol: ", symbol);
+  }
+  return ptr;
+}
+
+bool MLX5DV::isAvailable() {
+  static int available = -1;
+  if (available == -1) {
+    // Try to load the library; if it fails, mlx5dv is not available
+    const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
+    for (int i = 0; possibleLibNames[i] != nullptr; i++) {
+      void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
+      if (handle) {
+        if (!globalMLX5Handle) {
+          globalMLX5Handle.reset(handle);
+        } else {
+          ::dlclose(handle);
+        }
+        available = 1;
+        INFO(NET, "libmlx5 loaded successfully");
+        return true;
+      }
+    }
+    available = 0;
+    DEBUG(NET, "libmlx5 not available");
+  }
+  return available == 1;
+}
+
+bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
+  using FuncType = bool (*)(struct ibv_device*);
+  static FuncType impl = nullptr;
+  if (!impl) {
+    void* ptr = MLX5DV::dlsym("mlx5dv_is_supported", /*allowReturnNull=*/true);
+    if (!ptr) return false;
+    impl = reinterpret_cast<FuncType>(ptr);
+  }
+  return impl(device);
+}
+
+struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
+                                            int access) {
+  // mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
+  // Must use dlvsym with "MLX5_1.25" version to get the Data Direct-capable symbol.
+  using FuncType = struct ibv_mr* (*)(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int, int);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_reg_dmabuf_mr", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_reg_dmabuf_mr", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
+    resolved = true;
+  }
+  if (!impl) return nullptr;
+  return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
+}
+
+int MLX5DV::mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) {
+  using FuncType = int (*)(struct ibv_context*, char*, size_t);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_get_data_direct_sysfs_path", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_get_data_direct_sysfs_path", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
+    resolved = true;
+  }
+  if (!impl) return -1;
+  return impl(context, buf, buf_len);
+}
+
+}  // namespace mscclpp
+
+#endif  // defined(MSCCLPP_USE_MLX5DV)
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index bea43327..49a3791b 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -8,6 +8,7 @@
 #include "atomic.hpp"
 #include "connection.hpp"
 #include "context.hpp"
+#include "logger.hpp"
 #include "registered_memory.hpp"
 #include "serialization.hpp"
 
@@ -48,12 +49,12 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection
     token_ = std::make_shared<uint64_t>(0);
   } else if (localDevice.type == DeviceType::GPU) {
     if (localDevice.id < 0) {
-      throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
+      THROW(CONN, Error, ErrorCode::InvalidUsage, "Local GPU ID is not provided");
     }
     CudaDeviceGuard deviceGuard(localDevice.id);
     token_ = gpuCallocToken(connection_.context());
   } else {
-    throw Error("Unsupported local device type", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Unsupported local device type");
   }
   idMemory_ = std::move(connection_.context()->registerMemory(token_.get(), sizeof(uint64_t), connection_.transport()));
 }
@@ -78,7 +79,7 @@ MSCCLPP_API_CPP SemaphoreStub SemaphoreStub::deserialize(const std::vector<char>
   RegisteredMemory idMemory(std::make_shared<RegisteredMemory::Impl>(data.begin(), memEnd));
   auto it = detail::deserialize(memEnd, device);
   if (it != data.end()) {
-    throw Error("SemaphoreStub deserialize failed", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "SemaphoreStub deserialize failed");
   }
   return SemaphoreStub(std::make_shared<Impl>(std::move(idMemory), device));
 }
@@ -119,15 +120,35 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
       expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
       outboundToken_(std::make_unique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
-    throw Error("Local endpoint device type of Host2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
   }
-  BaseConnection::getImpl(connection())
-      ->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
+  auto connImpl = BaseConnection::getImpl(connection());
+  if (connImpl->isSignalForwarding()) {
+    // Signal forwarding (HostNoAtomic): the receiver's recv thread polls the recv CQ for
+    // WRITE_WITH_IMM completions, then forwards the token to inboundToken_ via GDRCopy.
+    CudaDeviceGuard deviceGuard(connection().localDevice().id);
+#if defined(MSCCLPP_USE_ROCM)
+    inboundToken_ = detail::gpuCallocUncachedShared<uint64_t>();
+#else
+    inboundToken_ = detail::gpuCallocShared<uint64_t>();
+#endif
+    connImpl->startSignalForwarding(inboundToken_);
+  }
+  // When isSignalForwarding() is false (atomic mode), inboundToken_ stays null
+  // and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
 }
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator, const Connection& connection)
     : Host2DeviceSemaphore(buildSemaphoreFromConnection(communicator, connection)) {}
 
+MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
+  if (inboundToken_) {
+    // Clear the connection's signal forwarding destination (and GdrMap)
+    // before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
+    BaseConnection::getImpl(connection())->stopSignalForwarding();
+  }
+}
+
 MSCCLPP_API_CPP Connection& Host2DeviceSemaphore::connection() { return semaphore_.connection(); }
 
 MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
@@ -136,7 +157,11 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const {
   Host2DeviceSemaphore::DeviceHandle device;
-  device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
+  // If inboundToken_ is allocated (signal forwarding mode), the GPU polls it.
+  // Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
+  // which is the same address targeted by the NIC's atomic operation.
+  device.inboundToken =
+      inboundToken_ ? inboundToken_.get() : reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
   device.expectedInboundToken = expectedInboundToken_.get();
   return device;
 }
@@ -146,13 +171,19 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
       expectedInboundToken_(std::make_unique<uint64_t>()),
       outboundToken_(std::make_unique<uint64_t>()) {
   if (connection().transport() == Transport::CudaIpc) {
-    throw Error("Host2HostSemaphore cannot be used with CudaIpc transport", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Host2HostSemaphore cannot be used with CudaIpc transport");
   }
   if (connection().localDevice().type != DeviceType::CPU) {
-    throw Error("Local endpoint device type of Host2HostSemaphore should be CPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
+  }
+  auto connImpl = BaseConnection::getImpl(connection());
+  if (connImpl->isSignalForwarding()) {
+    // Signal forwarding mode: tell the recv thread where to write the incoming token.
+    // Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
+    auto token =
+        std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
+    connImpl->startSignalForwarding(std::move(token));
   }
-  BaseConnection::getImpl(connection())
-      ->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
 }
 
 MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(Communicator& communicator, const Connection& connection)
@@ -177,7 +208,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
   while (atomicLoad(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), memoryOrderAcquire) <
          (*expectedInboundToken_)) {
     if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) {
-      throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout);
+      THROW(CONN, Error, ErrorCode::Timeout, "Host2HostSemaphore::wait timed out");
     }
   }
 }
@@ -185,7 +216,8 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
 MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const Semaphore& semaphore)
     : semaphore_(semaphore), expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
-    throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage,
+          "Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU");
   }
 }
 
diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index 0b288b38..fb51a342 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -183,7 +183,8 @@ std::shared_ptr<Algorithm> AllgatherFullmesh::build() {
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, [[maybe_unused]] DataType dtype, [[maybe_unused]] ReduceOp op,
              cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-             const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             const std::unordered_map<std::string, uintptr_t>& extras,
+             [[maybe_unused]] DataType accumDtype) -> CommResult {
         return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
index cf6027d9..9d169d68 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -212,7 +212,8 @@ std::shared_ptr<Algorithm> AllgatherFullmesh2::build() {
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, [[maybe_unused]] mscclpp::DataType dtype, [[maybe_unused]] ReduceOp op,
              cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-             const std::unordered_map<std::string, uintptr_t>& extras) -> mscclpp::CommResult {
+             const std::unordered_map<std::string, uintptr_t>& extras,
+             [[maybe_unused]] mscclpp::DataType accumDtype) -> mscclpp::CommResult {
         return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index 83950d7c..17bcfc33 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <collective_utils.hpp>
+#include <type_traits>
 
 #include "allreduce/allreduce_allpair_packet.hpp"
 #include "allreduce/common.hpp"
@@ -11,7 +12,7 @@
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                                   size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
                                   int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
@@ -43,13 +44,16 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
   // step 2: Reduce Data
   for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nelems; idx += blockDim.x * gridDim.x) {
     uint32_t data = src[idx];
+    using AccRaw = std::conditional_t<std::is_same_v<T, AccumT>, uint32_t,
+                                      mscclpp::VectorType<AccumT, sizeof(uint32_t) / sizeof(T)>>;
+    AccRaw acc = mscclpp::upcastVector<T, AccumT, AccRaw>(data);
     for (int index = 0; index < nPeers; index++) {
       const int remoteRank = index < rank ? index : index + 1;
       LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems;
       uint32_t val = dstPkt[idx].read(flag, -1);
-      data = cal_vector<T, OpType>(val, data);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(acc, val);
     }
-    dst[idx] = data;
+    dst[idx] = mscclpp::downcastVector<T, AccumT, uint32_t>(acc);
   }
   __syncthreads();
   if (threadIdx.x == 0) {
@@ -67,7 +71,7 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
   return {(worldSize - 1) * 4, 512};
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllpairAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
@@ -76,7 +80,12 @@ struct AllpairAdapter {
                           int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
-    allreduceAllPairs<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    // Round nBlocks to multiple of nPeers so every block maps to a valid peer.
+    const int nPeers = worldSize - 1;
+    if (nPeers > 0) {
+      nBlocks = (nBlocks / nPeers) * nPeers;
+    }
+    allreduceAllPairs<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
         nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
     return cudaGetLastError();
@@ -94,18 +103,24 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
                                                        size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op,
                                                        cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                       const std::unordered_map<std::string, uintptr_t>&) {
+                                                       const std::unordered_map<std::string, uintptr_t>&,
+                                                       DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
   std::pair<int, int> blockAndThreadNum{nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
   }
+  // nBlocks must be at least nPeers for allpair — each block maps to one peer.
+  const int nPeers = algoCtx->nRanksPerNode - 1;
+  if (nPeers > 0 && blockAndThreadNum.first < nPeers) {
+    return CommResult::CommInvalidArgument;
+  }
   size_t sendBytes;
   CUdeviceptr sendBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   size_t channelInOffset = (char*)input - (char*)sendBasePtr;
 
-  AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -161,9 +176,9 @@ std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index 13c63ba1..24d2a31c 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -9,7 +9,7 @@
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(512, 1)
     allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                       DeviceHandle<MemoryChannel>* memoryOutChannels, size_t channelOutDataOffset, int rank,
@@ -26,6 +26,10 @@ __global__ void __launch_bounds__(512, 1)
   int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
   int4* resultBuff4 = reinterpret_cast<int4*>(resultBuff);
 
+  // AccumVec: wider vector for mixed-precision accumulation. When AccumT==T, this is just int4 (no-op).
+  constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T);
+  using AccumVec = std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nElemsPerInt4>>;
+
   // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4`
   constexpr size_t unitNInt4 = 512;
   const size_t maxNInt4PerBlock =
@@ -81,12 +85,14 @@ __global__ void __launch_bounds__(512, 1)
     __syncthreads();
 
     for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) {
-      int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(rawData);
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
-        data = cal_vector<T, OpType>(val, data);
+        acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, val);
       }
+      int4 data = mscclpp::downcastVector<T, AccumT, int4>(acc);
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
@@ -121,12 +127,14 @@ __global__ void __launch_bounds__(512, 1)
     __syncthreads();
 
     for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) {
-      int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(rawData);
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
-        data = cal_vector<T, OpType>(val, data);
+        acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, val);
       }
+      int4 data = mscclpp::downcastVector<T, AccumT, int4>(acc);
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
@@ -144,7 +152,7 @@ __global__ void __launch_bounds__(512, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceAllconnectAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t,
@@ -155,7 +163,7 @@ struct AllreduceAllconnectAdapter {
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0) nBlocks = 35;
     if (nThreadsPerBlock == 0) nThreadsPerBlock = 512;
-    allreduceFullmesh<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreduceFullmesh<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels,
         channelOutDataOffset, rank, nRanksPerNode, worldSize, nelems);
     return cudaGetLastError();
@@ -174,10 +182,10 @@ void AllreduceFullmesh::initialize(std::shared_ptr<Communicator> comm) {
   localScratchMemory_ = std::move(localMemory);
 }
 
-CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
-                                                  size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
-                                                  int nBlocks, int nThreadsPerBlock,
-                                                  const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceFullmesh::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx_void, const void* input, void* output, size_t inputSize, DataType dtype,
+    ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   size_t recvBytes;
   CUdeviceptr recvBasePtr;
@@ -198,13 +206,20 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
   }
   inputChannelHandles = this->memoryChannelsMap_[input].second;
 
-  AllreduceFunc allreduce = dispatch<AllreduceAllconnectAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceAllconnectAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", static_cast<int>(op),
          static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  if (numBlocksAndThreads.first > 64) {
+    WARN("AllreduceFullmesh: number of blocks exceeds maximum supported blocks, which is 64");
+    return mscclpp::CommResult::CommInvalidArgument;
+  }
+  if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
+    numBlocksAndThreads = {35, 512};
+  }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(),
                 nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
@@ -261,9 +276,10 @@ std::shared_ptr<Algorithm> AllreduceFullmesh::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index b542a6a6..2d71cd63 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -146,7 +146,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsBlockPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -155,6 +155,9 @@ struct NvlsBlockPipelineAdapter {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
+      return cudaErrorNotSupported;
     } else
 #if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
       if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
@@ -187,9 +190,10 @@ void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm)
 CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
                                                            void* output, size_t inputSize, DataType dtype, ReduceOp op,
                                                            cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                           const std::unordered_map<std::string, uintptr_t>&) {
+                                                           const std::unordered_map<std::string, uintptr_t>& extras,
+                                                           DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -235,9 +239,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsBlockPipeline::build() {
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index 9824fbcd..a616485e 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -1,15 +1,17 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <type_traits>
+
 #include "allreduce/allreduce_nvls_packet.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
-#include "debug.h"
+#include "logger.hpp"
 
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output,
                         [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
@@ -31,15 +33,16 @@ __global__ void __launch_bounds__(1024, 1)
     mscclpp::SwitchChannelDeviceHandle::multimemStore(*(mscclpp::f32x2*)(&pkt), multiPkt + i);
   }
   for (uint32_t i = tid; i < nPktPerRank * worldSize; i += blockDim.x * gridDim.x) {
-    uint data = src[i];
+    // When T == AccumT, stay with raw uint to avoid type mismatch in identity path.
+    using AccRaw =
+        std::conditional_t<std::is_same_v<T, AccumT>, uint, mscclpp::VectorType<AccumT, sizeof(uint) / sizeof(T)>>;
+    AccRaw acc = mscclpp::upcastVector<T, AccumT, AccRaw>(src[i]);
     for (int peer = 0; peer < worldSize; peer++) {
-      if (peer == rank) {
-        continue;
-      }
+      if (peer == rank) continue;
       uint val = scratchPkt[peer * worldSize * nPktPerRank + i].read(flag);
-      data = cal_vector<T, OpType>(data, val);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(acc, val);
     }
-    dst[i] = data;
+    dst[i] = mscclpp::downcastVector<T, AccumT, uint>(acc);
   }
   __syncthreads();
   if (threadIdx.x == 0) {
@@ -62,13 +65,13 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize) {
   return {blockNum, threadNum};
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceNvlsPacketAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int, int worldSize, size_t inputSize, cudaStream_t stream,
                           void* flags, uint32_t flagBufferSize, uint32_t, int nBlocks, int nThreadsPerBlock) {
-    allreduceNvlsPacket<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreduceNvlsPacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank,
         worldSize, flags, flagBufferSize);
     return cudaGetLastError();
@@ -78,6 +81,8 @@ struct AllreduceNvlsPacketAdapter {
 void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator> comm) {
   int nSwitchChannels = 1;
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
+  this->switchChannels_ =
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
 }
 
 AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
@@ -92,9 +97,7 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
   ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
-  int nSwitchChannels = 1;
-  ctx->switchChannels =
-      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
+  ctx->switchChannels = this->switchChannels_;
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
@@ -102,19 +105,20 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
 CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
                                                     void* output, size_t inputSize, mscclpp::DataType dtype,
                                                     ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                    const std::unordered_map<std::string, uintptr_t>&) {
+                                                    const std::unordered_map<std::string, uintptr_t>&,
+                                                    mscclpp::DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize);
   }
   if (blockAndThreadNum.first > maxBlockNum_) {
-    WARN("Block number %d exceeds the maximum limit %d", blockAndThreadNum.first, maxBlockNum_);
+    WARN(ALGO, "Block number ", blockAndThreadNum.first, " exceeds the maximum limit ", maxBlockNum_);
     return CommResult::CommInvalidArgument;
   }
-  AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
+    WARN(ALGO, "Unsupported operation or data type for allreduce, dtype=", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   cudaError_t error =
@@ -122,7 +126,7 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void>
                 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
                 (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
+    WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
@@ -136,9 +140,10 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             mscclpp::DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index bc03ab26..3bb054da 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -109,7 +109,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsWarpPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -118,6 +118,9 @@ struct NvlsWarpPipelineAdapter {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
+      return cudaErrorNotSupported;
     } else
 #if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
       if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
@@ -147,12 +150,12 @@ void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
-CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
-                                                          void* output, size_t inputSize, DataType dtype, ReduceOp op,
-                                                          cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                          const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx_void, const void* input, void* output, size_t inputSize, DataType dtype,
+    ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsWarpPipelineAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsWarpPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -198,9 +201,9 @@ std::shared_ptr<Algorithm> AllreduceNvlsWarpPipeline::build() {
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index f251bcda..e7f2028f 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -67,7 +67,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsAdapter {
   static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
@@ -77,6 +77,9 @@ struct NvlsAdapter {
     // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
     if constexpr (std::is_same_v<T, uint8_t>) {
       return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
+      return cudaErrorNotSupported;
     } else
 #if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)
         if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
@@ -114,13 +117,14 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
 CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
                                               size_t inputSize, mscclpp::DataType dtype, ReduceOp op,
                                               cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                              const std::unordered_map<std::string, uintptr_t>&) {
+                                              [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras,
+                                              mscclpp::DataType accumDtype) {
   if (!symmetricMemory_) {
     WARN("AllreduceNvls requires symmetric memory for now.");
     return CommResult::CommInvalidArgument;
   }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -203,9 +207,10 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvls::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             mscclpp::DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index ceb545ee..e2d8ef73 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -2,16 +2,17 @@
 // Licensed under the MIT License.
 
 #include <mscclpp/algorithm.hpp>
+#include <type_traits>
 
 #include "allreduce/allreduce_packet.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
-#include "debug.h"
+#include "logger.hpp"
 
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
                     size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize,
@@ -92,12 +93,21 @@ __global__ void __launch_bounds__(1024, 1)
   // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
     uint2 data = src[idx];
-    for (int index = 0; index < nPeers; index++) {
-      const int remoteRank = index < rank ? index : index + 1;
-      mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
-      uint2 val = dstPkt[idx].read(flag);
-      data.x = cal_vector<T, OpType>(val.x, data.x);
-      data.y = cal_vector<T, OpType>(val.y, data.y);
+    {
+      // When T == AccumT, stay with raw uint32_t to avoid type mismatch in identity path.
+      using AccRaw = std::conditional_t<std::is_same_v<T, AccumT>, uint32_t,
+                                        mscclpp::VectorType<AccumT, sizeof(uint32_t) / sizeof(T)>>;
+      AccRaw accX = mscclpp::upcastVector<T, AccumT, AccRaw>(data.x);
+      AccRaw accY = mscclpp::upcastVector<T, AccumT, AccRaw>(data.y);
+      for (int index = 0; index < nPeers; index++) {
+        const int remoteRank = index < rank ? index : index + 1;
+        mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
+        uint2 val = dstPkt[idx].read(flag);
+        accX = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(accX, val.x);
+        accY = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(accY, val.y);
+      }
+      data.x = mscclpp::downcastVector<T, AccumT, uint32_t>(accX);
+      data.y = mscclpp::downcastVector<T, AccumT, uint32_t>(accY);
     }
 
     dst[idx].x = data.x;
@@ -142,7 +152,7 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct PacketAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
@@ -155,12 +165,12 @@ struct PacketAdapter {
     nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1);
 #if defined(ENABLE_NPKIT)
     size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS;
-    allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
+    allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
         nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
         NpKit::GetCpuTimestamp());
 #else
-    allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
         nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
 #endif
@@ -186,18 +196,22 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
     }
   }
 
-#if defined(__FP8_TYPES_EXIST__)
   // FP8-specific tuning for 32KB-256KB range
-  if (dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2) {
-    if (inputSize < (64 << 10)) {
-      nThreadsPerBlock = 64;
-    } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
-      nThreadsPerBlock = 128;
-    } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) {
-      nThreadsPerBlock = 256;
+  {
+    bool isFp8 = dtype == DataType::FLOAT8_E4M3B15;
+#if defined(__FP8_TYPES_EXIST__)
+    isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2;
+#endif
+    if (isFp8) {
+      if (inputSize < (64 << 10)) {
+        nThreadsPerBlock = 64;
+      } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
+        nThreadsPerBlock = 128;
+      } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) {
+        nThreadsPerBlock = 256;
+      }
     }
   }
-#endif
 #endif
   return {nBlocks, nThreadsPerBlock};
 }
@@ -213,7 +227,8 @@ void AllreducePacket::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
                                                 size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op,
                                                 cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                const std::unordered_map<std::string, uintptr_t>&) {
+                                                const std::unordered_map<std::string, uintptr_t>&,
+                                                DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
@@ -225,9 +240,10 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   size_t channelInOffset = (char*)input - (char*)sendBasePtr;
 
-  AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
+    WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
+         ", dtype=", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   cudaError_t error =
@@ -236,7 +252,7 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
                 stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
+    WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
@@ -280,9 +296,9 @@ std::shared_ptr<Algorithm> AllreducePacket::build() {
       "default_allreduce_packet", "allreduce", [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
index d5be2257..db471b93 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -87,7 +87,7 @@ __global__ void __launch_bounds__(1024, 1)
       int rankIdx = (rank + i + 1) % nRanksPerNode;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       int4 data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
-      tmp = cal_vector<T, OpType>(data, tmp);
+      tmp = calVector<T, OpType>(data, tmp);
     }
     for (uint32_t i = 0; i < nPeers; i++) {
       int rankIdx = (rank + i + 1) % nRanksPerNode;
@@ -123,7 +123,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -166,9 +166,9 @@ void AllreduceRsAg::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
                                               size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                                               int nBlocks, int nThreadsPerBlock,
-                                              const std::unordered_map<std::string, uintptr_t>&) {
+                                              const std::unordered_map<std::string, uintptr_t>&, DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
-  AllreduceFunc allreduce = dispatch<AllreduceRsAgAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
          ", dtype=", static_cast<int>(dtype));
@@ -213,9 +213,10 @@ std::shared_ptr<Algorithm> AllreduceRsAg::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
index a230d8cd..eabe3dc5 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
@@ -168,7 +168,7 @@ __global__ void __launch_bounds__(1024, 1)
           uint32_t peerSlotOffset =
               baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
           int4 data = scratch4[peerSlotOffset];
-          tmp = cal_vector<T, OpType>(data, tmp);
+          tmp = calVector<T, OpType>(data, tmp);
         }
         storeVec(resultBuff, myChunkOffset, tmp, nelems);
         // Broadcast reduced result to all peers' scratch at SCATTER_AG_OFFSET + rank * nInt4PerIter
@@ -220,7 +220,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -274,12 +274,12 @@ void AllreduceRsAgPipeline::initialize(std::shared_ptr<Communicator> comm) {
             cudaMemcpyHostToDevice);
 }
 
-CommResult AllreduceRsAgPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
-                                                      size_t inputSize, DataType dtype, ReduceOp op,
-                                                      cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                      const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceRsAgPipeline::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op,
+    cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
-  AllreduceFunc allreduce = dispatch<AllreduceRsAgPipelineAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
          ", dtype=", static_cast<int>(dtype));
@@ -320,9 +320,10 @@ std::shared_ptr<Algorithm> AllreduceRsAgPipeline::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
index caac07ae..f95ba7e3 100644
--- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <type_traits>
+
 #include "allreduce/allreduce_rsag_zero_copy.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
@@ -36,7 +38,7 @@ __device__ mscclpp::DeviceSyncer globalSyncer;
 // the extra copy steps of the standard RSAG. The NRanksPerNode template
 // parameter enables compile-time unrolling of peer loops (supports 4 or 8).
 
-template <int NRanksPerNode, ReduceOp OpType, typename T>
+template <int NRanksPerNode, ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
                           DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int worldSize,
@@ -73,19 +75,26 @@ __global__ void __launch_bounds__(1024, 1)
   }
   __syncthreads();
   int4 data[NPeers];
+  // AccumInt4: when AccumT != T, use a wider accumulator type.
+  // For AccumT == T, this is just int4 (no-op conversion).
+  constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T);
+  // When T == AccumT, stay with raw int4 to avoid type mismatch in identity path.
+  using AccumVec = std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nElemsPerInt4>>;
   for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) {
     uint32_t offset = idx + offset4 + rank * nInt4PerRank;
     if (offset >= nInt4Total) continue;
-    int4 tmp = buff4[offset];
+    int4 tmp_raw = buff4[offset];
 #pragma unroll
     for (int i = 0; i < NPeers; i++) {
       int rankIdx = (rank + i + 1) % NRanksPerNode;
       int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
       data[i] = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
     }
+    AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(tmp_raw);
     for (int i = 0; i < NPeers; i++) {
-      tmp = cal_vector<T, OpType>(data[i], tmp);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data[i]);
     }
+    int4 tmp = mscclpp::downcastVector<T, AccumT, int4>(acc);
 #pragma unroll
     for (int i = 0; i < NPeers; i++) {
       int rankIdx = (rank + i + 1) % NRanksPerNode;
@@ -102,7 +111,7 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceRsAgZeroCopyAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
                           DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
@@ -118,11 +127,11 @@ struct AllreduceRsAgZeroCopyAdapter {
       }
     }
     if (nRanksPerNode == 4) {
-      allreduceRsAgZeroCopy<4, OpType, T>
+      allreduceRsAgZeroCopy<4, OpType, T, AccumT>
           <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
                                                      switchChannel, remoteMemories, rank, worldSize, nelems);
     } else if (nRanksPerNode == 8) {
-      allreduceRsAgZeroCopy<8, OpType, T>
+      allreduceRsAgZeroCopy<8, OpType, T, AccumT>
           <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
                                                      switchChannel, remoteMemories, rank, worldSize, nelems);
     } else {
@@ -145,9 +154,10 @@ void AllreduceRsAgZeroCopy::initialize(std::shared_ptr<Communicator> comm) {
 CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
                                                       size_t inputSize, DataType dtype, ReduceOp op,
                                                       cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                      const std::unordered_map<std::string, uintptr_t>&) {
+                                                      const std::unordered_map<std::string, uintptr_t>&,
+                                                      DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
-  AllreduceFunc allreduce = dispatch<AllreduceRsAgZeroCopyAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgZeroCopyAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
          ", dtype=", static_cast<int>(dtype));
@@ -220,9 +230,10 @@ std::shared_ptr<Algorithm> AllreduceRsAgZeroCopy::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
index bd402cfa..362308b2 100644
--- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
@@ -20,7 +20,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
index fa811b15..a54352b3 100644
--- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
@@ -16,7 +16,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
index 8b9b04ae..81b74add 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
@@ -19,7 +19,7 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
index 65a48923..fb0c63b8 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
@@ -21,7 +21,8 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<mscclpp::Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-                                 int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras);
+                                 int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+                                 mscclpp::DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
                                              size_t, mscclpp::DataType);
@@ -34,6 +35,7 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
   uintptr_t flagBuffer_;
   size_t flagBufferSize_;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
+  std::vector<SwitchChannel> switchChannels_;
 };
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
index e392b54e..8f02a873 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
@@ -19,7 +19,7 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
index d0593500..d53ea180 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -19,7 +19,7 @@ class AllreduceNvls : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
index f0438dea..de7ca471 100644
--- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
@@ -20,7 +20,7 @@ class AllreducePacket : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
index 6e033f67..1fd663da 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
@@ -19,7 +19,7 @@ class AllreduceRsAg : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
index 2a740ac0..7629f2fe 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
@@ -19,7 +19,7 @@ class AllreduceRsAgPipeline : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
index 6153a0e4..05bf2ef3 100644
--- a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
@@ -18,7 +18,7 @@ class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 9bfac69a..1e0e7e69 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#ifndef MSCCLPP_ALLREDUCE_COMMOM_HPP_
-#define MSCCLPP_ALLREDUCE_COMMOM_HPP_
+#ifndef MSCCLPP_ALLREDUCE_COMMON_HPP_
+#define MSCCLPP_ALLREDUCE_COMMON_HPP_
 
 #include <cmath>
 #include <mscclpp/algorithm.hpp>
@@ -77,55 +77,51 @@ using AllreduceFunc =
                               mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t, int, int, int,
                               size_t, cudaStream_t, void*, uint32_t, uint32_t, int, int)>;
 
-template <template <ReduceOp, typename> class Adapter>
-AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {
-  if (op == SUM) {
-    if (dtype == mscclpp::DataType::FLOAT16) {
-      return Adapter<SUM, half>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT32) {
-      return Adapter<SUM, float>::call;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::BFLOAT16) {
-      return Adapter<SUM, __bfloat16>::call;
-#endif
-#if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
-      return Adapter<SUM, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
-      return Adapter<SUM, __fp8_e5m2>::call;
-#endif
-    } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
-      return Adapter<SUM, int>::call;
-    } else if (dtype == mscclpp::DataType::UINT8) {
-      return Adapter<SUM, uint8_t>::call;
-    } else {
-      return nullptr;
-    }
-  } else if (op == MIN) {
-    if (dtype == mscclpp::DataType::FLOAT16) {
-      return Adapter<MIN, half>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT32) {
-      return Adapter<MIN, float>::call;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::BFLOAT16) {
-      return Adapter<MIN, __bfloat16>::call;
-#endif
-#if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
-      return Adapter<MIN, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
-      return Adapter<MIN, __fp8_e5m2>::call;
-#endif
-    } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
-      return Adapter<MIN, int>::call;
-    } else if (dtype == mscclpp::DataType::UINT8) {
-      return Adapter<MIN, uint8_t>::call;
-    } else {
-      return nullptr;
-    }
+/// Dispatch helper for FP8 types with a configurable accumulation type.
+template <ReduceOp Op, typename FP8T, template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatchFp8Accum(mscclpp::DataType accumDtype, mscclpp::DataType dtype) {
+  if (accumDtype == mscclpp::DataType::FLOAT32) {
+    return Adapter<Op, FP8T, float>::call;
+  } else if (accumDtype == mscclpp::DataType::FLOAT16) {
+    return Adapter<Op, FP8T, half>::call;
+  } else if (accumDtype == dtype) {
+    return Adapter<Op, FP8T, FP8T>::call;
   }
   return nullptr;
 }
+
+template <ReduceOp Op, template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatchByDtype(mscclpp::DataType dtype, mscclpp::DataType accumDtype) {
+  if (dtype == mscclpp::DataType::FLOAT16) {
+    return Adapter<Op, half, half>::call;
+  } else if (dtype == mscclpp::DataType::FLOAT32) {
+    return Adapter<Op, float, float>::call;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  } else if (dtype == mscclpp::DataType::BFLOAT16) {
+    return Adapter<Op, __bfloat16, __bfloat16>::call;
+#endif
+#if defined(__FP8_TYPES_EXIST__)
+  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
+    return dispatchFp8Accum<Op, __fp8_e4m3, Adapter>(accumDtype, dtype);
+  } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
+    return dispatchFp8Accum<Op, __fp8_e5m2, Adapter>(accumDtype, dtype);
+#endif
+  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3B15) {
+    return dispatchFp8Accum<Op, __fp8_e4m3b15, Adapter>(accumDtype, dtype);
+  } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
+    return Adapter<Op, int, int>::call;
+  } else if (dtype == mscclpp::DataType::UINT8) {
+    return Adapter<Op, uint8_t, uint8_t>::call;
+  }
+  return nullptr;
+}
+
+template <template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype, mscclpp::DataType accumDtype) {
+  if (op == SUM) return dispatchByDtype<SUM, Adapter>(dtype, accumDtype);
+  if (op == MIN) return dispatchByDtype<MIN, Adapter>(dtype, accumDtype);
+  return nullptr;
+}
 }  // namespace collective
 }  // namespace mscclpp
 
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
index 82dd2d9e..0b9592d7 100644
--- a/src/ext/nccl/algorithm_selector.cc
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -15,7 +15,8 @@ static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, Da
   bool nvlsSupported = config.nvlsSupported;
 
   // NVLS does not support uint8_t (no hardware support for byte-level reduction)
-  if (dtype == DataType::UINT8) {
+  // NVLS also does not support float8_e4m3b15 (software-defined type with no hardware NVLS reduction support)
+  if (dtype == DataType::UINT8 || dtype == DataType::FLOAT8_E4M3B15) {
     return false;
   }
 
diff --git a/src/ext/nccl/datatype_conversion.hpp b/src/ext/nccl/datatype_conversion.hpp
index 0270a753..dcfb645a 100644
--- a/src/ext/nccl/datatype_conversion.hpp
+++ b/src/ext/nccl/datatype_conversion.hpp
@@ -43,6 +43,7 @@ inline size_t getDataTypeSize(mscclpp::DataType dtype) {
     case mscclpp::DataType::UINT8:
     case mscclpp::DataType::FLOAT8_E4M3:
     case mscclpp::DataType::FLOAT8_E5M2:
+    case mscclpp::DataType::FLOAT8_E4M3B15:
       return 1;
     case mscclpp::DataType::FLOAT16:
     case mscclpp::DataType::BFLOAT16:
@@ -76,6 +77,10 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
     case mscclpp::DataType::FLOAT8_E5M2:
       return ncclFloat8e5m2;
 #endif
+    case mscclpp::DataType::FLOAT8_E4M3B15:
+      // float8_e4m3b15 has no NCCL equivalent; NCCL cannot reduce this type correctly.
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "FLOAT8_E4M3B15 (float8_e4m3b15) has no NCCL equivalent and cannot be used with NCCL collectives");
     default:
       THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
             "Unsupported mscclpp::DataType: " + std::to_string(static_cast<int>(dtype)));
diff --git a/src/ext/nccl/nccl.cc b/src/ext/nccl/nccl.cc
index e12f12b6..2d6c5f9d 100644
--- a/src/ext/nccl/nccl.cc
+++ b/src/ext/nccl/nccl.cc
@@ -83,17 +83,17 @@ static inline int mscclppNcclDlopenInit() {
   const char* ncclLibPath = mscclpp::env()->ncclSharedLibPath.c_str();
   if (ncclLibPath != nullptr && ncclLibPath[0] != '\0') {
     if (std::filesystem::is_directory(ncclLibPath)) {
-      WARN(MSCCLPP_NCCL, "The value of the environment variable %s is a directory", ncclLibPath);
+      WARN(MSCCLPP_NCCL, "MSCCLPP_NCCL_LIB_PATH points to a directory: ", ncclLibPath);
       return dlopenError;
     }
 
     mscclppNcclDlHandle = dlopen(ncclLibPath, RTLD_LAZY | RTLD_NODELETE | RTLD_DEEPBIND);
     if (!mscclppNcclDlHandle) {
-      WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
+      WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: ", dlerror());
       return dlopenError;
     }
   } else {
-    WARN(MSCCLPP_NCCL, "The value of MSCCLPP_NCCL_LIB_PATH is empty!\n");
+    WARN(MSCCLPP_NCCL, "The value of MSCCLPP_NCCL_LIB_PATH is empty!");
     return dlopenError;
   }
 
@@ -270,19 +270,18 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
     return mscclpp::nccl::selectSingleNodeAllreduce(algoMap, request, config);
   }
 
-  INFO(MSCCLPP_NCCL, "No suitable algorithm found for collective '%s', fallback to nccl/rccl",
-       request.collective.c_str());
+  INFO(MSCCLPP_NCCL, "No suitable algorithm found for collective '", request.collective, "', fallback to nccl/rccl");
   return nullptr;
 }
 
 NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
-  INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank %d, world_size=%d", rank, nranks);
+  INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank ", rank, ", world_size=", nranks);
   if (comm == nullptr) {
     WARN(MSCCLPP_NCCL, "comm is nullptr");
     return ncclInvalidArgument;
   }
   if (nranks < 0 || rank < 0 || rank >= nranks) {
-    WARN(MSCCLPP_NCCL, "nranks is %d, rank is %d", nranks, rank);
+    WARN(MSCCLPP_NCCL, "nranks is ", nranks, ", rank is ", rank);
     return ncclInvalidArgument;
   }
   std::shared_ptr<mscclpp::TcpBootstrap> bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
@@ -560,8 +559,8 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
     return ncclInvalidArgument;
   }
 
-  INFO(MSCCLPP_NCCL, "rank %d broadcast sendbuff %p recvbuff %p count %ld, dtype %d, comm: %p", rank, sendbuff,
-       recvbuff, count, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " broadcast sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", count,
+       ", dtype ", datatype, ", comm: ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("broadcast", fallbackList)) {
@@ -619,8 +618,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
   }
   // Declarating variables
   int rank = comm->comm->bootstrap()->getRank();
-  INFO(MSCCLPP_NCCL, "rank %d allreduce sendbuff %p recvbuff %p count %ld, dtype %d comm is %p", rank, sendbuff,
-       recvbuff, count, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " allreduce sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", count,
+       ", dtype ", datatype, " comm is ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib && mscclppNcclInFallbackList("allreduce", fallbackList)) {
@@ -673,8 +672,8 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
     return ncclInvalidArgument;
   }
 
-  INFO(MSCCLPP_NCCL, "ReduceScatter recvcount: %ld, datatype: %d, op: %d, messageSize: %ld", recvcount, datatype, op,
-       bytes * comm->comm->bootstrap()->getNranks());
+  INFO(MSCCLPP_NCCL, "ReduceScatter recvcount: ", recvcount, ", datatype: ", datatype, ", op: ", op,
+       ", messageSize: ", bytes * comm->comm->bootstrap()->getNranks());
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("reducescatter", fallbackList)) {
@@ -730,8 +729,8 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
 
   int rank = comm->comm->bootstrap()->getRank();
   int nRank = comm->comm->bootstrap()->getNranks();
-  INFO(MSCCLPP_NCCL, "rank %d allgather sendbuff %p recvbuff %p count %ld, dtype %d, comm %p", rank, sendbuff, recvbuff,
-       sendcount, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " allgather sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", sendcount,
+       ", dtype ", datatype, ", comm ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("allgather", fallbackList)) {
@@ -866,20 +865,20 @@ ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
     }
   } catch (const mscclpp::Error& e) {
     if (e.getErrorCode() == mscclpp::ErrorCode::InvalidUsage) {
-      WARN(MSCCLPP_NCCL, "Invalid usage: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Invalid usage: ", e.what());
       return ncclInvalidUsage;
     } else {
-      WARN(MSCCLPP_NCCL, "Internal error: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Internal error: ", e.what());
       return ncclInternalError;
     }
   } catch (const mscclpp::CudaError& e) {
-    WARN(MSCCLPP_NCCL, "Cuda error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cuda error: ", e.what());
     return ncclUnhandledCudaError;
   } catch (const mscclpp::CuError& e) {
-    WARN(MSCCLPP_NCCL, "Cu error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cu error: ", e.what());
     return ncclUnhandledCudaError;
   } catch (const mscclpp::BaseError& e) {
-    WARN(MSCCLPP_NCCL, "Base error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Base error: ", e.what());
     return ncclInternalError;
   }
   ptrMap[sharedPtr.get()] = sharedPtr;
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
index 80cd10b1..d4996cc2 100644
--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -30,6 +30,12 @@ fi
 if [ "${PLATFORM}" == "rocm" ]; then
     export CXX=/opt/rocm/bin/hipcc
 fi
+
+PIP_CMAKE_ARGS_FILE="/root/mscclpp/pip_cmake_args.txt"
+if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then
+    export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})"
+    echo "Using CMAKE_ARGS: ${CMAKE_ARGS}"
+fi
 cd /root/mscclpp && pip3 install .
 pip3 install setuptools_scm
 python3 -m setuptools_scm --force-write-version-files
diff --git a/test/framework.cc b/test/framework.cc
index 73cf1272..941fdcba 100644
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -20,8 +20,30 @@ static bool gCurrentTestPassed = true;
 static std::string gCurrentTestFailureMessage;
 static std::string gCurrentTestName;
 
+// Performance result collection
+struct PerfResult {
+  std::string label;
+  double value;
+  std::string unit;
+};
+struct PerfTestResults {
+  std::string testName;
+  std::vector<PerfResult> results;
+};
+static std::vector<PerfTestResults> gPerfResults;
+
 std::string currentTestName() { return gCurrentTestName; }
 
+void reportPerfResult(const std::string& label, double value, const std::string& unit) {
+  if (gMpiRank != 0) return;
+  if (gCurrentTestName.empty()) return;
+  // Find or create entry for the current test
+  if (gPerfResults.empty() || gPerfResults.back().testName != gCurrentTestName) {
+    gPerfResults.push_back({gCurrentTestName, {}});
+  }
+  gPerfResults.back().results.push_back({label, value, unit});
+}
+
 namespace utils {
 
 void initializeMPI(int argc, char* argv[]) {
@@ -151,6 +173,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
   // Parse command line arguments
   std::string filter;
   bool excludePerfTests = false;
+  bool onlyPerfTests = false;
 
   for (int i = 1; i < argc; ++i) {
     std::string arg = argv[i];
@@ -161,6 +184,8 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       ++i;
     } else if (arg == "--exclude-perf-tests") {
       excludePerfTests = true;
+    } else if (arg == "--only-perf-tests") {
+      onlyPerfTests = true;
     }
   }
 
@@ -189,6 +214,10 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
       skippedByFilter++;
       continue;
     }
+    if (onlyPerfTests && !entry.isPerfTest) {
+      skippedByFilter++;
+      continue;
+    }
     if (!matchesFilter(fullName, filter)) {
       skippedByFilter++;
       continue;
@@ -208,6 +237,7 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     std::string fullName = entry.suiteName + "." + entry.testName;
 
     if (excludePerfTests && entry.isPerfTest) continue;
+    if (onlyPerfTests && !entry.isPerfTest) continue;
     if (!matchesFilter(fullName, filter)) continue;
 
     gCurrentTestPassed = true;
@@ -285,6 +315,9 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
         passed++;
       } else {
         std::cout << "[  FAILED  ] " << fullName << std::endl;
+        if (!gCurrentTestFailureMessage.empty()) {
+          std::cout << "            Reason: " << gCurrentTestFailureMessage << std::endl;
+        }
         failed++;
       }
     }
@@ -301,6 +334,19 @@ int TestRegistry::runAllTests(int argc, char* argv[]) {
     if (failed > 0) {
       std::cout << "[  FAILED  ] " << failed << " tests.\n";
     }
+
+    // Print collected performance results
+    if (!gPerfResults.empty()) {
+      std::cout << "\n[   PERF   ] Performance results:\n";
+      for (const auto& testResult : gPerfResults) {
+        std::cout << "[   PERF   ] " << testResult.testName << "\n";
+        for (const auto& r : testResult.results) {
+          std::cout << "[   PERF   ]   " << std::setw(12) << r.label << ": " << std::setprecision(4) << r.value << " "
+                    << r.unit << "\n";
+        }
+      }
+      gPerfResults.clear();
+    }
   }
 
   // Tear down global test environments (in reverse order)
diff --git a/test/framework.hpp b/test/framework.hpp
index 26a32d5b..b2431ed9 100644
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -63,6 +63,13 @@ class TestRegistry {
 // Returns "Suite.Name" for the currently running test, or "" if none.
 std::string currentTestName();
 
+/// Collect a performance result for the current test. Results are printed together
+/// after all tests complete. Only rank 0 should call this (results are ignored on other ranks).
+/// @param label A label for this measurement (e.g., "128 MB" or "latency").
+/// @param value The numeric result.
+/// @param unit The unit string (e.g., "GB/s", "us/iter").
+void reportPerfResult(const std::string& label, double value, const std::string& unit);
+
 // Utility functions
 namespace utils {
 
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 04ab402d..e5945563 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -3,8 +3,12 @@
 
 #include <mpi.h>
 
+#include <atomic>
+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <thread>
 
+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"
 
@@ -40,7 +44,11 @@ void IbPeerToPeerTest::SetUp() {
   int ib_gid_index = std::stoi(gEnv->args["ib_gid_index"]);
 
   ibCtx = std::make_shared<mscclpp::IbCtx>(ibDevName);
-  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, 0, 64);
+  bool noAtomic = !ibCtx->supportsRdmaAtomics();
+  // When atomics are not supported, the MemoryConsistency test uses
+  // write-with-imm which requires recv WRs on the receiver side.
+  int maxRecvWr = noAtomic ? 64 : 0;
+  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, maxRecvWr, 64, noAtomic);
 
   qpInfo[gEnv->rank] = qp->getInfo();
   bootstrap->allGather(qpInfo.data(), sizeof(mscclpp::IbQpInfo));
@@ -78,7 +86,7 @@ void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint6
   qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
 }
 
-TEST(IbPeerToPeerTest, SimpleSendRecv) {
+PERF_TEST(IbPeerToPeerTest, SimpleSendRecv) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -114,7 +122,7 @@ TEST(IbPeerToPeerTest, SimpleSendRecv) {
       }
     }
     float us = (float)timer.elapsed();
-    std::cout << "IbPeerToPeerTest.SimpleSendRecv: " << us / maxIter << " us/iter" << std::endl;
+    ::mscclpp::test::reportPerfResult("latency", us / maxIter, "us/iter");
   }
   bootstrap->barrier();
 }
@@ -199,11 +207,33 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     return;
   }
 
+  // Use atomic path if supported by the IB device.
+  bool useAtomic = ibCtx->supportsRdmaAtomics();
+
   const uint64_t signalPeriod = 1024;
   const uint64_t maxIter = 10000;
   const uint64_t nelem = 65536 + 1;
   auto data = mscclpp::detail::gpuCallocUnique<uint64_t>(nelem);
 
+  // For no-atomic mode: allocate a separate signal buffer for write-with-imm destination.
+  // The sender writes-with-imm to this buffer; the receiver's CPU thread reads the imm_data
+  // from the recv CQ and writes the iteration value to data[0] via GDRCopy atomicStore.
+  std::shared_ptr<uint64_t> signalBuf;
+  std::unique_ptr<const mscclpp::IbMr> signalMr;
+  std::array<mscclpp::IbMrInfo, 2> signalMrInfo{};
+  if (!useAtomic) {
+    signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+    signalMr = ibCtx->registerMr(signalBuf.get(), sizeof(uint64_t));
+    signalMrInfo[gEnv->rank] = signalMr->getInfo();
+    bootstrap->allGather(signalMrInfo.data(), sizeof(mscclpp::IbMrInfo));
+
+    // Pre-post recv WRs for write-with-imm on both ranks
+    for (int i = 0; i < 64; ++i) {
+      qp->stageRecv(0);
+    }
+    qp->postRecv();
+  }
+
   registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);
 
   uint64_t res = 0;
@@ -222,6 +252,40 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     ASSERT_EQ(*ptrCurIter, 0);
     ASSERT_EQ(*ptrResult, 0);
 
+    // For no-atomic mode: create a GDRCopy mapping for data[0] and start a CPU thread that
+    // polls recv CQ and forwards the signal via GDRCopy BAR1 write — the same mechanism
+    // used by IBConnection::recvThreadFunc for port channels.
+    std::atomic<bool> stopRecvThread(false);
+    std::thread recvThread;
+    std::unique_ptr<mscclpp::GdrMap> dataGdrMap;
+    if (!useAtomic) {
+      if (!mscclpp::gdrEnabled()) {
+        SKIP_TEST() << "No-atomic mode requires GDRCopy but it is not available.";
+      }
+      // Create GDRCopy BAR1 mapping for data[0] — same as how connection.cc maps inboundToken_
+      dataGdrMap =
+          std::make_unique<mscclpp::GdrMap>(std::shared_ptr<void>(data.get(), [](void*) {}),  // non-owning shared_ptr
+                                            cudaDevId);
+
+      recvThread = std::thread([&]() {
+        while (!stopRecvThread.load(std::memory_order_relaxed)) {
+          int wcNum = qp->pollRecvCq();
+          if (wcNum <= 0) continue;
+          for (int i = 0; i < wcNum; ++i) {
+            int status = qp->getRecvWcStatus(i);
+            if (status != static_cast<int>(mscclpp::WsStatus::Success)) continue;
+            uint64_t val = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+            // Write the iteration value to data[0] via GDRCopy BAR1 atomicStore —
+            // same pattern as IBConnection::recvThreadFunc.
+            mscclpp::atomicStore(dataGdrMap->hostPtr(), val, mscclpp::memoryOrderRelaxed);
+            // Re-post recv
+            qp->stageRecv(0);
+            qp->postRecv();
+          }
+        }
+      });
+    }
+
     kernelMemoryConsistency<<<1, 1024>>>(data.get(), ptrCurIter, ptrResult, nelem, maxIter);
     MSCCLPP_CUDATHROW(cudaGetLastError());
 
@@ -243,6 +307,11 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     }
 
     MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+    if (!useAtomic) {
+      stopRecvThread.store(true, std::memory_order_relaxed);
+      if (recvThread.joinable()) recvThread.join();
+    }
   } else if (gEnv->rank == 1) {
     // Sender
     std::vector<uint64_t> hostBuffer(nelem, 0);
@@ -263,15 +332,20 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
       stageSendWrite(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
       qp->postSend();
 
-#if 0
-      // For reference: send the first element using a normal send. This should occasionally see a wrong result.
-      stageSendWrite(sizeof(uint64_t), 0, 0, 0, false);
-      qp->postSend();
-#else
-      // Send the first element using AtomicAdd. This should see the correct result.
-      stageSendAtomicAdd(0, 0, 1, false);
-      qp->postSend();
-#endif
+      if (useAtomic) {
+        // Send the first element using AtomicAdd. The non-posted PCIe atomic operation
+        // provides end-to-end ordering: data[1..N] are guaranteed visible when data[0] updates.
+        stageSendAtomicAdd(0, 0, 1, false);
+        qp->postSend();
+      } else {
+        // No-atomic mode: send a 0-byte WRITE_WITH_IMM carrying the iteration in imm_data.
+        // The receiver's CPU thread polls the recv CQ and writes the value to data[0]
+        // via GDRCopy atomicStore.
+        // QP ordering guarantees data[1..N] WRITE completes before this write-with-imm.
+        const mscclpp::IbMrInfo& remoteSignalMrInfo = signalMrInfo[(gEnv->rank == 1) ? 0 : 1];
+        qp->stageSendWriteWithImm(nullptr, remoteSignalMrInfo, 0, 0, 0, 0, false, static_cast<unsigned int>(iter));
+        qp->postSend();
+      }
 
       if (signaled) {
         int wcNum = qp->pollSendCq();
@@ -292,20 +366,33 @@ TEST(IbPeerToPeerTest, MemoryConsistency) {
     }
   }
 
-  if (res & 2) {
-    FAIL() << "The receiver is stuck at iteration " << iter << ".";
-  } else if (res != 0 && res != 1) {
-    FAIL() << "Unknown error is detected at iteration " << iter << ". res =" << res;
+  if (useAtomic) {
+    // With RDMA atomics, memory consistency must be guaranteed.
+    if (res & 2) {
+      FAIL() << "The receiver is stuck at iteration " << iter << ".";
+    }
+    EXPECT_EQ(res, 0);
+  } else {
+    if (res == 0) {
+      // No-atomic path works correctly here.
+    } else if (res & 2) {
+      SKIP_TEST() << "No-atomic signal forwarding: receiver stuck at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    } else {
+      SKIP_TEST() << "No-atomic signal forwarding: memory inconsistency detected at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    }
   }
-
-  EXPECT_EQ(res, 0);
 }
 
-TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
+PERF_TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
   }
+  if (!ibCtx->supportsRdmaAtomics()) {
+    SKIP_TEST() << "This test requires RDMA atomics support.";
+  }
 
   mscclpp::Timer timeout(3);
 
@@ -339,7 +426,7 @@ TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
       }
     }
     float us = (float)timer.elapsed();
-    std::cout << "IbPeerToPeerTest.SimpleAtomicAdd: " << us / maxIter << " us/iter" << std::endl;
+    ::mscclpp::test::reportPerfResult("latency", us / maxIter, "us/iter");
   }
   bootstrap->barrier();
 }
diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
index 318d301a..1ce9eb0b 100644
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -103,7 +103,7 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)(nTries) << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)(nTries), "us/iter");
   }
 }
 
@@ -324,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int*
   }
 }
 
-TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
+PERF_TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
   auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
   packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
 }
 
-TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
+PERF_TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
   auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index 5f95d660..f4a26cf9 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -159,6 +159,7 @@ class PortChannelOneToOneTest : public CommunicatorTestBase {
   void testPingPongPerf(PingPongTestParams params);
   void testPacketPingPong(bool useIbOnly, IbMode ibMode = IbMode::Default);
   void testPacketPingPongPerf(bool useIbOnly, IbMode ibMode = IbMode::Default);
+  void testBandwidth(PingPongTestParams params);
 
   std::shared_ptr<mscclpp::ProxyService> proxyService;
 };
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 764c3299..3b14ed31 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -4,9 +4,38 @@
 #include <cstdint>
 #include <mscclpp/concurrency_device.hpp>
 
+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"
 
+// Skip the current test if the given IB mode will require GDRCopy on CUDA but it is unavailable.
+// On CUDA, HostNoAtomic requires GDRCopy for BAR1 signal forwarding. When IbMode::Host or
+// IbMode::Default is used and the IB device does not support RDMA atomics, the endpoint falls
+// back to no-atomic mode, which also requires GDRCopy.
+// On ROCm, no-atomic mode uses direct volatile writes and does not need GDRCopy.
+#if defined(MSCCLPP_USE_CUDA)
+inline void requireGdrForIbMode(IbMode mode, mscclpp::Transport ibTransport) {
+  if (mscclpp::gdrEnabled()) return;  // GDRCopy available — nothing to skip.
+  if (mode == IbMode::HostNoAtomic) {
+    SKIP_TEST() << "HostNoAtomic requires GDRCopy on CUDA: " << mscclpp::gdrStatusMessage();
+  }
+  // For Host/Default modes: check whether the IB device lacks RDMA atomics,
+  // which would cause an automatic fallback to no-atomic mode.
+  if (mode == IbMode::Host || mode == IbMode::Default) {
+    std::string devName = mscclpp::getIBDeviceName(ibTransport);
+    mscclpp::IbCtx ibCtx(devName);
+    if (!ibCtx.supportsRdmaAtomics()) {
+      SKIP_TEST() << "IB device " << devName
+                  << " lacks RDMA atomics; Host mode falls back to HostNoAtomic which requires GDRCopy: "
+                  << mscclpp::gdrStatusMessage();
+    }
+  }
+}
+#define REQUIRE_GDR_FOR_IB_MODE(mode) requireGdrForIbMode((mode), ibTransport)
+#else
+#define REQUIRE_GDR_FOR_IB_MODE(mode)  // No extra requirements on non-CUDA platforms.
+#endif
+
 void PortChannelOneToOneTest::SetUp() {
   // Use only two ranks
   setNumRanksToUse(2);
@@ -226,7 +255,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nTries, "us/iter");
   }
 
   proxyService->stopProxy();
@@ -239,6 +268,7 @@ TEST(PortChannelOneToOneTest, PingPong) {
 
 TEST(PortChannelOneToOneTest, PingPongIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 }
@@ -255,28 +285,31 @@ TEST(PortChannelOneToOneTest, PingPongWithPoll) {
 
 TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerf) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerf) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
 
-TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
@@ -443,7 +476,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nTries, "us/iter");
   }
 
   proxyService->stopProxy();
@@ -453,28 +486,117 @@ TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode
 
 TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPacketPingPong(true, IbMode::Host);
 }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
   testPacketPingPongPerf(true, IbMode::Host);
 }
 
-TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPacketPingPongPerf(true, IbMode::HostNoAtomic);
 }
 
 TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
 
 TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
   REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
   testPacketPingPong(true, IbMode::HostNoAtomic);
 }
+
+// Bandwidth test: bidirectional bulk transfer matching the tutorial pattern.
+// Both ranks do signal+wait+putWithSignal+wait per iteration.
+__global__ void kernelBandwidthBidir(int* buff, int nElem, int nIters, int rank) {
+  DeviceHandle<mscclpp::PortChannel>& portChan = gChannelOneToOneTestConstPortChans;
+  if (threadIdx.x != 0) return;
+  const uint64_t srcOffset = rank * nElem * sizeof(int);
+  const uint64_t dstOffset = srcOffset;
+  for (int i = 0; i < nIters; i++) {
+    portChan.signal();
+    portChan.wait();
+    portChan.putWithSignal(dstOffset, srcOffset, nElem * sizeof(int));
+    portChan.wait();
+  }
+}
+
+void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  const int maxElem = 32 * 1024 * 1024;  // 128 MB per direction
+  const int bufElem = maxElem * 2;       // 2x for bidirectional
+
+  std::vector<mscclpp::PortChannel> portChannels;
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(bufElem).memory();
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), bufElem * sizeof(int),
+                       nullptr, 0, params.ibMode);
+
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
+  for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
+
+  ASSERT_EQ(portChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::PortChannel>)));
+
+  proxyService->startProxy();
+
+  const std::string testName = ::mscclpp::test::currentTestName();
+  const int nIters = 1000;
+
+  for (int nElem : {256, 16 * 1024, 256 * 1024, 1024 * 1024, 4 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024}) {
+    // Warm-up
+    kernelBandwidthBidir<<<1, 1024>>>(buff.get(), nElem, 10, gEnv->rank);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    communicator->bootstrap()->barrier();
+
+    // Measure
+    mscclpp::Timer timer;
+    kernelBandwidthBidir<<<1, 1024>>>(buff.get(), nElem, nIters, gEnv->rank);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    double elapsedUs = timer.elapsed();
+    communicator->bootstrap()->barrier();
+
+    if (gEnv->rank == 0) {
+      double copyBytes = (double)nElem * sizeof(int);
+      double elapsedMsPerIter = elapsedUs / 1e3 / nIters;
+      double gbps = copyBytes / elapsedMsPerIter * 1e-6;
+      double sizeKB = copyBytes / 1024.0;
+      std::string label =
+          (sizeKB >= 1024.0) ? (std::to_string((int)(sizeKB / 1024.0)) + " MB") : (std::to_string((int)sizeKB) + " KB");
+      ::mscclpp::test::reportPerfResult(label, gbps, "GB/s");
+    }
+  }
+
+  proxyService->stopProxy();
+}
+
+PERF_TEST(PortChannelOneToOneTest, Bandwidth) {
+  testBandwidth(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
+}
+
+PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  testBandwidth(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
+}
+
+PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
+  testBandwidth(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
+}
diff --git a/test/mp_unit/semaphore_perf_tests.cu b/test/mp_unit/semaphore_perf_tests.cu
index 92560539..a4c0e29f 100644
--- a/test/mp_unit/semaphore_perf_tests.cu
+++ b/test/mp_unit/semaphore_perf_tests.cu
@@ -68,6 +68,6 @@ PERF_TEST(SemaphorePerfTest, SignalPingPong) {
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nIters << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nIters, "us/iter");
   }
 }
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 7836e063..a345effc 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -4,6 +4,7 @@
 target_sources(unit_tests PRIVATE
     unit_tests_main.cc
     core_tests.cc
+    gdr_tests.cu
     gpu_utils_tests.cc
     errors_tests.cc
     fifo_tests.cu
diff --git a/test/unit/gdr_tests.cu b/test/unit/gdr_tests.cu
new file mode 100644
index 00000000..78bb2e1a
--- /dev/null
+++ b/test/unit/gdr_tests.cu
@@ -0,0 +1,251 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <mscclpp/atomic_device.hpp>
+#include <mscclpp/errors.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "../framework.hpp"
+#include "gdr.hpp"
+
+// GdrStatus and gdrEnabled
+
+class GdrStatusTest : public ::mscclpp::test::TestCase {};
+
+TEST(GdrStatusTest, StatusIsValid) {
+  // gdrStatus() should return one of the defined enum values
+  auto status = mscclpp::gdrStatus();
+  ASSERT_TRUE(status == mscclpp::GdrStatus::Ok || status == mscclpp::GdrStatus::NotBuilt ||
+              status == mscclpp::GdrStatus::Disabled || status == mscclpp::GdrStatus::DriverMissing ||
+              status == mscclpp::GdrStatus::OpenFailed);
+}
+
+TEST(GdrStatusTest, EnabledConsistentWithStatus) {
+  // gdrEnabled() should be true iff gdrStatus() == Ok
+  EXPECT_EQ(mscclpp::gdrEnabled(), mscclpp::gdrStatus() == mscclpp::GdrStatus::Ok);
+}
+
+// GdrMap tests — only run when GDRCopy is available
+
+class GdrMapTest : public ::mscclpp::test::TestCase {
+ protected:
+  void SetUp() override {
+    if (!mscclpp::gdrEnabled()) {
+      SKIP_TEST() << "GDRCopy not enabled on this platform.";
+    }
+    MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
+    // Try creating a GDRCopy mapping to check if pin+map works on this platform.
+    try {
+      auto testMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+      mscclpp::GdrMap testMap(std::static_pointer_cast<void>(testMem), deviceId_);
+    } catch (const std::exception&) {
+      SKIP_TEST() << "GDRCopy mapping not supported on this platform.";
+    }
+  }
+
+  int deviceId_ = 0;
+};
+
+TEST(GdrMapTest, BasicMapping) {
+  // Allocate GPU memory via cudaMalloc (not VMM) and create a GDRCopy mapping
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+
+  ASSERT_TRUE(map.valid());
+  EXPECT_NE(map.hostPtr(), nullptr);
+}
+
+TEST(GdrMapTest, CopyToAndFrom) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write a value to GPU via GDRCopy
+  uint64_t writeVal = 0xDEADBEEFCAFE0123ULL;
+  map.copyTo(&writeVal, sizeof(uint64_t));
+
+  // Read it back via GDRCopy
+  uint64_t readVal = 0;
+  map.copyFrom(&readVal, sizeof(uint64_t));
+  EXPECT_EQ(readVal, writeVal);
+
+  // Also verify via cudaMemcpy
+  uint64_t cudaVal = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&cudaVal, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(cudaVal, writeVal);
+}
+
+TEST(GdrMapTest, CopyToVisibleFromGpu) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy, verify GPU sees it via cudaMemcpy
+  uint64_t val = 42;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 42);
+}
+
+TEST(GdrMapTest, MultipleWritesReadBack) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write multiple values sequentially and verify each
+  for (uint64_t i = 1; i <= 100; ++i) {
+    map.copyTo(&i, sizeof(uint64_t));
+    uint64_t readback = 0;
+    map.copyFrom(&readback, sizeof(uint64_t));
+    EXPECT_EQ(readback, i);
+    if (readback != i) break;
+  }
+}
+
+TEST(GdrMapTest, HostPtrIsWritable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write directly through the hostPtr (volatile store)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  *ptr = 12345;
+
+  // Read back via GDRCopy
+  uint64_t readback = 0;
+  map.copyFrom(&readback, sizeof(uint64_t));
+  EXPECT_EQ(readback, 12345);
+}
+
+TEST(GdrMapTest, HostPtrIsReadable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy copyTo (same BAR1 path as the read)
+  uint64_t val = 99999;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  // Read through the hostPtr (volatile load via BAR1)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  EXPECT_EQ(*ptr, 99999);
+}
+
+TEST(GdrMapTest, DestroyDoesNotCrash) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  {
+    mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+    ASSERT_TRUE(map.valid());
+    uint64_t val = 1;
+    map.copyTo(&val, sizeof(uint64_t));
+  }
+  // After GdrMap is destroyed, gpuMem should still be valid
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 1);
+}
+
+// GPU kernel: polls signalFromCpu until it reaches expectedIter, then writes expectedIter to ackToHost.
+// Repeats for maxIter iterations. The GPU uses system-scope acquire loads on signalFromCpu
+// and plain stores to ackToHost (which is host-pinned memory visible to CPU).
+__global__ void kernelGdrVisibilityPingPong(volatile uint64_t* signalFromCpu, volatile uint64_t* ackToHost,
+                                            uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // Poll until CPU writes the expected iteration value via GDRCopy BAR1
+    while (*signalFromCpu < iter) {
+    }
+    // Ack back to CPU via host-pinned memory
+    *ackToHost = iter;
+  }
+}
+
+TEST(GdrMapTest, CpuGpuVisibilityPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // signalBuf: GPU memory mapped via GDRCopy BAR1. CPU writes here, GPU polls.
+  auto signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap signalMap(std::static_pointer_cast<void>(signalBuf), deviceId_);
+  ASSERT_TRUE(signalMap.valid());
+
+  // ackBuf: host-pinned memory (gpuCallocHostShared). GPU writes here, CPU polls.
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — it will poll signalBuf and write ackBuf for each iteration
+  kernelGdrVisibilityPingPong<<<1, 1>>>(signalBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes iteration value to GPU via GDRCopy BAR1
+    uint64_t val = iter;
+    signalMap.copyTo(&val, sizeof(uint64_t));
+
+    // CPU polls host-pinned ack until GPU confirms it saw the value
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        FAIL() << "GPU did not ack iteration " << iter << " (ack=" << *ackPtr << ")";
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}
+
+// GPU kernel that polls a counter using system-scope acquire load.
+// When counter >= expectedIter, writes ack.
+__global__ void kernelCounterWait(uint64_t* counter, volatile uint64_t* ackToHost, uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // System-scope acquire load — matches the atomicStore(relaxed) on the CPU side
+    uint64_t got;
+    do {
+      got = mscclpp::atomicLoad(counter, mscclpp::memoryOrderAcquire);
+    } while (got < iter);
+    // Ack back
+    *ackToHost = iter;
+  }
+}
+
+// Test the GDRCopy counter pattern used by HostNoAtomic mode:
+// - GPU memory allocated via gpuCallocShared (cudaMalloc)
+// - GdrMap for BAR1 mapping
+// - CPU writes via atomicStore(relaxed) through GDRCopy BAR1 mapping
+// - GPU reads via atomicLoad with memory_order_acquire
+TEST(GdrMapTest, AtomicStoreCounterPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // Allocate GPU memory via gpuCallocShared
+  auto counterBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap counterMap(std::static_pointer_cast<void>(counterBuf), deviceId_);
+  ASSERT_TRUE(counterMap.valid());
+
+  // Ack buffer: host-pinned memory
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — polls counterBuf with system-scope acquire load
+  kernelCounterWait<<<1, 1>>>(counterBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes counter via atomicStore (relaxed — GPU uses acquire on read)
+    mscclpp::atomicStore(counterMap.hostPtr(), iter, mscclpp::memoryOrderRelaxed);
+
+    // Wait for GPU ack
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+        FAIL() << "GPU did not ack iteration " << iter;
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}
diff --git a/tools/npkit/npkit_trace_generator.py b/tools/npkit/npkit_trace_generator.py
index c5ed6191..294516e6 100644
--- a/tools/npkit/npkit_trace_generator.py
+++ b/tools/npkit/npkit_trace_generator.py
@@ -14,25 +14,25 @@ def parse_npkit_event_header(npkit_event_header_path):
         "NOP",
         "BARRIER",
         "PUT",
-        "PUT_PACKET",
-        "READ_PUT_PACKET",
+        "PUT_PACKETS",
+        "READ_PUT_PACKETS",
         "PUT_WITH_SIGNAL",
         "PUT_WITH_SIGNAL_AND_FLUSH",
         "GET",
         "COPY",
-        "COPY_PACKET",
-        "TRANSFORM_TO_PACKET",
+        "COPY_PACKETS",
+        "UNPACK_PACKETS",
         "SIGNAL",
         "WAIT",
         "FLUSH",
         "REDUCE",
-        "REDUCE_PACKET",
+        "REDUCE_PACKETS",
         "REDUCE_COPY_PACKETS",
         "REDUCE_SEND",
-        "REDUCE_SEND_PACKET",
+        "REDUCE_SEND_PACKETS",
         "REDUCE_COPY_SEND_PACKETS",
-        "READ_REDUCE_COPY",
-        "READ_REDUCE_COPY_SEND",
+        "READ_REDUCE",
+        "READ_REDUCE_SEND",
         "MULTI_LOAD_REDUCE_STORE",
         "RELAXED_SIGNAL",
         "RELAXED_WAIT",