diff --git a/.azure-pipelines/codecov.yml b/.azure-pipelines/codecov.yml
new file mode 100644
index 00000000..c4abeaa7
--- /dev/null
+++ b/.azure-pipelines/codecov.yml
@@ -0,0 +1,93 @@
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - apps/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+      - .devcontainer/**
+      - .github/**
+      - apps/**
+      - docker/**
+      - docs/**
+      - '**/*.md'
+
+jobs:
+- job: CodeCoverageA100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci
+      vmssName:         mscclpp-ci
+      gpuArch:          '80'
+
+- job: CodeCoverageH100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-h100
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
+
+- job: CodeCoverageMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      platform:         rocm
+      gpuArch:          gfx942
diff --git a/.azure-pipelines/integration-test-rocm.yml b/.azure-pipelines/integration-test-rocm.yml
deleted file mode 100644
index a4ffcfc3..00000000
--- a/.azure-pipelines/integration-test-rocm.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-trigger:
-  branches:
-    include:
-    - main
-    - release/*
-  paths:
-    exclude:
-    - .devcontainer/**
-    - .github/**
-    - docker/**
-    - docs/**
-    - '**/*.md'
-
-pr:
-  branches:
-    include:
-    - main
-    - release/*
-  drafts: false
-  paths:
-    exclude:
-      - .devcontainer/**
-      - .github/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
-
-jobs:
-- job: IntegrationTestRocm
-  displayName: Integration test ROCm
-  strategy:
-    matrix:
-      rocm6.2:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
-
-  pool:
-    name: mscclpp-rocm
-  container:
-    image: $[ variables['containerImage'] ]
-    options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1
-
-  steps:
-  - task: Bash@3
-    name: Build
-    displayName: Build
-    inputs:
-      targetType: 'inline'
-      script: |
-        mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
-        make -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: InstallRcclTest
-    displayName: Install rccl-test
-    inputs:
-      targetType: 'inline'
-      script: |
-        git clone https://github.com/ROCm/rccl-tests.git
-        cd rccl-tests
-        make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: InstallDep
-    displayName: Install dependencies
-    inputs:
-     targetType: 'inline'
-     script: |
-      set -e
-      git clone https://github.com/Azure/msccl-tools.git
-      cd msccl-tools
-      pip3 install .
-
-  - task: Bash@3
-    name: GenerateExectionFiles
-    displayName: Generate execution files
-    inputs:
-     targetType: 'inline'
-     script: |
-      set -e
-      git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/msccl-users
-      cd msccl-users
-      mkdir execution-files
-      python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json
-      python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json
-
-  - task: Bash@3
-    name: AllReduceTest
-    displayName: Run mscclpp allReduce test
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        export PATH=/usr/local/mpi/bin:$PATH
-        sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \
-          -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: AllReduceWithExecutionFileTest
-    displayName: Run mscclpp allReduce with execution file
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        export PATH=/usr/local/mpi/bin:$PATH
-        sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
-          -x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
-          -x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
-          -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
-          -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index f6fe3a47..d5d5f9bd 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -41,11 +41,10 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: IntegrationTestH100
@@ -61,10 +60,9 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       perfBaselineFile: test/deploy/perf_ndmv5.jsonl
       gpuArch:          '90'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 97a95c94..d4924879 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -37,33 +37,6 @@ jobs:
     image: $[ variables['containerImage'] ]
 
   steps:
-  - task: Bash@3
-    name: Build
-    displayName: Build
-    inputs:
-      targetType: 'inline'
-      script: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
-        make -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: DownloadSecureFile@1
-    name: SshKeyFile
-    displayName: Download key file
-    inputs:
-      secureFile: mscclpp-ssh.key
-
-  - task: Bash@3
-    name: InstallPackages
-    displayName: Install Packages
-    inputs:
-      targetType: 'inline'
-      script: |
-        sudo apt-get update -y
-        sudo apt-get install pssh -y
-        curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
   - task: Bash@3
     displayName: Add HostEntry
     inputs:
@@ -77,107 +50,46 @@ jobs:
           echo "Entry already exists, nothing to do."
         fi
 
-  - task: AzureCLI@2
-    name: StartVMSS
-    displayName: Start VMSS
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss start --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/deploy.yml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT
 
-  - task: Bash@3
-    name: DeployTestEnv
-    displayName: Deploy Test Env
-    inputs:
-      targetType: filePath
-      filePath: test/deploy/deploy.sh
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMscclppTest
+      displayName: Run multi-nodes mscclpp-test
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
 
-  - task: Bash@3
-    name: RunMscclppTest
-    displayName: Run multi-nodes mscclpp-test
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodeUnitTest
+      displayName: Run multi-nodes unit tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
 
-  - task: Bash@3
-    name: RunMultiNodeUnitTest
-    displayName: Run multi-nodes unit tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonTests
+      displayName: Run multi-nodes python tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh pytests
 
-  - task: Bash@3
-    name: RunMultiNodePythonTests
-    displayName: Run multi-nodes python tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonBenchmark
+      displayName: Run multi-nodes python benchmark
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
 
-  - task: Bash@3
-    name: RunMultiNodePythonBenchmark
-    displayName: Run multi-nodes python benchmark
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
-        kill $CHILD_PID
-
-  - task: AzureCLI@2
-    name: StopVMSS
-    displayName: Deallocate VMSS
-    condition: always()
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss deallocate  --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/stop.yml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT
diff --git a/.azure-pipelines/nccl-api-test.yaml b/.azure-pipelines/nccl-api-test.yml
similarity index 88%
rename from .azure-pipelines/nccl-api-test.yaml
rename to .azure-pipelines/nccl-api-test.yml
index 4951c5bd..cc017412 100644
--- a/.azure-pipelines/nccl-api-test.yaml
+++ b/.azure-pipelines/nccl-api-test.yml
@@ -40,11 +40,10 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       nvccGencode:      "-gencode=arch=compute_80,code=sm_80"
 
 - job: NcclTestH100
@@ -61,9 +60,8 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       nvccGencode:      "-gencode=arch=compute_90,code=sm_90"
\ No newline at end of file
diff --git a/.azure-pipelines/rccl-api-test.yml b/.azure-pipelines/rccl-api-test.yml
new file mode 100644
index 00000000..43841079
--- /dev/null
+++ b/.azure-pipelines/rccl-api-test.yml
@@ -0,0 +1,47 @@
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+      - .devcontainer/**
+      - .github/**
+      - docker/**
+      - docs/**
+      - '**/*.md'
+
+jobs:
+- job: RcclTestMI300X
+  displayName: Run MSCCLPP over RCCL Test (MI300X)
+  pool:
+    name: msccl-ci-mi300x
+
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/rccl-test.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      gpuArch:          gfx942
diff --git a/.azure-pipelines/templates/codecov.yml b/.azure-pipelines/templates/codecov.yml
new file mode 100644
index 00000000..08797351
--- /dev/null
+++ b/.azure-pipelines/templates/codecov.yml
@@ -0,0 +1,110 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildType:        Debug
+    cmakeArgs:        '-DMSCCLPP_ENABLE_COVERAGE=ON'
+    buildDisplayName: 'Build with coverage'
+    buildName:        BuildCoverage
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+- template: run-remote-task.yml
+  parameters:
+    name: TestsCoverageNonPerf
+    displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+      STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
+      export GCOV_PREFIX=/root/mscclpp
+      export GCOV_PREFIX_STRIP=$STRIP_COUNT
+
+      echo "Running unit_tests..."
+      ./build/bin/unit_tests
+      echo "unit_tests: PASSED"
+
+      echo "Running mp_unit_tests -np 2..."
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 2: PASSED"
+
+      echo "Running mp_unit_tests -np 4..."
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 4: PASSED"
+
+- template: run-remote-task.yml
+  parameters:
+    name: CaptureCoverage
+    displayName: Capture coverage data with lcov
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+
+      GCOV_TOOL_ARG=""
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1
+        GCOV_WRAPPER=$(mktemp)
+        printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
+        chmod +x "$GCOV_WRAPPER"
+        GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}"
+      fi
+
+      lcov --version
+      LCOV_CAPTURE_ARGS=""
+      if lcov --help 2>&1 | grep -q "inconsistent"; then
+        LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
+      fi
+
+      lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
+      if [ ! -s coverage.info ]; then
+        echo "ERROR: coverage.info was not generated."
+        exit 1
+      fi
+
+      lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
+      lcov --list coverage.info
+      ls -la coverage.info
+
+- task: Bash@3
+  name: FetchCoverage
+  displayName: Fetch coverage data from remote VM
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      HOST=$(head -1 ${HOSTFILE})
+      ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
+        'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
+      scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: UploadCodecov
+  displayName: Upload coverage to Codecov
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      curl -Os https://cli.codecov.io/latest/linux/codecov
+      chmod +x codecov
+      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml
new file mode 100644
index 00000000..2f642f1d
--- /dev/null
+++ b/.azure-pipelines/templates/deploy.yml
@@ -0,0 +1,151 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+# Build parameters
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+  default: ''
+- name: buildType
+  type: string
+  default: 'Release'
+- name: buildTests
+  type: string
+  default: 'true'
+- name: cmakeArgs
+  type: string
+  default: ''
+- name: buildName
+  type: string
+  default: 'Build'
+- name: buildDisplayName
+  type: string
+  default: 'Build'
+# Deploy parameters
+- name: deployArgs
+  type: string
+  default: ''
+
+steps:
+# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
+- task: Bash@3
+  name: EnsureAzureCLI
+  displayName: Ensure Azure CLI Installed
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      if command -v az >/dev/null 2>&1; then
+        az version >/dev/null
+        exit 0
+      fi
+      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
+# 1. Build
+- task: Bash@3
+  name: ${{ parameters.buildName }}
+  displayName: ${{ parameters.buildDisplayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      BUILD_TESTS_ARG=""
+      if [ "${{ parameters.buildTests }}" = "true" ]; then
+        BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON"
+      fi
+
+      GPU_ARCH_ARG=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        eval CXX=/opt/rocm/bin/hipcc cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_ROCM=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      else
+        eval cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_CUDA=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      fi
+      make -j
+      cd ..
+      pwd > build/BUILD_PREFIX
+      echo "=== Build artifacts ==="
+      ls -la build/bin/ || echo "ERROR: build/bin/ missing after build"
+      du -sh build/bin/* 2>/dev/null || true
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 2. Write CMake args for pip install on remote VMs
+- task: Bash@3
+  name: WritePipCmakeArgs
+  displayName: Write pip CMake args
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      PIP_CMAKE_ARGS=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        PIP_CMAKE_ARGS="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ -n "${CMAKE_EXTRA_ARGS}" ]; then
+        PIP_CMAKE_ARGS="${PIP_CMAKE_ARGS} ${CMAKE_EXTRA_ARGS}"
+      fi
+      echo "${PIP_CMAKE_ARGS}" > pip_cmake_args.txt
+      echo "pip CMake args: $(cat pip_cmake_args.txt)"
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 3. Download SSH key + install packages + start VMSS
+- task: DownloadSecureFile@1
+  name: SshKeyFile
+  displayName: Download key file
+  inputs:
+    secureFile: mscclpp.pem
+
+- task: Bash@3
+  name: InstallPackages
+  displayName: Install Packages
+  inputs:
+    targetType: 'inline'
+    script: |
+      sudo apt-get update -y
+      sudo apt-get install pssh -y
+
+- task: AzureCLI@2
+  name: StartVMSS
+  displayName: Start VMSS
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
+
+# 4. Deploy test environment
+- task: Bash@3
+  name: DeployTestEnv
+  displayName: Deploy Test Env
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: ${{ parameters.deployArgs }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/templates/integration-test.yaml b/.azure-pipelines/templates/integration-test.yaml
deleted file mode 100644
index b9dac24b..00000000
--- a/.azure-pipelines/templates/integration-test.yaml
+++ /dev/null
@@ -1,242 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: perfBaselineFile
-  type: string
-  default: 'test/deploy/perf_ndmv4.jsonl'
-- name: gpuArch
-  type: string
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: inline
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: AllGatherTest
-  displayName: Run mscclpp AllGather test
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        set -e;                                                   \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl;       \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: SendRecvTest
-  displayName: Run mscclpp SendRecv test
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: AllReduceTest
-  displayName: Run mscclpp AllReduce test
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl;                 \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: AllToAll
-  displayName: Run mscclpp AllToAll test
-  inputs:
-    targetType: 'inline'
-    script: |
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: CheckPerfNumber
-  displayName: Check collective primitives performance
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        cd /root/mscclpp;                                         \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PythonAllReduceBenchmark
-  displayName: Python Allreduce Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        set -e;                                                    \
-        cd /root/mscclpp;                                          \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 -m pip install .;                                     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: FifoPerfBenchmark
-  displayName: FIFO Performance Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        ./build/bin/perf/fifo_test"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
\ No newline at end of file
diff --git a/.azure-pipelines/templates/integration-test.yml b/.azure-pipelines/templates/integration-test.yml
new file mode 100644
index 00000000..b686e4f2
--- /dev/null
+++ b/.azure-pipelines/templates/integration-test.yml
@@ -0,0 +1,76 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: perfBaselineFile
+  type: string
+  default: 'test/deploy/perf_ndmv4.jsonl'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test'
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllGatherTest
+    displayName: Run mscclpp AllGather test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: SendRecvTest
+    displayName: Run mscclpp SendRecv test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllReduceTest
+    displayName: Run mscclpp AllReduce test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllToAll
+    displayName: Run mscclpp AllToAll test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: CheckPerfNumber
+    displayName: Check collective primitives performance
+    remoteScript: |
+      python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
+
+- template: run-remote-task.yml
+  parameters:
+    name: PythonAllReduceBenchmark
+    displayName: Python Allreduce Benchmark
+    remoteScript: |
+      python3 -m pip install .
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
\ No newline at end of file
diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml
deleted file mode 100644
index bc804a94..00000000
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ /dev/null
@@ -1,280 +0,0 @@
-# .azure-pipelines/templates/nccl-test.yaml
-# ----------------------------------------
-# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
-#
-# Parameters:
-#   subscription     – Azure subscription to use for VMSS start/stop
-#   sshKeySecureFile – the secureFile name for your SSH key
-
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: nvccGencode
-  type: string
-  default: "-gencode=arch=compute_80,code=sm_80"
-
-steps:
-- checkout: self
-- checkout: git://One/msccl-users
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: mscclpp/test/deploy/deploy.sh
-    arguments: nccltest-single-node
-    workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
-
-- task: Bash@3
-  name: CopyMscclUsers
-  displayName: Copy msccl-users
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      DST_DIR="/tmp/mscclpp/msccl-users"
-      parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: GenerateExecutionFile
-#   displayName: Generate execution file
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp/msccl-users;  \
-#         mkdir -p execution-files;      \
-#         cd /root/mscclpp/msccl-users;  \
-#         bash algos/mscclpp_a100/generate_execution_plan.sh"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: InstallNcclTests
-  displayName: Install NCCL Tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd; git clone https://github.com/NVIDIA/nccl-tests.git;    \
-        cd nccl-tests;                                             \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllReduceTest
-#   displayName: Run NCCL AllReduce Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllGatherTest
-#   displayName: Run NCCL AllGather Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterTest
-#   displayName: Run NCCL Reduce Scatter Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: InstallNccl
-  displayName: Install NCCL
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd; git clone https://github.com/NVIDIA/nccl.git;          \
-        cd nccl;                                                   \
-        make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunNcclAllGatherFallbaclkToNcclTest
-  displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunNcclAllReduceFallbaclkToNcclTest
-  displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunNcclBroadcastFallbaclkToNcclTest
-  displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterFallbaclkToNcclTest
-#   displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";                                                                 \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;                                                                            \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml
new file mode 100644
index 00000000..211e2393
--- /dev/null
+++ b/.azure-pipelines/templates/nccl-test.yml
@@ -0,0 +1,76 @@
+# .azure-pipelines/templates/nccl-test.yml
+# ----------------------------------------
+# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: nvccGencode
+  type: string
+  default: "-gencode=arch=compute_80,code=sm_80"
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    deployArgs:       'nccltest-single-node'
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallNcclTests
+    displayName: Install NCCL Tests
+    remoteScript: |
+      cd
+      git clone https://github.com/NVIDIA/nccl-tests.git
+      cd nccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallNccl
+    displayName: Install NCCL
+    remoteScript: |
+      LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
+      if [ -z "$LATEST_TAG" ]; then
+        echo "Failed to fetch latest NCCL tag"
+        exit 1
+      fi
+      cd
+      git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
+      cd nccl
+      make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclAllGatherFallbaclkToNcclTest
+    displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclAllReduceFallbaclkToNcclTest
+    displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclBroadcastFallbaclkToNcclTest
+    displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/rccl-test.yml b/.azure-pipelines/templates/rccl-test.yml
new file mode 100644
index 00000000..8e247161
--- /dev/null
+++ b/.azure-pipelines/templates/rccl-test.yml
@@ -0,0 +1,63 @@
+# .azure-pipelines/templates/rccl-test.yml
+# ------------------------------------------------
+# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+#   vmssName         – VMSS name to start/stop
+#   gpuArch          – GPU architecture (e.g. gfx942)
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+  default: "gfx942"
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         rocm
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildTests:       false
+    deployArgs:       'single-node-test true rocm'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallRcclTests
+    displayName: Install RCCL Tests
+    remoteScript: |
+      cd
+      git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git
+      cd rocm-systems
+      git sparse-checkout init --cone
+      git sparse-checkout set projects/rccl-tests
+      git checkout
+      cd projects/rccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunRcclAllGatherTest
+    displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunRcclAllReduceTest
+    displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/run-remote-task.yml b/.azure-pipelines/templates/run-remote-task.yml
new file mode 100644
index 00000000..37b3a7d7
--- /dev/null
+++ b/.azure-pipelines/templates/run-remote-task.yml
@@ -0,0 +1,27 @@
+parameters:
+- name: name
+  type: string
+  default: ''
+- name: displayName
+  type: string
+- name: runRemoteArgs
+  type: string
+  default: ''
+- name: remoteScript
+  type: string
+- name: workingDirectory
+  type: string
+  default: '$(System.DefaultWorkingDirectory)'
+
+steps:
+- task: Bash@3
+  ${{ if ne(parameters.name, '') }}:
+    name: ${{ parameters.name }}
+  displayName: ${{ parameters.displayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD'
+      ${{ parameters.remoteScript }}
+      REMOTE_CMD
+    workingDirectory: ${{ parameters.workingDirectory }}
diff --git a/.azure-pipelines/templates/stop.yml b/.azure-pipelines/templates/stop.yml
new file mode 100644
index 00000000..40498c29
--- /dev/null
+++ b/.azure-pipelines/templates/stop.yml
@@ -0,0 +1,20 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+
+steps:
+- task: AzureCLI@2
+  name: StopVMSS
+  displayName: Deallocate VMSS
+  condition: always()
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
diff --git a/.azure-pipelines/templates/ut-executor.yml b/.azure-pipelines/templates/ut-executor.yml
new file mode 100644
index 00000000..426daf17
--- /dev/null
+++ b/.azure-pipelines/templates/ut-executor.yml
@@ -0,0 +1,42 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: ExecutorTest
+    displayName: Run executor tests
+    remoteScript: |
+      python3 -m pip install .
+      PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
+      TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml
deleted file mode 100644
index aa21c407..00000000
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ /dev/null
@@ -1,89 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: gpuArch
-  type: string
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: single-node-test false
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py::test_executor -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
\ No newline at end of file
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yml b/.azure-pipelines/templates/ut-no-ib-env.yml
new file mode 100644
index 00000000..a62f1a77
--- /dev/null
+++ b/.azure-pipelines/templates/ut-no-ib-env.yml
@@ -0,0 +1,95 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_USE_IB=OFF'
+    deployArgs:       'single-node-test false'
+
+- template: run-remote-task.yml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: run-remote-task.yml
+  parameters:
+    name: StopContainer
+    displayName: Stop existing container
+    runRemoteArgs: '--no-docker --no-log'
+    remoteScript: |
+      sudo docker stop mscclpp-test || true
+      sudo docker rm mscclpp-test || true
+
+- task: Bash@3
+  displayName: Remove generated SSH key files
+  inputs:
+    targetType: 'inline'
+    script: |
+      rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+  
+- task: Bash@3
+  name: BuildWithIb
+  displayName: Rebuild with IB
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      cmake \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+        -DMSCCLPP_USE_CUDA=ON \
+        -DMSCCLPP_BUILD_TESTS=ON \
+        -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      make -j
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: DeployTestEnvWithIb
+  displayName: Deploy Test Env (with IB build)
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: single-node-test false
+    workingDirectory: $(System.DefaultWorkingDirectory)
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTestsWithIbBuildDisableIb
+    displayName: Run pytests (IB build, IB tests disabled)
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yaml
deleted file mode 100644
index 0ab733c9..00000000
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ /dev/null
@@ -1,145 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: gpuArch
-  type: string
-
-
-steps:
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        set -e;                                                       \
-        cd /root/mscclpp;                                             \
-        mkdir -p build && cd build;                                   \
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
-        make -j"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: 'inline'
-    script: |
-      # set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output;     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json';      \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output;  \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;          \
-        grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;   \
-        grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml
new file mode 100644
index 00000000..1bd89caf
--- /dev/null
+++ b/.azure-pipelines/templates/ut-npkit.yml
@@ -0,0 +1,57 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
+    deployArgs:       'single-node-test'
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce"
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_UNPACK_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
deleted file mode 100644
index 093a6094..00000000
--- a/.azure-pipelines/templates/ut.yaml
+++ /dev/null
@@ -1,135 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: gpuArch
-  type: string
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
-- task: Bash@3
-  name: UnitTests
-  displayName: Run mscclpp unit tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/ut.yml b/.azure-pipelines/templates/ut.yml
new file mode 100644
index 00000000..743c66e6
--- /dev/null
+++ b/.azure-pipelines/templates/ut.yml
@@ -0,0 +1,49 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_fp8_accum.py -x
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 960f3eae..6b8c9eda 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -37,17 +37,16 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: UnitTestWithNpKitA100
@@ -59,17 +58,16 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: UnitTestH100
@@ -79,17 +77,16 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
 - job: UnitTestWithNpKitH100
@@ -99,21 +96,20 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
 - job: UnitTestNoIBEnv
-  timeoutInMinutes: 40
+  timeoutInMinutes: 60
   displayName: Test No IB Environment
   pool:
     name: msccl-ci-h100
@@ -121,15 +117,55 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-no-ib-env.yaml
+  - template: templates/ut-no-ib-env.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
+
+- job: UnitTestMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      platform:         rocm
+      gpuArch:          gfx942
+
+- job: UnitTestExecutor
+  timeoutInMinutes: 60
+  displayName: Test DSL Executor
+  pool:
+    name: msccl-ci-h100
+
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut-executor.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
\ No newline at end of file
diff --git a/.codecov.yml b/.codecov.yml
new file mode 100644
index 00000000..a98f1e89
--- /dev/null
+++ b/.codecov.yml
@@ -0,0 +1,24 @@
+codecov:
+  require_ci_to_pass: yes
+
+coverage:
+  status:
+    project:
+      default:
+        target: 68%
+        threshold: 1%
+    patch:
+      default:
+        target: 80%
+
+flag_management:
+  default_rules:
+    carryforward: true
+
+ignore:
+  - "test/"
+  - "examples/"
+  - "python/"
+  - "tools/"
+  - "docs/"
+  - "docker/"
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 4cf9dbf8..9d7e7798 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -25,7 +25,7 @@ For C/C++/CUDA source code:
 ```
 
 ## Formatting
-If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only staged files.
+If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only files that are tracked by git, so if you have added new files, make sure to `git add` them first.
 
 ## Building and Testing
 The following commands are commonly used for building and testing the project. See `docs/quickstart.md` for more detailed instructions.
@@ -40,10 +40,10 @@ cd ..
 
 For testing after successful build:
 ```bash
-# To run all tests
+# To run tests with two GPUs - two is enough for most tests
 mpirun -np 2 ./build/bin/mp_unit_tests
 # To run tests excluding IB-related ones (when IB is not available)
-mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
+mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
 ```
 
 For building a Python package:
@@ -51,6 +51,12 @@ For building a Python package:
 python3 -m pip install -e .
 ```
 
+For Python tests after building the package:
+```bash
+# Run tests with 8 GPUs - adjust the number as needed
+mpirun -np 8 python3 -m pytest ./python/test/test_mscclpp.py -vx
+```
+
 For building documentation (see dependencies in `docs/requirements.txt`):
 ```bash
 cd docs
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index b423e326..fb065141 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -40,7 +40,7 @@ jobs:
       fail-fast: false
       matrix:
         language: [ 'cpp', 'python' ]
-        version: [ 'cuda11.8', 'cuda12.8' ]
+        version: [ 'cuda11.8', 'cuda12.9' ]
 
     steps:
     - name: Checkout repository
@@ -51,7 +51,7 @@ jobs:
         df -h
 
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v3
+      uses: github/codeql-action/init@v4
       with:
         languages: ${{ matrix.language }}
 
@@ -62,11 +62,11 @@ jobs:
     - name: Build
       run: |
         rm -rf build && mkdir build && cd build
-        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
-        make -j
+        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF ..
+        make -j4
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v3
+      uses: github/codeql-action/analyze@v4
       with:
         category: "/language:${{matrix.language}}/version:${{matrix.version}}"
 
@@ -96,7 +96,7 @@ jobs:
         df -h
 
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v3
+      uses: github/codeql-action/init@v4
       with:
         languages: ${{ matrix.language }}
 
@@ -107,10 +107,10 @@ jobs:
     - name: Build
       run: |
         rm -rf build && mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
-        make -j
+        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF ..
+        make -j4
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v3
+      uses: github/codeql-action/analyze@v4
       with:
         category: "/language:${{matrix.language}}/version:${{matrix.version}}"
diff --git a/.github/workflows/doc-build.yaml b/.github/workflows/doc-build.yml
similarity index 100%
rename from .github/workflows/doc-build.yaml
rename to .github/workflows/doc-build.yml
diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml
deleted file mode 100644
index 900e8aba..00000000
--- a/.github/workflows/integration-test-backup.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-name: IntegrationTest
-
-on: workflow_dispatch
-
-jobs:
-  IntegrationTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-
-      - name: Lock GPU clock frequency
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: Run mscclpp AllGather test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-
-      - name: Run mscclpp SendRecv test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-
-      - name: Run mscclpp AllReduce test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
-
-      - name: Run mscclpp AllToAll test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-
-      - name: Check collective primitives performance
-        run: |
-          set -e
-          python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
diff --git a/.github/workflows/mscclpp-lang.yml b/.github/workflows/mscclpp-lang.yml
index 5947b087..a9187e96 100644
--- a/.github/workflows/mscclpp-lang.yml
+++ b/.github/workflows/mscclpp-lang.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
         fail-fast: false
         matrix:
-          version: [ 'cuda11.8', 'cuda12.8' ]
+          version: [ 'cuda11.8', 'cuda12.9' ]
 
     steps:
     - uses: actions/checkout@v4
diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml
deleted file mode 100644
index 8849c353..00000000
--- a/.github/workflows/ut-backup.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: UnitTest
-
-on: workflow_dispatch
-
-jobs:
-  UnitTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-        working-directory: ${{ github.workspace }}
-
-      - name: LockGPUClock
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: UnitTests
-        run: |
-          ./build/bin/unit_tests
-
-      - name: MpUnitTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
-
-      - name: PyTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x
diff --git a/.gitignore b/.gitignore
index 9c4da143..74307e67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,9 @@
 .vscode/
-.hypothesis/
 build/
-dist/
+build_coverage/
 __pycache__
 .*.swp
-.idea/
 *.so
+.pytest_cache/
+_codeql_detected_source_root
 docs/_static/versions.js
-_codeql_detected_source_root
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6288dbb0..ef8b785a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 cmake_minimum_required(VERSION 3.25)
 project(mscclpp LANGUAGES CXX)
@@ -47,7 +47,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 # Options
 option(MSCCLPP_ENABLE_TRACE "Enable tracing" OFF)
-option(MSCCLPP_BUILD_TESTS "Build tests" ON)
+option(MSCCLPP_BUILD_TESTS "Build tests" OFF)
 option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
 option(MSCCLPP_BUILD_EXT_NCCL "Build NCCL interfaces" ON)
 option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
@@ -56,6 +56,8 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
 option(MSCCLPP_USE_IB "Use InfiniBand." ON)
 option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
+option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
+option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
 set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
 
 if(MSCCLPP_BYPASS_GPU_CHECK)
@@ -98,6 +100,62 @@ else()
         message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.")
     endif()
 endif()
+
+# Code coverage setup
+if(MSCCLPP_ENABLE_COVERAGE)
+    if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+        message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading")
+    endif()
+    
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+        message(STATUS "Code coverage enabled")
+        
+        # Add coverage flags to C++ targets only (not CUDA)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:--coverage>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O0>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-g>)
+        add_link_options($<$<LINK_LANGUAGE:CXX>:--coverage>)
+        
+        # Find lcov
+        find_program(LCOV_PATH lcov)
+        
+        if(NOT LCOV_PATH)
+            message(WARNING "lcov not found. Install lcov to generate coverage reports.")
+        endif()
+        
+        if(LCOV_PATH)
+            # Add coverage target
+            add_custom_target(coverage
+                COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Running tests..."
+                COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
+                COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info"
+                
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Generating code coverage report"
+            )
+            
+            # Add coverage clean target
+            add_custom_target(coverage-clean
+                COMMAND ${CMAKE_COMMAND} -E remove coverage.info
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Cleaning coverage data"
+            )
+        endif()
+    else()
+        message(WARNING "Code coverage is only supported with GCC or Clang compilers")
+    endif()
+endif()
 if(MSCCLPP_GPU_ARCHS)
     string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
     string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
@@ -166,12 +224,35 @@ if(MSCCLPP_USE_IB)
     if(NOT IBVERBS_FOUND)
         message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.")
     endif()
+    find_package(MLX5)
+    if(MLX5_FOUND)
+        message(STATUS "MLX5 Direct Verbs found: ${MLX5_LIBRARIES}")
+    else()
+        message(STATUS "MLX5 Direct Verbs not found, mlx5dv optimizations disabled")
+    endif()
 endif()
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)
 
+option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON)
+if(MSCCLPP_USE_ROCM)
+    set(MSCCLPP_USE_GDRCOPY OFF)
+endif()
+if(MSCCLPP_USE_GDRCOPY)
+    find_package(GDRCopy)
+    if(NOT GDRCOPY_FOUND)
+        message(STATUS "GDRCopy not found, disabling GDRCopy support")
+        set(MSCCLPP_USE_GDRCOPY OFF)
+    else()
+        message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}")
+    endif()
+endif()
+
 include(FetchContent)
-FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
+FetchContent_Declare(json
+    GIT_REPOSITORY https://github.com/nlohmann/json.git
+    GIT_TAG v3.12.0
+)
 FetchContent_MakeAvailable(json)
 
 if("${INSTALL_PREFIX}" STREQUAL "")
diff --git a/README.md b/README.md
index 69ae5add..58586a30 100644
--- a/README.md
+++ b/README.md
@@ -3,13 +3,16 @@
 [![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest)
 [![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
 [![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
-[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp)
 
 | Testing Pipelines        | Build Status      |
 |--------------------------|-------------------|
-| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
-| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
-| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) |
+| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Unit Tests (ROCm)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
+| NCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
+| RCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |
 
 A GPU-driven communication stack for scalable AI applications.
 
diff --git a/VERSION b/VERSION
index a3df0a69..ac39a106 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.8.0
+0.9.0
diff --git a/cmake/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake
new file mode 100644
index 00000000..54e0ba1c
--- /dev/null
+++ b/cmake/FindGDRCopy.cmake
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE)
+#
+# The following variables are optionally searched for defaults
+#  GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
+#  GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
+#  GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found
+
+# The following are set after configuration is done:
+#  GDRCOPY_FOUND
+#  GDRCOPY_INCLUDE_DIRS
+#  GDRCOPY_LIBRARIES
+
+find_path(GDRCOPY_INCLUDE_DIRS
+  NAMES gdrapi.h
+  HINTS
+  ${GDRCOPY_INCLUDE_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(GDRCOPY_LIBRARIES
+  NAMES gdrapi
+  HINTS
+  ${GDRCOPY_LIB_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+if(GDRCOPY_INCLUDE_DIRS)
+    include(CheckSymbolExists)
+    set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
+    set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
+    check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    unset(CMAKE_REQUIRED_INCLUDES)
+    if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
+        message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
+        set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
+    endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
+mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
diff --git a/cmake/FindMLX5.cmake b/cmake/FindMLX5.cmake
new file mode 100644
index 00000000..9fd59127
--- /dev/null
+++ b/cmake/FindMLX5.cmake
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the MLX5 Direct Verbs (mlx5dv) library
+#
+# The following variables are optionally searched for defaults
+#  MLX5_ROOT_DIR: Base directory where all MLX5 components are found
+#  MLX5_INCLUDE_DIR: Directory where MLX5 headers are found
+#  MLX5_LIB_DIR: Directory where MLX5 libraries are found
+
+# The following are set after configuration is done:
+#  MLX5_FOUND
+#  MLX5_INCLUDE_DIRS
+#  MLX5_LIBRARIES
+
+find_path(MLX5_INCLUDE_DIRS
+  NAMES infiniband/mlx5dv.h
+  HINTS
+  ${MLX5_INCLUDE_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(MLX5_LIBRARIES
+  NAMES mlx5
+  HINTS
+  ${MLX5_LIB_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
+mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
index 04ba1f03..47436202 100644
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         htop \
-        lcov \
         vim \
         && \
     apt-get autoremove -y && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install lcov 2.2
+RUN LCOV_VERSION="2.2" && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        cpanminus \
+        gcc \
+        make \
+        perl \
+        && \
+    cpanm --notest \
+        Capture::Tiny \
+        DateTime \
+        JSON::XS \
+        Memory::Process \
+        TimeDate \
+        && \
+    cd /tmp && \
+    curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \
+    tar xzf lcov.tar.gz && \
+    cd lcov-${LCOV_VERSION} && \
+    make install && \
+    cd / && rm -rf /tmp/lcov* && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
 # Install CMake 3.26.4
 RUN OS_ARCH=$(uname -m) && \
     CMAKE_VERSION="3.26.4" && \
@@ -24,6 +49,33 @@ RUN OS_ARCH=$(uname -m) && \
     rm -rf ${CMAKE_HOME}.tar.gz && \
     ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
 
+# Install GDRCopy userspace library for CUDA targets
+ARG TARGET="cuda13.0"
+RUN if echo "$TARGET" | grep -q "^cuda"; then \
+        GDRCOPY_VERSION="2.5.2" && \
+        apt-get update -y && \
+        apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \
+        cd /tmp && \
+        curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
+        tar xzf gdrcopy.tar.gz && \
+        cd gdrcopy-${GDRCOPY_VERSION}/packages && \
+        ./build-deb-packages.sh -k -t && \
+        dpkg -i libgdrapi_*.deb && \
+        cd / && rm -rf /tmp/gdrcopy* && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* /tmp/*; \
+    fi
+
+# Install ROCm-specific packages if building for ROCm
+RUN if echo "$TARGET" | grep -q "^rocm"; then \
+        apt-get update -y && \
+        apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* /tmp/*; \
+    fi
+
 # Create Python venv
 RUN python3 -m venv /root/venv && \
     echo 'source /root/venv/bin/activate' >> /root/.bashrc
@@ -32,10 +84,13 @@ ENV PATH="/root/venv/bin:${PATH}"
 # Install Python dependencies
 ADD . /tmp/mscclpp
 WORKDIR /tmp/mscclpp
-ARG TARGET="cuda13.0"
 RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
+    if echo "$TARGET" | grep -q "^rocm"; then \
+        export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
+    fi && \
     pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r python/requirements_${target_type}.txt
+    pip install --no-cache-dir -r python/requirements_${target_type}.txt && \
+    pip install --no-cache-dir coverage xlsxwriter
 
 # Cleanup
 RUN rm -rf /tmp/mscclpp
diff --git a/docker/base-x-rocm.dockerfile b/docker/base-x-rocm.dockerfile
deleted file mode 100644
index 525ba1d4..00000000
--- a/docker/base-x-rocm.dockerfile
+++ /dev/null
@@ -1,19 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-ENV RCCL_VERSION=rocm-6.2.0
-ARG GPU_ARCH=gfx942
-ENV ARCH_TARGET=${GPU_ARCH}
-RUN cd /tmp && \
-    git clone --branch ${RCCL_VERSION} --depth 1  https://github.com/ROCm/rccl.git && \
-    cd rccl && \
-    ./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \
-    cd .. && \
-    rm -rf /tmp/rccl
-
-WORKDIR /
diff --git a/docker/build.sh b/docker/build.sh
index e9b10c3a..89568e19 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -4,38 +4,39 @@ set -e
 
 declare -A baseImageTable
 baseImageTable=(
-    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
-    ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
-    ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
-    ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
+    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
     ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
     ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
-    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
+    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
     ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
-    ["rocm6.2"]="rocm/rocm-terminal:6.2.1"
+    ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
 )
 
 declare -A extraLdPathTable
 extraLdPathTable=(
-    ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
-    ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
-    ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
+    ["cuda11.8"]="/usr/local/cuda-11.8/compat"
+    ["cuda12.4"]="/usr/local/cuda-12.4/compat"
+    ["cuda12.8"]="/usr/local/cuda-12.8/compat"
+    ["cuda12.9"]="/usr/local/cuda-12.9/compat"
+    ["cuda13.0"]="/usr/local/cuda-13.0/compat"
     ["rocm6.2"]="/opt/rocm/lib"
 )
 
 declare -A ofedVersionTable
 ofedVersionTable=(
+    ["cuda11.8"]="23.07-0.5.1.2"
     ["cuda12.4"]="23.07-0.5.1.2"
     ["cuda12.8"]="24.10-1.1.4.0"
     ["cuda12.9"]="24.10-1.1.4.0"
     ["cuda13.0"]="24.10-3.2.5.0"
+    ["rocm6.2"]="24.10-1.1.4.0"
 )
 
 TARGET=${1}
 OS_ARCH=$(uname -m)
 
 print_usage() {
-    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
+    echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
 }
 
 if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
@@ -68,18 +69,11 @@ docker build -t ${TAG_TMP} \
 
 if [[ ${TARGET} == rocm* ]]; then
     echo "Building ROCm base image..."
-    docker build -t ${TAG_BASE} \
-        -f docker/base-x-rocm.dockerfile \
-        --build-arg BASE_IMAGE=${TAG_TMP} \
-        --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
-        --build-arg TARGET=${TARGET} \
-        --build-arg GPU_ARCH="gfx942" .
-    docker rmi ${TAG_TMP}
 else
     echo "Building CUDA base image..."
-    docker tag ${TAG_TMP} ${TAG_BASE}
-    docker rmi --no-prune ${TAG_TMP}
 fi
+docker tag ${TAG_TMP} ${TAG_BASE}
+docker rmi --no-prune ${TAG_TMP}
 
 docker build -t ${TAG_BASE_DEV} \
     -f docker/base-dev-x.dockerfile \
diff --git a/docs/Makefile b/docs/Makefile
index 5bc7422e..bf82c03a 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -5,7 +5,7 @@
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
-SPHINXMULTIVERSION ?= sphinx-multiversion
+SPHINXMULTIVERSION ?= python3 build_multiversion.py
 SOURCEDIR     = .
 BUILDDIR      = _build
 
diff --git a/docs/_static/version-selector.js b/docs/_static/version-selector.js
index 0efc47fe..7622aefd 100644
--- a/docs/_static/version-selector.js
+++ b/docs/_static/version-selector.js
@@ -26,27 +26,53 @@
      * @returns {string} The base path (e.g., '/mscclpp' or '')
      */
     function detectBasePath() {
-        const path = window.location.pathname;
-        // Match pattern: /base-path/vX.Y.Z/... or /base-path/main/...
-        // The base path is everything before the version or main directory
-        const match = path.match(/^(\/[^\/]+)?(?=\/(v\d+\.\d+\.\d+|main)\/)/);
-        if (match && match[1]) {
-            return match[1];
-        }
-        // Check if we're at a root that's actually a project site
-        // Look for common indicators like the repository name in the path
-        const projectMatch = path.match(/^(\/[^\/]+)(?=\/)/);
-        if (projectMatch) {
-            // Verify this isn't a version path at root
-            const potentialBase = projectMatch[1];
-            if (!potentialBase.match(/^\/v\d+\.\d+\.\d+$/) && potentialBase !== '/main') {
-                // Check if the remaining path contains version info
-                const remainingPath = path.substring(potentialBase.length);
-                if (remainingPath.match(/^\/(v\d+\.\d+\.\d+|main)\//)) {
-                    return potentialBase;
+        // Most reliable method: detect from this script's own URL
+        // The script is always at {base}/_static/version-selector.js or {base}/vX.Y.Z/_static/version-selector.js
+        const scripts = document.getElementsByTagName('script');
+        for (let i = 0; i < scripts.length; i++) {
+            const src = scripts[i].src;
+            if (src && (src.includes('/_static/version-selector.js') || src.endsWith('version-selector.js'))) {
+                try {
+                    const url = new URL(src);
+                    const scriptPath = url.pathname;
+                    // Extract base path: everything before /_static/version-selector.js
+                    // But also strip version directories like /v0.8.0/ or /main/
+                    const match = scriptPath.match(/^(.*?)\/_static\/version-selector\.js$/);
+                    if (match) {
+                        let basePath = match[1] || '';
+                        // Remove version suffix if present (e.g., /mscclpp/v0.8.0 -> /mscclpp)
+                        basePath = basePath.replace(/\/(v\d+\.\d+\.\d+|main)$/, '');
+                        return basePath;
+                    }
+                } catch (e) {
+                    // URL parsing failed, continue to fallback
+                    // Log a warning to aid debugging when the primary detection method fails.
+                    if (typeof console !== 'undefined' && typeof console.warn === 'function') {
+                        console.warn('version-selector: Failed to parse script URL for base path detection; falling back to location-based detection.', src, e);
+                    }
                 }
             }
         }
+
+        // Fallback: try to detect from URL path
+        const path = window.location.pathname;
+        const segments = path.split('/').filter(s => s.length > 0);
+
+        if (segments.length >= 1) {
+            const firstSegment = segments[0];
+            // If first segment is not a version tag (vX.Y.Z), not 'main', and
+            // does not look like a file name (no '.' in the segment), then it's
+            // the GitHub Pages project base path (e.g., 'mscclpp').
+            // This handles both:
+            //   /mscclpp/v0.8.0/index.html -> base is /mscclpp
+            //   /mscclpp/index.html -> base is /mscclpp
+            // while avoiding treating root files like /index.html as a base path.
+            if (!firstSegment.match(/^v\d+\.\d+\.\d+$/) && firstSegment !== 'main' && !firstSegment.includes('.')) {
+                return '/' + firstSegment;
+            }
+        }
+
+        // No base path (root site or local development)
         return '';
     }
     
diff --git a/docs/build_multiversion.py b/docs/build_multiversion.py
new file mode 100644
index 00000000..ace20fc0
--- /dev/null
+++ b/docs/build_multiversion.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Wrapper around sphinx-multiversion that patches copy_tree to generate
+_version.py in each tag checkout. This is needed because setuptools_scm
+generates _version.py at build time, but sphinx-multiversion uses
+`git archive` which only contains committed files.
+
+Usage (called by Makefile):
+    python3 build_multiversion.py <sourcedir> <outputdir> [sphinx-opts...]
+"""
+
+import os
+import re
+import subprocess
+import sys
+
+import sphinx_multiversion.git as smv_git
+from sphinx_multiversion import main as smv_main
+
+# Save the original copy_tree
+_original_copy_tree = smv_git.copy_tree
+
+
+def _patched_copy_tree(gitroot, src, dst, reference, sourcepath="."):
+    """Call original copy_tree, then generate _version.py from the VERSION file."""
+    _original_copy_tree(gitroot, src, dst, reference, sourcepath)
+
+    # Extract version from the tag name (e.g., "v0.9.0" -> "0.9.0")
+    refname = getattr(reference, "refname", "") or ""
+    match = re.search(r"v(\d+\.\d+\.\d+)", refname)
+    if not match:
+        return
+
+    version = match.group(1)
+    version_py_dir = os.path.join(dst, "python", "mscclpp")
+    if os.path.isdir(version_py_dir):
+        version_py = os.path.join(version_py_dir, "_version.py")
+        if not os.path.exists(version_py):
+            with open(version_py, "w") as f:
+                f.write(f'__version__ = "{version}"\n')
+
+
+# Monkey-patch
+smv_git.copy_tree = _patched_copy_tree
+
+if __name__ == "__main__":
+    sys.exit(smv_main(sys.argv[1:]))
diff --git a/docs/conf.py b/docs/conf.py
index fdfb8d66..52321465 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -11,6 +11,18 @@
 import sys
 import importlib.util
 from pathlib import Path
+from unittest.mock import MagicMock
+
+
+class NamedMock(MagicMock):
+    def __getattr__(self, name):
+        attr = super().__getattr__(name)
+        if isinstance(attr, MagicMock):
+            # Assigns __name__ and __qualname__ to satisfy Sphinx autodoc inspection.
+            attr.__name__ = name
+            attr.__qualname__ = name
+        return attr
+
 
 # Add the python package to sys.path so Sphinx can find it
 project_root = Path(__file__).parent.parent
@@ -63,7 +75,7 @@ autodoc_default_options = {
     "show-inheritance": True,
 }
 # only mock the C-extension when using the source tree
-autodoc_mock_imports = ["mscclpp._version", "mscclpp._mscclpp", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"]
+autodoc_mock_imports = ["mscclpp._version", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"]
 autodoc_typehints = "description"
 napoleon_google_docstring = True
 napoleon_numpy_docstring = True
@@ -71,6 +83,10 @@ intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
 }
+mock_mscclpp = NamedMock()
+# Set attributes to satisfy Sphinx autodoc inspection.
+mock_mscclpp.env.return_value.cache_dir = "_mscclpp"
+sys.modules["mscclpp._mscclpp"] = mock_mscclpp
 
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
diff --git a/docs/dsl/quick_start.md b/docs/dsl/quick_start.md
index 6c32ec32..afccd48e 100644
--- a/docs/dsl/quick_start.md
+++ b/docs/dsl/quick_start.md
@@ -12,6 +12,10 @@ After finishing the installation in the quick start section, you can add the fol
 python3 -m mscclpp --install
 ```
 
+This installs bundled default execution plans into `~/.cache/mscclpp/default` by default.
+If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed into `MSCCLPP_CACHE_DIR/default`.
+`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
+
 ## Your First Algorithm: AllGather
 
 Let's walk through a simple AllGather algorithm to understand the DSL basics. This example demonstrates the key concepts without diving into all the advanced features.
diff --git a/docs/dsl/results.md b/docs/dsl/results.md
index a34eae5b..a1adad2a 100644
--- a/docs/dsl/results.md
+++ b/docs/dsl/results.md
@@ -56,9 +56,12 @@ python3 -m mscclpp --install
 
 After installation, the generated JSON execution plan can be found at:
 ```
-~/.cache/mscclpp_default/
+~/.cache/mscclpp/default/
 ```
 
+If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed under `MSCCLPP_CACHE_DIR/default/`.
+`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
+
 **Performance Results:**
 
 The figure below shows the performance characteristics for small message sizes in a two-node configuration:
diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md
index 236dd8ef..b4e4fcdf 100644
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -129,7 +129,7 @@ class CustomizedComm:
         self._algo_large = [
             algo for algo in algorithms
             if algo.collective == "allreduce"
-            and algo.name == "default_allreduce_nvls_with_copy"
+            and algo.name == "default_allreduce_nvls_warp_pipeline"
         ][0]
 
     def all_reduce(self, tensor: torch.Tensor, stream=None):
@@ -332,7 +332,8 @@ public:
                    size_t inputSize, size_t outputSize,
                    mscclpp::DataType dtype, mscclpp::ReduceOp op,
                    cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                   const std::unordered_map<std::string, uintptr_t>& extras) {
+                   const std::unordered_map<std::string, uintptr_t>& extras,
+                   [[maybe_unused]] mscclpp::DataType accumDtype) {
                 return self->kernelFunc(ctx, input, output, inputSize, dtype, stream);
             },
             // Context initialization function
@@ -343,8 +344,8 @@ public:
             },
             // Context key generation function
             [self](const void* input, void* output,
-                   size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
-                return self->generateContextKey(input, output, inputSize, outputSize, dtype);
+                   size_t inputSize, size_t outputSize, mscclpp::DataType dtype, bool symmetricMemory) {
+                return self->generateContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
             }
         );
     }
@@ -468,3 +469,196 @@ stream_handle = torch.cuda.current_stream().cuda_stream
 
 All examples are in [`examples/torch-integration/`](../../examples/torch-integration/).
 
+---
+
+## Performance Tuning
+
+The default algorithms use a fixed heuristic to select algorithms based on message size. For production workloads, you can achieve significantly better performance by **auto-tuning** — benchmarking every candidate algorithm, block count, and thread count for each message size at startup, then using the fastest configuration at runtime.
+
+**Full example:** [customized_comm_with_tuning.py](../../examples/torch-integration/customized_comm_with_tuning.py)
+
+### How It Works
+
+1. **Candidate selection** — For each power-of-two message size from 1 KB to 128 MB, the tuner picks the applicable algorithms:
+   - All sizes (when NVLS is supported): `default_allreduce_nvls_zero_copy`
+   - Small messages (≤ 4 MB): `default_allreduce_nvls_packet`, `default_allreduce_packet`
+   - Large messages (≥ 512 KB): `default_allreduce_rsag_zero_copy`
+
+2. **Grid search** — Each candidate is run with every combination of block counts (`4, 8, 16, … 128`) and thread counts (`512, 768, 1024`). Results are captured in a CUDA graph and timed.
+
+3. **Cross-rank consensus** — Elapsed times are averaged across all ranks with an allreduce so every GPU selects the same configuration.
+
+4. **Runtime dispatch** — `get_tuned_config()` rounds the actual message size up to the next power of two and returns the winning `(algorithm, nblocks, nthreads)` triple.
+
+### Symmetric Memory Allocation
+
+Algorithms like `default_allreduce_nvls_zero_copy` require **symmetric memory** — memory where the buffer offset is the same for each rank, allocated via `mscclpp.RawGpuBuffer` (`cuMemAlloc`). Regular `torch.rand()` or `torch.empty()` allocations cannot be used with these algorithms because they do not guarantee the same offset across ranks. Instead, allocate a single large buffer and reuse it for all message sizes:
+
+```python
+# Allocate symmetric memory via RawGpuBuffer and wrap as a PyTorch tensor
+tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
+tune_tensor.normal_()
+```
+
+When executing an algorithm with symmetric memory, pass `symmetric_memory=True`:
+
+```python
+def _run_algo(self, algo, tensor, size, nblocks, nthreads):
+    return algo.execute(
+        comm=self.comm.communicator,
+        input_buffer=tensor.data_ptr(),
+        output_buffer=tensor.data_ptr(),
+        input_size=size,
+        output_size=size,
+        dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+        op=mscclpp.ReduceOp.SUM,
+        stream=torch.cuda.current_stream().cuda_stream,
+        nblocks=nblocks,
+        nthreads_per_block=nthreads,
+        symmetric_memory=True,
+    )
+```
+
+### Loading Candidate Algorithms
+
+The same `load_algorithms` helper from Approach 1 is reused. The tuner extracts multiple algorithm objects:
+
+```python
+algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
+
+self._algorithm_nvls_packet = [
+    algo for algo in algorithms
+    if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
+][0]
+
+self._algorithm_rsag_zero_copy = [
+    algo for algo in algorithms
+    if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy"
+][0]
+
+self._algorithm_packet = [
+    algo for algo in algorithms
+    if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
+][0]
+
+# NVLS zero-copy is only available on supported hardware
+if mscclpp.is_nvls_supported():
+    self._algorithm_nvls_zero_copy = [
+        algo for algo in algorithms
+        if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy"
+    ][0]
+```
+
+### The Tuning Loop
+
+The tuning loop iterates over message sizes, candidate algorithms, and kernel launch parameters. CUDA graphs are used for accurate timing. Note the use of `RawGpuBuffer` for symmetric memory:
+
+```python
+def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
+    sizes = [1 << i for i in range(10, 28)]
+    self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
+
+    # Use RawGpuBuffer for symmetric memory allocation
+    tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+    tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
+    tune_tensor.normal_()
+    candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
+    candidates_nthreads = [512, 768, 1024]
+
+    for size in sizes:
+        algos = []
+        if mscclpp.is_nvls_supported():
+            algos.append(self._algorithm_nvls_zero_copy)
+        if size <= 4 * 1024 * 1024:
+            algos.append(self._algorithm_nvls_packet)
+            algos.append(self._algorithm_packet)
+        if size >= 512 * 1024:
+            algos.append(self._algorithm_rsag_zero_copy)
+
+        best_time = float("inf")
+        best_config = None
+
+        for algo in algos:
+            for nb in candidates_nblocks:
+                for nt in candidates_nthreads:
+                    if self._run_algo(algo, tune_tensor, size, nb, nt) != 0:
+                        continue  # skip unsupported configs
+
+                    # Warmup, then time with CUDA graphs
+                    # ... (see full example for graph capture logic)
+
+                    # Average timing across ranks
+                    time_tensor = torch.full(
+                        (self.world_size,), elapsed, dtype=torch.float64, device="cuda"
+                    ).to(dtype=torch.float32)
+                    self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM)
+                    avg_time = time_tensor[self.rank].item() / self.world_size
+
+                    if avg_time < best_time:
+                        best_time = avg_time
+                        best_config = (algo, nb, nt)
+
+        if best_config:
+            self.best_configs[size] = best_config
+```
+
+### Dispatching with Tuned Configuration
+
+At runtime, round the message size to the next power of two and look up the best configuration. When the tensor is allocated from `RawGpuBuffer` (`cuMemAlloc`) and the buffer offset is the same for each rank, pass `symmetric_memory=True` to the `execute()` call (see the [Symmetric Memory Allocation](#symmetric-memory-allocation) section above):
+
+```python
+def get_tuned_config(self, size):
+    if size < 1024:
+        target_size = 1024
+    elif size > 256 * 1024 * 1024:
+        target_size = 256 * 1024 * 1024
+    else:
+        target_size = 1 << (size - 1).bit_length()
+    return self.best_configs.get(target_size)
+
+def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None):
+    config = self.get_tuned_config(tensor.nbytes)
+    algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0)
+    algo.execute(
+        comm=self.comm.communicator,
+        input_buffer=tensor.data_ptr(),
+        output_buffer=tensor.data_ptr(),
+        input_size=tensor.nbytes,
+        output_size=tensor.nbytes,
+        dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+        op=mscclpp.ReduceOp.SUM,
+        stream=stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream,
+        nblocks=nblocks,
+        nthreads_per_block=nthreads,
+    )
+```
+
+### Benchmarking with Symmetric Memory
+
+When benchmarking tuned configurations, use the same `RawGpuBuffer` allocation pattern. Create one large buffer and slice it for each message size:
+
+```python
+def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
+    # Allocate a single large RawGpuBuffer (symmetric memory) and reuse for all sizes
+    dtype = torch.float16
+    bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype))
+    bench_buf = torch.utils.dlpack.from_dlpack(bench_buf)
+    bench_buf.normal_()
+
+    for size in sizes:
+        n_elements = size // bench_buf.element_size()
+        tensor = bench_buf[:n_elements]
+
+        # Capture CUDA graph, warmup, and time...
+        with torch.cuda.graph(g, stream=capture_stream):
+            for _ in range(n_iter_per_graph):
+                self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
+```
+
+### Running the Tuning Example
+
+```bash
+MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
+  torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py
+```
diff --git a/docs/py_api.rst b/docs/py_api.rst
index 5ea39bc3..7acc9273 100644
--- a/docs/py_api.rst
+++ b/docs/py_api.rst
@@ -7,6 +7,4 @@ This reference organizes the MSCCL++ Python API.
    :toctree: py_api
    :recursive:
 
-   mscclpp.comm
-   mscclpp.utils
-   mscclpp.language
+   mscclpp
diff --git a/docs/quickstart.md b/docs/quickstart.md
index 04a26466..c9c98128 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -31,6 +31,9 @@
         ```
         If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
     * (Optional, for benchmarks) MPI
+    * (Optional, for NVIDIA platforms) [GDRCopy](https://github.com/NVIDIA/gdrcopy) >= 2.5.1
+        * GDRCopy is required for IB `HostNoAtomic` mode, which uses CPU-side signal forwarding to GPU memory via BAR1 mappings. This mode is used on platforms where RDMA atomics are not available (e.g., when using Data Direct Virtual Functions).
+        * Install GDRCopy from source or via packages. See the [GDRCopy installation guide](https://github.com/NVIDIA/gdrcopy#installation).
 * Others
     * For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites).
     * For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.
@@ -42,7 +45,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se
 
 ```bash
 # For NVIDIA platforms
-$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
+$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash
 # For AMD platforms
 $ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
 ```
@@ -171,7 +174,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
 For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
 
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
@@ -189,14 +191,12 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
 
 Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
 Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
-mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
+mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
 On AMD platforms, you need to add `RCCL_MSCCL_ENABLE=0` to avoid conflicts with the fallback features.
diff --git a/docs/tutorials/03-memory-channel.md b/docs/tutorials/03-memory-channel.md
index 00e2192b..c6a8b9e1 100644
--- a/docs/tutorials/03-memory-channel.md
+++ b/docs/tutorials/03-memory-channel.md
@@ -78,7 +78,7 @@ mscclpp::GpuBuffer buffer(bufferBytes);
 mscclpp::RegisteredMemory localRegMem = comm.registerMemory(buffer.data(), buffer.bytes(), transport);
 ```
 
-Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (such as [NVL72](https://www.nvidia.com/en-us/data-center/gb300-nvl72) on NVIDIA Grace Blackwell platforms). We will introduce other transport types in later tutorials.
+Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (will be explained in a later section: {ref}`mc-cross-node`). We will introduce other transport types in later tutorials.
 
 **GpuBuffer** is NOT required for creating a `RegisteredMemory`; you can register any pre-allocated GPU memory region with `registerMemory()`. However, it is the user's responsibility to ensure that the memory region is suitable for their communication operations. Depending on the hardware platform, some communication methods may require specific memory allocation to ensure data consistency and correctness. `GpuBuffer` is a convenient way to allocate GPU memory that is compatible with the communication methods that MSCCL++ supports. It provides a simple interface for allocating GPU memory and automatically handles memory deallocation when it goes out of scope.
 
@@ -251,6 +251,37 @@ columns 2
 
 Since the flags take 50% of the packet size, the goodput of communication using packets is only 50% compared to transferring raw data. However, this doesn't matter because packets are designed for small data transfers. Packets transfer small data efficiently because the integrity of the user data is guaranteed by only waiting for the correct flags (done by `unpackPackets()`); explicit memory synchronization (signal and wait) is not needed.
 
+(mc-cross-node)=
+## Cross-node Execution
+
+For **inter-node** communication, using `PortChannel` (will be explained in the following tutorial) is usually a more accessible option that leverages more widely-used networking interfaces. However, `MemoryChannel` can still be used as long as the underlying hardware allows memory mapping between the two GPUs, such as [Multi-Node NVLink (MNNVL)](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/overview.html) on NVIDIA Grace Blackwell platforms.
+
+We can use the same example code to test inter-node `MemoryChannel`. Users can consult the [NVIDIA MNNVL verification guide](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/verifying.html) for verification steps and detailed environment requirements for MNNVL.
+
+Run the program on two nodes with command line arguments:
+
+```
+./bidir_memory_channel [<ip_port> <rank> <gpu_id>]
+```
+
+For example, assume we use `192.168.0.1:50000` as the bootstrap IP address and port, and both nodes use GPU 0 locally.
+
+On Node 0 (Rank 0):
+```bash
+$ ./bidir_memory_channel 192.168.0.1:50000 0 0
+```
+
+On Node 1 (Rank 1):
+```bash
+$ ./bidir_memory_channel 192.168.0.1:50000 1 0
+```
+
+You should see output indicating successful data transfer.
+
+```{tip}
+If your bootstrap IP address is not on the default network interface of your node, you can specify the network interface by passing `interface_name:ip:port` as the first argument (such as `eth1:192.168.0.1:50000`).
+```
+
 ## Summary and Next Steps
 
 In this tutorial, you have learned how to use `MemoryChannel` for efficient data transfer between GPUs. You have also learned how to create communication buffers using `RegisteredMemory` and `GpuBuffer`, and how to use packets for small data transfers. You can find more complex usage of `MemoryChannel` in the {ref}`mscclpp-test`.
diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu
index 436a6a94..02df3685 100644
--- a/examples/customized-collective-algorithm/customized_allgather.cu
+++ b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -101,15 +101,17 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
         "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
         [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
                mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
-               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+               [[maybe_unused]] mscclpp::DataType accumDtype) {
           return self->allgatherKernelFunc(ctx, input, output, inputSize, stream);
         },
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
                size_t outputSize,
                mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
+               bool symmetricMemory) {
           return self->generateAllgatherContextKey(input, output, inputSize, outputSize,
-                                                   static_cast<ncclDataType_t>(dtype));
+                                                   static_cast<ncclDataType_t>(dtype), symmetricMemory);
         });
     return allgatherAlgo;
   }
@@ -191,7 +193,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
   }
 
   mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
-                                                       size_t outputSize, ncclDataType_t dtype) {
+                                                       size_t outputSize, ncclDataType_t dtype, bool) {
     return {(void*)input, output, inputSize, outputSize, 0};
   }
 };
diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu
index 10400ddc..907b3ada 100644
--- a/examples/torch-integration/customized_allgather.cu
+++ b/examples/torch-integration/customized_allgather.cu
@@ -69,14 +69,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
         "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
         [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
                mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
-               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+               [[maybe_unused]] mscclpp::DataType accumDtype) {
           return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream);
         },
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
                size_t outputSize,
                mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
-          return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype);
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
+               bool symmetricMemory) {
+          return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
         });
     return allgatherAlgo;
   }
@@ -159,7 +161,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
   }
 
   mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
-                                                       size_t outputSize, mscclpp::DataType dtype) {
+                                                       size_t outputSize, mscclpp::DataType dtype, bool) {
     return {(void*)input, output, inputSize, outputSize, 0};
   }
 };
diff --git a/examples/torch-integration/customized_comm_with_default_algo.py b/examples/torch-integration/customized_comm_with_default_algo.py
index 78560f15..3e933107 100644
--- a/examples/torch-integration/customized_comm_with_default_algo.py
+++ b/examples/torch-integration/customized_comm_with_default_algo.py
@@ -15,7 +15,9 @@ import ipaddress
 def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
     collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
     return collection_builder.build_default_algorithms(
-        scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
+        scratch_buffer=scratch_buffer.data_ptr(),
+        scratch_buffer_size=scratch_buffer.nbytes,
+        rank=rank,
     )
 
 
@@ -59,7 +61,7 @@ class CustomizedComm:
         self._algorithm_nvls_nonzero_copy = [
             algo
             for algo in algorithms
-            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_with_copy"
+            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_warp_pipeline"
         ][0]
 
     def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py
new file mode 100644
index 00000000..060a0097
--- /dev/null
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -0,0 +1,476 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
+
+import os
+import ipaddress
+
+import netifaces as ni
+import torch
+import mscclpp
+import mscclpp.ext
+import mscclpp.utils as mscclpp_utils
+
+# -- Helpers ------------------------------------------------------------------
+
+
+def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor:
+    """Allocate a tensor backed by RawGpuBuffer (symmetric memory)."""
+    # PyTorch's from_dlpack does not support certain float8 DLPack type codes.
+    # Work around by importing as uint8 and reinterpreting via .view().
+    _DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)
+    if dtype in _DLPACK_UNSUPPORTED:
+        dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8))
+        return torch.utils.dlpack.from_dlpack(dlpack).view(dtype)
+    dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype))
+    return torch.utils.dlpack.from_dlpack(dlpack)
+
+
+def _load_algorithms(scratch: torch.Tensor, rank: int):
+    return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms(
+        scratch_buffer=scratch.data_ptr(),
+        scratch_buffer_size=scratch.nbytes,
+        rank=rank,
+    )
+
+
+def _interfaces_for_ip(ip: str):
+    target = ipaddress.ip_address(ip)
+    for iface in ni.interfaces():
+        addrs = ni.ifaddresses(iface)
+        if ni.AF_INET in addrs:
+            for link in addrs[ni.AF_INET]:
+                if "addr" in link and ipaddress.ip_address(link["addr"]) == target:
+                    return iface
+    return None
+
+
+def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
+    if op == torch.distributed.ReduceOp.SUM:
+        return mscclpp.ReduceOp.SUM
+    if op == torch.distributed.ReduceOp.MIN:
+        return mscclpp.ReduceOp.MIN
+    raise ValueError(f"unsupported op: {op}")
+
+
+def _round_pow2(size: int) -> int:
+    """Round up to next power-of-2, clamped to [1024, 256 MB]."""
+    size = max(size, 1024)
+    size = min(size, 256 << 20)
+    return 1 << (size - 1).bit_length()
+
+
+# -- CustomizedComm -----------------------------------------------------------
+
+
+class CustomizedComm:
+    """Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
+
+    _TUNE_N_WARMUP = 5
+    _TUNE_N_GRAPH_LAUNCHES = 10
+    _TUNE_N_OPS_PER_GRAPH = 100
+    _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128]
+    _CANDIDATE_NTHREADS = [512, 768, 1024]
+    _NBLOCKS_LIMIT = {
+        "default_allreduce_nvls_packet": 16,
+        "default_allreduce_packet": 56,
+        "default_allreduce_allpair_packet": 56,
+        "default_allreduce_fullmesh": 64,
+        "default_allgather_fullmesh2": 32,
+    }
+
+    def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
+        self.comm = comm
+        self.rank = comm.my_rank
+        self.world_size = comm.nranks
+        self.symmetric_memory = symmetric_memory
+        self._nvls = mscclpp.is_nvls_supported()
+
+        self._scratch = _make_tensor(1 << 27, torch.float16)
+        self._barrier_tensor = _make_tensor(4096, torch.float32)
+
+        algos = _load_algorithms(self._scratch, self.rank)
+        self._algos = {(a.collective, a.name): a for a in algos}
+
+        # {collective: {rounded_size: (algo, nblocks, nthreads)}}
+        self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}}
+        self._tune_buf = None
+        self._time_buf = None
+
+    def _algo(self, collective: str, name: str):
+        return self._algos.get((collective, name))
+
+    def _default_ar_config(self):
+        """Fallback allreduce config for barrier / timing sync."""
+        pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
+        if self._nvls and pkt:
+            return (pkt, 0, 0)
+        return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
+
+    # -- low-level execute --
+
+    def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True):
+        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
+        ret = algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=tensor.data_ptr(),
+            output_buffer=tensor.data_ptr(),
+            input_size=tensor.nbytes,
+            output_size=tensor.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+            op=op,
+            stream=s,
+            nblocks=nb,
+            nthreads_per_block=nt,
+            symmetric_memory=sym,
+            accum_dtype=accum_dtype,
+        )
+        if ret != 0:
+            print(f"Rank {self.rank}: {algo.name} failed ({ret})")
+        return ret
+
+    def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
+        if sym is None:
+            sym = self.symmetric_memory
+        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
+        ret = algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=inp.data_ptr(),
+            output_buffer=out.data_ptr(),
+            input_size=inp.nbytes,
+            output_size=out.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype),
+            op=mscclpp.ReduceOp.NOP,
+            stream=s,
+            nblocks=nb,
+            nthreads_per_block=nt,
+            symmetric_memory=sym,
+        )
+        if ret != 0:
+            print(f"Rank {self.rank}: AG {algo.name} failed ({ret})")
+        return ret
+
+    def _barrier_internal(self):
+        a, nb, nt = self._default_ar_config()
+        self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
+
+    # -- lazy tuning --
+
+    def _ensure_tune_bufs(self):
+        if self._tune_buf is None:
+            self._tune_buf = _make_tensor(1 << 27, torch.float16)
+            self._tune_buf.normal_()
+            self._time_buf = _make_tensor(4096, torch.float32)
+        return self._tune_buf
+
+    def _ar_candidates(self, size: int):
+        out = []
+        if size <= 4 << 20:
+            a = self._algo("allreduce", "default_allreduce_nvls_packet")
+            if self._nvls and a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_packet")
+            if a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_allpair_packet")
+            if a:
+                out.append(a)
+        if size >= 512 << 10:
+            a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
+            if self._nvls and self.symmetric_memory and a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
+            if a:
+                out.append(a)
+        if torch.version.hip is not None:
+            a = self._algo("allreduce", "default_allreduce_fullmesh")
+            if a:
+                out.append(a)
+        return out
+
+    def _ag_candidates(self):
+        a = self._algo("allgather", "default_allgather_fullmesh2")
+        return [a] if a else []
+
+    def _run_tune(self, collective, algo, buf, size, nb, nt):
+        """Single tune invocation for either collective."""
+        if collective == "allreduce":
+            return algo.execute(
+                comm=self.comm.communicator,
+                input_buffer=buf.data_ptr(),
+                output_buffer=buf.data_ptr(),
+                input_size=size,
+                output_size=size,
+                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
+                op=mscclpp.ReduceOp.SUM,
+                stream=torch.cuda.current_stream().cuda_stream,
+                nblocks=nb,
+                nthreads_per_block=nt,
+                symmetric_memory=True,
+            )
+        else:
+            total = size * self.world_size
+            out_ptr = buf.data_ptr()
+            return algo.execute(
+                comm=self.comm.communicator,
+                input_buffer=out_ptr + self.rank * size,
+                output_buffer=out_ptr,
+                input_size=size,
+                output_size=total,
+                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
+                op=mscclpp.ReduceOp.NOP,
+                stream=torch.cuda.current_stream().cuda_stream,
+                nblocks=nb,
+                nthreads_per_block=nt,
+                symmetric_memory=False,
+            )
+
+    def _tune_size(self, collective: str, target_size: int):
+        """Auto-tune one (collective, target_size) pair and cache result."""
+        buf = self._ensure_tune_bufs()
+        cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates()
+
+        best_time, best_cfg = float("inf"), None
+        used = set()
+        run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
+
+        for algo in cands:
+            nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
+            for nb in self._CANDIDATE_NBLOCKS:
+                if nb > nb_limit:
+                    continue
+                for nt in self._CANDIDATE_NTHREADS:
+                    # Feasibility — sync result across ranks so all agree
+                    ret = run(algo, nb, nt)
+                    torch.cuda.synchronize()
+                    self._time_buf[0] = float(ret)
+                    self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True)
+                    if self._time_buf[0].item() != 0:
+                        continue
+                    used.add(algo)
+
+                    # Warmup
+                    for _ in range(self._TUNE_N_WARMUP):
+                        run(algo, nb, nt)
+
+                    # CUDA-graph timed benchmark
+                    cs = torch.cuda.Stream()
+                    cs.wait_stream(torch.cuda.current_stream())
+                    g = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(g, stream=cs):
+                        for _ in range(self._TUNE_N_OPS_PER_GRAPH):
+                            run(algo, nb, nt)
+
+                    start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+                    start.record(cs)
+                    with torch.cuda.stream(cs):
+                        for _ in range(self._TUNE_N_GRAPH_LAUNCHES):
+                            g.replay()
+                    end.record(cs)
+                    end.synchronize()
+                    elapsed = start.elapsed_time(end)
+
+                    # Cross-rank timing sync
+                    self._time_buf.fill_(elapsed)
+                    torch.cuda.current_stream().wait_stream(cs)
+                    self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True)
+                    avg = self._time_buf[self.rank].item() / self.world_size
+
+                    if avg < best_time:
+                        best_time, best_cfg = avg, (algo, nb, nt)
+
+        if best_cfg:
+            self._tune_cache[collective][target_size] = best_cfg
+            if self.rank == 0:
+                n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH
+                print(
+                    f"[tune] {collective} size={target_size}: {best_cfg[0].name} "
+                    f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us",
+                    flush=True,
+                )
+        else:
+            fb = (
+                self._default_ar_config()
+                if collective == "allreduce"
+                else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None)
+            )
+            self._tune_cache[collective][target_size] = fb
+
+        torch.cuda.synchronize()
+        self._barrier_internal()
+        for a in used:
+            a.reset()
+
+    # -- public API --
+
+    def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None):
+        sz = _round_pow2(tensor.nbytes)
+        if sz not in self._tune_cache["allreduce"]:
+            self._tune_size("allreduce", sz)
+        a, nb, nt = self._tune_cache["allreduce"][sz]
+        self._exec_ar(
+            tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory
+        )
+
+    def all_gather(self, output_tensor, input_tensor, stream=None):
+        sz = _round_pow2(input_tensor.nbytes)
+        if sz not in self._tune_cache["allgather"]:
+            self._tune_size("allgather", sz)
+        a, nb, nt = self._tune_cache["allgather"][sz]
+        self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory)
+
+    def barrier(self):
+        self._barrier_internal()
+
+    def destroy(self):
+        self._algos.clear()
+        self._tune_cache = {"allreduce": {}, "allgather": {}}
+        self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None
+
+
+# -- Benchmarks (standalone) --------------------------------------------------
+
+
+def _bench_sizes(low=5 * 1024, high=80 << 20):
+    sizes, c = [], low
+    while c <= high:
+        sizes.append(c)
+        c *= 2
+    return sizes
+
+
+def benchmark_allreduce(
+    comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100
+):
+    sizes = _bench_sizes()
+    if comm.rank == 0:
+        print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}")
+        print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
+
+    cs = torch.cuda.Stream()
+    buf = _make_tensor(1 << 27, dtype)
+    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
+
+    for size in sizes:
+        nelems = size // buf.element_size()
+        t = buf[: size // buf.element_size()]
+        comm.all_reduce(t, accum_dtype=accum_dtype)
+        torch.cuda.synchronize()
+
+        cs.wait_stream(torch.cuda.current_stream())
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, stream=cs):
+            for _ in range(n_iter):
+                comm.all_reduce(t, accum_dtype=accum_dtype)
+        with torch.cuda.stream(cs):
+            for _ in range(n_warmup):
+                g.replay()
+            comm.barrier()
+        cs.synchronize()
+
+        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        s.record(cs)
+        with torch.cuda.stream(cs):
+            for _ in range(n_graph_launches):
+                g.replay()
+        e.record(cs)
+        e.synchronize()
+
+        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
+        if comm.rank == 0:
+            print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}")
+
+
+def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100):
+    sizes = _bench_sizes()
+    if comm.rank == 0:
+        print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}")
+        print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
+
+    cs = torch.cuda.Stream()
+    buf = _make_tensor(1 << 27, dtype)
+    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
+
+    for prs in sizes:
+        total = prs * comm.world_size
+        if total > buf.nbytes:
+            break
+        nt = total // buf.element_size()
+        npr = prs // buf.element_size()
+        out = buf[:nt]
+        inp = out[comm.rank * npr : (comm.rank + 1) * npr]
+
+        comm.all_gather(out, inp)
+        torch.cuda.synchronize()
+
+        cs.wait_stream(torch.cuda.current_stream())
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, stream=cs):
+            for _ in range(n_iter):
+                comm.all_gather(out, inp)
+        with torch.cuda.stream(cs):
+            for _ in range(n_warmup):
+                g.replay()
+            comm.barrier()
+        cs.synchronize()
+
+        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        s.record(cs)
+        with torch.cuda.stream(cs):
+            for _ in range(n_graph_launches):
+                g.replay()
+        e.record(cs)
+        e.synchronize()
+
+        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
+        if comm.rank == 0:
+            print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}")
+
+
+# -- Bootstrap & main ---------------------------------------------------------
+
+
+def init_dist() -> mscclpp.CommGroup:
+    addr = os.environ.get("MSCCLPP_MASTER_ADDR")
+    if addr:
+        rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"])
+        port = os.environ["MSCCLPP_MASTER_PORT"]
+        iface = _interfaces_for_ip(addr)
+        if not iface:
+            raise ValueError(f"No interface for {addr}")
+        return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
+    import torch.distributed as dist
+
+    dist.init_process_group(backend="gloo")
+    return mscclpp.CommGroup(torch_group=dist.group.WORLD)
+
+
+def main():
+    local = int(os.environ["LOCAL_RANK"])
+    torch.cuda.set_device(local)
+
+    dtype_str = os.environ.get("DTYPE", "float16")
+    dtype = getattr(torch, dtype_str, torch.float16)
+    accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
+    accum_str = os.environ.get("ACCUM_DTYPE")
+    accum_dtype = accum_map.get(accum_str) if accum_str else None
+
+    comm_group = init_dist()
+    cc = CustomizedComm(comm_group)
+
+    print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
+    benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
+    cc.barrier()
+    torch.cuda.synchronize()
+
+    benchmark_allgather(cc, dtype=dtype)
+    cc.barrier()
+    torch.cuda.synchronize()
+
+    cc.destroy()
+    print(f"rank {local} completed successfully.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/torch-integration/dsl_with_nccl_api.py b/examples/torch-integration/dsl_with_nccl_api.py
index 975d3749..5a4dd1c4 100644
--- a/examples/torch-integration/dsl_with_nccl_api.py
+++ b/examples/torch-integration/dsl_with_nccl_api.py
@@ -1,19 +1,20 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/nccl/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
 
 import os
 from typing import Any, Dict
 import torch, torch.distributed as dist
-import mscclpp
+import mscclpp.ext
 from mscclpp.language.collectives import AllReduce
 from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
 from mscclpp.language.program import CollectiveProgram
 from mscclpp.language.rank import Rank
+from mscclpp.language.utils import AlgoSpec
 
 
-def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
+def allreduce_nvls(spec: AlgoSpec) -> CollectiveProgram:
     gpu_size = spec.world_size
     with CollectiveProgram.from_spec(spec) as program:
         # Creating Channels
@@ -63,8 +64,8 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
     return program
 
 
-def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int):
-    spec = mscclpp.AlgoSpec(
+def setup_plan(algo_collection_builder: mscclpp.ext.AlgorithmCollectionBuilder, rank: int, world_size: int):
+    spec = AlgoSpec(
         name="allreduce_nvls",
         collective=AllReduce(8, 1, True),
         nranks_per_node=8,
@@ -94,10 +95,10 @@ def init_dist():
     rank = int(os.environ["RANK"])
     world = int(os.environ["WORLD_SIZE"])
     local = int(os.environ["LOCAL_RANK"])
-    algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder()
+    algorithm_collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
     setup_plan(algorithm_collection_builder, rank, world)
     algorithm_collection_builder.set_algorithm_selector(selector)
-    dist.init_process_group(backend="nccl", device_id=local)
+    dist.init_process_group(backend="nccl", device_id=torch.device("cuda", local))
     return rank, world, local
 
 
diff --git a/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu b/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu
index 0e2ab5ad..f3c69b72 100644
--- a/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu
+++ b/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu
@@ -9,7 +9,7 @@
 #include <sstream>
 
 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
   std::stringstream ss;
   (ss << ... << args);
   ss << std::endl;
@@ -23,7 +23,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
   }
 }
 
-__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
   if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
     for (int i = 0; i < iter; ++i) {
       devHandle->relaxedWait();
@@ -34,7 +34,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
   }
 }
 
-__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
   if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
     for (int i = 0; i < iter; ++i) {
       devHandle->relaxedSignal();
@@ -88,7 +88,7 @@ int main() {
   mscclpp::Semaphore sema0(/*localSemaphoreStub*/ semaStub0, /*remoteSemaphoreStub*/ semaStub1);
   mscclpp::BaseMemoryChannel memChan0(sema0);
   mscclpp::BaseMemoryChannelDeviceHandle memChanHandle0 = memChan0.deviceHandle();
-  void *devHandle0;
+  void* devHandle0;
   MSCCLPP_CUDATHROW(cudaMalloc(&devHandle0, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
   MSCCLPP_CUDATHROW(cudaMemcpy(devHandle0, &memChanHandle0, sizeof(memChanHandle0), cudaMemcpyHostToDevice));
 
@@ -98,14 +98,14 @@ int main() {
   mscclpp::Semaphore sema1(/*localSemaphoreStub*/ semaStub1, /*remoteSemaphoreStub*/ semaStub0);
   mscclpp::BaseMemoryChannel memChan1(sema1);
   mscclpp::BaseMemoryChannelDeviceHandle memChanHandle1 = memChan1.deviceHandle();
-  void *devHandle1;
+  void* devHandle1;
   MSCCLPP_CUDATHROW(cudaMalloc(&devHandle1, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
   MSCCLPP_CUDATHROW(cudaMemcpy(devHandle1, &memChanHandle1, sizeof(memChanHandle1), cudaMemcpyHostToDevice));
 
   log("GPU 0: Launching gpuKernel0 ...");
 
   MSCCLPP_CUDATHROW(cudaSetDevice(0));
-  gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle0), iter);
+  gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle0), iter);
   MSCCLPP_CUDATHROW(cudaGetLastError());
 
   log("GPU 1: Launching gpuKernel1 ...");
@@ -115,7 +115,7 @@ int main() {
   MSCCLPP_CUDATHROW(cudaEventCreate(&start));
   MSCCLPP_CUDATHROW(cudaEventCreate(&end));
   MSCCLPP_CUDATHROW(cudaEventRecord(start));
-  gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle1), iter);
+  gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle1), iter);
   MSCCLPP_CUDATHROW(cudaGetLastError());
   MSCCLPP_CUDATHROW(cudaEventRecord(end));
   MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
diff --git a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
index 05eb1b25..0526407e 100644
--- a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
+++ b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
@@ -14,7 +14,7 @@
 #define PORT_NUMBER "50505"
 
 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
   std::stringstream ss;
   (ss << ... << args);
   ss << std::endl;
@@ -50,7 +50,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
   }
 }
 
-__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
   if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
     for (int i = 0; i < iter; ++i) {
       devHandle->relaxedWait();
@@ -61,7 +61,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
   }
 }
 
-__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
   if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
     for (int i = 0; i < iter; ++i) {
       devHandle->relaxedSignal();
@@ -115,14 +115,14 @@ void worker(int gpuId) {
 
   mscclpp::BaseMemoryChannel memChan(sema);
   auto memChanHandle = memChan.deviceHandle();
-  void *devHandle;
+  void* devHandle;
   MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
   MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
 
   log("GPU ", gpuId, ": Launching a GPU kernel ...");
 
   if (gpuId == 0) {
-    gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
+    gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
     MSCCLPP_CUDATHROW(cudaGetLastError());
     MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
   } else {
@@ -130,7 +130,7 @@ void worker(int gpuId) {
     MSCCLPP_CUDATHROW(cudaEventCreate(&start));
     MSCCLPP_CUDATHROW(cudaEventCreate(&end));
     MSCCLPP_CUDATHROW(cudaEventRecord(start));
-    gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
+    gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
     MSCCLPP_CUDATHROW(cudaGetLastError());
     MSCCLPP_CUDATHROW(cudaEventRecord(end));
     MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
diff --git a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
index e9007612..a1be59f2 100644
--- a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
+++ b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
@@ -16,7 +16,7 @@
 #define PORT_NUMBER "50505"
 
 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
   std::stringstream ss;
   (ss << ... << args);
   ss << std::endl;
@@ -47,7 +47,7 @@ int wait_process(int pid) {
 
 __device__ mscclpp::DeviceSyncer devSyncer;
 
-__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid == 0) {
     devHandle->relaxedSignal();
@@ -65,7 +65,7 @@ __global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
   }
 }
 
-__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid == 0) {
     devHandle->relaxedSignal();
@@ -79,7 +79,7 @@ __global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
   devHandle->get(srcOffset, dstOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x);
 }
 
-__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank,
+__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank,
                                      uint32_t flag) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid == 0) {
@@ -95,9 +95,8 @@ __global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHand
   devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
 }
 
-void worker(int gpuId) {
+void worker(int myRank, int gpuId, const std::string& ipPort) {
   MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
-  const int myRank = gpuId;
   const int remoteRank = myRank == 0 ? 1 : 0;
   const int nRanks = 2;
   const int iter = 1000;
@@ -105,11 +104,11 @@ void worker(int gpuId) {
   const size_t bufferBytes = 256 * 1024 * 1024;
   const size_t pktBufferBytes = 256 * 1024 * 1024;
 
-  log("GPU ", gpuId, ": Preparing for tests ...");
+  log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");
 
   // Build a connection and a semaphore
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
-  bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER);
+  bootstrap->initialize(ipPort);
   mscclpp::Communicator comm(bootstrap);
   auto conn = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank).get();
   auto sema = comm.buildSemaphore(conn, remoteRank).get();
@@ -133,8 +132,8 @@ void worker(int gpuId) {
   auto memChanHandle = memChan.deviceHandle();
   auto memPktChanHandle = memPktChan.deviceHandle();
 
-  void *devHandle;
-  void *devPktHandle;
+  void* devHandle;
+  void* devPktHandle;
   MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
   MSCCLPP_CUDATHROW(cudaMalloc(&devPktHandle, sizeof(memPktChanHandle)));
   MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
@@ -146,23 +145,23 @@ void worker(int gpuId) {
   std::function<void(size_t)> kernels[3];
 
   kernels[0] = [&](size_t copyBytes) {
-    bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
-                                            copyBytes, myRank);
+    bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
+                                            myRank);
   };
 
   kernels[1] = [&](size_t copyBytes) {
-    bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
-                                            copyBytes, myRank);
+    bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
+                                            myRank);
   };
 
   kernels[2] = [&](size_t copyBytes) {
     static uint32_t flag = 1;
-    bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devPktHandle),
+    bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devPktHandle),
                                                   copyBytes, myRank, flag++);
   };
 
   cudaEvent_t start, end;
-  if (gpuId == 0) {
+  if (myRank == 0) {
     MSCCLPP_CUDATHROW(cudaEventCreate(&start));
     MSCCLPP_CUDATHROW(cudaEventCreate(&end));
   }
@@ -189,13 +188,13 @@ void worker(int gpuId) {
       MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
       bootstrap->barrier();
 
-      if (gpuId == 0) {
+      if (myRank == 0) {
         MSCCLPP_CUDATHROW(cudaEventRecord(start, stream));
       }
 
       MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream));
 
-      if (gpuId == 0) {
+      if (myRank == 0) {
         MSCCLPP_CUDATHROW(cudaEventRecord(end, stream));
         MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
         float elapsedTime;
@@ -204,8 +203,8 @@ void worker(int gpuId) {
         MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
         elapsedTimePerIter = elapsedTime / iter;
         gbps = float(copyBytes) / elapsedTimePerIter * 1e-6f;
-        log("GPU ", gpuId, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
-            gbps, " GB/s");
+        log("Rank ", myRank, " (GPU ", gpuId, "): [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter,
+            " ms/iter, BW ", gbps, " GB/s");
       }
       MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
       MSCCLPP_CUDATHROW(cudaGraphExecDestroy(graphExec));
@@ -216,23 +215,47 @@ void worker(int gpuId) {
   bootstrap->barrier();
 }
 
-int main() {
-  int pid0 = spawn_process([]() { worker(0); });
-  int pid1 = spawn_process([]() { worker(1); });
-  if (pid0 < 0 || pid1 < 0) {
-    log("Failed to spawn processes.");
+int main(int argc, char** argv) {
+  if (argc == 1) {
+    int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
+    int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
+    if (pid0 < 0 || pid1 < 0) {
+      log("Failed to spawn processes.");
+      return -1;
+    }
+    int status0 = wait_process(pid0);
+    int status1 = wait_process(pid1);
+    if (status0 < 0 || status1 < 0) {
+      log("Failed to wait for processes.");
+      return -1;
+    }
+    if (status0 != 0 || status1 != 0) {
+      log("One of the processes failed.");
+      return -1;
+    }
+    log("Succeed!");
+    return 0;
+  } else if (argc == 4) {
+    std::string ipPort = argv[1];
+    int rank, gpuId;
+    try {
+      rank = std::stoi(argv[2]);
+      gpuId = std::stoi(argv[3]);
+    } catch (const std::exception&) {
+      log("Error: rank and gpu_id must be valid integers.");
+      return -1;
+    }
+    if (rank < 0 || rank > 2 || gpuId < 0) {
+      log("Error: rank must be between 0 and 1 and gpu_id must be non-negative.");
+      return -1;
+    }
+    worker(rank, gpuId, ipPort);
+    log("Rank ", rank, ": Succeed!");
+    return 0;
+  } else {
+    std::cerr << "Usage:\n"
+              << "  " << argv[0] << "                Run in intra-node mode\n"
+              << "  " << argv[0] << " <ip_port> <rank> <gpu_id>   Run in inter-node mode\n";
     return -1;
   }
-  int status0 = wait_process(pid0);
-  int status1 = wait_process(pid1);
-  if (status0 < 0 || status1 < 0) {
-    log("Failed to wait for processes.");
-    return -1;
-  }
-  if (status0 != 0 || status1 != 0) {
-    log("One of the processes failed.");
-    return -1;
-  }
-  log("Succeed!");
-  return 0;
 }
diff --git a/examples/tutorials/04-port-channel/bidir_port_channel.cu b/examples/tutorials/04-port-channel/bidir_port_channel.cu
index 46064581..9e6d61dd 100644
--- a/examples/tutorials/04-port-channel/bidir_port_channel.cu
+++ b/examples/tutorials/04-port-channel/bidir_port_channel.cu
@@ -16,7 +16,7 @@
 #define PORT_NUMBER "50505"
 
 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
   std::stringstream ss;
   (ss << ... << args);
   ss << std::endl;
@@ -45,7 +45,7 @@ int wait_process(int pid) {
   return -1;
 }
 
-__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid == 0) {
     devHandle->signal();
@@ -58,7 +58,7 @@ __global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size
   }
 }
 
-void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport transport) {
+void worker(int rank, int gpuId, const std::string& ipPort, mscclpp::Transport transport) {
   MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
   const int myRank = rank;
   const int remoteRank = myRank == 0 ? 1 : 0;
@@ -90,7 +90,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
 
   auto portChanHandle = portChan.deviceHandle();
 
-  void *devHandle;
+  void* devHandle;
   MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(portChanHandle)));
   MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &portChanHandle, sizeof(portChanHandle), cudaMemcpyHostToDevice));
 
@@ -100,7 +100,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
   std::function<void(size_t)> kernels[1];
 
   kernels[0] = [&](size_t copyBytes) {
-    bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle *>(devHandle), copyBytes,
+    bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle*>(devHandle), copyBytes,
                                         myRank);
   };
 
@@ -166,7 +166,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
   bootstrap->barrier();
 }
 
-mscclpp::Transport parseTransport(const std::string &transportStr) {
+mscclpp::Transport parseTransport(const std::string& transportStr) {
   if (transportStr == "CudaIpc") return mscclpp::Transport::CudaIpc;
   if (transportStr == "IB0") return mscclpp::Transport::IB0;
   if (transportStr == "IB1") return mscclpp::Transport::IB1;
@@ -180,7 +180,7 @@ mscclpp::Transport parseTransport(const std::string &transportStr) {
   throw std::runtime_error("Unknown transport: " + transportStr);
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   if (argc == 1) {
     int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
     int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
diff --git a/examples/tutorials/05-switch-channel/Makefile b/examples/tutorials/05-switch-channel/Makefile
new file mode 100644
index 00000000..1a211f64
--- /dev/null
+++ b/examples/tutorials/05-switch-channel/Makefile
@@ -0,0 +1,15 @@
+CUDA_HOME ?= /usr/local/cuda
+
+COMPILER := $(CUDA_HOME)/bin/nvcc
+ARCH_FLAG := -arch=native
+
+TARGET = bidir_switch_channel
+SRC = bidir_switch_channel.cu
+
+all: $(TARGET)
+
+$(TARGET): $(SRC)
+	$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp
+
+clean:
+	rm -f $(TARGET)
diff --git a/examples/tutorials/05-switch-channel/bidir_switch_channel.cu b/examples/tutorials/05-switch-channel/bidir_switch_channel.cu
new file mode 100644
index 00000000..658e6f05
--- /dev/null
+++ b/examples/tutorials/05-switch-channel/bidir_switch_channel.cu
@@ -0,0 +1,177 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <functional>
+#include <iostream>
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/switch_channel.hpp>
+#include <mscclpp/switch_channel_device.hpp>
+#include <sstream>
+
+#define PORT_NUMBER "50505"
+
+template <typename... Args>
+void log(Args &&...args) {
+  std::stringstream ss;
+  (ss << ... << args);
+  ss << std::endl;
+  std::cout << ss.str();
+}
+
+int spawn_process(std::function<void()> func) {
+  pid_t pid = fork();
+  if (pid < 0) return -1;
+  if (pid == 0) {
+    // Child process
+    func();
+    exit(0);
+  }
+  return pid;
+}
+
+int wait_process(int pid) {
+  int status;
+  if (waitpid(pid, &status, 0) < 0) {
+    return -1;
+  }
+  if (WIFEXITED(status)) {
+    return WEXITSTATUS(status);
+  }
+  return -1;
+}
+
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
+
+__device__ mscclpp::DeviceSyncer devSyncer;
+
+__global__ void kernelSwitchReduce(int rank, int numElements) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  // rank 0 performs on first half of data and rank 1 on second half
+  int min = rank * (numElements / 2);
+  int max = (rank + 1) * (numElements / 2);
+
+  for (int i = tid + min; i < max; i += stride) {
+    auto val = gConstSwitchChan.reduce<mscclpp::f32x1>(i);
+    gConstSwitchChan.broadcast(i, val);
+  }
+}
+
+void worker(int myRank, int gpuId, const std::string &ipPort) {
+  MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
+  const int nRanks = 2;
+  const int iter = 1000;
+  const size_t bufferBytes = 128 * 1024 * 1024;
+
+  log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");
+
+  // Build a connection and a semaphore
+  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
+  bootstrap->initialize(ipPort);
+  std::shared_ptr<mscclpp::Communicator> comm = std::make_shared<mscclpp::Communicator>(bootstrap);
+
+  std::vector<int> ranks;
+  ranks.reserve(nRanks);
+  for (int i = 0; i < nRanks; i++) ranks.push_back(i);
+
+  auto buffer = mscclpp::GpuBuffer<float>(bufferBytes);
+
+  auto nvlsConnection = mscclpp::connectNvlsCollective(comm, ranks, bufferBytes);
+
+  auto switchChannel = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer.data()), bufferBytes);
+
+  auto deviceHandle = switchChannel.deviceHandle();
+
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan, &deviceHandle, sizeof(deviceHandle)));
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  // Call the kernel in a loop for perf evaluation
+
+  for (size_t numElements : {1024, 1024 * 1024, 32 * 1024 * 1024}) {
+    cudaEvent_t start, end;
+    if (myRank == 0) {
+      MSCCLPP_CUDATHROW(cudaEventCreate(&start));
+      MSCCLPP_CUDATHROW(cudaEventCreate(&end));
+    }
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    bootstrap->barrier();
+
+    if (myRank == 0) {
+      MSCCLPP_CUDATHROW(cudaEventRecord(start, 0));
+    }
+
+    for (int i = 0; i < iter; ++i) {
+      kernelSwitchReduce<<<256, 1024>>>(myRank, numElements);
+    }
+
+    MSCCLPP_CUDATHROW(cudaGetLastError());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+    comm->bootstrap()->barrier();
+
+    if (myRank == 0) {
+      MSCCLPP_CUDATHROW(cudaEventRecord(end, 0));
+      MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
+      float elapsedTime;
+      float elapsedTimePerIter;
+      float gbps;
+      MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
+      elapsedTimePerIter = elapsedTime / iter;
+      float dataSize = numElements * 4;
+      gbps = dataSize / elapsedTimePerIter * 1e-6f;
+      log("Rank ", myRank, " (GPU ", gpuId, "): bytes ", dataSize, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
+          gbps, " GB/s");
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  if (argc == 1) {
+    int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
+    int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
+    if (pid0 < 0 || pid1 < 0) {
+      log("Failed to spawn processes.");
+      return -1;
+    }
+    int status0 = wait_process(pid0);
+    int status1 = wait_process(pid1);
+    if (status0 < 0 || status1 < 0) {
+      log("Failed to wait for processes.");
+      return -1;
+    }
+    if (status0 != 0 || status1 != 0) {
+      log("One of the processes failed.");
+      return -1;
+    }
+    log("Succeed!");
+    return 0;
+  } else if (argc == 4) {
+    std::string ipPort = argv[1];
+    int rank, gpuId;
+    try {
+      rank = std::stoi(argv[2]);
+      gpuId = std::stoi(argv[3]);
+    } catch (const std::exception &) {
+      log("Error: rank and gpu_id must be valid integers.");
+      return -1;
+    }
+    if (rank < 0 || rank > 2 || gpuId < 0) {
+      log("Error: rank must be between 0 and 1 and gpu_id must be non-negative.");
+      return -1;
+    }
+    worker(rank, gpuId, ipPort);
+    log("Rank ", rank, ": Succeed!");
+    return 0;
+  } else {
+    std::cerr << "Usage:\n"
+              << "  " << argv[0] << "                Run in intra-node mode\n"
+              << "  " << argv[0] << " <ip_port> <rank> <gpu_id>   Run in inter-node mode\n";
+    return -1;
+  }
+}
diff --git a/include/mscclpp/algorithm.hpp b/include/mscclpp/algorithm.hpp
index 7acdb8b8..531cb857 100644
--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -84,6 +84,11 @@ class Algorithm {
   /// @return The Constraint struct specifying worldSize and nRanksPerNode requirements.
   virtual Constraint constraint() const = 0;
 
+  /// Set the valid message size range for this algorithm.
+  /// @param minMessageSize Minimum supported message size in bytes.
+  /// @param maxMessageSize Maximum supported message size in bytes.
+  virtual void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) = 0;
+
   /// Execute the algorithm.
   /// @param comm The communicator to use.
   /// @param input Pointer to the input buffer.
@@ -96,12 +101,16 @@ class Algorithm {
   /// @param executor The executor for DSL algorithms (may be nullptr for native).
   /// @param nBlocks Number of CUDA blocks (0 for auto-selection).
   /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
+  /// @param symmetricMemory Whether to use symmetric memory optimization.
   /// @param extras Additional parameters for algorithm-specific customization.
+  /// @param accumDtype Data type for accumulation during reduction. DataType::AUTO resolves to dtype.
   /// @return The result of the operation.
   virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                              size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                              std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                             const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
+                             bool symmetricMemory = false,
+                             const std::unordered_map<std::string, uintptr_t>& extras = {},
+                             DataType accumDtype = DataType::AUTO) = 0;
 
   /// Reset the algorithm state, clearing any cached contexts.
   virtual void reset() = 0;
@@ -179,10 +188,11 @@ class NativeAlgorithm : public Algorithm {
   /// @param nBlocks Number of CUDA blocks.
   /// @param nThreadsPerBlock Number of threads per block.
   /// @param extras Additional algorithm-specific parameters.
+  /// @param accumDtype Data type for accumulation (resolved from input dtype if sentinel).
   /// @return The result of the operation.
   using KernelFunc =
       std::function<CommResult(const std::shared_ptr<void>, const void*, void*, size_t, size_t, DataType, ReduceOp,
-                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&)>;
+                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&, DataType)>;
 
   /// Function type for creating algorithm contexts.
   /// @param comm The communicator.
@@ -201,9 +211,10 @@ class NativeAlgorithm : public Algorithm {
   /// @param inputSize Size of the input buffer.
   /// @param outputSize Size of the output buffer.
   /// @param dtype Data type of the elements.
+  /// @param symmetricMemory Whether symmetric memory is enabled.
   /// @return A key uniquely identifying this buffer configuration.
   using ContextKeyGenFunc = std::function<AlgorithmCtxKey(const void* input, void* output, size_t inputSize,
-                                                          size_t outputSize, DataType dtype)>;
+                                                          size_t outputSize, DataType dtype, bool symmetricMemory)>;
 
   /// Construct a NativeAlgorithm.
   /// @param name Human-readable name of the algorithm.
@@ -225,10 +236,12 @@ class NativeAlgorithm : public Algorithm {
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                      size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                      std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+                     bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
+                     DataType accumDtype = DataType::AUTO) override;
   const std::string& name() const override;
   const std::string& collective() const override;
   const std::pair<size_t, size_t>& messageRange() const override;
+  void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) override;
   const std::unordered_map<std::string, uint64_t>& tags() const override;
   const CollectiveBufferMode& bufferMode() const override;
   AlgorithmType type() const override { return AlgorithmType::Native; }
@@ -269,12 +282,14 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
   const std::string& name() const override;
   const std::string& collective() const override;
   const std::pair<size_t, size_t>& messageRange() const override;
+  void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) override;
   const std::unordered_map<std::string, uint64_t>& tags() const override;
   const CollectiveBufferMode& bufferMode() const override;
   CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                      size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                      std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+                     bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
+                     DataType accumDtype = DataType::AUTO) override;
   AlgorithmType type() const override { return AlgorithmType::DSL; }
   Constraint constraint() const override;
   void reset() override;
@@ -299,6 +314,7 @@ struct CollectiveRequest {
   const void* inputBuffer;
   void* outputBuffer;
   size_t messageSize;
+  cudaStream_t stream;
   const std::string& collective;
   const DataType dtype;
   const std::unordered_map<std::string, std::vector<uint64_t>>& hints;
@@ -358,6 +374,10 @@ class AlgorithmCollection {
   AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
 };
 
+/// Get a default GPU flag buffer (allocated once and reused).
+/// @return A pair of (shared_ptr to the flag buffer, size in bytes).
+std::pair<std::shared_ptr<void>, size_t> getFlagBuffer();
+
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_ALGORITHM_HPP_
\ No newline at end of file
diff --git a/include/mscclpp/assert_device.hpp b/include/mscclpp/assert_device.hpp
index bf982ba6..1b9cb611 100644
--- a/include/mscclpp/assert_device.hpp
+++ b/include/mscclpp/assert_device.hpp
@@ -19,11 +19,11 @@
 #else  // defined(DEBUG_BUILD)
 
 #if defined(MSCCLPP_DEVICE_HIP)
-extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
-                                         const char *__function);
+extern "C" __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
+                                         const char* __function);
 #else   // !defined(MSCCLPP_DEVICE_HIP)
-extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
-                                                  const char *__function) __THROW;
+extern "C" __host__ __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
+                                                  const char* __function) __THROW;
 #endif  // !defined(MSCCLPP_DEVICE_HIP)
 
 /// Assert a condition on the device and print a message if the condition is false.
diff --git a/include/mscclpp/atomic_device.hpp b/include/mscclpp/atomic_device.hpp
index 74f6122f..d00bb50c 100644
--- a/include/mscclpp/atomic_device.hpp
+++ b/include/mscclpp/atomic_device.hpp
@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
   return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
 }
 
-#elif defined(MSCCLPP_DEVICE_HIP)
+#else  // !defined(MSCCLPP_DEVICE_CUDA)
 
 constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
 constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
 constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
 constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
 
-// HIP does not have thread scope enums like CUDA
 constexpr auto scopeSystem = 0;
 constexpr auto scopeDevice = 0;
 
@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
   return __atomic_fetch_add(ptr, val, memoryOrder);
 }
 
-#endif  // defined(MSCCLPP_DEVICE_HIP)
+#endif  // !defined(MSCCLPP_DEVICE_CUDA)
 
 }  // namespace mscclpp
 
diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 38b05ccf..ca2fc34f 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -381,11 +381,19 @@ struct EndpointConfig {
   /// These settings are only used when the transport is an InfiniBand type (IB0-IB7); they are ignored for other
   /// transports.
   struct Ib {
+    /// IB mode for signaling, used to select between different implementations.
+    enum class Mode {
+      Default,      // Use the MSCCLPP_IBV_MODE environment variable (or "host" if unset).
+      Host,         // Use the host stack with RDMA atomics.
+      HostNoAtomic  // Use the host stack with write-with-immediate signaling (no RDMA atomics).
+    };
+
     static constexpr int DefaultPort = -1;
-    static constexpr int DefaultGidIndex = 0;
+    static constexpr int DefaultGidIndex = -1;
     static constexpr int DefaultMaxCqSize = 1024;
     static constexpr int DefaultMaxCqPollNum = 1;
     static constexpr int DefaultMaxSendWr = 8192;
+    static constexpr int DefaultMaxRecvWr = 16;
     static constexpr int DefaultMaxWrPerSend = 64;
 
     /// Device index. Currently ignored; use transport type (IB0-IB7) to select device.
@@ -394,32 +402,41 @@ struct EndpointConfig {
     int port;
     /// GID index.
     int gidIndex;
-    /// Maximum size of the completion queue.
+    /// Maximum size of the send completion queue.
     int maxCqSize;
-    /// Maximum number of completion queue polls per operation.
+    /// Maximum number of send completion queue polls per operation.
     int maxCqPollNum;
     /// Maximum number of outstanding send work requests.
     int maxSendWr;
+    /// Maximum number of outstanding receive work requests (used in HostNoAtomic mode for write-with-immediate).
+    int maxRecvWr;
     /// Maximum number of work requests per send operation.
     int maxWrPerSend;
+    /// IB mode for signaling. When set to Default, uses the MSCCLPP_IBV_MODE environment variable.
+    Mode mode;
 
     /// Constructor.
     /// @param deviceIndex Device index.
     /// @param port Port number.
-    /// @param gidIndex GID index.
-    /// @param maxCqSize Maximum completion queue size.
-    /// @param maxCqPollNum Maximum completion queue poll count.
+    /// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
+    /// @param maxCqSize Maximum send completion queue size.
+    /// @param maxCqPollNum Maximum send completion queue poll count.
     /// @param maxSendWr Maximum outstanding send work requests.
+    /// @param maxRecvWr Maximum outstanding receive work requests (for HostNoAtomic mode).
     /// @param maxWrPerSend Maximum work requests per send operation.
+    /// @param mode IB mode for signaling (Default uses MSCCLPP_IBV_MODE env variable).
     Ib(int deviceIndex = -1, int port = DefaultPort, int gidIndex = DefaultGidIndex, int maxCqSize = DefaultMaxCqSize,
-       int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxWrPerSend = DefaultMaxWrPerSend)
+       int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxRecvWr = DefaultMaxRecvWr,
+       int maxWrPerSend = DefaultMaxWrPerSend, Mode mode = Mode::Default)
         : deviceIndex(deviceIndex),
           port(port),
           gidIndex(gidIndex),
           maxCqSize(maxCqSize),
           maxCqPollNum(maxCqPollNum),
           maxSendWr(maxSendWr),
-          maxWrPerSend(maxWrPerSend) {}
+          maxRecvWr(maxRecvWr),
+          maxWrPerSend(maxWrPerSend),
+          mode(mode) {}
   };
 
   /// Communication transport type (e.g., CudaIpc, IB0-IB7, Ethernet).
@@ -658,6 +675,7 @@ class Connection {
   friend class SemaphoreStub;
   friend class Semaphore;
   friend class ProxyService;
+  friend class BaseConnection;
 };
 
 /// SemaphoreStub object only used for constructing Semaphore, not for direct use by the user.
diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp
index 5972234b..a6dd306b 100644
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -54,6 +54,12 @@ class Env {
   /// default libibverbs library found in the system.
   const std::string ibvSo;
 
+  /// Env name: `MSCCLPP_IBV_MODE`. Selects the IB stack implementation for PortChannel.
+  /// Allowed values:
+  /// - "host": use the host stack with RDMA atomics (default).
+  /// - "host-no-atomic": use the host stack with write-with-immediate signaling (no RDMA atomics).
+  const std::string ibvMode;
+
   /// Env name: `MSCCLPP_HOSTID`. A string that uniquely identifies the host. If unset, it will use the hostname.
   /// This is used to determine whether the host is the same across different processes.
   const std::string hostid;
@@ -70,9 +76,9 @@ class Env {
   /// Env name: `MSCCLPP_COMM_ID`. To be deprecated; don't use this.
   const std::string commId;
 
-  /// Env name: `MSCCLPP_EXECUTION_PLAN_DIR`. The directory to find execution plans from. This should be set to
-  /// use execution plans for the NCCL API. Unset by default.
-  const std::string executionPlanDir;
+  /// Env name: `MSCCLPP_CACHE_DIR`. The directory to use for caching execution plans and other temporary files.
+  /// If unset, it defaults to `~/.cache/mscclpp`.
+  const std::string cacheDir;
 
   /// Env name: `MSCCLPP_NPKIT_DUMP_DIR`. The directory to dump NPKIT traces to. If this is set, NPKIT will be
   /// enabled and will dump traces to this directory. Unset by default.
@@ -92,17 +98,27 @@ class Env {
   /// debugging purposes. Currently supports `all`, `broadcast`, `allreduce`, `reducescatter`, and `allgather`.
   const std::string forceNcclFallbackOperation;
 
-  /// Env name: `MSCCLPP_DISABLE_CHANNEL_CACHE`. If set to true, it will disable the channel cache for NCCL APIs.
-  /// Currently, this should be set to true if the application may call NCCL APIs on the same local buffer with
-  /// different remote buffers, e.g., in the case of a dynamic communicator. If CUDA/HIP graphs are used, disabling
-  /// the channel cache won't affect the performance, but otherwise it may lead to performance degradation.
+  /// Env name: `MSCCLPP_NCCL_SYMMETRIC_MEMORY`. If set to true, it indicates that the application uses symmetric memory
+  /// allocation across all ranks, making it safe to cache memory handles for all NCCL algorithms. If set to false, the
+  /// system will either use non-zero-copy algorithms (when CUDA/HIP graphs are not enabled) or set up new connections
+  /// every time (when CUDA/HIP graphs are enabled). This should be set to false if the application may call NCCL APIs
+  /// on the same local buffer with different remote buffers, e.g., in the case of a dynamic communicator.
   /// Default is false.
-  const bool disableChannelCache;
+  const bool ncclSymmetricMemory;
 
   /// Env name: `MSCCLPP_FORCE_DISABLE_NVLS`. If set to true, it will disable the NVLS support in MSCCL++.
   /// Default is false.
   const bool forceDisableNvls;
 
+  /// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++.
+  /// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded.
+  /// Default is false.
+  const bool forceDisableGdr;
+
+  /// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
+  /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
+  const int ibGidIndex;
+
  private:
   Env();
 
diff --git a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
index 201d7440..394e8014 100644
--- a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
+++ b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
@@ -47,7 +47,8 @@ class AlgorithmCollectionBuilder {
   /// @return The built AlgorithmCollection containing all registered algorithms.
   AlgorithmCollection build();
 
-  AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, int rank);
+  AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer,
+                                             size_t flagBufferSize, int rank);
 
  private:
   AlgorithmCollectionBuilder() = default;
@@ -55,7 +56,8 @@ class AlgorithmCollectionBuilder {
   AlgoSelectFunc algoSelector_ = nullptr;
   AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
 
-  AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize);
+  AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize,
+                                                   uintptr_t flagBuffer, size_t flagBufferSize);
   AlgorithmCollection buildDefaultDslAlgorithms(int rank);
 
   static std::shared_ptr<AlgorithmCollectionBuilder> gAlgorithmCollectionBuilder_;
diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
index 6a0929aa..b8d096e2 100644
--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -15,6 +15,7 @@ using cudaGraphExec_t = hipGraphExec_t;
 using cudaDeviceProp = hipDeviceProp_t;
 using cudaStream_t = hipStream_t;
 using cudaStreamCaptureMode = hipStreamCaptureMode;
+using cudaStreamCaptureStatus = hipStreamCaptureStatus;
 using cudaMemcpyKind = hipMemcpyKind;
 using cudaIpcMemHandle_t = hipIpcMemHandle_t;
 
@@ -35,6 +36,9 @@ constexpr auto cudaErrorNotSupported = hipErrorNotSupported;
 constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
 constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
 constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed;
+constexpr auto cudaStreamCaptureStatusNone = hipStreamCaptureStatusNone;
+constexpr auto cudaStreamCaptureStatusActive = hipStreamCaptureStatusActive;
+constexpr auto cudaStreamCaptureStatusInvalidated = hipStreamCaptureStatusInvalidated;
 constexpr auto cudaHostAllocMapped = hipHostMallocMapped;
 constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined;
 constexpr auto cudaMemcpyDefault = hipMemcpyDefault;
@@ -98,6 +102,7 @@ constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVIC
 #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
 #define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__)
 #define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__)
+#define cudaStreamIsCapturing(...) hipStreamIsCapturing(__VA_ARGS__)
 #define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__)
 #define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__)
 #define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__)
diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
index 99b95d9a..41bd5928 100644
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -16,20 +16,27 @@ using __bfloat16 = __hip_bfloat16;
 using __bfloat162 = __hip_bfloat162;
 #define __CUDA_BF16_TYPES_EXIST__
 
-// AMD FP8 support - hip_fp8.h provides __hip_fp8_e4m3_fnuz and __hip_fp8_e5m2_fnuz
-// Only available on gfx942 and newer architectures (ROCm 6.0+)
+// AMD FP8 support - Use fnuz types for HIP 6.0 or when HIP_FP8_TYPE_FNUZ is enabled and HIP_FP8_TYPE_OCP is not
+// enabled. Otherwise, use the standard FP8 types.
 #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6)
 #include <hip/hip_fp8.h>
 
 // Create aliases matching CUDA naming convention for cross-platform compatibility
+#if (HIP_VERSION_MAJOR == 6) || (HIP_VERSION_MAJOR > 6 && HIP_FP8_TYPE_FNUZ && !HIP_FP8_TYPE_OCP)
 using __fp8_e4m3 = __hip_fp8_e4m3_fnuz;
 using __fp8_e5m2 = __hip_fp8_e5m2_fnuz;
-
-// HIP FP8 vector types use storage types (from hip/amd_detail/amd_hip_fp8.h):
-using __fp8x2_e4m3 = __hip_fp8x2_storage_t;  // uint16_t
-using __fp8x2_e5m2 = __hip_fp8x2_storage_t;  // uint16_t
-using __fp8x4_e4m3 = __hip_fp8x4_storage_t;  // uint32_t
-using __fp8x4_e5m2 = __hip_fp8x4_storage_t;  // uint32_t
+using __fp8x2_e4m3 = __hip_fp8x2_e4m3_fnuz;
+using __fp8x2_e5m2 = __hip_fp8x2_e5m2_fnuz;
+using __fp8x4_e4m3 = __hip_fp8x4_e4m3_fnuz;
+using __fp8x4_e5m2 = __hip_fp8x4_e5m2_fnuz;
+#else
+using __fp8_e4m3 = __hip_fp8_e4m3;
+using __fp8_e5m2 = __hip_fp8_e5m2;
+using __fp8x2_e4m3 = __hip_fp8x2_e4m3;
+using __fp8x2_e5m2 = __hip_fp8x2_e5m2;
+using __fp8x4_e4m3 = __hip_fp8x4_e4m3;
+using __fp8x4_e5m2 = __hip_fp8x4_e5m2;
+#endif
 
 #define __FP8_TYPES_EXIST__
 #endif  // HIP_VERSION_MAJOR >= 6
@@ -57,24 +64,156 @@ using __bfloat162 = __nv_bfloat162;
 
 #endif
 
+/// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15.
+/// Format (MSB first): [sign:1][exponent:4][mantissa:3]
+/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention).
+/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6.
+struct alignas(1) __fp8_e4m3b15 {
+  uint8_t __x;
+
+  __fp8_e4m3b15() = default;
+
+  /// Construct from raw bits (use __fp8_e4m3b15::fromRaw() for clarity).
+  MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(uint8_t raw) : __x(raw) {}
+
+  /// Construct from float32 (explicit to avoid ambiguous conversion chains).
+  MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(float val) : __x(fromFloat(val)) {}
+
+  /// Convert to float32.
+  MSCCLPP_HOST_DEVICE_INLINE operator float() const { return toFloat(__x); }
+
+  /// Construct from a raw bit pattern without conversion.
+  static MSCCLPP_HOST_DEVICE_INLINE __fp8_e4m3b15 fromRaw(uint8_t bits) {
+    __fp8_e4m3b15 r;
+    r.__x = bits;
+    return r;
+  }
+
+ private:
+  /// Decode fp8_e4m3b15 bits → float32.
+  ///
+  /// Uses bit manipulation through fp16 as intermediate, adapted from the Triton compiler.
+  /// fp8_e4m3b15 is identical to fp8_e4m3fn (NVIDIA) except exponent bias is 15 vs 7.
+  /// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8,
+  /// then convert fp16 → float32.
+  static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) {
+    // Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN.
+    uint32_t exp = (bits >> 3) & 0xFu;
+    if (bits == 0x80 || exp == 15) {
+      union {
+        uint32_t u;
+        float f;
+      } nan_val = {0x7FC00000u};
+      return nan_val.f;
+    }
+    if (bits == 0) return 0.0f;
+
+    // Triton-style bit manipulation: fp8 → fp16 → fp32.
+    // fp8 layout: [S:1][E:4][M:3]  (bias=15)
+    // fp16 layout: [S:1][E:5][M:10] (bias=15)
+    //
+    // Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1
+    // to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15.
+    // Refer:
+    // https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34
+    uint16_t h = (uint16_t)bits << 8;             // place fp8 in upper byte of fp16
+    uint16_t sign16 = h & 0x8000u;                // extract sign at fp16 position
+    uint16_t nosign = h & 0x7F00u;                // exponent + mantissa (no sign)
+    uint16_t fp16_bits = sign16 | (nosign >> 1);  // shift exponent right by 1
+
+    // For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0
+    // and fp16 mantissa = (fp8_mantissa << 7), which correctly represents
+    // the subnormal fp16 value since both share bias=15.
+
+    // Convert fp16 bits to float via __half (works on host and device, CUDA and HIP).
+    union {
+      uint16_t u;
+      __half h;
+    } cvt = {fp16_bits};
+    return __half2float(cvt.h);
+  }
+
+  /// Encode float32 → fp8_e4m3b15 bits.
+  ///
+  /// Algorithm adapted from Triton: float32 → fp16 → bit-manipulate → fp8.
+  /// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15),
+  /// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1.
+  static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) {
+    union {
+      float f;
+      uint32_t u;
+    } in = {val};
+
+    // NaN → 0x80 (negative-zero bit pattern = NaN in fnuz).
+    if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u;
+
+    // Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP).
+    __half h_val = __float2half_rn(val);
+    union {
+      __half h;
+      uint16_t u;
+    } cvt = {h_val};
+    uint16_t fp16_bits = cvt.u;
+
+    // Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80.
+    uint16_t abs_fp16 = fp16_bits & 0x7FFFu;
+    if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u;
+
+    // Reconstruct with sign.
+    uint16_t sign16 = fp16_bits & 0x8000u;
+
+    // Triton-style: fp16 → fp8.
+    // fp16 layout: [S:1][E:5][M:10] (bias=15)
+    // fp8 layout:  [S:1][E:4][M:3]  (bias=15)
+    //
+    // mad.lo.u32 a0, a0, 2, 0x00800080  →  (abs_fp16 * 2 + 0x0080)
+    // This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias.
+    // Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea  →  (sign & 0x8000) | a0
+    // Finally: prmt for byte extraction.
+    //
+    // Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte.
+    uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u);
+    // The upper byte now contains [E:4][M:3][round_bit].
+    // Combine with sign and extract.
+    uint16_t with_sign = sign16 | adjusted;
+    uint8_t result = (uint8_t)(with_sign >> 8);
+
+    // Zero → 0x00 (ensure positive zero, not negative zero which is NaN).
+    if ((result & 0x7Fu) == 0) result = 0x00u;
+
+    return result;
+  }
+};
+
+/// Packed 2x fp8_e4m3b15 storage.
+struct alignas(2) __fp8x2_e4m3b15 {
+  uint16_t __x;
+};
+
+/// Packed 4x fp8_e4m3b15 storage.
+struct alignas(4) __fp8x4_e4m3b15 {
+  uint32_t __x;
+};
+
 namespace mscclpp {
 
 /// Data types supported by mscclpp operations.
 enum class DataType {
-  INT32,     // 32-bit signed integer.
-  UINT32,    // 32-bit unsigned integer.
-  FLOAT16,   // IEEE 754 half precision.
-  FLOAT32,   // IEEE 754 single precision.
-  BFLOAT16,  // bfloat16 precision.
-  FP8_E4M3,  // FP8 with E4M3 layout.
-  FP8_E5M2,  // FP8 with E5M2 layout.
+  INT32,           // 32-bit signed integer.
+  UINT32,          // 32-bit unsigned integer.
+  FLOAT16,         // IEEE 754 half precision.
+  FLOAT32,         // IEEE 754 single precision.
+  BFLOAT16,        // bfloat16 precision.
+  FLOAT8_E4M3,     // float8 with E4M3 layout.
+  FLOAT8_E5M2,     // float8 with E5M2 layout.
+  UINT8,           // 8-bit unsigned integer.
+  FLOAT8_E4M3B15,  // float8 with E4M3 layout, bias=15 (software, no HW accel).
+  AUTO = 255,      // Sentinel: resolve to the input dtype at runtime.
 };
 
 /// Word array.
-template <int Bytes>
+template <int Bytes, bool Enabled = (Bytes >= 4 && Bytes % 4 == 0)>
 struct alignas(Bytes) Words {
-  static_assert(Bytes > 0, "Bytes must be greater than 0");
-  static_assert(Bytes % 4 == 0, "Bytes must be multiple of 4");
   uint32_t w[Bytes / 4];
 
   MSCCLPP_HOST_DEVICE_INLINE Words() {}
@@ -84,18 +223,34 @@ struct alignas(Bytes) Words {
   MSCCLPP_HOST_DEVICE_INLINE const uint32_t& operator[](int i) const { return w[i]; }
 };
 
-/// Vector type.
-template <typename T, int N>
-union alignas(sizeof(T) * N) VectorType {
+template <int Bytes>
+struct alignas(Bytes) Words<Bytes, false> {};
+
+/// Vector type implementation (internal).
+template <typename T, int N, typename StorageT>
+union alignas(sizeof(T) * N) VectorTypeImpl {
   static_assert(N > 0, "N must be greater than 0");
+  static_assert(sizeof(StorageT) >= sizeof(T) * N, "StorageT must cover the full vector size");
 
   T data[N];
   Words<sizeof(T) * N> words;
+  StorageT storage;
 
   using ElementType = T;
   constexpr static int Size = N;
 
-  MSCCLPP_HOST_DEVICE_INLINE VectorType() {}
+  MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl() {}
+
+  MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl(const StorageT& value) : storage(value) {}
+
+  MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl(const VectorTypeImpl& other) { storage = other.storage; }
+
+  MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl& operator=(const VectorTypeImpl& other) {
+    storage = other.storage;
+    return *this;
+  }
+
+  MSCCLPP_HOST_DEVICE_INLINE operator StorageT() const { return storage; }
 
   MSCCLPP_HOST_DEVICE_INLINE operator T*() { return data; }
 
@@ -106,38 +261,1109 @@ union alignas(sizeof(T) * N) VectorType {
   MSCCLPP_HOST_DEVICE_INLINE const T& operator[](int i) const { return data[i]; }
 };
 
-using i32x1 = VectorType<int32_t, 1>;
-using u32x1 = VectorType<uint32_t, 1>;
-using f64x1 = VectorType<double, 1>;
-using f32x1 = VectorType<float, 1>;
+// Helper template to get the appropriate vector type for a given element type and count.
+template <typename T, int N>
+struct VectorTypeHelper {
+  static constexpr int Bytes = N * sizeof(T);
+  using type = VectorTypeImpl<
+      T, N,
+      std::conditional_t<Bytes == 4, uint32_t,
+                         std::conditional_t<Bytes == 8, uint2, std::conditional_t<Bytes <= 16, uint4, Words<Bytes>>>>>;
+};
 
-using i32x2 = VectorType<int32_t, 2>;
-using u32x2 = VectorType<uint32_t, 2>;
-using f32x2 = VectorType<float, 2>;
-using f16x2 = VectorType<__half, 2>;
-using bf16x2 = VectorType<__bfloat16, 2>;
+/// Vector type - clean user interface (automatically selects appropriate storage type)
+template <typename T, int N>
+using VectorType = typename VectorTypeHelper<T, N>::type;
 
-using i32x4 = VectorType<int32_t, 4>;
-using u32x4 = VectorType<uint32_t, 4>;
-using f32x4 = VectorType<float, 4>;
-using f16x4 = VectorType<__half, 4>;
-using bf16x4 = VectorType<__bfloat16, 4>;
+// Macro to define specialization AND alias in one go
+#define DEFINE_VEC(Alias, T, N, Storage)        \
+  template <>                                   \
+  struct VectorTypeHelper<T, N> {               \
+    using type = VectorTypeImpl<T, N, Storage>; \
+  };                                            \
+  using Alias = VectorType<T, N>
 
-using f16x8 = VectorType<__half, 8>;
-using bf16x8 = VectorType<__bfloat16, 8>;
+DEFINE_VEC(i32x1, int32_t, 1, int32_t);
+DEFINE_VEC(u32x1, uint32_t, 1, uint32_t);
+DEFINE_VEC(f32x1, float, 1, float);
+DEFINE_VEC(f64x1, double, 1, double);
+
+DEFINE_VEC(i32x2, int32_t, 2, int2);
+DEFINE_VEC(u32x2, uint32_t, 2, uint2);
+DEFINE_VEC(u8x2, uint8_t, 2, uint16_t);
+DEFINE_VEC(f32x2, float, 2, float2);
+DEFINE_VEC(f16x2, __half, 2, __half2);
+DEFINE_VEC(bf16x2, __bfloat16, 2, __bfloat162);
+
+DEFINE_VEC(i32x4, int32_t, 4, int4);
+DEFINE_VEC(u32x4, uint32_t, 4, uint4);
+DEFINE_VEC(u8x4, uint8_t, 4, uint32_t);
+DEFINE_VEC(f32x4, float, 4, float4);
+DEFINE_VEC(f16x4, __half, 4, uint2);
+DEFINE_VEC(bf16x4, __bfloat16, 4, uint2);
+
+DEFINE_VEC(f16x8, __half, 8, uint4);
+DEFINE_VEC(bf16x8, __bfloat16, 8, uint4);
+
+// Aliases for large vector types (>16 bytes) where no native CUDA storage type exists.
+using f32x8 = VectorType<float, 8>;
+using f32x16 = VectorType<float, 16>;
+using f16x16 = VectorType<__half, 16>;
 
 #if defined(__FP8_TYPES_EXIST__)
-// FP8 vector types
-using fp8_e4m3x2 = VectorType<__fp8_e4m3, 2>;
-using fp8_e4m3x4 = VectorType<__fp8_e4m3, 4>;
-using fp8_e4m3x8 = VectorType<__fp8_e4m3, 8>;
-using fp8_e4m3x16 = VectorType<__fp8_e4m3, 16>;
-using fp8_e5m2x2 = VectorType<__fp8_e5m2, 2>;
-using fp8_e5m2x4 = VectorType<__fp8_e5m2, 4>;
-using fp8_e5m2x8 = VectorType<__fp8_e5m2, 8>;
-using fp8_e5m2x16 = VectorType<__fp8_e5m2, 16>;
+DEFINE_VEC(f8_e4m3x2, __fp8_e4m3, 2, __fp8x2_e4m3);
+DEFINE_VEC(f8_e4m3x4, __fp8_e4m3, 4, __fp8x4_e4m3);
+DEFINE_VEC(f8_e4m3x8, __fp8_e4m3, 8, uint2);
+DEFINE_VEC(f8_e4m3x16, __fp8_e4m3, 16, uint4);
+
+DEFINE_VEC(f8_e5m2x2, __fp8_e5m2, 2, __fp8x2_e5m2);
+DEFINE_VEC(f8_e5m2x4, __fp8_e5m2, 4, __fp8x4_e5m2);
+DEFINE_VEC(f8_e5m2x8, __fp8_e5m2, 8, uint2);
+DEFINE_VEC(f8_e5m2x16, __fp8_e5m2, 16, uint4);
 #endif
 
+// fp8_e4m3b15 vectors (always available — software type, no HW dependency)
+DEFINE_VEC(f8_e4m3b15x2, __fp8_e4m3b15, 2, __fp8x2_e4m3b15);
+DEFINE_VEC(f8_e4m3b15x4, __fp8_e4m3b15, 4, __fp8x4_e4m3b15);
+DEFINE_VEC(f8_e4m3b15x8, __fp8_e4m3b15, 8, uint2);
+DEFINE_VEC(f8_e4m3b15x16, __fp8_e4m3b15, 16, uint4);
+#undef DEFINE_VEC
+
+#if defined(MSCCLPP_DEVICE_COMPILE)
+template <typename To, typename From>
+MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) {
+  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
+
+  union {
+    From f;
+    To t;
+  } u{.f = src};
+  return u.t;
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE T clip(T val) {
+  return val;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __half clip(__half val) {
+  val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
+  val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
+
+  return val;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __half2 clip(__half2 val) {
+  val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff));
+  val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff));
+  val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff));
+  val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff));
+  return val;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __bfloat16 clip(__bfloat16 val) {
+  val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80));
+  val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80));
+  return val;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __bfloat162 clip(__bfloat162 val) {
+  val.x = __hmax(val.x, bit_cast<__bfloat16, unsigned short>(0xff80));
+  val.x = __hmin(val.x, bit_cast<__bfloat16, unsigned short>(0x7f80));
+  val.y = __hmax(val.y, bit_cast<__bfloat16, unsigned short>(0xff80));
+  val.y = __hmin(val.y, bit_cast<__bfloat16, unsigned short>(0x7f80));
+  return val;
+}
+
+// FP8 E4M3 clipping function
+#if defined(__FP8_TYPES_EXIST__)
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3 clip(__fp8_e4m3 val) {
+  // FP8 E4M3 has range [-448, 448], no infinities
+  // Built-in saturation in FP8 arithmetic
+  return val;
+}
+
+// FP8 E5M2 clipping function - prevent infinities by clamping to max finite value
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e5m2 clip(__fp8_e5m2 val) {
+  // FP8 E5M2 has infinities - clamp to max finite value to prevent overflow
+  // Max finite value for E5M2 is 57344.0f (0x7B), min is -57344.0f (0xFB)
+  float fval = float(val);
+  fval = fmaxf(fval, -57344.0f);
+  fval = fminf(fval, 57344.0f);
+  return __fp8_e5m2(fval);
+}
+#endif
+
+// --- f32x2 arithmetic ---
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f32x2 operator+(const f32x2& a, const f32x2& b) {
+#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ >= 1000)
+  // Blackwell (SM 10.0+): packed float2 add in a single instruction.
+  return __fadd2_rn(a.storage, b.storage);
+#else
+  f32x2 result;
+  result.data[0] = a.data[0] + b.data[0];
+  result.data[1] = a.data[1] + b.data[1];
+  return result;
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
+  __half2 result;
+  if constexpr (UseClip) {
+    result = clip(__hadd2(a, b));
+  } else {
+    result = __hadd2(a, b);
+  }
+  return result;
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f16x4 operator+(const f16x4& a, const f16x4& b) {
+  // Decompose into 2× packed __hadd2 (2 instructions instead of 4 scalar __hadd).
+  const f16x2* a2 = reinterpret_cast<const f16x2*>(&a);
+  const f16x2* b2 = reinterpret_cast<const f16x2*>(&b);
+  f16x4 result;
+  f16x2* r2 = reinterpret_cast<f16x2*>(&result);
+  r2[0] = a2[0] + b2[0];
+  r2[1] = a2[1] + b2[1];
+  return result;
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE bf16x2 operator+(const bf16x2& a, const bf16x2& b) {
+  __bfloat162 result;
+  if constexpr (UseClip) {
+    result = clip(__hadd2(a, b));
+  } else {
+    result = __hadd2(a, b);
+  }
+  return result;
+}
+
+#if defined(__FP8_TYPES_EXIST__)
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3 operator+(const __fp8_e4m3& a, const __fp8_e4m3& b) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // Optimized assembly for gfx942
+  float2 v;
+  uint32_t ival = 0;
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0)));
+  return static_cast<__hip_fp8_storage_t>(__builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false));
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  // NVIDIA CUDA FP8 addition (CUDA 11.8+)
+  __fp8_e4m3 result = __fp8_e4m3(__hadd(__half(a), __half(b)));
+  return UseClip ? clip(result) : result;
+#else
+  // Fallback for other devices
+  __fp8_e4m3 result = __fp8_e4m3(float(a) + float(b));
+  return UseClip ? clip(result) : result;
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 operator+(const f8_e4m3x2& a, const f8_e4m3x2& b) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  float2 v;
+  uint32_t ival = 0;
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, 0)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, 0)));
+  return bit_cast<f8_e4m3x2>(
+      static_cast<__hip_fp8x2_storage_t>(__builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false)));
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  // CUDA: Convert to half2, add using optimized __hadd2, convert back
+  return __fp8x2_e4m3(__hadd2(__half2(static_cast<__fp8x2_e4m3>(a)), __half2(static_cast<__fp8x2_e4m3>(b))));
+#else
+  // Fallback for other devices: element-wise using single-element operations
+  f8_e4m3x2 result;
+  result.data[0] = a.data[0] + b.data[0];
+  result.data[1] = a.data[1] + b.data[1];
+  return result;
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3x4 operator+(const f8_e4m3x4& a, const f8_e4m3x4& b) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  float2 v_low, v_high;
+  // E4M3 using fp8 conversion - process low word (false) and high word (true)
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v_low)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, false)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, false)));
+  uint32_t result_packed = __builtin_amdgcn_cvt_pk_fp8_f32(v_low.x, v_low.y, 0, false);
+
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v_high)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, true)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, true)));
+  result_packed = __builtin_amdgcn_cvt_pk_fp8_f32(v_high.x, v_high.y, result_packed, true);
+  return bit_cast<f8_e4m3x4>(result_packed);
+#else
+  // Process as two f8_e4m3x2 using operator+ for 2 elements
+  const f8_e4m3x2* a_pair = reinterpret_cast<const f8_e4m3x2*>(&a);
+  const f8_e4m3x2* b_pair = reinterpret_cast<const f8_e4m3x2*>(&b);
+
+  f8_e4m3x2 result[2];
+  result[0] = a_pair[0] + b_pair[0];
+  result[1] = a_pair[1] + b_pair[1];
+
+  return *reinterpret_cast<f8_e4m3x4*>(result);
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE __fp8_e5m2 operator+(const __fp8_e5m2& a, const __fp8_e5m2& b) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // Optimized assembly for gfx942 (bfloat8)
+  float2 v;
+  uint32_t ival = 0;
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0)));
+  return static_cast<__hip_fp8_storage_t>(__builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false));
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  // NVIDIA CUDA FP8 addition
+  __fp8_e5m2 result = __fp8_e5m2(__hadd(__half(a), __half(b)));
+  return UseClip ? clip(result) : result;
+#else
+  __fp8_e5m2 result = __fp8_e5m2(float(a) + float(b));
+  return UseClip ? clip(result) : result;
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e5m2x2 operator+(const f8_e5m2x2& a, const f8_e5m2x2& b) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  // CUDA: Convert to half2, add using optimized __hadd2, convert back
+  f8_e5m2x2 result =
+      __fp8x2_e5m2(__hadd2(__half2(static_cast<__fp8x2_e5m2>(a)), __half2(static_cast<__fp8x2_e5m2>(b))));
+  if constexpr (UseClip) {
+    result = clip(result);
+  }
+  return result;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // HIP gfx942: Use BF8 assembly instructions
+  float2 v;
+  uint32_t ival = 0;
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.data[0].__x, 0)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.data[0].__x, 0)));
+  return bit_cast<f8_e5m2x2>(
+      static_cast<__hip_fp8x2_storage_t>(__builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, ival, false)));
+#else
+  // Fallback: element-wise using single-element operations
+  f8_e5m2x2 result;
+  result.data[0] = a.data[0] + b.data[0];
+  result.data[1] = a.data[1] + b.data[1];
+  return result;
+#endif
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e5m2x4 operator+(const f8_e5m2x4& a, const f8_e5m2x4& b) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  float2 v_low, v_high;
+  // E5M2 using bf8 conversion - process low word (false) and high word (true)
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v_low)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.storage.__x, false)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.storage.__x, false)));
+  uint32_t result_packed = __builtin_amdgcn_cvt_pk_bf8_f32(v_low.x, v_low.y, 0, false);
+
+  asm volatile("v_pk_add_f32 %0, %1, %2"
+               : "=v"(v_high)
+               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.storage.__x, true)),
+                 "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.storage.__x, true)));
+  result_packed = __builtin_amdgcn_cvt_pk_bf8_f32(v_high.x, v_high.y, result_packed, true);
+  return bit_cast<f8_e5m2x4>(result_packed);
+#else
+  // Process as two f8_e5m2x2 using operator+ for 2 elements
+  const f8_e5m2x2* a_pair = reinterpret_cast<const f8_e5m2x2*>(&a);
+  const f8_e5m2x2* b_pair = reinterpret_cast<const f8_e5m2x2*>(&b);
+  f8_e5m2x2 result[2];
+  result[0] = a_pair[0] + b_pair[0];
+  result[1] = a_pair[1] + b_pair[1];
+
+  return *reinterpret_cast<f8_e5m2x4*>(result);
+#endif
+}
+#endif  // defined(__FP8_TYPES_EXIST__)
+
+MSCCLPP_DEVICE_INLINE u8x4 operator+(const u8x4& a, const u8x4& b) {
+#if defined(MSCCLPP_DEVICE_HIP)
+  // Optimized uint8_t x 4 sum using byte permute to avoid overflow between adjacent bytes
+  constexpr uint32_t even = 0x00ff00ffu;
+  uint32_t ua = a.storage;
+  uint32_t ub = b.storage;
+  uint32_t x = (ua & even) + (ub & even);
+  uint32_t y = (ua & ~even) + (ub & ~even);
+  return __byte_perm(x, y, 0x7250);
+#else
+  return __vadd4(a.storage, b.storage);
+#endif
+}
+
+template <typename T>
+MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) {
+  return (a < b ? a : b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 min(const f32x2& a, const f32x2& b) {
+  f32x2 result;
+  result.data[0] = fminf(a.data[0], b.data[0]);
+  result.data[1] = fminf(a.data[1], b.data[1]);
+  return result;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE f16x2 min(const f16x2& a, const f16x2& b) {
+#if defined(MSCCLPP_DEVICE_HIP)
+  f16x2 val;
+  val[0] = __hmin(a[0], b[0]);
+  val[1] = __hmin(a[1], b[1]);
+  return val;
+#else
+  __half2 ret = __hmin2(a, b);
+  return ret;
+#endif
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE bf16x2 min(const bf16x2& a, const bf16x2& b) {
+  return __hmin2(a, b);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE u8x4 min(const u8x4& a, const u8x4& b) {
+#if defined(MSCCLPP_DEVICE_HIP)
+  // Optimized uint8_t x 4 min using 9-bit arithmetic
+  constexpr uint32_t ones = 0x01010101u;
+  constexpr uint32_t even = 0x00ff00ffu;  // even byte mask
+  uint32_t ua = a.storage;
+  uint32_t ub = b.storage;
+  // Use 9-bit arithmetic to compute d=a-b for each byte
+  uint32_t d0 = (ua & even) + (~ub & even) + ones;
+  uint32_t d1 = ((ua >> 8) & even) + (~(ub >> 8) & even) + ones;
+  // Move sign bit of each 9-bit delta into the least bit of origin byte
+  uint32_t s = __byte_perm(d0, d1, 0x7351) & ones;
+  // Broadcast least bit across whole byte
+  s *= 0xffu;
+  // Compose result by selecting bytes via: signbit(a-b)==1 ? a : b
+  return (ua & s) | (ub & ~s);
+#else
+  return __vminu4(a.storage, b.storage);
+#endif
+}
+
+/// Convert a vector type From to vector type To.
+/// Primary template with auto-decomposition: vectors with N > 4 elements decompose into x4 chunks,
+/// vectors with N == 4 decompose into x2 chunks, enabling optimized x2/x4 specializations to be reached.
+/// Specialized below for optimized FP8 conversion paths at x2/x4 level.
+template <typename To, typename From>
+MSCCLPP_DEVICE_INLINE To to(const From& v) {
+  static_assert(To::Size == From::Size, "to<To, From>: vector sizes must match");
+  constexpr int N = From::Size;
+
+  // Auto-decompose: N > 4 → split into x4 chunks
+  if constexpr (N > 4 && N % 4 == 0) {
+    constexpr int nChunks = N / 4;
+    using FromChunk = VectorType<typename From::ElementType, 4>;
+    using ToChunk = VectorType<typename To::ElementType, 4>;
+    const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
+    To result;
+    ToChunk* out = reinterpret_cast<ToChunk*>(&result);
+#pragma unroll
+    for (int c = 0; c < nChunks; ++c) {
+      out[c] = to<ToChunk>(in[c]);
+    }
+    return result;
+  }
+  // Auto-decompose: N == 4 → split into 2x x2 chunks
+  else if constexpr (N == 4) {
+    using FromChunk = VectorType<typename From::ElementType, 2>;
+    using ToChunk = VectorType<typename To::ElementType, 2>;
+    const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
+    To result;
+    ToChunk* out = reinterpret_cast<ToChunk*>(&result);
+    out[0] = to<ToChunk>(in[0]);
+    out[1] = to<ToChunk>(in[1]);
+    return result;
+  }
+  // Base case: element-wise conversion
+  else {
+    To result;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      result.data[i] = static_cast<typename To::ElementType>(v.data[i]);
+    }
+    return result;
+  }
+}
+
+#if defined(__FP8_TYPES_EXIST__)
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) {
+#if defined(MSCCLPP_DEVICE_HIP)
+  return __fp8_e4m3(fminf(float(a), float(b)));
+#else
+  return __fp8_e4m3(__hmin(__half(a), __half(b)));
+#endif
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 min(const f8_e4m3x2& a, const f8_e4m3x2& b) {
+  // Process element-wise using single-element operations
+  f8_e4m3x2 result;
+  result.data[0] = mscclpp::min(a.data[0], b.data[0]);
+  result.data[1] = mscclpp::min(a.data[1], b.data[1]);
+  return result;
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3x4 min(const f8_e4m3x4& a, const f8_e4m3x4& b) {
+  // Process as two f8_e4m3x2 using min for 2 elements
+  const f8_e4m3x2* a_ptr = reinterpret_cast<const f8_e4m3x2*>(&a);
+  const f8_e4m3x2* b_ptr = reinterpret_cast<const f8_e4m3x2*>(&b);
+
+  f8_e4m3x4 result;
+  f8_e4m3x2* result_ptr = reinterpret_cast<f8_e4m3x2*>(&result);
+
+  result_ptr[0] = mscclpp::min(a_ptr[0], b_ptr[0]);
+  result_ptr[1] = mscclpp::min(a_ptr[1], b_ptr[1]);
+
+  return result;
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e5m2 min(const __fp8_e5m2& a, const __fp8_e5m2& b) {
+#if defined(MSCCLPP_DEVICE_HIP)
+  return __fp8_e5m2(fminf(float(a), float(b)));
+#else
+  return __fp8_e5m2(__hmin(__half(a), __half(b)));
+#endif
+}
+
+MSCCLPP_DEVICE_INLINE f8_e5m2x2 min(const f8_e5m2x2& a, const f8_e5m2x2& b) {
+  // Process element-wise using single-element operations
+  f8_e5m2x2 result;
+  result.data[0] = mscclpp::min(a.data[0], b.data[0]);
+  result.data[1] = mscclpp::min(a.data[1], b.data[1]);
+  return result;
+}
+
+MSCCLPP_DEVICE_INLINE f8_e5m2x4 min(const f8_e5m2x4& a, const f8_e5m2x4& b) {
+  // Process as two f8_e5m2x2 using min for 2 elements
+  const f8_e5m2x2* a_ptr = reinterpret_cast<const f8_e5m2x2*>(&a);
+  const f8_e5m2x2* b_ptr = reinterpret_cast<const f8_e5m2x2*>(&b);
+
+  f8_e5m2x4 result;
+  f8_e5m2x2* result_ptr = reinterpret_cast<f8_e5m2x2*>(&result);
+
+  result_ptr[0] = mscclpp::min(a_ptr[0], b_ptr[0]);
+  result_ptr[1] = mscclpp::min(a_ptr[1], b_ptr[1]);
+
+  return result;
+}
+
+// --- f8_e4m3 -> f32 specializations ---
+
+/// f8_e4m3x2 -> f32x2.
+/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
+/// HIP gfx942: fp8 -> float (via __builtin_amdgcn_cvt_pk_f32_fp8).
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3x2>(const f8_e4m3x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
+  f32x2 result;
+  result.data[0] = f[0];
+  result.data[1] = f[1];
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
+  f32x2 result;
+  result.data[0] = __half2float(bit_cast<__half>(h2.x));
+  result.data[1] = __half2float(bit_cast<__half>(h2.y));
+  return result;
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e4m3x4 -> f32x4.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3x4>(const f8_e4m3x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto lo = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, false);
+  auto hi = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, true);
+  f32x4 result;
+  result.data[0] = lo[0];
+  result.data[1] = lo[1];
+  result.data[2] = hi[0];
+  result.data[3] = hi[1];
+  return result;
+#else
+  const f8_e4m3x2* pair = reinterpret_cast<const f8_e4m3x2*>(&v);
+  f32x2 lo = to<f32x2>(pair[0]);
+  f32x2 hi = to<f32x2>(pair[1]);
+  f32x4 result;
+  result.data[0] = lo.data[0];
+  result.data[1] = lo.data[1];
+  result.data[2] = hi.data[0];
+  result.data[3] = hi.data[1];
+  return result;
+#endif
+}
+
+// --- f8_e5m2 -> f32 specializations ---
+
+/// f8_e5m2x2 -> f32x2.
+/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
+/// HIP gfx942: bf8 -> float (via __builtin_amdgcn_cvt_pk_f32_bf8).
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e5m2x2>(const f8_e5m2x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, 0);
+  f32x2 result;
+  result.data[0] = f[0];
+  result.data[1] = f[1];
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E5M2);
+  f32x2 result;
+  result.data[0] = __half2float(bit_cast<__half>(h2.x));
+  result.data[1] = __half2float(bit_cast<__half>(h2.y));
+  return result;
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e5m2x4 -> f32x4.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e5m2x4>(const f8_e5m2x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto lo = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, false);
+  auto hi = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, true);
+  f32x4 result;
+  result.data[0] = lo[0];
+  result.data[1] = lo[1];
+  result.data[2] = hi[0];
+  result.data[3] = hi[1];
+  return result;
+#else
+  const f8_e5m2x2* pair = reinterpret_cast<const f8_e5m2x2*>(&v);
+  f32x2 lo = to<f32x2>(pair[0]);
+  f32x2 hi = to<f32x2>(pair[1]);
+  f32x4 result;
+  result.data[0] = lo.data[0];
+  result.data[1] = lo.data[1];
+  result.data[2] = hi.data[0];
+  result.data[3] = hi.data[1];
+  return result;
+#endif
+}
+
+// --- f32 -> f8_e4m3 specializations (downcast) ---
+
+/// f32x2 -> f8_e4m3x2.
+/// HIP gfx942: float -> fp8 (via __builtin_amdgcn_cvt_pk_fp8_f32).
+/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2).
+/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
+  return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2;
+  h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
+  return bit_cast<f8_e4m3x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  f8_e4m3x2 result;
+  result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
+  result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
+  return result;
+#else
+  f8_e4m3x2 result;
+  result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e4m3x4.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x4 to<f8_e4m3x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
+  packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[2], v.data[3], packed, true);
+  return bit_cast<f8_e4m3x4>(packed);
+#else
+  f32x2 lo, hi;
+  lo.data[0] = v.data[0];
+  lo.data[1] = v.data[1];
+  hi.data[0] = v.data[2];
+  hi.data[1] = v.data[3];
+  f8_e4m3x2 lo_fp8 = to<f8_e4m3x2>(lo);
+  f8_e4m3x2 hi_fp8 = to<f8_e4m3x2>(hi);
+  f8_e4m3x4 result;
+  result.data[0] = lo_fp8.data[0];
+  result.data[1] = lo_fp8.data[1];
+  result.data[2] = hi_fp8.data[0];
+  result.data[3] = hi_fp8.data[1];
+  return result;
+#endif
+}
+
+// --- f32 -> f8_e5m2 specializations (downcast) ---
+
+/// f32x2 -> f8_e5m2x2.
+/// HIP gfx942: float -> bf8 (via __builtin_amdgcn_cvt_pk_bf8_f32).
+/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2 with __NV_E5M2).
+/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e5m2x2 to<f8_e5m2x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
+  return bit_cast<f8_e5m2x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2;
+  h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E5M2);
+  return bit_cast<f8_e5m2x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
+  h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
+  f8_e5m2x2 result;
+  result.data[0] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E5M2));
+  result.data[1] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E5M2));
+  return result;
+#else
+  f8_e5m2x2 result;
+  result.data[0] = static_cast<__fp8_e5m2>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e5m2>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e5m2x4.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e5m2x4 to<f8_e5m2x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
+  packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[2], v.data[3], packed, true);
+  return bit_cast<f8_e5m2x4>(packed);
+#else
+  f32x2 lo, hi;
+  lo.data[0] = v.data[0];
+  lo.data[1] = v.data[1];
+  hi.data[0] = v.data[2];
+  hi.data[1] = v.data[3];
+  f8_e5m2x2 lo_fp8 = to<f8_e5m2x2>(lo);
+  f8_e5m2x2 hi_fp8 = to<f8_e5m2x2>(hi);
+  f8_e5m2x4 result;
+  result.data[0] = lo_fp8.data[0];
+  result.data[1] = lo_fp8.data[1];
+  result.data[2] = hi_fp8.data[0];
+  result.data[3] = hi_fp8.data[1];
+  return result;
+#endif
+}
+
+// --- f8_e4m3 <-> f16 conversion specializations ---
+
+/// f8_e4m3x2 -> f16x2.
+/// NVIDIA SM90+: packed intrinsic (1 instruction).
+/// HIP gfx942: fp8 -> float -> half (via AMD builtin).
+/// Pre-SM90 / fallback: element-wise scalar conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3x2>(const f8_e4m3x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
+  f16x2 result;
+  result.data[0] = __float2half(f[0]);
+  result.data[1] = __float2half(f[1]);
+  return result;
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
+  return bit_cast<f16x2>(h2);
+#else
+  f16x2 result;
+  result.data[0] = static_cast<__half>(v.data[0]);
+  result.data[1] = static_cast<__half>(v.data[1]);
+  return result;
+#endif
+}
+
+/// f16x2 -> f8_e4m3x2.
+/// NVIDIA SM90+: packed intrinsic (1 instruction).
+/// HIP gfx942: half -> float -> fp8 (via AMD builtin).
+/// Pre-SM90: element-wise scalar conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f16x2>(const f16x2& v) {
+#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  float f0 = __half2float(v.data[0]);
+  float f1 = __half2float(v.data[1]);
+  uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(f0, f1, 0, false);
+  return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
+#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
+  __half2_raw h2 = bit_cast<__half2_raw>(v);
+  __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
+  return bit_cast<f8_e4m3x2>(fp8x2);
+#elif defined(MSCCLPP_DEVICE_CUDA)
+  __half_raw h0, h1;
+  h0.x = bit_cast<unsigned short>(v.data[0]);
+  h1.x = bit_cast<unsigned short>(v.data[1]);
+  f8_e4m3x2 result;
+  result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
+  result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
+  return result;
+#else
+  f8_e4m3x2 result;
+  result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
+  result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
+  return result;
+#endif
+}
+
+#endif  // defined(__FP8_TYPES_EXIST__)
+
+// --- fp8_e4m3b15 <-> fp16 direct conversion specializations ---
+// These are the PRIMARY conversions: fp8_b15 <-> fp16 is just a 1-bit exponent shift
+// (E4 bias=15 <-> E5 bias=15), no precision loss since fp16 has 10 mantissa bits
+// vs fp8's 3. fp32 conversions are derived by routing through fp16.
+
+/// f8_e4m3b15x2 -> f16x2.
+/// Direct fp8 -> fp16 via branch-free bit manipulation.
+template <>
+MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint16_t in = v.storage.__x;
+  // Spread 2 fp8 bytes into packed fp16 pair, adjust exponent E4->E5.
+  uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
+  uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
+  uint32_t out0 = b0 | (a0 & 0x80008000u);
+  __half2 h;
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h)) : "r"(out0));
+  return h;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: same bit manipulation as CUDA, store packed fp16 bits via words[].
+  uint16_t in = v.storage.__x;
+  uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
+  uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
+  uint32_t out0 = b0 | (a0 & 0x80008000u);
+  f16x2 result;
+  result.words[0] = out0;
+  return result;
+#else
+  f16x2 result;
+  result.data[0] = __float2half(float(v.data[0]));
+  result.data[1] = __float2half(float(v.data[1]));
+  return result;
+#endif
+}
+
+/// f8_e4m3b15x4 -> f16x4.
+/// Uses __byte_perm + lop3 for branch-free vectorized conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f16x4 to<f16x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in = v.storage.__x;
+  uint32_t a0 = __byte_perm(0u, in, 0x5746u);
+  uint32_t a0_shr = a0 >> 1;
+  uint32_t a0_sign = a0 & 0x80008000u;
+  uint32_t out0;
+  asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out0) : "r"(a0_shr), "r"(0x3f803f80u), "r"(a0_sign));
+  uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
+  uint32_t a1_shr = a1 >> 1;
+  uint32_t a1_sign = a1 & 0x80008000u;
+  uint32_t out1;
+  asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out1) : "r"(a1_shr), "r"(0x3f803f80u), "r"(a1_sign));
+  f16x4 result;
+  asm("mov.b32 %0, %1;" : "=r"(result.words[0]) : "r"(out0));
+  asm("mov.b32 %0, %1;" : "=r"(result.words[1]) : "r"(out1));
+  return result;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: __byte_perm + bitwise E4→E5 shift (no lop3), store via words[].
+  uint32_t in = v.storage.__x;
+  uint32_t a0 = __byte_perm(0u, in, 0x5746u);
+  uint32_t out0 = ((a0 >> 1) & 0x3f803f80u) | (a0 & 0x80008000u);
+  uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
+  uint32_t out1 = ((a1 >> 1) & 0x3f803f80u) | (a1 & 0x80008000u);
+  f16x4 result;
+  result.words[0] = out0;
+  result.words[1] = out1;
+  return result;
+#else
+  f16x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __float2half(float(v.data[i]));
+  }
+  return result;
+#endif
+}
+
+/// f16x2 -> f8_e4m3b15x2.
+/// Direct fp16 -> fp8 via clamp + exponent shift E5->E4 + pack.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in0;
+  asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast<const uint32_t*>(&v)));
+  // Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16).
+  uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16;
+  uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu;
+  alo = alo < 0x3B80u ? alo : 0x3B80u;
+  ahi = ahi < 0x3B80u ? ahi : 0x3B80u;
+  uint32_t a0 = alo | (ahi << 16);
+  a0 = a0 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
+  return bit_cast<f8_e4m3b15x2>(packed);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, pack.
+  uint32_t in0 = v.words[0];
+  uint32_t abs0 = in0 & 0x7fff7fffu;
+  uint32_t a0;
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
+  a0 = a0 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
+  return bit_cast<f8_e4m3b15x2>(packed);
+#else
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(__half2float(v.data[0]));
+  result.data[1] = __fp8_e4m3b15(__half2float(v.data[1]));
+  return result;
+#endif
+}
+
+/// f16x4 -> f8_e4m3b15x4.
+/// Uses __vminu2 + lop3 + __byte_perm for branch-free vectorized conversion.
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  uint32_t in0, in1;
+  asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(v.words[0]));
+  asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1]));
+  uint32_t abs0 = in0 & 0x7fff7fffu;
+  uint32_t abs1 = in1 & 0x7fff7fffu;
+  uint32_t a0 = __vminu2(abs0, 0x3B803B80u);
+  uint32_t a1 = __vminu2(abs1, 0x3B803B80u);
+  a0 = a0 * 2u + 0x00800080u;
+  a1 = a1 * 2u + 0x00800080u;
+  uint32_t b0, b1;
+  asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b0) : "r"(a0), "r"(in0), "r"(0x80008000u));
+  asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b1) : "r"(a1), "r"(in1), "r"(0x80008000u));
+  uint32_t packed = __byte_perm(b0, b1, 0x7531u);
+  return bit_cast<f8_e4m3b15x4>(packed);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  // gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, __byte_perm pack.
+  uint32_t in0 = v.words[0], in1 = v.words[1];
+  uint32_t abs0 = in0 & 0x7fff7fffu, abs1 = in1 & 0x7fff7fffu;
+  uint32_t a0, a1;
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3B803B80u));
+  a0 = a0 * 2u + 0x00800080u;
+  a1 = a1 * 2u + 0x00800080u;
+  uint32_t b0 = a0 | (in0 & 0x80008000u);
+  uint32_t b1 = a1 | (in1 & 0x80008000u);
+  uint32_t packed = __byte_perm(b0, b1, 0x7531u);
+  return bit_cast<f8_e4m3b15x4>(packed);
+#else
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(__half2float(v.data[i]));
+  }
+  return result;
+#endif
+}
+
+// --- fp8_e4m3b15 <-> f32 conversion specializations (software, always available) ---
+
+/// f8_e4m3b15x2 -> f32x2.
+/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
+template <>
+MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
+  float2 f2 = __half22float2(h);
+  return bit_cast<f32x2>(f2);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
+  f32x2 result;
+  result.data[0] = __half2float(h.data[0]);
+  result.data[1] = __half2float(h.data[1]);
+  return result;
+#else
+  f32x2 result;
+  result.data[0] = float(v.data[0]);
+  result.data[1] = float(v.data[1]);
+  return result;
+#endif
+}
+
+/// f8_e4m3b15x4 -> f32x4.
+/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
+template <>
+MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
+  __half2 h0, h1;
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h0)) : "r"(h.words[0]));
+  asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h1)) : "r"(h.words[1]));
+  float2 f0 = __half22float2(h0);
+  float2 f1 = __half22float2(h1);
+  f32x4 result;
+  result.data[0] = f0.x;
+  result.data[1] = f0.y;
+  result.data[2] = f1.x;
+  result.data[3] = f1.y;
+  return result;
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
+  f32x4 result;
+  result.data[0] = __half2float(h.data[0]);
+  result.data[1] = __half2float(h.data[1]);
+  result.data[2] = __half2float(h.data[2]);
+  result.data[3] = __half2float(h.data[3]);
+  return result;
+#else
+  f32x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = float(v.data[i]);
+  }
+  return result;
+#endif
+}
+
+/// f32x2 -> f8_e4m3b15x2.
+/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f32x2>(const f32x2& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  float2 f2 = {v.data[0], v.data[1]};
+  __half2 h = __float22half2_rn(f2);
+  return to<f8_e4m3b15x2, f16x2>(h);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x2 h;
+  h.data[0] = __float2half_rn(v.data[0]);
+  h.data[1] = __float2half_rn(v.data[1]);
+  return to<f8_e4m3b15x2, f16x2>(h);
+#else
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(v.data[0]);
+  result.data[1] = __fp8_e4m3b15(v.data[1]);
+  return result;
+#endif
+}
+
+/// f32x4 -> f8_e4m3b15x4.
+/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
+template <>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f32x4>(const f32x4& v) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+  float2 f01 = {v.data[0], v.data[1]};
+  float2 f23 = {v.data[2], v.data[3]};
+  __half2 h01 = __float22half2_rn(f01);
+  __half2 h23 = __float22half2_rn(f23);
+  f16x4 h;
+  asm("mov.b32 %0, %1;" : "=r"(h.words[0]) : "r"(*reinterpret_cast<uint32_t*>(&h01)));
+  asm("mov.b32 %0, %1;" : "=r"(h.words[1]) : "r"(*reinterpret_cast<uint32_t*>(&h23)));
+  return to<f8_e4m3b15x4, f16x4>(h);
+#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
+  f16x4 h;
+  h.words[0] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[0], v.data[1]));
+  h.words[1] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[2], v.data[3]));
+  return to<f8_e4m3b15x4, f16x4>(h);
+#else
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(v.data[i]);
+  }
+  return result;
+#endif
+}
+
+// --- fp8_e4m3b15 arithmetic (software, always available) ---
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 operator+(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
+  return __fp8_e4m3b15(float(a) + float(b));
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 operator+(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
+  f8_e4m3b15x2 result;
+  result.data[0] = __fp8_e4m3b15(float(a.data[0]) + float(b.data[0]));
+  result.data[1] = __fp8_e4m3b15(float(a.data[1]) + float(b.data[1]));
+  return result;
+}
+
+template <bool UseClip = true>
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 operator+(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = __fp8_e4m3b15(float(a.data[i]) + float(b.data[i]));
+  }
+  return result;
+}
+
+// --- fp8_e4m3b15 min (software) ---
+
+template <>
+MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 min(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
+  return __fp8_e4m3b15(fminf(float(a), float(b)));
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 min(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
+  f8_e4m3b15x2 result;
+  result.data[0] = mscclpp::min(a.data[0], b.data[0]);
+  result.data[1] = mscclpp::min(a.data[1], b.data[1]);
+  return result;
+}
+
+MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 min(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
+  f8_e4m3b15x4 result;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    result.data[i] = mscclpp::min(a.data[i], b.data[i]);
+  }
+  return result;
+}
+
+#endif  // MSCCLPP_DEVICE_COMPILE
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_GPU_DATA_TYPES_HPP_
diff --git a/include/mscclpp/proxy.hpp b/include/mscclpp/proxy.hpp
index 36a56a90..990deabb 100644
--- a/include/mscclpp/proxy.hpp
+++ b/include/mscclpp/proxy.hpp
@@ -29,7 +29,9 @@ class Proxy {
  public:
   /// Constructor.
   /// @param handler Handler for each FIFO trigger.
-  /// @param threadInit Optional function run in proxy thread before FIFO consumption.
+  /// @param threadInit Optional function run once in the proxy thread before FIFO consumption.
+  ///        The function should initialize thread runtime context before any CUDA API call in that thread
+  ///        (for example, set CUDA device and optionally bind NUMA affinity).
   /// @param fifoSize FIFO size (default: DEFAULT_FIFO_SIZE).
   Proxy(ProxyHandler handler, std::function<void()> threadInit, int fifoSize = DEFAULT_FIFO_SIZE);
 
diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index 27f9aefa..4d1f2e32 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -16,6 +16,7 @@ namespace mscclpp {
 class Host2DeviceSemaphore {
  private:
   Semaphore semaphore_;
+  std::shared_ptr<uint64_t> inboundToken_;
   detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
   std::unique_ptr<uint64_t> outboundToken_;
 
@@ -29,6 +30,15 @@ class Host2DeviceSemaphore {
   /// @param connection The connection associated with this semaphore.
   Host2DeviceSemaphore(Communicator& communicator, const Connection& connection);
 
+  /// Destructor.
+  ~Host2DeviceSemaphore();
+
+  /// Move constructor.
+  Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default;
+
+  /// Move assignment operator.
+  Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default;
+
   /// Returns the connection.
   /// @return The connection associated with this semaphore.
   Connection& connection();
@@ -82,7 +92,6 @@ class MemoryDevice2DeviceSemaphore {
  private:
   Semaphore semaphore_;
   detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
-  detail::UniqueGpuPtr<uint64_t> outboundToken_;
 
  public:
   /// Constructor.
diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp
index f1b01e89..a790a6e1 100644
--- a/include/mscclpp/semaphore_device.hpp
+++ b/include/mscclpp/semaphore_device.hpp
@@ -82,19 +82,20 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
 
   /// Signal remote device, ensures prior memory ops complete.
   MSCCLPP_DEVICE_INLINE void signal() {
-    auto outbound = incOutbound();
-#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ == 800)
-    // Using memoryOrderSeqCst is faster for A100.
-    atomicStore(remoteInboundToken, outbound, memoryOrderSeqCst);
-#else
-    atomicStore(remoteInboundToken, outbound, memoryOrderRelease);
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("red.release.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
+#elif defined(MSCCLPP_DEVICE_HIP)
+    (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelease);
 #endif
   }
 
   /// Relaxed signal; no memory completion guarantee. Use it only for synchronizing execution, not data.
   MSCCLPP_DEVICE_INLINE void relaxedSignal() {
-    auto outbound = incOutbound();
-    atomicStore(remoteInboundToken, outbound, memoryOrderRelaxed);
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("red.relaxed.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
+#elif defined(MSCCLPP_DEVICE_HIP)
+    (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelaxed);
+#endif
   }
 
   /// Thread-safe read of expected inbound value.
@@ -121,27 +122,12 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
     return atomicLoad<uint64_t, scopeSystem>(inboundToken, memoryOrderRelaxed);
   }
 
-  /// Thread-safe read of outbound value.
-  /// @return The outbound value.
-  MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() {
-    return atomicLoad<uint64_t, scopeDevice>(outboundToken, memoryOrderRelaxed);
-  }
-
-  /// Thread-safe increment of outbound value.
-  /// @return The incremented outbound value.
-  MSCCLPP_DEVICE_INLINE uint64_t incOutbound() {
-    return atomicFetchAdd<uint64_t, scopeDevice>(outboundToken, 1, memoryOrderRelaxed) + 1;
-  }
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
   /// A local memory space where the remote device will write its semaphore value and the local device will read it.
   uint64_t* inboundToken;
 
-  /// A local memory space where the local device stores the semaphore value to be written to the remote device.
-  uint64_t* outboundToken;
-
-  /// A remote memory space where the local device writes its outboundToken on. This is inboundToken of the
-  /// remote device.
+  /// A remote memory space where the local device atomically increments. This is inboundToken of the remote device.
   uint64_t* remoteInboundToken;
 
   /// A local memory space where the local device stores the expected value of the inboundToken to wait for.
diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp
index 5f8a1608..b52b6572 100644
--- a/include/mscclpp/switch_channel_device.hpp
+++ b/include/mscclpp/switch_channel_device.hpp
@@ -80,26 +80,26 @@ struct SwitchChannelDeviceHandle {
           : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
           : "l"(ptr)
           : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];"
           : "=r"(val.words[0]), "=r"(val.words[1])
           : "l"(ptr)
           : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x16>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];"
           : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
           : "l"(ptr)
           : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x4>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x8>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];"
           : "=r"(val.words[0]), "=r"(val.words[1])
           : "l"(ptr)
           : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x16>) {
       asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];"
           : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
           : "l"(ptr)
@@ -148,23 +148,23 @@ struct SwitchChannelDeviceHandle {
       asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
       asm volatile("multimem.st.relaxed.sys.global.e4m3x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
       asm volatile("multimem.st.relaxed.sys.global.v2.e4m3x4  [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1])
                    : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x16>) {
       asm volatile("multimem.st.relaxed.sys.global.v4.e4m3x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x4>) {
       asm volatile("multimem.st.relaxed.sys.global.e5m2x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x8>) {
       asm volatile("multimem.st.relaxed.sys.global.v2.e5m2x4  [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1])
                    : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x16>) {
       asm volatile("multimem.st.relaxed.sys.global.v4.e5m2x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                    "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                    : "memory");
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b84cea3a..5e784e92 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,6 +4,10 @@
 add_subdirectory(csrc)
 add_subdirectory(test)
 
+target_compile_definitions(mscclpp_py PRIVATE
+  $<$<BOOL:${MSCCLPP_DISABLE_NB_LEAK_WARNINGS}>:MSCCLPP_DISABLE_NB_LEAK_WARNINGS>
+)
+
 add_custom_target(pytest_lib_copy ALL
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
         ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_mscclpp.*.so
@@ -12,4 +16,4 @@ add_custom_target(pytest_lib_copy ALL
         ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_ext.*.so
         ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
     DEPENDS mscclpp_py mscclpp_py_test
-)
+)
\ No newline at end of file
diff --git a/python/csrc/CMakeLists.txt b/python/csrc/CMakeLists.txt
index 8759201f..44fb150f 100644
--- a/python/csrc/CMakeLists.txt
+++ b/python/csrc/CMakeLists.txt
@@ -24,4 +24,7 @@ set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
 set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
 target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES})
 target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
+if(MSCCLPP_USE_ROCM)
+    target_compile_definitions(mscclpp_py PRIVATE MSCCLPP_USE_ROCM)
+endif()
 install(TARGETS mscclpp_py LIBRARY DESTINATION .)
diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp
index 5a9c4bd6..a9aa2727 100644
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -16,14 +16,16 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_algorithm(nb::module_& m) {
-  nb::enum_<CollectiveBufferMode>(m, "CollectiveBufferMode")
+  nb::enum_<CollectiveBufferMode>(m, "CppCollectiveBufferMode")
       .value("ANY", CollectiveBufferMode::Any)
       .value("IN_PLACE", CollectiveBufferMode::InPlace)
       .value("OUT_OF_PLACE", CollectiveBufferMode::OutOfPlace);
 
-  nb::enum_<AlgorithmType>(m, "AlgorithmType").value("NATIVE", AlgorithmType::Native).value("DSL", AlgorithmType::DSL);
+  nb::enum_<AlgorithmType>(m, "CppAlgorithmType")
+      .value("NATIVE", AlgorithmType::Native)
+      .value("DSL", AlgorithmType::DSL);
 
-  nb::enum_<CommResult>(m, "CommResult")
+  nb::enum_<CommResult>(m, "CppCommResult")
       .value("COMM_SUCCESS", CommResult::CommSuccess)
       .value("COMM_UNHANDLED_CUDA_ERROR", CommResult::CommUnhandledCudaError)
       .value("COMM_SYSTEM_ERROR", CommResult::CommSystemError)
@@ -34,13 +36,13 @@ void register_algorithm(nb::module_& m) {
       .value("COMM_IN_PROGRESS", CommResult::CommInProgress)
       .value("COMM_NUM_RESULTS", CommResult::CommNumResults);
 
-  nb::enum_<ReduceOp>(m, "ReduceOp")
+  nb::enum_<ReduceOp>(m, "CppReduceOp")
       .value("SUM", ReduceOp::SUM)
       .value("MIN", ReduceOp::MIN)
       .value("NOP", ReduceOp::NOP);
 
   auto algorithmClass =
-      nb::class_<Algorithm>(m, "Algorithm")
+      nb::class_<Algorithm>(m, "CppAlgorithm")
           .def_static(
               "from_native_capsule",
               [](nb::capsule cap) {
@@ -58,6 +60,12 @@ void register_algorithm(nb::module_& m) {
           .def_prop_ro("name", &Algorithm::name)
           .def_prop_ro("collective", &Algorithm::collective)
           .def_prop_ro("message_range", &Algorithm::messageRange)
+          .def(
+              "set_message_size_range",
+              [](Algorithm& self, size_t minMessageSize, size_t maxMessageSize) {
+                self.setMessageSizeRange(minMessageSize, maxMessageSize);
+              },
+              nb::arg("min_message_size"), nb::arg("max_message_size"))
           .def_prop_ro("tags", &Algorithm::tags)
           .def_prop_ro("buffer_mode", &Algorithm::bufferMode)
           .def_prop_ro("constraint", &Algorithm::constraint)
@@ -67,16 +75,19 @@ void register_algorithm(nb::module_& m) {
               "execute",
               [](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
                  size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
-                 std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock,
-                 std::unordered_map<std::string, uintptr_t> extras) {
+                 std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
+                 std::unordered_map<std::string, uintptr_t> extras, int32_t accumDtype) {
                 return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
                                     inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
-                                    nBlocks, nThreadsPerBlock, extras);
+                                    nBlocks, nThreadsPerBlock, symmetricMemory, extras,
+                                    static_cast<DataType>(accumDtype));
               },
               nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
               nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
-              nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0,
-              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>());
+              nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
+              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>(),
+              nb::arg("accum_dtype") = static_cast<int32_t>(DataType::AUTO))
+          .def("reset", &Algorithm::reset);
 
   nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
       .def(nb::init<>())
@@ -84,21 +95,21 @@ void register_algorithm(nb::module_& m) {
       .def_rw("world_size", &Algorithm::Constraint::worldSize)
       .def_rw("n_ranks_per_node", &Algorithm::Constraint::nRanksPerNode);
 
-  nb::class_<AlgorithmBuilder>(m, "AlgorithmBuilder").def("build", &AlgorithmBuilder::build);
+  nb::class_<AlgorithmBuilder>(m, "CppAlgorithmBuilder").def("build", &AlgorithmBuilder::build);
 
-  nb::class_<DslAlgorithm, Algorithm>(m, "DslAlgorithm")
+  nb::class_<DslAlgorithm, Algorithm>(m, "CppDslAlgorithm")
       .def(nb::init<std::string, ExecutionPlan, std::unordered_map<std::string, uint64_t>, Algorithm::Constraint>(),
            nb::arg("id"), nb::arg("plan"), nb::arg("tags") = std::unordered_map<std::string, uint64_t>(),
            nb::arg("constraint") = Algorithm::Constraint())
       .def("build", &DslAlgorithm::build);
 
-  nb::class_<AlgorithmCollection>(m, "AlgorithmCollection")
+  nb::class_<AlgorithmCollection>(m, "CppAlgorithmCollection")
       .def("register_algorithm", &AlgorithmCollection::registerAlgorithm, nb::arg("collective"), nb::arg("algo_name"),
            nb::arg("algorithm"))
       .def("get_algorithms_by_collective", &AlgorithmCollection::getAlgorithmsByCollective, nb::arg("collective"))
       .def("to_list", &AlgorithmCollection::getAllAlgorithms);
 
-  nb::class_<CollectiveRequest>(m, "CollectiveRequest")
+  nb::class_<CollectiveRequest>(m, "CppCollectiveRequest")
       .def_ro("world_size", &CollectiveRequest::worldSize)
       .def_ro("n_ranks_per_node", &CollectiveRequest::nRanksPerNode)
       .def_ro("rank", &CollectiveRequest::rank)
@@ -107,8 +118,22 @@ void register_algorithm(nb::module_& m) {
       .def_prop_ro("output_buffer",
                    [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.outputBuffer); })
       .def_ro("message_size", &CollectiveRequest::messageSize)
+      .def_prop_ro("stream", [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.stream); })
       .def_prop_ro("collective", [](const CollectiveRequest& self) { return self.collective; })
       .def_ro("dtype", &CollectiveRequest::dtype)
       .def_prop_ro("hints", [](const CollectiveRequest& self) { return self.hints; })
       .def("buffer_mode", &CollectiveRequest::bufferMode);
+
+  m.def(
+      "cpp_get_flag_buffer",
+      []() {
+        auto [buffer, size] = getFlagBuffer();
+        uintptr_t ptr = reinterpret_cast<uintptr_t>(buffer.get());
+        // Transfer shared_ptr ownership into a capsule so Python's GC manages the lifetime.
+        auto prevent = std::make_unique<std::shared_ptr<void>>(std::move(buffer));
+        nb::capsule owner(prevent.get(), [](void* p) noexcept { delete static_cast<std::shared_ptr<void>*>(p); });
+        prevent.release();  // capsule now owns the pointer
+        return nb::make_tuple(ptr, size, owner);
+      },
+      "Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size, owner).");
 }
\ No newline at end of file
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index c1462a11..ec64d744 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -32,21 +32,25 @@ extern void register_algorithm_collection_builder(nb::module_& m);
 
 template <typename T>
 void def_shared_future(nb::handle& m, const std::string& typestr) {
-  std::string pyclass_name = std::string("shared_future_") + typestr;
+  std::string pyclass_name = std::string("CppSharedFuture_") + typestr;
   nb::class_<std::shared_future<T>>(m, pyclass_name.c_str()).def("get", &std::shared_future<T>::get);
 }
 
 void register_core(nb::module_& m) {
   m.def("version", &version);
 
-  nb::enum_<DataType>(m, "DataType")
+  nb::enum_<DataType>(m, "CppDataType")
       .value("int32", DataType::INT32)
       .value("uint32", DataType::UINT32)
       .value("float16", DataType::FLOAT16)
       .value("float32", DataType::FLOAT32)
-      .value("bfloat16", DataType::BFLOAT16);
+      .value("bfloat16", DataType::BFLOAT16)
+      .value("float8_e4m3", DataType::FLOAT8_E4M3)
+      .value("float8_e5m2", DataType::FLOAT8_E5M2)
+      .value("uint8", DataType::UINT8)
+      .value("float8_e4m3b15", DataType::FLOAT8_E4M3B15);
 
-  nb::class_<Bootstrap>(m, "Bootstrap")
+  nb::class_<Bootstrap>(m, "CppBootstrap")
       .def("get_rank", &Bootstrap::getRank)
       .def("get_n_ranks", &Bootstrap::getNranks)
       .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode)
@@ -71,7 +75,7 @@ void register_core(nb::module_& m) {
       .def("recv", static_cast<void (Bootstrap::*)(std::vector<char>&, int, int)>(&Bootstrap::recv), nb::arg("data"),
            nb::arg("peer"), nb::arg("tag"));
 
-  nb::class_<UniqueId>(m, "UniqueId")
+  nb::class_<UniqueId>(m, "CppUniqueId")
       .def(nb::init<>())
       .def("__setstate__",
            [](UniqueId& self, nb::bytes b) {
@@ -81,7 +85,7 @@ void register_core(nb::module_& m) {
       .def("__getstate__",
            [](const UniqueId& self) { return nb::bytes(reinterpret_cast<const char*>(self.data()), UniqueIdBytes); });
 
-  nb::class_<TcpBootstrap, Bootstrap>(m, "TcpBootstrap")
+  nb::class_<TcpBootstrap, Bootstrap>(m, "CppTcpBootstrap")
       .def(nb::init<int, int>(), "Do not use this constructor. Use create instead.")
       .def_static(
           "create", [](int rank, int nRanks) { return std::make_shared<TcpBootstrap>(rank, nRanks); }, nb::arg("rank"),
@@ -93,7 +97,7 @@ void register_core(nb::module_& m) {
       .def("initialize", static_cast<void (TcpBootstrap::*)(const std::string&, int64_t)>(&TcpBootstrap::initialize),
            nb::call_guard<nb::gil_scoped_release>(), nb::arg("if_ip_port_trio"), nb::arg("timeout_sec") = 30);
 
-  nb::enum_<Transport>(m, "Transport")
+  nb::enum_<Transport>(m, "CppTransport")
       .value("Unknown", Transport::Unknown)
       .value("CudaIpc", Transport::CudaIpc)
       .value("IB0", Transport::IB0)
@@ -106,7 +110,7 @@ void register_core(nb::module_& m) {
       .value("IB7", Transport::IB7)
       .value("NumTransports", Transport::NumTransports);
 
-  nb::class_<TransportFlags>(m, "TransportFlags")
+  nb::class_<TransportFlags>(m, "CppTransportFlags")
       .def(nb::init<>())
       .def(nb::init_implicit<Transport>(), nb::arg("transport"))
       .def("has", &TransportFlags::has, nb::arg("transport"))
@@ -130,12 +134,12 @@ void register_core(nb::module_& m) {
       .def(nb::self == nb::self)
       .def(nb::self != nb::self);
 
-  nb::enum_<DeviceType>(m, "DeviceType")
+  nb::enum_<DeviceType>(m, "CppDeviceType")
       .value("Unknown", DeviceType::Unknown)
       .value("CPU", DeviceType::CPU)
       .value("GPU", DeviceType::GPU);
 
-  nb::class_<Device>(m, "Device")
+  nb::class_<Device>(m, "CppDevice")
       .def(nb::init<>())
       .def(nb::init_implicit<DeviceType>(), nb::arg("type"))
       .def(nb::init<DeviceType, int>(), nb::arg("type"), nb::arg("id") = -1)
@@ -147,24 +151,33 @@ void register_core(nb::module_& m) {
         return ss.str();
       });
 
-  nb::class_<EndpointConfig::Ib>(m, "EndpointConfigIb")
+  nb::enum_<EndpointConfig::Ib::Mode>(m, "CppIbMode")
+      .value("Default", EndpointConfig::Ib::Mode::Default)
+      .value("Host", EndpointConfig::Ib::Mode::Host)
+      .value("HostNoAtomic", EndpointConfig::Ib::Mode::HostNoAtomic);
+
+  nb::class_<EndpointConfig::Ib>(m, "CppEndpointConfigIb")
       .def(nb::init<>())
-      .def(nb::init<int, int, int, int, int, int, int>(), nb::arg("device_index") = -1,
+      .def(nb::init<int, int, int, int, int, int, int, int, EndpointConfig::Ib::Mode>(), nb::arg("device_index") = -1,
            nb::arg("port") = EndpointConfig::Ib::DefaultPort,
            nb::arg("gid_index") = EndpointConfig::Ib::DefaultGidIndex,
            nb::arg("max_cq_size") = EndpointConfig::Ib::DefaultMaxCqSize,
            nb::arg("max_cq_poll_num") = EndpointConfig::Ib::DefaultMaxCqPollNum,
            nb::arg("max_send_wr") = EndpointConfig::Ib::DefaultMaxSendWr,
-           nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend)
+           nb::arg("max_recv_wr") = EndpointConfig::Ib::DefaultMaxRecvWr,
+           nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend,
+           nb::arg("mode") = EndpointConfig::Ib::Mode::Default)
       .def_rw("device_index", &EndpointConfig::Ib::deviceIndex)
       .def_rw("port", &EndpointConfig::Ib::port)
       .def_rw("gid_index", &EndpointConfig::Ib::gidIndex)
       .def_rw("max_cq_size", &EndpointConfig::Ib::maxCqSize)
       .def_rw("max_cq_poll_num", &EndpointConfig::Ib::maxCqPollNum)
       .def_rw("max_send_wr", &EndpointConfig::Ib::maxSendWr)
-      .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend);
+      .def_rw("max_recv_wr", &EndpointConfig::Ib::maxRecvWr)
+      .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend)
+      .def_rw("mode", &EndpointConfig::Ib::mode);
 
-  nb::class_<RegisteredMemory>(m, "RegisteredMemory")
+  nb::class_<RegisteredMemory>(m, "CppRegisteredMemory")
       .def(nb::init<>())
       .def("data", [](RegisteredMemory& self) { return reinterpret_cast<uintptr_t>(self.data()); })
       .def("size", &RegisteredMemory::size)
@@ -172,7 +185,7 @@ void register_core(nb::module_& m) {
       .def("serialize", &RegisteredMemory::serialize)
       .def_static("deserialize", &RegisteredMemory::deserialize, nb::arg("data"));
 
-  nb::class_<Endpoint>(m, "Endpoint")
+  nb::class_<Endpoint>(m, "CppEndpoint")
       .def("config", &Endpoint::config)
       .def("transport", &Endpoint::transport)
       .def("device", &Endpoint::device)
@@ -180,7 +193,7 @@ void register_core(nb::module_& m) {
       .def("serialize", &Endpoint::serialize)
       .def_static("deserialize", &Endpoint::deserialize, nb::arg("data"));
 
-  nb::class_<Connection>(m, "Connection")
+  nb::class_<Connection>(m, "CppConnection")
       .def("write", &Connection::write, nb::arg("dst"), nb::arg("dstOffset"), nb::arg("src"), nb::arg("srcOffset"),
            nb::arg("size"))
       .def(
@@ -197,7 +210,7 @@ void register_core(nb::module_& m) {
       .def("local_device", &Connection::localDevice)
       .def("get_max_write_queue_size", &Connection::getMaxWriteQueueSize);
 
-  nb::class_<EndpointConfig>(m, "EndpointConfig")
+  nb::class_<EndpointConfig>(m, "CppEndpointConfig")
       .def(nb::init<>())
       .def(nb::init_implicit<Transport>(), nb::arg("transport"))
       .def(nb::init<Transport, Device, int, EndpointConfig::Ib>(), nb::arg("transport"), nb::arg("device"),
@@ -223,12 +236,18 @@ void register_core(nb::module_& m) {
       .def_prop_rw(
           "ib_max_send_wr", [](EndpointConfig& self) { return self.ib.maxSendWr; },
           [](EndpointConfig& self, int v) { self.ib.maxSendWr = v; })
+      .def_prop_rw(
+          "ib_max_recv_wr", [](EndpointConfig& self) { return self.ib.maxRecvWr; },
+          [](EndpointConfig& self, int v) { self.ib.maxRecvWr = v; })
       .def_prop_rw(
           "ib_max_wr_per_send", [](EndpointConfig& self) { return self.ib.maxWrPerSend; },
           [](EndpointConfig& self, int v) { self.ib.maxWrPerSend = v; })
+      .def_prop_rw(
+          "ib_mode", [](EndpointConfig& self) { return self.ib.mode; },
+          [](EndpointConfig& self, EndpointConfig::Ib::Mode v) { self.ib.mode = v; })
       .def_rw("max_write_queue_size", &EndpointConfig::maxWriteQueueSize);
 
-  nb::class_<Context>(m, "Context")
+  nb::class_<Context>(m, "CppContext")
       .def_static("create", &Context::create)
       .def(
           "register_memory",
@@ -239,13 +258,13 @@ void register_core(nb::module_& m) {
       .def("create_endpoint", &Context::createEndpoint, nb::arg("config"))
       .def("connect", &Context::connect, nb::arg("local_endpoint"), nb::arg("remote_endpoint"));
 
-  nb::class_<SemaphoreStub>(m, "SemaphoreStub")
+  nb::class_<SemaphoreStub>(m, "CppSemaphoreStub")
       .def(nb::init<const Connection&>(), nb::arg("connection"))
       .def("memory", &SemaphoreStub::memory)
       .def("serialize", &SemaphoreStub::serialize)
       .def_static("deserialize", &SemaphoreStub::deserialize, nb::arg("data"));
 
-  nb::class_<Semaphore>(m, "Semaphore")
+  nb::class_<Semaphore>(m, "CppSemaphore")
       .def(nb::init<>())
       .def(nb::init<const SemaphoreStub&, const SemaphoreStub&>(), nb::arg("local_stub"), nb::arg("remote_stub"))
       .def("connection", &Semaphore::connection)
@@ -256,7 +275,7 @@ void register_core(nb::module_& m) {
   def_shared_future<Connection>(m, "Connection");
   def_shared_future<Semaphore>(m, "Semaphore");
 
-  nb::class_<Communicator>(m, "Communicator")
+  nb::class_<Communicator>(m, "CppCommunicator")
       .def(nb::init<std::shared_ptr<Bootstrap>, std::shared_ptr<Context>>(), nb::arg("bootstrap"),
            nb::arg("context") = nullptr)
       .def("bootstrap", &Communicator::bootstrap)
@@ -289,6 +308,9 @@ void register_core(nb::module_& m) {
 }
 
 NB_MODULE(_mscclpp, m) {
+#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS
+  nb::set_leak_warnings(false);
+#endif
   register_env(m);
   register_error(m);
   register_port_channel(m);
@@ -306,4 +328,4 @@ NB_MODULE(_mscclpp, m) {
 
   // ext
   register_algorithm_collection_builder(m);
-}
+}
\ No newline at end of file
diff --git a/python/csrc/env_py.cpp b/python/csrc/env_py.cpp
index a0ba4a4e..d4b2f5da 100644
--- a/python/csrc/env_py.cpp
+++ b/python/csrc/env_py.cpp
@@ -11,7 +11,7 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_env(nb::module_& m) {
-  nb::class_<Env>(m, "Env")
+  nb::class_<Env>(m, "CppEnv")
       .def_ro("debug", &Env::debug)
       .def_ro("debug_subsys", &Env::debugSubsys)
       .def_ro("debug_file", &Env::debugFile)
@@ -20,9 +20,11 @@ void register_env(nb::module_& m) {
       .def_ro("socket_family", &Env::socketFamily)
       .def_ro("socket_ifname", &Env::socketIfname)
       .def_ro("comm_id", &Env::commId)
-      .def_ro("execution_plan_dir", &Env::executionPlanDir)
+      .def_ro("ibv_mode", &Env::ibvMode)
+      .def_ro("cache_dir", &Env::cacheDir)
       .def_ro("npkit_dump_dir", &Env::npkitDumpDir)
-      .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
+      .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream)
+      .def_ro("ib_gid_index", &Env::ibGidIndex);
 
   m.def("env", &env);
 }
diff --git a/python/csrc/error_py.cpp b/python/csrc/error_py.cpp
index ff532d10..c19a3b15 100644
--- a/python/csrc/error_py.cpp
+++ b/python/csrc/error_py.cpp
@@ -11,18 +11,18 @@ using namespace mscclpp;
 
 #define REGISTER_EXCEPTION_TRANSLATOR(name_)                                                                         \
   nb::register_exception_translator(                                                                                 \
-      [](const std::exception_ptr &p, void *payload) {                                                               \
+      [](const std::exception_ptr& p, void* payload) {                                                               \
         try {                                                                                                        \
           std::rethrow_exception(p);                                                                                 \
-        } catch (const name_ &e) {                                                                                   \
-          PyErr_SetObject(reinterpret_cast<PyObject *>(payload),                                                     \
+        } catch (const name_& e) {                                                                                   \
+          PyErr_SetObject(reinterpret_cast<PyObject*>(payload),                                                      \
                           PyTuple_Pack(2, PyLong_FromLong(long(e.getErrorCode())), PyUnicode_FromString(e.what()))); \
         }                                                                                                            \
       },                                                                                                             \
       m.attr(#name_).ptr());
 
-void register_error(nb::module_ &m) {
-  nb::enum_<ErrorCode>(m, "ErrorCode")
+void register_error(nb::module_& m) {
+  nb::enum_<ErrorCode>(m, "CppErrorCode")
       .value("SystemError", ErrorCode::SystemError)
       .value("InternalError", ErrorCode::InternalError)
       .value("RemoteError", ErrorCode::RemoteError)
diff --git a/python/csrc/executor_py.cpp b/python/csrc/executor_py.cpp
index 0a196f37..350a1e7a 100644
--- a/python/csrc/executor_py.cpp
+++ b/python/csrc/executor_py.cpp
@@ -15,16 +15,16 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_executor(nb::module_& m) {
-  nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
+  nb::enum_<PacketType>(m, "CppPacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
 
-  nb::class_<ExecutionPlan>(m, "ExecutionPlan")
+  nb::class_<ExecutionPlan>(m, "CppExecutionPlan")
       .def(nb::init<const std::string&, int>(), nb::arg("planPath"), nb::arg("rank"))
       .def_prop_ro("name", [](const ExecutionPlan& self) -> std::string { return self.name(); })
       .def_prop_ro("collective", [](const ExecutionPlan& self) -> std::string { return self.collective(); })
       .def_prop_ro("min_message_size", [](const ExecutionPlan& self) -> size_t { return self.minMessageSize(); })
       .def_prop_ro("max_message_size", [](const ExecutionPlan& self) -> size_t { return self.maxMessageSize(); });
 
-  nb::class_<Executor>(m, "Executor")
+  nb::class_<Executor>(m, "CppExecutor")
       .def(nb::init<std::shared_ptr<Communicator>>(), nb::arg("comm"))
       .def(
           "execute",
diff --git a/python/csrc/ext/algorithm_collection_builder_py.cpp b/python/csrc/ext/algorithm_collection_builder_py.cpp
index 2756edb7..4a3563d9 100644
--- a/python/csrc/ext/algorithm_collection_builder_py.cpp
+++ b/python/csrc/ext/algorithm_collection_builder_py.cpp
@@ -4,6 +4,7 @@
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/function.h>
 #include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
 #include <nanobind/stl/unordered_map.h>
 #include <nanobind/stl/vector.h>
 
@@ -15,7 +16,7 @@ using namespace mscclpp;
 using namespace mscclpp::collective;
 
 void register_algorithm_collection_builder(nb::module_& m) {
-  nb::class_<AlgorithmCollectionBuilder>(m, "AlgorithmCollectionBuilder")
+  nb::class_<AlgorithmCollectionBuilder>(m, "CppAlgorithmCollectionBuilder")
       .def_static("get_instance", &AlgorithmCollectionBuilder::getInstance)
       .def("add_algorithm_builder", &AlgorithmCollectionBuilder::addAlgorithmBuilder, nb::arg("builder"))
       .def(
@@ -29,6 +30,6 @@ void register_algorithm_collection_builder(nb::module_& m) {
            nb::arg("selector"))
       .def("build", &AlgorithmCollectionBuilder::build)
       .def("build_default_algorithms", &AlgorithmCollectionBuilder::buildDefaultAlgorithms, nb::arg("scratch_buffer"),
-           nb::arg("scratch_buffer_size"), nb::arg("rank"))
+           nb::arg("scratch_buffer_size"), nb::arg("flag_buffer"), nb::arg("flag_buffer_size"), nb::arg("rank"))
       .def_static("reset", &AlgorithmCollectionBuilder::reset);
 }
\ No newline at end of file
diff --git a/python/csrc/fifo_py.cpp b/python/csrc/fifo_py.cpp
index 63be4a33..e8b6a3e2 100644
--- a/python/csrc/fifo_py.cpp
+++ b/python/csrc/fifo_py.cpp
@@ -9,7 +9,7 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_fifo(nb::module_& m) {
-  nb::class_<ProxyTrigger>(m, "ProxyTrigger")
+  nb::class_<ProxyTrigger>(m, "CppProxyTrigger")
       .def_prop_rw(
           "fst", [](const ProxyTrigger& self) { return self.fst; },
           [](ProxyTrigger& self, uint64_t v) { self.fst = v; })
@@ -17,7 +17,7 @@ void register_fifo(nb::module_& m) {
           "snd", [](const ProxyTrigger& self) { return self.snd; },
           [](ProxyTrigger& self, uint64_t v) { self.snd = v; });
 
-  nb::class_<FifoDeviceHandle>(m, "FifoDeviceHandle")
+  nb::class_<FifoDeviceHandle>(m, "CppFifoDeviceHandle")
       .def_rw("triggers", &FifoDeviceHandle::triggers)
       .def_rw("tail", &FifoDeviceHandle::tail)
       .def_rw("head", &FifoDeviceHandle::head)
@@ -26,7 +26,7 @@ void register_fifo(nb::module_& m) {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<Fifo>(m, "Fifo")
+  nb::class_<Fifo>(m, "CppFifo")
       .def(nb::init<int>(), nb::arg("size") = DEFAULT_FIFO_SIZE)
       .def("poll", &Fifo::poll)
       .def("pop", &Fifo::pop)
diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp
index 66f036e2..60880456 100644
--- a/python/csrc/gpu_utils_py.cpp
+++ b/python/csrc/gpu_utils_py.cpp
@@ -34,6 +34,19 @@ static DLDataType getDlType(std::string type) {
     return DLDataType{kDLBfloat, 16, 1};
   } else if (type == "torch.float16") {
     return DLDataType{kDLFloat, 16, 1};
+  } else if (type == "torch.float8_e4m3fn") {
+    return DLDataType{kDLFloat8_e4m3fn, 8, 1};
+  } else if (type == "torch.float8_e4m3fnuz") {
+    return DLDataType{kDLFloat8_e4m3fnuz, 8, 1};
+  } else if (type == "torch.float8_e5m2") {
+    return DLDataType{kDLFloat8_e5m2, 8, 1};
+  } else if (type == "torch.float8_e5m2fnuz") {
+    return DLDataType{kDLFloat8_e5m2fnuz, 8, 1};
+  } else if (type == "torch.uint8") {
+    return DLDataType{kDLUInt, 8, 1};
+  } else if (type == "fp8_e4m3b15") {
+    // No standard DLPack code for fp8_e4m3b15; store as raw uint8 bytes.
+    return DLDataType{kDLUInt, 8, 1};
   } else {
     throw Error("Unsupported type: " + type, ErrorCode::InvalidUsage);
   }
@@ -101,7 +114,7 @@ static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::v
 void register_gpu_utils(nb::module_& m) {
   m.def("is_nvls_supported", &isNvlsSupported);
 
-  nb::class_<GpuBuffer<char>>(m, "RawGpuBuffer")
+  nb::class_<GpuBuffer<char>>(m, "CppRawGpuBuffer")
       .def(nb::init<size_t>(), nb::arg("nelems"))
       .def("nelems", &GpuBuffer<char>::nelems)
       .def("bytes", &GpuBuffer<char>::bytes)
diff --git a/python/csrc/memory_channel_py.cpp b/python/csrc/memory_channel_py.cpp
index 4f9d90a0..ecccb1a0 100644
--- a/python/csrc/memory_channel_py.cpp
+++ b/python/csrc/memory_channel_py.cpp
@@ -11,20 +11,20 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_memory_channel(nb::module_& m) {
-  nb::class_<BaseMemoryChannel>(m, "BaseMemoryChannel")
+  nb::class_<BaseMemoryChannel>(m, "CppBaseMemoryChannel")
       .def(nb::init<>())
       .def(nb::init<std::shared_ptr<MemoryDevice2DeviceSemaphore>>(), nb::arg("semaphore"))
       .def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
       .def("device_handle", &BaseMemoryChannel::deviceHandle);
 
-  nb::class_<BaseMemoryChannel::DeviceHandle>(m, "BaseMemoryChannelDeviceHandle")
+  nb::class_<BaseMemoryChannel::DeviceHandle>(m, "CppBaseMemoryChannelDeviceHandle")
       .def(nb::init<>())
       .def_rw("semaphore_", &BaseMemoryChannel::DeviceHandle::semaphore_)
       .def_prop_ro("raw", [](const BaseMemoryChannel::DeviceHandle& self) -> nb::bytes {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<MemoryChannel>(m, "MemoryChannel")
+  nb::class_<MemoryChannel>(m, "CppMemoryChannel")
       .def(nb::init<>())
       .def(
           "__init__",
@@ -42,7 +42,7 @@ void register_memory_channel(nb::module_& m) {
           nb::arg("semaphore"), nb::arg("dst"), nb::arg("src"), nb::arg("packet_buffer") = 0)
       .def("device_handle", &MemoryChannel::deviceHandle);
 
-  nb::class_<MemoryChannel::DeviceHandle>(m, "MemoryChannelDeviceHandle")
+  nb::class_<MemoryChannel::DeviceHandle>(m, "CppMemoryChannelDeviceHandle")
       .def(nb::init<>())
       .def_rw("semaphore_", &MemoryChannel::DeviceHandle::semaphore_)
       .def_rw("dst_", &MemoryChannel::DeviceHandle::dst_)
diff --git a/python/csrc/npkit_py.cpp b/python/csrc/npkit_py.cpp
index 0557b72d..8c158354 100644
--- a/python/csrc/npkit_py.cpp
+++ b/python/csrc/npkit_py.cpp
@@ -8,8 +8,8 @@
 
 namespace nb = nanobind;
 
-void register_npkit(nb::module_ &m) {
-  nb::module_ sub_m = m.def_submodule("npkit", "NPKit functions");
+void register_npkit(nb::module_& m) {
+  nb::module_ sub_m = m.def_submodule("cpp_npkit", "NPKit functions");
   sub_m.def("init", &NpKit::Init);
   sub_m.def("dump", &NpKit::Dump);
   sub_m.def("shutdown", &NpKit::Shutdown);
diff --git a/python/csrc/numa_py.cpp b/python/csrc/numa_py.cpp
index 2489a479..fadc0f69 100644
--- a/python/csrc/numa_py.cpp
+++ b/python/csrc/numa_py.cpp
@@ -6,8 +6,8 @@ int getDeviceNumaNode(int cudaDev);
 void numaBind(int node);
 };  // namespace mscclpp
 
-void register_numa(nb::module_ &m) {
-  nb::module_ sub_m = m.def_submodule("numa", "numa functions");
+void register_numa(nb::module_& m) {
+  nb::module_ sub_m = m.def_submodule("cpp_numa", "numa functions");
   sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode);
   sub_m.def("numa_bind", &mscclpp::numaBind);
 }
diff --git a/python/csrc/port_channel_py.cpp b/python/csrc/port_channel_py.cpp
index 4b1aa289..e3dd98f1 100644
--- a/python/csrc/port_channel_py.cpp
+++ b/python/csrc/port_channel_py.cpp
@@ -11,11 +11,11 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_port_channel(nb::module_& m) {
-  nb::class_<BaseProxyService>(m, "BaseProxyService")
+  nb::class_<BaseProxyService>(m, "CppBaseProxyService")
       .def("start_proxy", &BaseProxyService::startProxy, nb::arg("blocking") = false)
       .def("stop_proxy", &BaseProxyService::stopProxy);
 
-  nb::class_<ProxyService, BaseProxyService>(m, "ProxyService")
+  nb::class_<ProxyService, BaseProxyService>(m, "CppProxyService")
       .def(nb::init<int>(), nb::arg("fifo_size") = DEFAULT_FIFO_SIZE)
       .def("start_proxy", &ProxyService::startProxy, nb::arg("blocking") = false)
       .def("stop_proxy", &ProxyService::stopProxy)
@@ -31,13 +31,13 @@ void register_port_channel(nb::module_& m) {
       .def("base_port_channel", &ProxyService::basePortChannel, nb::arg("id"))
       .def("port_channel", &ProxyService::portChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src"));
 
-  nb::class_<BasePortChannel>(m, "BasePortChannel")
+  nb::class_<BasePortChannel>(m, "CppBasePortChannel")
       .def(nb::init<>())
       .def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>>(),
            nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy"))
       .def("device_handle", &BasePortChannel::deviceHandle);
 
-  nb::class_<BasePortChannel::DeviceHandle>(m, "BasePortChannelDeviceHandle")
+  nb::class_<BasePortChannel::DeviceHandle>(m, "CppBasePortChannelDeviceHandle")
       .def(nb::init<>())
       .def_rw("semaphore_id_", &BasePortChannel::DeviceHandle::semaphoreId_)
       .def_rw("semaphore_", &BasePortChannel::DeviceHandle::semaphore_)
@@ -46,13 +46,13 @@ void register_port_channel(nb::module_& m) {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<PortChannel>(m, "PortChannel")
+  nb::class_<PortChannel>(m, "CppPortChannel")
       .def(nb::init<>())
       .def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>, MemoryId, MemoryId>(),
            nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy"), nb::arg("dst"), nb::arg("src"))
       .def("device_handle", &PortChannel::deviceHandle);
 
-  nb::class_<PortChannel::DeviceHandle>(m, "PortChannelDeviceHandle")
+  nb::class_<PortChannel::DeviceHandle>(m, "CppPortChannelDeviceHandle")
       .def(nb::init<>())
       .def_rw("semaphore_id_", &PortChannel::DeviceHandle::semaphoreId_)
       .def_rw("semaphore_", &PortChannel::DeviceHandle::semaphore_)
diff --git a/python/csrc/semaphore_py.cpp b/python/csrc/semaphore_py.cpp
index 665d395e..17c06a7d 100644
--- a/python/csrc/semaphore_py.cpp
+++ b/python/csrc/semaphore_py.cpp
@@ -10,7 +10,7 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_semaphore(nb::module_& m) {
-  nb::class_<Host2DeviceSemaphore> host2DeviceSemaphore(m, "Host2DeviceSemaphore");
+  nb::class_<Host2DeviceSemaphore> host2DeviceSemaphore(m, "CppHost2DeviceSemaphore");
   host2DeviceSemaphore.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
       .def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
       .def("connection", &Host2DeviceSemaphore::connection)
@@ -25,7 +25,7 @@ void register_semaphore(nb::module_& m) {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<Host2HostSemaphore>(m, "Host2HostSemaphore")
+  nb::class_<Host2HostSemaphore>(m, "CppHost2HostSemaphore")
       .def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
       .def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
       .def("connection", &Host2HostSemaphore::connection)
@@ -34,7 +34,7 @@ void register_semaphore(nb::module_& m) {
       .def("wait", &Host2HostSemaphore::wait, nb::call_guard<nb::gil_scoped_release>(),
            nb::arg("max_spin_count") = 10000000);
 
-  nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "MemoryDevice2DeviceSemaphore");
+  nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "CppMemoryDevice2DeviceSemaphore");
   memoryDevice2DeviceSemaphore.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
       .def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
       .def("connection", &MemoryDevice2DeviceSemaphore::connection)
@@ -43,7 +43,6 @@ void register_semaphore(nb::module_& m) {
   nb::class_<MemoryDevice2DeviceSemaphore::DeviceHandle>(memoryDevice2DeviceSemaphore, "DeviceHandle")
       .def(nb::init<>())
       .def_rw("inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundToken)
-      .def_rw("outbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundToken)
       .def_rw("remote_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundToken)
       .def_rw("expected_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundToken)
       .def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
diff --git a/python/csrc/switch_channel_py.cpp b/python/csrc/switch_channel_py.cpp
index dd72c97e..2d0340dd 100644
--- a/python/csrc/switch_channel_py.cpp
+++ b/python/csrc/switch_channel_py.cpp
@@ -15,11 +15,11 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_nvls(nb::module_& m) {
-  nb::class_<SwitchChannel>(m, "SwitchChannel")
+  nb::class_<SwitchChannel>(m, "CppSwitchChannel")
       .def("get_device_ptr", [](SwitchChannel* self) { return (uintptr_t)self->getDevicePtr(); })
       .def("device_handle", &SwitchChannel::deviceHandle);
 
-  nb::class_<SwitchChannel::DeviceHandle>(m, "DeviceHandle")
+  nb::class_<SwitchChannel::DeviceHandle>(m, "CppSwitchChannelDeviceHandle")
       .def(nb::init<>())
       .def_rw("device_ptr", &SwitchChannel::DeviceHandle::devicePtr)
       .def_rw("mc_ptr", &SwitchChannel::DeviceHandle::mcPtr)
@@ -28,7 +28,7 @@ void register_nvls(nb::module_& m) {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<NvlsConnection>(m, "NvlsConnection")
+  nb::class_<NvlsConnection>(m, "CppNvlsConnection")
       .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("device_ptr"), nb::arg("size"));
 
   m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("all_ranks"),
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 58233a7c..5f3a2302 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -23,35 +23,37 @@ version = {
 from ._core import *
 
 from ._mscclpp import (
-    Device,
-    DeviceType,
-    Communicator,
-    Connection,
+    CppDevice as Device,
+    CppDeviceType as DeviceType,
+    CppCommunicator as Communicator,
+    CppConnection as Connection,
     connect_nvls_collective,
-    EndpointConfig,
-    Fifo,
-    Semaphore,
-    Host2DeviceSemaphore,
-    Host2HostSemaphore,
-    numa,
-    ProxyService,
-    RegisteredMemory,
-    PortChannel,
-    MemoryChannel,
-    MemoryDevice2DeviceSemaphore,
-    TcpBootstrap,
-    Transport,
-    TransportFlags,
-    DataType,
-    ErrorCode,
-    Executor,
-    ExecutionPlan,
-    PacketType,
-    RawGpuBuffer,
-    ReduceOp,
+    CppEndpointConfig as EndpointConfig,
+    CppEndpointConfigIb as EndpointConfigIb,
+    CppIbMode as IbMode,
+    CppFifo as Fifo,
+    CppSemaphore as Semaphore,
+    CppHost2DeviceSemaphore as Host2DeviceSemaphore,
+    CppHost2HostSemaphore as Host2HostSemaphore,
+    cpp_numa as numa,
+    CppProxyService as ProxyService,
+    CppRegisteredMemory as RegisteredMemory,
+    CppPortChannel as PortChannel,
+    CppMemoryChannel as MemoryChannel,
+    CppMemoryDevice2DeviceSemaphore as MemoryDevice2DeviceSemaphore,
+    CppTcpBootstrap as TcpBootstrap,
+    CppTransport as Transport,
+    CppTransportFlags as TransportFlags,
+    CppDataType as DataType,
+    CppErrorCode as ErrorCode,
+    CppExecutor as Executor,
+    CppExecutionPlan as ExecutionPlan,
+    CppPacketType as PacketType,
+    CppRawGpuBuffer as RawGpuBuffer,
+    CppReduceOp as ReduceOp,
     env,
     is_nvls_supported,
-    npkit,
+    cpp_npkit as npkit,
 )
 
 __all__ = [
@@ -61,6 +63,8 @@ __all__ = [
     "Connection",
     "connect_nvls_collective",
     "EndpointConfig",
+    "EndpointConfigIb",
+    "IbMode",
     "ErrorCode",
     "Fifo",
     "Semaphore",
diff --git a/python/mscclpp/__main__.py b/python/mscclpp/__main__.py
index 6d0e0108..6a6f5f28 100644
--- a/python/mscclpp/__main__.py
+++ b/python/mscclpp/__main__.py
@@ -6,7 +6,7 @@ import shutil
 import argparse
 from pathlib import Path
 
-from mscclpp.language import default_algos as def_algo
+from mscclpp import default_algos as def_algo
 from mscclpp.language.collectives import *
 from mscclpp.language.utils import AlgoSpec
 
@@ -57,7 +57,7 @@ default_algo_configs = [
 
 
 def create_default_plans():
-    plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp_default")
+    plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default")
     plan_path = Path(plan_dir)
     if plan_path.exists():
         shutil.rmtree(plan_path)
diff --git a/python/mscclpp/_core/__init__.py b/python/mscclpp/_core/__init__.py
index e9d886f3..a97c91a0 100644
--- a/python/mscclpp/_core/__init__.py
+++ b/python/mscclpp/_core/__init__.py
@@ -5,9 +5,3 @@ from .algorithm import *
 from .comm import *
 from .compiler import *
 from .buffer import *
-
-__all__ = []
-__all__ += algorithm.__all__
-__all__ += comm.__all__
-__all__ += compiler.__all__
-__all__ += buffer.__all__
diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py
index abaac60c..f12a3027 100644
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -4,18 +4,22 @@
 from __future__ import annotations
 from typing import Optional, Tuple, Dict
 from functools import cached_property
+import cupy as cp
 
 
 from mscclpp._mscclpp import (
-    Algorithm as _Algorithm,
-    DslAlgorithm as _DslAlgorithm,
-    AlgorithmType as _AlgorithmType,
-    Communicator,
-    CollectiveBufferMode,
-    DataType,
-    Executor,
-    ExecutionPlan,
-    ReduceOp,
+    CppAlgorithm,
+    CppDslAlgorithm,
+    CppAlgorithmType,
+    CppCommunicator,
+    CppCollectiveBufferMode,
+    CppDataType,
+    CppExecutor,
+    CppExecutionPlan,
+    CppReduceOp,
+    CppAlgorithmBuilder,
+    CppAlgorithmCollection,
+    cpp_get_flag_buffer,
 )
 
 __all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
@@ -45,7 +49,7 @@ class Algorithm:
         """
 
         def __init__(self, world_size: int = 0, n_ranks_per_node: int = 0):
-            self._constraint = _Algorithm.Constraint(world_size, n_ranks_per_node)
+            self._constraint = CppAlgorithm.Constraint(world_size, n_ranks_per_node)
 
         @property
         def world_size(self) -> int:
@@ -58,23 +62,23 @@ class Algorithm:
     def __init__(
         self,
         id: Optional[str] = None,
-        execution_plan: Optional[ExecutionPlan] = None,
-        native_handle: Optional[_Algorithm] = None,
+        execution_plan: Optional[CppExecutionPlan] = None,
+        native_handle: Optional[CppAlgorithm] = None,
         tags: Optional[Dict[str, int]] = None,
         constraint: Optional[Constraint] = None,
     ):
         if execution_plan is not None:
-            self._algorithm = _DslAlgorithm(
+            self._algorithm = CppDslAlgorithm(
                 id,
                 execution_plan,
                 tags=tags if tags is not None else {},
-                constraint=constraint._constraint if constraint is not None else _Algorithm.Constraint(),
+                constraint=constraint._constraint if constraint is not None else CppAlgorithm.Constraint(),
             )
         elif native_handle is not None:
             self._algorithm = native_handle
 
     @classmethod
-    def create_from_native_handle(cls, handle: _Algorithm):
+    def create_from_native_handle(cls, handle: CppAlgorithm):
         """Create an Algorithm instance from a native C++ algorithm handle.
 
         Args:
@@ -97,7 +101,7 @@ class Algorithm:
         Returns:
             A new Algorithm instance wrapping the algorithm from the capsule.
         """
-        handle = _Algorithm.from_native_capsule(obj)
+        handle = CppAlgorithm.from_native_capsule(obj)
         return cls(native_handle=handle)
 
     @cached_property
@@ -110,18 +114,31 @@ class Algorithm:
         """The collective operation this algorithm implements (e.g., "allreduce", "allgather")."""
         return self._algorithm.collective
 
-    @cached_property
+    @property
     def message_size_range(self) -> Tuple[int, int]:
         """The valid message size range (min_size, max_size) in bytes."""
         return (self._algorithm.message_range[0], self._algorithm.message_range[1])
 
+    def set_message_size_range(self, min_message_size: int, max_message_size: int):
+        """Set the valid message size range in bytes.
+
+        Args:
+            min_message_size: Minimum supported message size in bytes.
+            max_message_size: Maximum supported message size in bytes.
+
+        Only supported for native algorithms. Raises TypeError for DSL algorithms.
+        """
+        if self.is_dsl_algorithm():
+            raise TypeError("set_message_size_range is only supported for native algorithms")
+        self._algorithm.set_message_size_range(min_message_size, max_message_size)
+
     @cached_property
     def tags(self) -> Dict[str, int]:
         """Dictionary of tag names to tag values for algorithm selection hints."""
         return self._algorithm.tags
 
     @cached_property
-    def buffer_mode(self) -> CollectiveBufferMode:
+    def buffer_mode(self) -> CppCollectiveBufferMode:
         """The buffer mode supported by this algorithm (IN_PLACE, OUT_OF_PLACE, or ANY)."""
         return self._algorithm.buffer_mode
 
@@ -131,7 +148,7 @@ class Algorithm:
         Returns:
             True if this algorithm is defined using DSL/execution plan, False otherwise.
         """
-        if self._algorithm.type == _AlgorithmType.DSL:
+        if self._algorithm.type == CppAlgorithmType.DSL:
             return True
         return False
 
@@ -141,24 +158,26 @@ class Algorithm:
         Returns:
             True if this algorithm is implemented natively, False otherwise.
         """
-        if self._algorithm.type == _AlgorithmType.NATIVE:
+        if self._algorithm.type == CppAlgorithmType.NATIVE:
             return True
         return False
 
     def execute(
         self,
-        comm: Communicator,
+        comm: CppCommunicator,
         input_buffer: int,
         output_buffer: int,
         input_size: int,
         output_size: int,
-        dtype: DataType,
-        op: ReduceOp = ReduceOp.NOP,
+        dtype: CppDataType,
+        op: CppReduceOp = CppReduceOp.NOP,
         stream: int = 0,
-        executor: Optional[Executor] = None,
+        executor: Optional[CppExecutor] = None,
         nblocks=0,
         nthreads_per_block=0,
+        symmetric_memory: bool = False,
         extras: Optional[Dict[str, int]] = None,
+        accum_dtype: Optional[CppDataType] = None,
     ) -> int:
         """Execute the collective algorithm.
 
@@ -174,11 +193,16 @@ class Algorithm:
             executor: The executor for DSL algorithms (required for DSL, optional for native).
             nblocks: Number of CUDA blocks (0 for auto-selection).
             nthreads_per_block: Number of threads per block (0 for auto-selection).
+            symmetric_memory: Whether to use symmetric memory optimization (default: False).
             extras: Additional algorithm-specific parameters.
+            accum_dtype: Data type for accumulation during reduction. If None, defaults to
+                         the same as dtype. Use DataType.float32 for high-precision FP8 accumulation.
 
         Returns:
             The result code (0 for success).
         """
+        merged_extras = dict(extras) if extras is not None else {}
+        accum_dtype = accum_dtype if accum_dtype is not None else dtype
         return self._algorithm.execute(
             comm,
             int(input_buffer),
@@ -191,12 +215,18 @@ class Algorithm:
             executor,
             nblocks,
             nthreads_per_block,
-            extras if extras is not None else {},
+            symmetric_memory,
+            merged_extras,
+            int(accum_dtype),
         )
 
+    def reset(self):
+        """Reset the internal state of the algorithm, if applicable."""
+        self._algorithm.reset()
+
 
 class AlgorithmBuilder:
-    def __init__(self, algorithm_builder: _AlgorithmBuilder):
+    def __init__(self, algorithm_builder: CppAlgorithmBuilder):
         self._algorithm_builder = algorithm_builder
 
     def build(self) -> Algorithm:
@@ -204,7 +234,7 @@ class AlgorithmBuilder:
 
 
 class AlgorithmCollection:
-    def __init__(self, native_collection: _AlgorithmCollection):
+    def __init__(self, native_collection: CppAlgorithmCollection):
         self._native_collection = native_collection
         self._algorithms = [Algorithm.create_from_native_handle(algo) for algo in self._native_collection.to_list()]
 
@@ -228,3 +258,24 @@ class AlgorithmCollection:
         """Register an algorithm for a collective operation."""
         self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm)
         self._algorithms.append(algorithm)
+
+
+_flag_buffer_cache = None
+
+
+def get_flag_buffer() -> cp.ndarray:
+    """Get the default flag buffer for algorithm selection.
+
+    This buffer is used internally by default algorithms to store selection flags.
+    It is allocated as a shared GPU buffer and can be accessed from Python.
+    The result is cached so all callers share the same buffer.
+
+    Returns:
+        A CuPy array representing the flag buffer on the GPU.
+    """
+    global _flag_buffer_cache
+    if _flag_buffer_cache is None:
+        buffer_ptr, buffer_size, owner = cpp_get_flag_buffer()
+        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, owner), 0)
+        _flag_buffer_cache = cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
+    return _flag_buffer_cache
diff --git a/python/mscclpp/_core/buffer.py b/python/mscclpp/_core/buffer.py
index b54342ea..0575ca68 100644
--- a/python/mscclpp/_core/buffer.py
+++ b/python/mscclpp/_core/buffer.py
@@ -6,7 +6,7 @@ from typing import Union, Tuple
 
 import cupy as cp
 import numpy as np
-from mscclpp._mscclpp import RawGpuBuffer
+from mscclpp._mscclpp import CppRawGpuBuffer
 
 __all__ = ["GpuBuffer"]
 
@@ -25,6 +25,6 @@ class GpuBuffer(cp.ndarray):
         if any(s <= 0 for s in shape):
             raise ValueError("Shape must be positive.")
         # Create the buffer
-        buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
+        buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
         memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
         return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py
index 2b5a5f25..d42349dd 100644
--- a/python/mscclpp/_core/comm.py
+++ b/python/mscclpp/_core/comm.py
@@ -6,21 +6,21 @@ from typing import Type
 
 import cupy as cp
 from mscclpp._mscclpp import (
-    Communicator,
-    Connection,
+    CppCommunicator,
+    CppConnection,
     connect_nvls_collective,
-    EndpointConfig,
-    Semaphore,
-    ProxyService,
-    RegisteredMemory,
-    PortChannel,
-    MemoryChannel,
-    TcpBootstrap,
-    Transport,
-    TransportFlags,
+    CppEndpointConfig,
+    CppSemaphore,
+    CppProxyService,
+    CppRegisteredMemory,
+    CppPortChannel,
+    CppMemoryChannel,
+    CppTcpBootstrap,
+    CppTransport,
+    CppTransportFlags,
 )
-import mpi4py
 import numpy as np
+import pickle
 
 from mscclpp.utils import is_torch_tensor
 
@@ -29,27 +29,47 @@ __all__ = ["CommGroup"]
 
 class CommGroup:
     def __init__(
-        self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None
+        self,
+        mpi_comm: "mpi4py.MPI.Comm" = None,
+        torch_group: "dist.ProcessGroup" = None,
+        interfaceIpPortTrio: str = "",
+        rank: int = None,
+        size: int = None,
     ):
-        if interfaceIpPortTrio == "":
-            self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
+        if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None):
             uniq_id = None
-            if mpi_comm.rank == 0:
-                # similar to NCCL's unique id
+            rank, size = (
+                (mpi_comm.Get_rank(), mpi_comm.Get_size())
+                if mpi_comm is not None
+                else (torch_group.rank(), torch_group.size())
+            )
+            self.bootstrap = CppTcpBootstrap.create(rank, size)
+            if rank == 0:
                 uniq_id = self.bootstrap.create_unique_id()
-            uniq_id_global = mpi_comm.bcast(uniq_id, 0)
+            if mpi_comm is not None:
+                import mpi4py
+
+                uniq_id_global = mpi_comm.bcast(uniq_id, 0)
+            else:
+                import torch
+                import torch.distributed as dist
+
+                if rank == 0:
+                    uniq_id_global = uniq_id
+                    pickled_data = pickle.dumps(uniq_id)
+                    data_tensor = torch.frombuffer(bytearray(pickled_data), dtype=torch.uint8).clone()
+                else:
+                    data_tensor = torch.zeros(256, dtype=torch.uint8)
+                dist.broadcast(data_tensor, src=0, group=torch_group)
+                uniq_id_global = pickle.loads(data_tensor.numpy().tobytes())
             self.bootstrap.initialize(uniq_id_global)
-        elif mpi_comm:
-            # use this instead
-            self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
-            self.bootstrap.initialize(interfaceIpPortTrio)
         elif not interfaceIpPortTrio == "":
             assert rank >= 0 and size >= 1
-            self.bootstrap = TcpBootstrap.create(rank, size)
+            self.bootstrap = CppTcpBootstrap.create(rank, size)
             self.bootstrap.initialize(interfaceIpPortTrio)
         else:
             raise RuntimeError("Either the interface or mpi_group need to be specified")
-        self.communicator = Communicator(self.bootstrap)
+        self.communicator = CppCommunicator(self.bootstrap)
         self.my_rank = self.bootstrap.get_rank()
         self.nranks = self.bootstrap.get_n_ranks()
         self.nranks_per_node = self.bootstrap.get_n_ranks_per_node()
@@ -63,43 +83,43 @@ class CommGroup:
     def recv(self, tensor: np.ndarray, peer: int, tag: int):
         self.bootstrap.recv(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag)
 
-    def my_ib_device(self, local_rank: int) -> Transport:
+    def my_ib_device(self, local_rank: int) -> CppTransport:
         if local_rank == 0:
-            return Transport.IB0
+            return CppTransport.IB0
         if local_rank == 1:
-            return Transport.IB1
+            return CppTransport.IB1
         if local_rank == 2:
-            return Transport.IB2
+            return CppTransport.IB2
         if local_rank == 3:
-            return Transport.IB3
+            return CppTransport.IB3
         if local_rank == 4:
-            return Transport.IB4
+            return CppTransport.IB4
         if local_rank == 5:
-            return Transport.IB5
+            return CppTransport.IB5
         if local_rank == 6:
-            return Transport.IB6
+            return CppTransport.IB6
         if local_rank == 7:
-            return Transport.IB7
+            return CppTransport.IB7
         else:
             assert False  # only 8 IBs are supported
 
     def make_connection(
         self,
         all_ranks: list[int],
-        endpoints: EndpointConfig | Transport | dict[int, EndpointConfig] | dict[int, Transport],
+        endpoints: CppEndpointConfig | CppTransport | dict[int, CppEndpointConfig] | dict[int, CppTransport],
         use_switch: bool = False,
-    ) -> dict[int, Connection]:
-        if type(endpoints) is Transport:
-            endpoints = EndpointConfig(endpoints)
+    ) -> dict[int, CppConnection]:
+        if type(endpoints) is CppTransport:
+            endpoints = CppEndpointConfig(endpoints)
         elif type(endpoints) is dict:
-            endpoints = {k: EndpointConfig(v) if type(v) is Transport else v for k, v in endpoints.items()}
+            endpoints = {k: CppEndpointConfig(v) if type(v) is CppTransport else v for k, v in endpoints.items()}
         connections = {}
         for rank in all_ranks:
             if type(endpoints) is dict:
                 endpoint = endpoints[rank]
             else:
                 endpoint = endpoints
-            if endpoint.transport == Transport.CudaIpc and use_switch:
+            if endpoint.transport == CppTransport.CudaIpc and use_switch:
                 return connect_nvls_collective(self.communicator, all_ranks, 2**30)
             else:
                 connections[rank] = self.communicator.connect(endpoint, rank)
@@ -107,8 +127,8 @@ class CommGroup:
         return connections
 
     def register_tensor_with_connections(
-        self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, Connection]
-    ) -> dict[int, RegisteredMemory]:
+        self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, CppConnection]
+    ) -> dict[int, CppRegisteredMemory]:
         local_reg_memory = self.register_local_memory(tensor, connections)
         all_registered_memories = {}
         all_registered_memories[self.my_rank] = local_reg_memory
@@ -121,8 +141,8 @@ class CommGroup:
         return all_registered_memories
 
     def _register_memory_with_connections(
-        self, memory: RegisteredMemory, connections: dict[int, Connection]
-    ) -> dict[int, RegisteredMemory]:
+        self, memory: CppRegisteredMemory, connections: dict[int, CppConnection]
+    ) -> dict[int, CppRegisteredMemory]:
         all_registered_memories = {}
         all_registered_memories[self.my_rank] = memory
         future_memories = {}
@@ -133,18 +153,20 @@ class CommGroup:
             all_registered_memories[rank] = future_memories[rank].get()
         return all_registered_memories
 
-    def make_semaphores(self, connections: dict[int, Connection]) -> dict[int, Semaphore]:
+    def make_semaphores(self, connections: dict[int, CppConnection]) -> dict[int, CppSemaphore]:
         future_semaphores = {}
         for rank in connections:
             future_semaphores[rank] = self.communicator.build_semaphore(connections[rank], rank)
         return {rank: future.get() for rank, future in future_semaphores.items()}
 
-    def make_memory_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, MemoryChannel]:
+    def make_memory_channels(
+        self, tensor: cp.ndarray, connections: dict[int, CppConnection]
+    ) -> dict[int, CppMemoryChannel]:
         semaphores = self.make_semaphores(connections)
         registered_memories = self.register_tensor_with_connections(tensor, connections)
         channels = {}
         for rank in connections:
-            channels[rank] = MemoryChannel(
+            channels[rank] = CppMemoryChannel(
                 semaphores[rank], registered_memories[rank], registered_memories[self.my_rank]
             )
         return channels
@@ -152,9 +174,9 @@ class CommGroup:
     def make_memory_channels_with_scratch(
         self,
         tensor: cp.ndarray,
-        registeredScratchBuffer: RegisteredMemory,
-        connections: dict[int, Connection],
-    ) -> dict[int, MemoryChannel]:
+        registeredScratchBuffer: CppRegisteredMemory,
+        connections: dict[int, CppConnection],
+    ) -> dict[int, CppMemoryChannel]:
         semaphores = self.make_semaphores(connections)
         registered_memories = self._register_memory_with_connections(registeredScratchBuffer, connections)
         channels = {}
@@ -162,17 +184,17 @@ class CommGroup:
         tensor_size = (
             tensor.numel() * tensor.element_size() if is_torch_tensor(tensor) else tensor.size * tensor.itemsize
         )
-        local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, TransportFlags())
+        local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, CppTransportFlags())
         scratch_data_ptr = registeredScratchBuffer.data()
         for rank in connections:
-            channels[rank] = MemoryChannel(
+            channels[rank] = CppMemoryChannel(
                 semaphores[rank], registered_memories[rank], local_registered_memory, scratch_data_ptr
             )
         return channels
 
     def make_port_channels(
-        self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
-    ) -> dict[int, PortChannel]:
+        self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection]
+    ) -> dict[int, CppPortChannel]:
         semaphores = self.make_semaphores(connections)
         registered_memories = self.register_tensor_with_connections(tensor, connections)
         memory_ids = {}
@@ -188,12 +210,12 @@ class CommGroup:
 
     def make_port_channels_with_scratch(
         self,
-        proxy_service: ProxyService,
+        proxy_service: CppProxyService,
         tensor: cp.ndarray,
-        registeredScratchBuffer: RegisteredMemory,
-        connections: dict[int, Connection],
-    ) -> dict[int, PortChannel]:
-        transport_flags = TransportFlags()
+        registeredScratchBuffer: CppRegisteredMemory,
+        connections: dict[int, CppConnection],
+    ) -> dict[int, CppPortChannel]:
+        transport_flags = CppTransportFlags()
         for rank in connections:
             transport_flags |= connections[rank].transport()
         data_ptr = (
@@ -223,8 +245,8 @@ class CommGroup:
         return channels
 
     def register_semaphore_with_proxy(
-        self, proxy_service: ProxyService, connections: dict[int, Connection]
-    ) -> dict[int, PortChannel]:
+        self, proxy_service: CppProxyService, connections: dict[int, CppConnection]
+    ) -> dict[int, CppPortChannel]:
         semaphores = self.make_semaphores(connections)
         semaphore_ids = {}
         for rank in semaphores:
@@ -235,7 +257,7 @@ class CommGroup:
         return channels
 
     def register_memory_with_proxy(
-        self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
+        self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection]
     ) -> dict[int, int]:
         registered_memories = self.register_tensor_with_connections(tensor, connections)
         memory_ids = {}
@@ -243,8 +265,8 @@ class CommGroup:
             memory_ids[rank] = proxy_service.add_memory(registered_memories[rank])
         return memory_ids
 
-    def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> RegisteredMemory:
-        transport_flags = TransportFlags()
+    def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, CppConnection]) -> CppRegisteredMemory:
+        transport_flags = CppTransportFlags()
         for rank in connections:
             transport_flags |= connections[rank].transport()
         data_ptr = (
diff --git a/python/mscclpp/_core/compiler.py b/python/mscclpp/_core/compiler.py
index 82ae93a9..b2da976d 100644
--- a/python/mscclpp/_core/compiler.py
+++ b/python/mscclpp/_core/compiler.py
@@ -26,9 +26,7 @@ from mscclpp.language.program import CollectiveProgram
 from mscclpp.language.utils import AlgoSpec
 from mscclpp.utils import get_device_arch
 
-from mscclpp._mscclpp import (
-    ExecutionPlan,
-)
+from mscclpp._mscclpp import CppExecutionPlan, env
 
 logging.basicConfig(level=logging.INFO)
 
@@ -51,7 +49,7 @@ class DslCompiler:
     into execution plans that can be run on GPUs. The compiled plans are cached
     to disk for reuse.
 
-    The cache location can be configured via the `MSCCLPP_EXECUTION_PLAN_DIR`
+    The cache location can be configured via the `MSCCLPP_CACHE_DIR`
     environment variable (defaults to `~/.cache/mscclpp`).
 
     Example:
@@ -138,7 +136,7 @@ class DslCompiler:
             )
         ).hexdigest()
 
-        plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp")
+        plan_dir = Path(env().cache_dir)
         os.makedirs(plan_dir, exist_ok=True)
         filename = f"{plan_id}.json"
         plan_path = os.path.join(plan_dir, filename)
@@ -157,7 +155,7 @@ class DslCompiler:
                     os.remove(tmp_path)
             except Exception:
                 Path(plan_path).unlink(missing_ok=True)
-        execution_plan = ExecutionPlan(plan_path, rank)
+        execution_plan = CppExecutionPlan(plan_path, rank)
         return Algorithm(
             id=plan_id,
             execution_plan=execution_plan,
@@ -179,8 +177,8 @@ class NativeCodeCompiler:
     based on the runtime environment. Compiled modules are cached to avoid
     recompilation.
 
-    The cache location can be configured via the `MSCCLPP_NATIVE_CACHE_DIR`
-    environment variable (defaults to `~/.cache/mscclpp/native`).
+    The cache location can be configured via the `MSCCLPP_CACHE_DIR`
+    environment variable (defaults to `~/.cache/mscclpp`).
 
     Attributes:
         _is_hip: True if running on AMD/ROCm, False for NVIDIA/CUDA.
@@ -226,8 +224,7 @@ class NativeCodeCompiler:
             "-L" + os.path.join(self._lib_home, "lib"),
             "-lmscclpp",
         ]
-        cache_root = os.environ.get("MSCCLPP_NATIVE_CACHE_DIR", Path.home() / ".cache/mscclpp/native")
-        self._cache_dir = Path(cache_root)
+        self._cache_dir = Path(env().cache_dir) / "native"
         self._cache_dir.mkdir(parents=True, exist_ok=True)
 
     def _get_compiler(self) -> str:
@@ -283,7 +280,7 @@ class NativeCodeCompiler:
         Note:
             - The source file should include pybind11 bindings to expose functions.
             - MSCCLPP headers are automatically included in the compilation.
-            - The module is cached in `MSCCLPP_NATIVE_CACHE_DIR` (default: ~/.cache/mscclpp/native).
+            - The module is cached in `MSCCLPP_CACHE_DIR` (default: ~/.cache/mscclpp).
             - File locking is used to prevent race conditions during parallel compilation.
 
         Example:
diff --git a/python/mscclpp/ext/algorithm_collection_builder.py b/python/mscclpp/ext/algorithm_collection_builder.py
index 51a178fb..ddfb929f 100644
--- a/python/mscclpp/ext/algorithm_collection_builder.py
+++ b/python/mscclpp/ext/algorithm_collection_builder.py
@@ -3,12 +3,10 @@
 
 from __future__ import annotations
 from typing import Union
-from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection
+from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_flag_buffer
 import atexit
 
-from mscclpp._mscclpp import (
-    AlgorithmCollectionBuilder as _AlgorithmCollectionBuilder,
-)
+from mscclpp._mscclpp import CppAlgorithmCollectionBuilder
 
 __all__ = ["AlgorithmCollectionBuilder"]
 
@@ -24,13 +22,14 @@ class AlgorithmCollectionBuilder:
     @classmethod
     def reset(cls):
         if cls._instance is not None:
-            _AlgorithmCollectionBuilder.reset()
+            CppAlgorithmCollectionBuilder.reset()
             cls._instance = None
 
     def __init__(self):
         if not hasattr(self, "_initialized"):
-            self._builder = _AlgorithmCollectionBuilder.get_instance()
+            self._builder = CppAlgorithmCollectionBuilder.get_instance()
             self._initialized = True
+            self._flag_buffer = None
 
     def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]):
         if isinstance(algorithm_builder, AlgorithmBuilder):
@@ -52,8 +51,17 @@ class AlgorithmCollectionBuilder:
         collection = self._builder.build()
         return AlgorithmCollection(collection)
 
-    def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection:
-        native_collection = self._builder.build_default_algorithms(int(scratch_buffer), scratch_buffer_size, rank)
+    def build_default_algorithms(
+        self,
+        scratch_buffer: int,
+        scratch_buffer_size: int,
+        rank: int,
+    ) -> AlgorithmCollection:
+        if self._flag_buffer is None:
+            self._flag_buffer = get_flag_buffer()
+        native_collection = self._builder.build_default_algorithms(
+            int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank
+        )
         return AlgorithmCollection(native_collection)
 
 
diff --git a/python/mscclpp/ext/alltoallv_single.py b/python/mscclpp/ext/alltoallv_single.py
index 2a29b3f5..e45ef950 100644
--- a/python/mscclpp/ext/alltoallv_single.py
+++ b/python/mscclpp/ext/alltoallv_single.py
@@ -24,11 +24,11 @@ def _a2av_dbg(msg: str):
     if _DEBUG_A2AV:
         print(msg, file=sys.stderr, flush=True)
 from mscclpp._mscclpp import (
-    Communicator,
-    TcpBootstrap,
-    DataType,
-    ReduceOp,
-    CommResult,
+    CppCommunicator as Communicator,
+    CppTcpBootstrap as TcpBootstrap,
+    CppDataType as DataType,
+    CppReduceOp as ReduceOp,
+    CppCommResult as CommResult,
 )
 from mscclpp.ext.algorithm_collection_builder import AlgorithmCollectionBuilder
 
@@ -375,6 +375,7 @@ class MscclppAlltoAllV:
             None,  # executor (not needed for native algos)
             0,     # nblocks (auto)
             0,     # nthreads_per_block (auto)
+            False, # symmetric_memory
             self._extras,
         )
 
diff --git a/python/mscclpp/language/channel.py b/python/mscclpp/language/channel.py
index 1b22e4e2..23d76eda 100644
--- a/python/mscclpp/language/channel.py
+++ b/python/mscclpp/language/channel.py
@@ -140,7 +140,7 @@ class MemoryChannel:
 
         for tb_id in tb_list:
             tb_chunk_id = get_program().setup_remote_chunk(self.src_rank, tb_id, remote_chunk, self.channel_type)
-            tb_channel_ids = get_program().setup_channel(tb, self)
+            tb_channel_ids = get_program().setup_channel(tb_id, self)
             op = GetOperation(
                 src_buff=[RemoteChunk(src_chunk.buffer, src_chunk.index, src_chunk.size, tb_chunk_id)],
                 dst_buff=[LocalChunk(dst_chunk.buffer, dst_chunk.index, dst_chunk.size)],
diff --git a/python/mscclpp/language/internal/operations.py b/python/mscclpp/language/internal/operations.py
index 127f4a03..5fb392e3 100644
--- a/python/mscclpp/language/internal/operations.py
+++ b/python/mscclpp/language/internal/operations.py
@@ -534,6 +534,7 @@ class PutOperation(BaseOperation):
         self.dst_buff = dst_buff
         self.channel_ids = channel_ids
         self.channel_type = channel_type
+        self.from_packet = from_packet
         self.to_packet = to_packet
         self.with_signal = with_signal
         self.with_signal_and_flush = with_signal_and_flush
@@ -579,6 +580,25 @@ class PutOperation(BaseOperation):
                 with_signal=self.with_signal,
                 with_signal_and_flush=self.with_signal_and_flush,
             )
+        elif (
+            isinstance(other, PutOperation)
+            and self.name == Instruction.read_put_packet
+            and self.name == other.name
+            and self.src_buff == other.src_buff
+            and self.channel_type == other.channel_type
+            and self.tbg_info == other.tbg_info
+        ):
+            fused_operation = PutOperation(
+                src_buff=self.src_buff,
+                dst_buff=self.dst_buff + other.dst_buff,
+                channel_ids=self.channel_ids + other.channel_ids,
+                channel_type=self.channel_type,
+                tbg_info=self.tbg_info,
+                from_packet=self.from_packet,
+                to_packet=self.to_packet,
+                with_signal=self.with_signal,
+                with_signal_and_flush=self.with_signal_and_flush,
+            )
 
         return fused_operation
 
@@ -725,7 +745,7 @@ class ReduceOperation(BaseOperation):
                 remote_dst_buff=self.remote_dst_buff + other.dst_buff,
                 channel_ids=self.channel_ids,
                 put_channel_ids=self.put_channel_ids + other.channel_ids,
-                channel_type=self.channel_type,
+                channel_type=other.channel_type,
                 reduce_operation=self.reduce_operation,
                 tbg_info=self.tbg_info,
                 packet=self.packet,
diff --git a/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py b/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py
new file mode 100644
index 00000000..bda9e36c
--- /dev/null
+++ b/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py
@@ -0,0 +1,78 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def allgather_example(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Scratch Buffers
+        scratch_buffer = []
+        for gpu in range(gpu_size):
+            scratch_buffer.append(Buffer(gpu, 2 * gpu_size))
+
+        # Copying it to scratch buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            scratch_offset = gpu_size
+            input_buffer = rank.get_input_buffer()
+            rank.copy_packets(
+                scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1], input_buffer[0:1], tb=0
+            )
+
+        # Putting packets in the remote scratch buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            output_buffer = rank.get_output_buffer()
+            for peer in range(1, gpu_size):
+                dst_rank = (gpu + peer) % gpu_size
+                ch = MemoryChannel(dst_rank, gpu)
+                tb = 0
+                ch.read_put_packets(
+                    scratch_buffer[dst_rank][gpu : gpu + 1],
+                    scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1],
+                    tb,
+                )
+
+        # Copying packets from local scratch buffer to local buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            output_buffer = rank.get_output_buffer()
+            for peer in range(1, gpu_size):
+                dst_rank = (gpu + peer) % gpu_size
+                rank.unpack_packets(
+                    output_buffer[dst_rank : dst_rank + 1],
+                    scratch_buffer[gpu][dst_rank : dst_rank + 1],
+                    tb=0,
+                )
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+allgather_example(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index 783b0ca9..93cd786b 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -11,7 +11,7 @@ from typing import Any, Type, Union
 import cupy as cp
 import numpy as np
 
-from mscclpp._mscclpp import DataType
+from mscclpp._mscclpp import CppDataType as DataType
 
 try:
     import torch
@@ -192,5 +192,13 @@ def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType:
         return DataType.int32
     elif dtype == torch.bfloat16:
         return DataType.bfloat16
+    # Hardware supports either OCP format or FNUZ format for float8.
+    # Mapping both to the same MSCClPP data type.
+    elif dtype == torch.float8_e5m2 or dtype == torch.float8_e5m2fnuz:
+        return DataType.float8_e5m2
+    elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e4m3fnuz:
+        return DataType.float8_e4m3
+    elif dtype == torch.uint8:
+        return DataType.uint8
     else:
         raise ValueError(f"Unknown data type: {dtype}")
diff --git a/python/requirements_cuda13.txt b/python/requirements_cuda13.txt
index b49a404c..49cf13bc 100644
--- a/python/requirements_cuda13.txt
+++ b/python/requirements_cuda13.txt
@@ -6,4 +6,5 @@ pytest
 numpy
 matplotlib
 sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
-blake3
\ No newline at end of file
+blake3
+pybind11
\ No newline at end of file
diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt
index e69de29b..7ed4fef3 100644
--- a/python/requirements_rocm6.txt
+++ b/python/requirements_rocm6.txt
@@ -0,0 +1,10 @@
+mpi4py
+cupy
+prettytable
+netifaces
+pytest
+numpy
+matplotlib
+sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+blake3
+pybind11
\ No newline at end of file
diff --git a/python/test/_cpp/proxy_test.cpp b/python/test/_cpp/proxy_test.cpp
index 5bc18e23..697a5c38 100644
--- a/python/test/_cpp/proxy_test.cpp
+++ b/python/test/_cpp/proxy_test.cpp
@@ -63,10 +63,13 @@ class MyProxyService {
 };
 
 NB_MODULE(_ext, m) {
+#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS
+  nb::set_leak_warnings(false);
+#endif
   nb::class_<MyProxyService>(m, "MyProxyService")
       .def(nb::init<int, int, int, nb::list, nb::list>(), nb::arg("rank"), nb::arg("nranks"), nb::arg("data_size"),
            nb::arg("reg_mem_list"), nb::arg("sem_list"))
       .def("fifo_device_handle", &MyProxyService::fifoDeviceHandle)
       .def("start", &MyProxyService::start)
       .def("stop", &MyProxyService::stop);
-}
+}
\ No newline at end of file
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 49e5166f..59bc1661 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -11,7 +11,7 @@ from mscclpp import (
     env,
 )
 from mscclpp import CommGroup, GpuBuffer
-from mscclpp.utils import KernelBuilder, GpuBuffer, pack
+from mscclpp.utils import KernelBuilder, pack
 import os
 import struct
 
diff --git a/python/test/test_alltoallv_mscclpp.py b/python/test/test_alltoallv_mscclpp.py
index e8797e43..d45fb6f4 100644
--- a/python/test/test_alltoallv_mscclpp.py
+++ b/python/test/test_alltoallv_mscclpp.py
@@ -130,11 +130,11 @@ def main():
         print("=" * 60)
 
     # Import after torch.distributed init
-    from mscclpp._mscclpp import (
+    from mscclpp import (
         Communicator,
         TcpBootstrap,
-        UniqueId,
     )
+    from mscclpp._mscclpp import CppUniqueId as UniqueId
     from mscclpp.ext.alltoallv_single import MscclppAlltoAllV
     
     # Create mscclpp communicator with TcpBootstrap
diff --git a/python/test/test_fp8_accum.py b/python/test/test_fp8_accum.py
new file mode 100644
index 00000000..82981ce1
--- /dev/null
+++ b/python/test/test_fp8_accum.py
@@ -0,0 +1,397 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Correctness test for FP8 allreduce with different accumulation types.
+#
+# Verifies that FP8 allreduce with higher-precision accumulation produces
+# results at least as accurate as native FP8 accumulation, by comparing
+# against a float32 reference.
+#
+# Usage:
+#   mpirun -np 8 pytest python/test/test_fp8_accum.py -v
+
+import cupy as cp
+import numpy as np
+import pytest
+
+from mscclpp import CommGroup, GpuBuffer, DataType, ReduceOp, is_nvls_supported
+from mscclpp.ext import AlgorithmCollectionBuilder
+from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
+
+# FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs.
+# On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed.
+_is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip
+_skip_fp8 = not _is_hip and int(cp.cuda.Device().compute_capability) < 89
+pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA")
+
+# ---------------------------------------------------------------------------
+# FP8 E4M3FN helpers (bias=7, no infinity, NaN = exp=15 & mant=7)
+# ---------------------------------------------------------------------------
+
+
+def e4m3fn_to_float(uint8_array):
+    """Decode a cupy uint8 array of E4M3FN bit patterns to float32."""
+    bits = uint8_array.astype(cp.int32)
+    sign = (bits >> 7) & 1
+    exp = (bits >> 3) & 0xF
+    mant = bits & 0x7
+
+    # Normal: (-1)^s * 2^(exp-7) * (1 + mant/8)
+    normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 7).astype(cp.int32))
+    # Subnormal (exp==0): (-1)^s * 2^(-6) * (mant/8)
+    subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-6))
+
+    result = cp.where(exp == 0, subnormal_val, normal_val)
+    result = cp.where(sign == 1, -result, result)
+    # Zero
+    result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
+    # NaN: exp==15 & mant==7
+    nan_mask = (exp == 15) & (mant == 7)
+    result = cp.where(nan_mask, cp.float32(float("nan")), result)
+    return result
+
+
+def float_to_e4m3fn(f32_array, chunk_size=65536):
+    """Encode a cupy float32 array to uint8 E4M3FN bit patterns.
+
+    Uses a lookup-table approach: precompute all 128 positive E4M3FN values,
+    then find nearest match per element via chunked broadcast comparison.
+    """
+    # Build lookup table of all 128 positive E4M3FN values (0x00..0x7F)
+    all_bytes = cp.arange(128, dtype=cp.uint8)
+    all_floats = e4m3fn_to_float(all_bytes)  # (128,) float32
+    # Mark NaN entries as inf so they're never selected as nearest
+    all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
+
+    # Clamp input and extract sign
+    clamped = f32_array.astype(cp.float32)
+    clamped = cp.clip(clamped, -448.0, 448.0)
+    signs = (clamped < 0).astype(cp.uint8)
+    absval = cp.abs(clamped)
+
+    result = cp.zeros(absval.shape, dtype=cp.uint8)
+    n = absval.size
+    absval_flat = absval.ravel()
+    result_flat = result.ravel()
+
+    for start in range(0, n, chunk_size):
+        end = min(start + chunk_size, n)
+        chunk = absval_flat[start:end]
+        # (chunk_size, 128) difference matrix
+        diffs = cp.abs(chunk[:, None] - all_floats[None, :])
+        result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
+
+    # Combine with sign bit
+    result = result_flat.reshape(absval.shape)
+    result = result | (signs << 7)
+    # Handle exact zero
+    result = cp.where(absval == 0, cp.uint8(0), result)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80)
+# ---------------------------------------------------------------------------
+
+
+def e4m3b15_to_float(uint8_array):
+    """Decode a cupy uint8 array of E4M3B15 bit patterns to float32."""
+    bits = uint8_array.astype(cp.int32)
+    sign = (bits >> 7) & 1
+    exp = (bits >> 3) & 0xF
+    mant = bits & 0x7
+
+    # Normal: (-1)^s * 2^(exp-15) * (1 + mant/8)
+    normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 15).astype(cp.int32))
+    # Subnormal (exp==0): (-1)^s * 2^(-14) * (mant/8)
+    subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-14))
+
+    result = cp.where(exp == 0, subnormal_val, normal_val)
+    result = cp.where(sign == 1, -result, result)
+    # Zero
+    result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
+    # NaN: exp==15 or negative zero (0x80)
+    nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80)
+    result = cp.where(nan_mask, cp.float32(float("nan")), result)
+    return result
+
+
+def float_to_e4m3b15(f32_array, chunk_size=65536):
+    """Encode a cupy float32 array to uint8 E4M3B15 bit patterns.
+
+    Same lookup-table approach as float_to_e4m3fn.
+    """
+    # Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F)
+    all_bytes = cp.arange(128, dtype=cp.uint8)
+    all_floats = e4m3b15_to_float(all_bytes)  # (128,) float32
+    # Mark NaN entries as inf so they're never selected as nearest
+    all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
+
+    # Clamp input and extract sign
+    clamped = f32_array.astype(cp.float32)
+    clamped = cp.clip(clamped, -0.9375, 0.9375)
+    signs = (clamped < 0).astype(cp.uint8)
+    absval = cp.abs(clamped)
+
+    result = cp.zeros(absval.shape, dtype=cp.uint8)
+    n = absval.size
+    absval_flat = absval.ravel()
+    result_flat = result.ravel()
+
+    for start in range(0, n, chunk_size):
+        end = min(start + chunk_size, n)
+        chunk = absval_flat[start:end]
+        # (chunk_size, 128) difference matrix
+        diffs = cp.abs(chunk[:, None] - all_floats[None, :])
+        result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
+
+    # Combine with sign bit
+    result = result_flat.reshape(absval.shape)
+    result = result | (signs << 7)
+    # Handle exact zero
+    result = cp.where(absval == 0, cp.uint8(0), result)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Shared test helpers
+# ---------------------------------------------------------------------------
+
+
+def setup_algorithms(mpi_group):
+    """Build default algorithms and return (comm_group, algo_map, scratch_buf)."""
+    comm_group = CommGroup(mpi_group.comm)
+    scratch = GpuBuffer(1 << 27, dtype=cp.uint8)  # 128 MB
+    AlgorithmCollectionBuilder.reset()
+    builder = AlgorithmCollectionBuilder()
+    algorithms = builder.build_default_algorithms(
+        scratch_buffer=scratch.data.ptr,
+        scratch_buffer_size=scratch.nbytes,
+        rank=comm_group.my_rank,
+    )
+    algo_map = {a.name: a for a in algorithms}
+    return comm_group, algo_map, scratch
+
+
+def run_allreduce(algo, comm_group, buffer, dtype, accum_dtype=None, nblocks=0, nthreads_per_block=0):
+    """Run allreduce in-place on buffer and return a copy of the result."""
+    ret = algo.execute(
+        comm=comm_group.communicator,
+        input_buffer=buffer.data.ptr,
+        output_buffer=buffer.data.ptr,
+        input_size=buffer.nbytes,
+        output_size=buffer.nbytes,
+        dtype=dtype,
+        op=ReduceOp.SUM,
+        stream=cp.cuda.get_current_stream().ptr,
+        nblocks=nblocks,
+        nthreads_per_block=nthreads_per_block,
+        symmetric_memory=True,
+        accum_dtype=accum_dtype,
+    )
+    cp.cuda.Device().synchronize()
+    assert ret == 0, f"Allreduce failed with error code {ret}"
+    return buffer.copy()
+
+
+# ---------------------------------------------------------------------------
+# Test: FP8 E4M3 accumulation correctness
+# ---------------------------------------------------------------------------
+
+
+@parametrize_mpi_groups(8)
+@pytest.mark.parametrize(
+    "algo_name",
+    [
+        "default_allreduce_packet",
+        "default_allreduce_nvls_packet",
+        "default_allreduce_fullmesh",
+        "default_allreduce_rsag_zero_copy",
+        "default_allreduce_allpair_packet",
+    ],
+)
+@pytest.mark.parametrize("size", [1024, 4096, 16384, 65536, 262144, 1048576])
+def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
+    """Verify that FP8 E4M3 allreduce with higher-precision accumulation is at
+    least as accurate as native FP8 accumulation, across all algorithm variants."""
+    rank = mpi_group.comm.rank
+    world_size = mpi_group.comm.size
+
+    comm_group, algo_map, scratch = setup_algorithms(mpi_group)
+    if algo_name not in algo_map:
+        pytest.skip(f"{algo_name} not available")
+    if "nvls" in algo_name and not is_nvls_supported():
+        pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
+    algo = algo_map[algo_name]
+
+    buf = GpuBuffer(size, dtype=cp.uint8)
+
+    accum_configs = [
+        ("fp8_native", DataType.float8_e4m3),
+        ("float16", DataType.float16),
+        ("float32", DataType.float32),
+    ]
+
+    # rsag_zero_copy and fullmesh need explicit block/thread counts
+    if "rsag" in algo_name:
+        nb = max(1, min(32, size // (world_size * 32)))
+        nt = 1024
+    elif "fullmesh" in algo_name:
+        nb = 35
+        nt = 512
+    else:
+        nb = 0
+        nt = 0
+
+    errors = {}
+    for accum_label, accum_dtype in accum_configs:
+        # Generate deterministic per-rank data (use numpy to avoid hipRAND issues on ROCm)
+        rng = np.random.RandomState(42 + rank)
+        src_f32 = cp.asarray(rng.randn(size).astype(np.float32))
+        src_f32 = cp.clip(src_f32, -240.0, 240.0)
+        src_fp8 = float_to_e4m3fn(src_f32)
+
+        # Copy into symmetric buffer
+        buf[:] = src_fp8
+        cp.cuda.Device().synchronize()
+
+        # Run allreduce
+        result = run_allreduce(
+            algo,
+            comm_group,
+            buf,
+            dtype=DataType.float8_e4m3,
+            accum_dtype=accum_dtype,
+            nblocks=nb,
+            nthreads_per_block=nt,
+        )
+        result_f32 = e4m3fn_to_float(result)
+
+        # Compute float32 reference: sum all ranks' quantized FP8 inputs in float32
+        ref_f32 = cp.zeros(size, dtype=cp.float32)
+        for r in range(world_size):
+            rng_r = np.random.RandomState(42 + r)
+            rank_data = cp.asarray(rng_r.randn(size).astype(np.float32))
+            rank_data = cp.clip(rank_data, -240.0, 240.0)
+            rank_data_fp8 = float_to_e4m3fn(rank_data)
+            ref_f32 += e4m3fn_to_float(rank_data_fp8)
+
+        # Compute errors
+        abs_err = cp.abs(result_f32 - ref_f32)
+        mean_abs_err = float(cp.mean(abs_err))
+        errors[accum_label] = mean_abs_err
+
+        # Reset between runs
+        algo.reset()
+
+    # Higher-precision accumulation should be at least as accurate as native fp8
+    assert (
+        errors["float16"] <= errors["fp8_native"] + 1e-6
+    ), f"float16 accum ({errors['float16']:.6f}) worse than native ({errors['fp8_native']:.6f})"
+    assert (
+        errors["float32"] <= errors["fp8_native"] + 1e-6
+    ), f"float32 accum ({errors['float32']:.6f}) worse than native ({errors['fp8_native']:.6f})"
+
+
+# ---------------------------------------------------------------------------
+# Test: FP8 E4M3B15 accumulation correctness
+# ---------------------------------------------------------------------------
+
+
+@parametrize_mpi_groups(8)
+@pytest.mark.parametrize(
+    "algo_name",
+    [
+        "default_allreduce_packet",
+        "default_allreduce_nvls_packet",
+        "default_allreduce_rsag_zero_copy",
+        "default_allreduce_fullmesh",
+        "default_allreduce_allpair_packet",
+    ],
+)
+@pytest.mark.parametrize("size", [1024, 4096, 65536])
+def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
+    """Verify that FP8 E4M3B15 allreduce with higher-precision accumulation is at
+    least as accurate as native E4M3B15 accumulation."""
+    rank = mpi_group.comm.rank
+    world_size = mpi_group.comm.size
+
+    comm_group, algo_map, scratch = setup_algorithms(mpi_group)
+    if algo_name not in algo_map:
+        pytest.skip(f"{algo_name} not available")
+    if "nvls" in algo_name and not is_nvls_supported():
+        pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
+
+    algo = algo_map[algo_name]
+    buf = GpuBuffer(size, dtype=cp.uint8)
+
+    accum_configs = [
+        ("e4m3b15_native", DataType.float8_e4m3b15),
+        ("float16", DataType.float16),
+        ("float32", DataType.float32),
+    ]
+
+    # rsag_zero_copy needs explicit block/thread counts, scaled to data size
+    if "rsag" in algo_name:
+        nb = max(1, min(32, size // (world_size * 32)))
+        nt = 1024
+    else:
+        nb = 0
+        nt = 0
+
+    errors = {}
+    for accum_label, accum_dtype in accum_configs:
+        # Generate deterministic per-rank random uint8 values in valid e4m3b15 range
+        rng = np.random.RandomState(42 + rank)
+        raw = cp.asarray(rng.randint(0, 0x78, (size,)).astype(np.uint8))
+        signs = cp.asarray(rng.randint(0, 2, (size,)).astype(np.uint8)) << 7
+        src_uint8 = raw | signs
+        # Fix negative zero -> positive zero
+        src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8)
+
+        # Copy into symmetric buffer
+        buf[:] = src_uint8
+        cp.cuda.Device().synchronize()
+
+        # Run allreduce
+        result = run_allreduce(
+            algo,
+            comm_group,
+            buf,
+            dtype=DataType.float8_e4m3b15,
+            accum_dtype=accum_dtype,
+            nblocks=nb,
+            nthreads_per_block=nt,
+        )
+
+        # Decode result
+        result_f32 = e4m3b15_to_float(result)
+
+        # Compute float32 reference
+        ref_f32 = cp.zeros(size, dtype=cp.float32)
+        for r in range(world_size):
+            rng_r = np.random.RandomState(42 + r)
+            raw_r = cp.asarray(rng_r.randint(0, 0x78, (size,)).astype(np.uint8))
+            signs_r = cp.asarray(rng_r.randint(0, 2, (size,)).astype(np.uint8)) << 7
+            bits_r = raw_r | signs_r
+            bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r)
+            ref_f32 += e4m3b15_to_float(bits_r)
+
+        # Clamp reference to e4m3b15 representable range
+        ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375)
+
+        # Compute errors (only on valid entries)
+        valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32)
+        abs_err = cp.abs(result_f32[valid] - ref_f32[valid])
+        mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0
+        errors[accum_label] = mean_abs_err
+
+        algo.reset()
+
+    # Higher-precision accumulation should be at least as accurate as native
+    assert (
+        errors["float16"] <= errors["e4m3b15_native"] + 1e-8
+    ), f"float16 accum ({errors['float16']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
+    assert (
+        errors["float32"] <= errors["e4m3b15_native"] + 1e-8
+    ), f"float32 accum ({errors['float32']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index a6899642..6b3119cb 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -162,13 +162,10 @@ def create_connection(group: CommGroup, connection_type: str):
 def create_group_and_connection(mpi_group: MpiGroup, connection_type: str):
     if (connection_type == "NVLink" or connection_type == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
         pytest.skip("cannot use nvlink/nvls for cross node")
+    if connection_type == "IB" and os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0":
+        pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1")
     group = CommGroup(mpi_group.comm)
-    try:
-        connection = create_connection(group, connection_type)
-    except Error as e:
-        if connection_type == "IB" and e.args[0] == ErrorCode.InvalidUsage:
-            pytest.skip("IB not supported on this node")
-        raise
+    connection = create_connection(group, connection_type)
     return group, connection
 
 
@@ -281,6 +278,8 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, connection_type: str,
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 def test_h2h_semaphores(mpi_group: MpiGroup):
+    if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0":
+        pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1")
     group = CommGroup(mpi_group.comm)
     tran = group.my_ib_device(group.my_rank % 8)
     endpoint = EndpointConfig(tran, Device(DeviceType.CPU))
@@ -301,6 +300,8 @@ def test_h2h_semaphores(mpi_group: MpiGroup):
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 def test_h2h_semaphores_gil_release(mpi_group: MpiGroup):
+    if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0":
+        pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1")
     group = CommGroup(mpi_group.comm)
     tran = group.my_ib_device(group.my_rank % 8)
     endpoint = EndpointConfig(tran, Device(DeviceType.CPU))
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index c1aa25bb..9ca5fed3 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -28,6 +28,16 @@ if(MSCCLPP_USE_IB)
     target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
     target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
     target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
+    if(MLX5_FOUND)
+        target_include_directories(mscclpp_obj SYSTEM PRIVATE ${MLX5_INCLUDE_DIRS})
+        target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MLX5DV)
+    endif()
+endif()
+
+if(MSCCLPP_USE_GDRCOPY)
+    target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GDRCOPY_INCLUDE_DIRS})
+    target_link_libraries(mscclpp_obj PRIVATE ${GDRCOPY_LIBRARIES})
+    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
 endif()
 
 set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc
index 07da9045..a492ee6a 100644
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -3,6 +3,7 @@
 
 #include <filesystem>
 #include <mscclpp/algorithm.hpp>
+#include <mscclpp/gpu_utils.hpp>
 
 #include "logger.hpp"
 
@@ -40,19 +41,21 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
 CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
                                     size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
                                     cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
-                                    const std::unordered_map<std::string, uintptr_t>& extras) {
+                                    bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras,
+                                    DataType accumDtype) {
+  if (accumDtype == DataType::AUTO) accumDtype = dtype;
   if (!initialized_) {
     initFunc_(comm);
     initialized_ = true;
   }
-  AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype);
+  AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype, symmetricMemory);
   auto it = contexts_.find(ctxKey);
   if (it == contexts_.end()) {
     auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype);
     contexts_[ctxKey] = ctx;
   }
   return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks,
-                           nThreadsPerBlock, extras);
+                           nThreadsPerBlock, extras, accumDtype);
 }
 
 const std::string& NativeAlgorithm::name() const { return name_; }
@@ -65,6 +68,11 @@ const std::pair<size_t, size_t>& NativeAlgorithm::messageRange() const {
   return range;
 }
 
+void NativeAlgorithm::setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) {
+  minMessageSize_ = minMessageSize;
+  maxMessageSize_ = maxMessageSize;
+}
+
 const std::unordered_map<std::string, uint64_t>& NativeAlgorithm::tags() const { return tags_; }
 
 const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferMode_; }
@@ -142,6 +150,10 @@ const std::pair<size_t, size_t>& DslAlgorithm::messageRange() const {
   return range;
 }
 
+void DslAlgorithm::setMessageSizeRange(size_t, size_t) {
+  THROW(EXEC, Error, ErrorCode::InvalidUsage, "setMessageSizeRange is only supported for native algorithms");
+}
+
 const std::unordered_map<std::string, uint64_t>& DslAlgorithm::tags() const { return tags_; }
 
 const CollectiveBufferMode& DslAlgorithm::bufferMode() const {
@@ -155,8 +167,8 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
 
 CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                                  size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
-                                 std::shared_ptr<Executor> executor, int, int,
-                                 const std::unordered_map<std::string, uintptr_t>&) {
+                                 std::shared_ptr<Executor> executor, int, int, bool,
+                                 const std::unordered_map<std::string, uintptr_t>&, DataType) {
   if (!executor) {
     THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
   }
@@ -173,15 +185,19 @@ CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void*
                         stream);
       break;
 #if defined(__FP8_TYPES_EXIST__)
-    case DataType::FP8_E4M3:
-      executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FP8_E4M3, plan_,
-                        stream);
+    case DataType::FLOAT8_E4M3:
+      executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3,
+                        plan_, stream);
       break;
-    case DataType::FP8_E5M2:
-      executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FP8_E5M2, plan_,
-                        stream);
+    case DataType::FLOAT8_E5M2:
+      executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FLOAT8_E5M2,
+                        plan_, stream);
       break;
 #endif
+    case DataType::FLOAT8_E4M3B15:
+      executor->execute(rank, (__fp8_e4m3b15*)input, (__fp8_e4m3b15*)output, inputSize, outputSize,
+                        DataType::FLOAT8_E4M3B15, plan_, stream);
+      break;
     case DataType::INT32:
     case DataType::UINT32:
       executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream);
@@ -198,4 +214,23 @@ std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
 // TODO: implement this
 void DslAlgorithm::reset() {}
 
+static uint32_t* gDefaultFlagBuffer = nullptr;
+static std::weak_ptr<void> gDefaultFlagBufferWeak;
+static size_t gDefaultFlagCount = 128;
+
+std::pair<std::shared_ptr<void>, size_t> getFlagBuffer() {
+  auto ptr = gDefaultFlagBufferWeak.lock();
+  if (!ptr) {
+    if (!gDefaultFlagBuffer) {
+      // Intentionally never freed — CUDA driver reclaims GPU memory at process exit.
+      gDefaultFlagBuffer = static_cast<uint32_t*>(mscclpp::detail::gpuCalloc(gDefaultFlagCount * sizeof(uint32_t)));
+      std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
+      mscclpp::gpuMemcpy(gDefaultFlagBuffer, initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
+    }
+    ptr = std::shared_ptr<void>(gDefaultFlagBuffer, [](void*) {});
+    gDefaultFlagBufferWeak = ptr;
+  }
+  return {ptr, gDefaultFlagCount * sizeof(uint32_t)};
+}
+
 }  // namespace mscclpp
diff --git a/src/core/communicator.cc b/src/core/communicator.cc
index a146f0de..c95ca421 100644
--- a/src/core/communicator.cc
+++ b/src/core/communicator.cc
@@ -4,7 +4,6 @@
 #include "communicator.hpp"
 
 #include "api.h"
-#include "debug.h"
 
 namespace mscclpp {
 
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 10a43e88..8b6c0afb 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -7,7 +7,8 @@
 #include <mscclpp/npkit/npkit.hpp>
 #endif
 
-#include <mscclpp/env.hpp>
+#include <mscclpp/atomic_device.hpp>
+#include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
 #include <thread>
@@ -15,6 +16,7 @@
 #include "api.h"
 #include "context.hpp"
 #include "endpoint.hpp"
+#include "gpu_utils_internal.hpp"
 #include "logger.hpp"
 
 namespace mscclpp {
@@ -180,25 +182,185 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
 
 // IBConnection
 
+void IBConnection::recvThreadFunc() {
+  // Set the CUDA device context for this thread
+  if (localGpuDeviceId_ >= 0) {
+    cudaError_t err = cudaSetDevice(localGpuDeviceId_);
+    if (err != cudaSuccess) {
+      WARN(NET, "IBConnection recvThreadFunc: cudaSetDevice(", localGpuDeviceId_,
+           ") failed: ", cudaGetErrorString(err));
+      return;
+    }
+    // Bind this thread to the NUMA node of the local GPU for optimal memory access
+    int deviceNumaNode = getDeviceNumaNode(localGpuDeviceId_);
+    if (deviceNumaNode >= 0) {
+      numaBind(deviceNumaNode);
+    }
+  }
+
+  uint32_t lastImmData = 0;
+  uint64_t immHighBits = 0;
+  uint64_t newValueHost = 0;
+
+  auto qp = qp_.lock();
+  if (!qp) return;
+
+  while (!stopRecvThread_.load(std::memory_order_relaxed)) {
+    int wcNum = qp->pollRecvCq();
+    if (wcNum < 0) {
+      recvThreadErrorMsg_ = "pollRecvCq failed";
+      recvThreadError_.store(true, std::memory_order_release);
+      WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
+      break;
+    }
+
+    for (int i = 0; i < wcNum; ++i) {
+      int status = qp->getRecvWcStatus(i);
+      if (status != static_cast<int>(WsStatus::Success)) {
+        // A failed recv WC typically means the QP entered error state (e.g., WR Flushed Error).
+        // All remaining WRs will also fail — no recovery without QP recreation. Exit the thread
+        // and set the error flag so the main thread can detect it.
+        recvThreadErrorMsg_ = std::string("recv work completion failed: ") + qp->getRecvWcStatusString(i);
+        recvThreadError_.store(true, std::memory_order_release);
+        WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
+        return;
+      }
+
+      // Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
+      // using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits
+      // are less than the previous value, the upper 32 bits must have incremented by 1.
+      uint32_t immData = qp->getRecvWcImmData(i);
+      if (immData < lastImmData) {
+        immHighBits += (1ULL << 32);
+      }
+      lastImmData = immData;
+      newValueHost = immHighBits | static_cast<uint64_t>(immData);
+
+      // Forward the token to the semaphore's inbound token address via atomicStore
+      // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
+      if (signalAddr_ != 0) {
+        if (signalGdrMap_ && signalGdrMap_->valid()) {
+          atomicStore(signalGdrMap_->hostPtr(), newValueHost, memoryOrderRelaxed);
+        } else {
+          // For HIP/ROCm.
+          // NOTE: may need a fix in the future to ensure BAR1 mapping.
+          *reinterpret_cast<volatile uint64_t*>(signalAddr_) = newValueHost;
+        }
+      }
+
+      // Post another recv for future messages
+      qp->stageRecv(/*wrId=*/0);
+      qp->postRecv();
+    }
+  }
+}
+
 IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint,
                            const Endpoint& remoteEndpoint)
     : BaseConnection(context, localEndpoint),
       transport_(localEndpoint.transport()),
       remoteTransport_(remoteEndpoint.transport()),
-      dummyAtomicSource_(std::make_unique<uint64_t>(0)) {
+      atomicSrc_(std::make_unique<uint64_t>(0)),
+      ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
+      gdrSignalForwarding_(false),
+      stopRecvThread_(false),
+      recvThreadError_(false),
+      localGpuDeviceId_(localEndpoint.device().id),
+      signalAddr_(0) {
   qp_ = getImpl(localEndpoint).ibQp_;
   qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
   qp_.lock()->rts();
-  dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
-  validateTransport(dummyAtomicSourceMem_, transport_);
-  dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_);
-  INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created");
+  atomicSrcMem_ = context->registerMemory(atomicSrc_.get(), sizeof(uint64_t), transport_);
+  validateTransport(atomicSrcMem_, transport_);
+  atomicSrcTransportInfo_ = getImpl(atomicSrcMem_).getTransportInfo(transport_);
+
+  if (ibNoAtomic_) {
+#if defined(MSCCLPP_USE_CUDA)
+    // On CUDA, HostNoAtomic requires GDRCopy for CPU→GPU signal forwarding through BAR1.
+    if (!gdrEnabled()) {
+      THROW(CONN, Error, ErrorCode::InvalidUsage,
+            "IB host-no-atomic mode on CUDA requires GDRCopy: ", gdrStatusMessage());
+    }
+    gdrSignalForwarding_ = true;
+#endif  // defined(MSCCLPP_USE_CUDA)
+
+    // On platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200
+    // NVLink-C2C), HostNoAtomic requires Data Direct for correct memory ordering. Data Direct
+    // routes NIC DMA through the PCIe Data Direct engine, bypassing the bridge. It is available
+    // on Virtual Function (VF) devices. On platforms without such a bridge (x86, non-Grace
+    // aarch64), HostNoAtomic works without Data Direct.
+    //
+    // We cannot reliably detect the bridge at compile time or runtime, so we emit a warning
+    // when the device is not a VF. If data corruption occurs, switching to VF devices with
+    // Data Direct or using IbMode::Host with RDMA atomics will resolve it.
+    {
+      IbCtx* ibCtx = getImpl(*context).getIbContext(transport_);
+      if (!ibCtx->isVirtualFunction()) {
+        WARN(CONN,
+             "IB HostNoAtomic mode without a Virtual Function (VF) device may cause data corruption "
+             "on platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200). "
+             "Device ",
+             ibCtx->getDevName(),
+             " is not a VF. "
+             "If you experience data corruption, use VF devices with Data Direct or IbMode::Host.");
+      }
+    }
+
+    // Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
+    // The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
+    auto qp = qp_.lock();
+    int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
+    for (int i = 0; i < maxRecvWr; ++i) {
+      qp->stageRecv(/*wrId=*/0);
+    }
+    qp->postRecv();
+    // The recv thread is started later in startSignalForwarding() when the semaphore
+    // provides the signal forwarding destination. This ensures the thread lifetime is
+    // bounded by the GdrMap lifetime (created before start, destroyed after stop).
+    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with signal forwarding (HostNoAtomic) mode");
+  } else {
+    INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode");
+  }
 }
 
+IBConnection::~IBConnection() { stopSignalForwarding(); }
+
 Transport IBConnection::transport() const { return transport_; }
 
 Transport IBConnection::remoteTransport() const { return remoteTransport_; }
 
+bool IBConnection::isSignalForwarding() const { return ibNoAtomic_; }
+
+void IBConnection::startSignalForwarding(std::shared_ptr<uint64_t> mem) {
+  // Set up the forwarding destination and GdrMap, then start the recv thread.
+  // Order: set address → create GdrMap → start thread.
+  signalAddr_ = reinterpret_cast<uint64_t>(mem.get());
+  if (gdrSignalForwarding_) {
+    signalGdrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
+  }
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(false, std::memory_order_relaxed);
+    recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
+  }
+  INFO(CONN, "IBConnection startSignalForwarding: ", (void*)signalAddr_);
+}
+
+void IBConnection::stopSignalForwarding() {
+  // Stop the recv thread, then tear down GdrMap and address.
+  // Order: stop thread → destroy GdrMap → clear address.
+  if (ibNoAtomic_) {
+    stopRecvThread_.store(true, std::memory_order_relaxed);
+    if (recvThread_.joinable()) {
+      recvThread_.join();
+    }
+  }
+  if (gdrSignalForwarding_) {
+    signalGdrMap_.reset();
+  }
+  signalAddr_ = 0;
+  INFO(CONN, "IBConnection stopSignalForwarding");
+}
+
 void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
                          uint64_t size) {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_CONN_IB_WRITE_ENTRY)
@@ -220,8 +382,8 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem
   auto dstMrInfo = dstTransportInfo.ibMrInfo;
   auto srcMr = srcTransportInfo.ibMr;
 
-  qp_.lock()->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset,
-                        /*signaled=*/true);
+  qp_.lock()->stageSendWrite(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset,
+                             /*dstOffset=*/dstOffset, /*signaled=*/true);
 
   qp_.lock()->postSend();
   INFO(CONN, "IBConnection write: from ", (uint8_t*)srcMr->getBuff() + srcOffset, " to ",
@@ -248,12 +410,32 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
   uint64_t oldValue = *src;
   *src = newValue;
 
-  qp_.lock()->stageAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
-                             /*signaled=*/true);
-
-  qp_.lock()->postSend();
-  INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
-       " -> ", newValue);
+  if (ibNoAtomic_) {
+    // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the
+    // token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around
+    // detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
+    // indicates the upper 32 bits incremented by 1).
+    if (newValue <= oldValue) {
+      WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ", newValue);
+    } else if (newValue - oldValue >= (1ULL << 32)) {
+      WARN(CONN,
+           "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ", oldValue,
+           " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
+    }
+    unsigned int immData = static_cast<unsigned int>(newValue);
+    qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
+                                      /*size=*/0, /*wrId=*/0,
+                                      /*srcOffset=*/0, /*dstOffset=*/0,
+                                      /*signaled=*/true, /*immData=*/immData);
+    qp_.lock()->postSend();
+    INFO(CONN, "IBConnection signal forwarding: value ", oldValue, " -> ", newValue);
+  } else {
+    qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
+                                   /*signaled=*/true);
+    qp_.lock()->postSend();
+    INFO(CONN, "IBConnection atomic write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
+         " -> ", newValue);
+  }
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_EXIT)
   NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_EXIT, 0, 0, *NpKit::GetCpuTimestamp(), 0);
@@ -265,22 +447,27 @@ void IBConnection::flush(int64_t timeoutUsec) {
   NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_FLUSH_ENTRY, 0, 0, *NpKit::GetCpuTimestamp(), 0);
 #endif
 
+  // Check if the recv thread has already reported an error (e.g., QP entered error state).
+  if (recvThreadError_.load(std::memory_order_acquire)) {
+    THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_);
+  }
+
   Timer timer;
-  while (qp_.lock()->getNumCqItems()) {
-    int wcNum = qp_.lock()->pollCq();
+  while (qp_.lock()->getNumSendCqItems()) {
+    int wcNum = qp_.lock()->pollSendCq();
     if (wcNum < 0) {
-      THROW(NET, IbError, errno, "pollCq failed");
+      THROW(NET, IbError, errno, "pollSendCq failed");
     } else if (timeoutUsec >= 0) {
       auto elapsed = timer.elapsed();
       if (elapsed > timeoutUsec) {
-        THROW(CONN, Error, ErrorCode::Timeout, "pollCq timed out: waited for ", elapsed / 1e6, " seconds. Expected ",
-              qp_.lock()->getNumCqItems(), " signals");
+        THROW(CONN, Error, ErrorCode::Timeout, "pollSendCq timed out: waited for ", elapsed / 1e6,
+              " seconds. Expected ", qp_.lock()->getNumSendCqItems(), " signals");
       }
     }
     for (int i = 0; i < wcNum; ++i) {
-      int status = qp_.lock()->getWcStatus(i);
+      int status = qp_.lock()->getSendWcStatus(i);
       if (status != static_cast<int>(WsStatus::Success)) {
-        THROW(NET, Error, ErrorCode::SystemError, "an IB work item failed: ", qp_.lock()->getWcStatusString(i));
+        THROW(NET, Error, ErrorCode::SystemError, "an IB work item failed: ", qp_.lock()->getSendWcStatusString(i));
       }
     }
   }
diff --git a/src/core/context.cc b/src/core/context.cc
index 9bf299d3..aabe71df 100644
--- a/src/core/context.cc
+++ b/src/core/context.cc
@@ -23,14 +23,14 @@ void CudaIpcStream::setStreamIfNeeded() {
   }
 }
 
-void CudaIpcStream::memcpyD2D(void *dst, const void *src, size_t nbytes) {
+void CudaIpcStream::memcpyD2D(void* dst, const void* src, size_t nbytes) {
   CudaDeviceGuard deviceGuard(deviceId_);
   setStreamIfNeeded();
   MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, *stream_));
   dirty_ = true;
 }
 
-void CudaIpcStream::memcpyH2D(void *dst, const void *src, size_t nbytes) {
+void CudaIpcStream::memcpyH2D(void* dst, const void* src, size_t nbytes) {
   CudaDeviceGuard deviceGuard(deviceId_);
   setStreamIfNeeded();
   MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, *stream_));
@@ -46,9 +46,7 @@ void CudaIpcStream::sync() {
   }
 }
 
-Context::Impl::Impl() {}
-
-IbCtx *Context::Impl::getIbContext(Transport ibTransport) {
+IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
   // Find IB context or create it
   auto it = ibContexts_.find(ibTransport);
   if (it == ibContexts_.end()) {
@@ -70,7 +68,7 @@ MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique<Impl>()) {}
 
 MSCCLPP_API_CPP Context::~Context() = default;
 
-MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void *ptr, size_t size, TransportFlags transports) {
+MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void* ptr, size_t size, TransportFlags transports) {
   return RegisteredMemory(std::make_shared<RegisteredMemory::Impl>(ptr, size, transports, *pimpl_));
 }
 
@@ -78,7 +76,7 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) {
   return Endpoint(std::make_shared<Endpoint::Impl>(config, *pimpl_));
 }
 
-MSCCLPP_API_CPP Connection Context::connect(const Endpoint &localEndpoint, const Endpoint &remoteEndpoint) {
+MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) {
   if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) {
     throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage);
   }
diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc
index 3833fdc4..fe51e348 100644
--- a/src/core/endpoint.cc
+++ b/src/core/endpoint.cc
@@ -4,9 +4,13 @@
 #include "endpoint.hpp"
 
 #include <algorithm>
+#include <mscclpp/env.hpp>
 
 #include "api.h"
 #include "context.hpp"
+#include "ib.hpp"
+#include "logger.hpp"
+#include "registered_memory.hpp"
 #include "serialization.hpp"
 #include "socket.h"
 #include "utils_internal.hpp"
@@ -23,9 +27,36 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
     if (config_.maxWriteQueueSize <= 0) {
       config_.maxWriteQueueSize = config_.ib.maxCqSize;
     }
+
+    // Determine if we should use no-atomics mode
+    ibNoAtomic_ = false;
+    if (config_.ib.mode == EndpointConfig::Ib::Mode::HostNoAtomic) {
+      ibNoAtomic_ = true;
+    } else if (config_.ib.mode == EndpointConfig::Ib::Mode::Default) {
+      // Use environment variable when mode is Default
+      ibNoAtomic_ = (env()->ibvMode == "host-no-atomic");
+    }
+
+    // If mode is Host (or Default resolved to host), check if atomics are supported
+    if (!ibNoAtomic_) {
+      IbCtx* ibCtx = contextImpl.getIbContext(config_.transport);
+      if (!ibCtx->supportsRdmaAtomics()) {
+        WARN(NET, "IB device ", ibCtx->getDevName(),
+             " does not support RDMA atomics. Falling back to write-with-immediate mode (HostNoAtomic).");
+        ibNoAtomic_ = true;
+      }
+    }
+
+    // Resolve GID index: explicit value (>= 0) takes priority, otherwise use env
+    if (config_.ib.gidIndex < 0) {
+      config_.ib.gidIndex = env()->ibGidIndex;
+    }
+
+    int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
+
     ibQp_ = contextImpl.getIbContext(config_.transport)
                 ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
-                           config_.ib.maxSendWr, 0, config_.ib.maxWrPerSend);
+                           config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
     ibQpInfo_ = ibQp_->getInfo();
   } else if (config_.transport == Transport::Ethernet) {
     // Configuring Ethernet Interfaces
@@ -48,6 +79,7 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
   if (AllIBTransports.has(config_.transport)) {
     ibLocal_ = false;
     it = detail::deserialize(it, ibQpInfo_);
+    it = detail::deserialize(it, ibNoAtomic_);
   } else if (config_.transport == Transport::Ethernet) {
     it = detail::deserialize(it, socketAddress_);
   }
@@ -77,6 +109,7 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
   detail::serialize(data, pimpl_->pidHash_);
   if (AllIBTransports.has(pimpl_->config_.transport)) {
     detail::serialize(data, pimpl_->ibQpInfo_);
+    detail::serialize(data, pimpl_->ibNoAtomic_);
   } else if (pimpl_->config_.transport == Transport::Ethernet) {
     detail::serialize(data, pimpl_->socketAddress_);
   }
diff --git a/src/core/env.cpp b/src/core/env.cpp
index 35a31f4c..7a42471b 100644
--- a/src/core/env.cpp
+++ b/src/core/env.cpp
@@ -54,18 +54,20 @@ Env::Env()
       logFile(readEnv<std::string>("MSCCLPP_LOG_FILE", "")),
       hcaDevices(readEnv<std::string>("MSCCLPP_HCA_DEVICES", "")),
       ibvSo(readEnv<std::string>("MSCCLPP_IBV_SO", "")),
+      ibvMode(readEnv<std::string>("MSCCLPP_IBV_MODE", "host")),
       hostid(readEnv<std::string>("MSCCLPP_HOSTID", "")),
       socketFamily(readEnv<std::string>("MSCCLPP_SOCKET_FAMILY", "")),
       socketIfname(readEnv<std::string>("MSCCLPP_SOCKET_IFNAME", "")),
       commId(readEnv<std::string>("MSCCLPP_COMM_ID", "")),
-      executionPlanDir(readEnv<std::string>("MSCCLPP_EXECUTION_PLAN_DIR",
-                                            readEnv<std::string>("HOME", "~") + "/.cache/mscclpp_default")),
+      cacheDir(readEnv<std::string>("MSCCLPP_CACHE_DIR", readEnv<std::string>("HOME", "~") + "/.cache/mscclpp")),
       npkitDumpDir(readEnv<std::string>("MSCCLPP_NPKIT_DUMP_DIR", "")),
       cudaIpcUseDefaultStream(readEnv<bool>("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)),
       ncclSharedLibPath(readEnv<std::string>("MSCCLPP_NCCL_LIB_PATH", "")),
       forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
-      disableChannelCache(readEnv<bool>("MSCCLPP_DISABLE_CHANNEL_CACHE", false)),
-      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)) {}
+      ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
+      forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
+      forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
+      ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)) {}
 
 std::shared_ptr<Env> env() {
   static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -81,17 +83,20 @@ std::shared_ptr<Env> env() {
     logEnv("MSCCLPP_LOG_FILE", globalEnv->logFile);
     logEnv("MSCCLPP_HCA_DEVICES", globalEnv->hcaDevices);
     logEnv("MSCCLPP_IBV_SO", globalEnv->ibvSo);
+    logEnv("MSCCLPP_IBV_MODE", globalEnv->ibvMode);
     logEnv("MSCCLPP_HOSTID", globalEnv->hostid);
     logEnv("MSCCLPP_SOCKET_FAMILY", globalEnv->socketFamily);
     logEnv("MSCCLPP_SOCKET_IFNAME", globalEnv->socketIfname);
     logEnv("MSCCLPP_COMM_ID", globalEnv->commId);
-    logEnv("MSCCLPP_EXECUTION_PLAN_DIR", globalEnv->executionPlanDir);
+    logEnv("MSCCLPP_CACHE_DIR", globalEnv->cacheDir);
     logEnv("MSCCLPP_NPKIT_DUMP_DIR", globalEnv->npkitDumpDir);
     logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream);
     logEnv("MSCCLPP_NCCL_LIB_PATH", globalEnv->ncclSharedLibPath);
     logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation);
-    logEnv("MSCCLPP_DISABLE_CHANNEL_CACHE", globalEnv->disableChannelCache);
+    logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
     logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
+    logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
+    logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
   }
   return globalEnv;
 }
diff --git a/src/core/executor/execution_kernel.cu b/src/core/executor/execution_kernel.cu
index 4b1b06bc..28ced77f 100644
--- a/src/core/executor/execution_kernel.cu
+++ b/src/core/executor/execution_kernel.cu
@@ -32,6 +32,17 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
           NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
 #else
       );
+#endif
+      break;
+    case DataType::UINT8:
+      executionKernel<uint8_t, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+          rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores,
+          localMemoryIdBegin, flag
+#if defined(ENABLE_NPKIT)
+          ,
+          NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
+#else
+      );
 #endif
       break;
     case DataType::FLOAT16:
@@ -67,10 +78,16 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
       );
 #endif
       break;
-    case DataType::FP8_E4M3:
-    case DataType::FP8_E5M2:
+    case DataType::FLOAT8_E4M3:
+    case DataType::FLOAT8_E5M2:
       // FP8 is not supported in CUDA execution kernel.
       break;
+    case DataType::FLOAT8_E4M3B15:
+      // fp8_e4m3b15 is a software type not supported in the CUDA execution kernel.
+      break;
+    case DataType::AUTO:
+      // AUTO is a sentinel resolved before reaching this point; nothing to do.
+      break;
   }
 }
 
diff --git a/src/core/gdr.cc b/src/core/gdr.cc
new file mode 100644
index 00000000..22ac15c9
--- /dev/null
+++ b/src/core/gdr.cc
@@ -0,0 +1,204 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "gdr.hpp"
+
+#if defined(MSCCLPP_USE_GDRCOPY)
+
+#include <gdrapi.h>
+#include <unistd.h>
+
+#include <mscclpp/env.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "logger.hpp"
+
+#ifndef GPU_PAGE_SHIFT
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
+#endif
+
+namespace mscclpp {
+
+// GdrContext
+
+class GdrContext {
+ public:
+  GdrContext();
+  ~GdrContext();
+
+  GdrContext(const GdrContext&) = delete;
+  GdrContext& operator=(const GdrContext&) = delete;
+
+  GdrStatus status() const { return status_; }
+  gdr_t handle() const { return handle_; }
+
+ private:
+  GdrStatus status_;
+  gdr_t handle_;
+};
+
+static std::shared_ptr<GdrContext> gdrContext() {
+  static auto instance = std::make_shared<GdrContext>();
+  return instance;
+}
+
+GdrStatus gdrStatus() { return gdrContext()->status(); }
+
+bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
+
+const char* gdrStatusMessage() {
+  switch (gdrStatus()) {
+    case GdrStatus::Ok:
+      return "GDRCopy initialized successfully";
+    case GdrStatus::NotBuilt:
+      return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
+    case GdrStatus::Disabled:
+      return "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
+    case GdrStatus::DriverMissing:
+      return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
+    case GdrStatus::OpenFailed:
+      return "gdr_open() failed; GDRCopy driver may be misconfigured";
+    default:
+      return "unknown GDRCopy status";
+  }
+}
+
+GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
+  if (env()->forceDisableGdr) {
+    INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
+    status_ = GdrStatus::Disabled;
+    return;
+  }
+
+  // Auto-detect: check if driver is available
+  if (access("/dev/gdrdrv", F_OK) != 0) {
+    INFO(GPU, "GDRCopy driver not detected, disabling GDRCopy");
+    status_ = GdrStatus::DriverMissing;
+    return;
+  }
+
+  handle_ = gdr_open();
+  if (handle_ == nullptr) {
+    INFO(GPU, "gdr_open() failed, disabling GDRCopy");
+    status_ = GdrStatus::OpenFailed;
+    return;
+  }
+
+  status_ = GdrStatus::Ok;
+  INFO(GPU, "GDRCopy initialized successfully");
+}
+
+GdrContext::~GdrContext() {
+  if (handle_ != nullptr) {
+    gdr_close(handle_);
+    handle_ = nullptr;
+  }
+}
+
+// GdrMap::Impl — real implementation with GDRCopy
+
+struct GdrMap::Impl {
+  std::shared_ptr<GdrContext> ctx;
+  std::shared_ptr<void> gpuMem;
+  gdr_mh_t mh;
+  void* barPtr;
+  uint64_t* hostDstPtr;
+  size_t mappedSize;
+};
+
+GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : pimpl_(std::make_unique<Impl>()) {
+  pimpl_->ctx = gdrContext();
+  pimpl_->gpuMem = std::move(gpuMem);
+  pimpl_->mh = {};
+  pimpl_->barPtr = nullptr;
+  pimpl_->hostDstPtr = nullptr;
+  pimpl_->mappedSize = 0;
+
+  // Ensure CUDA device context is active for gdr_pin_buffer
+  CudaDeviceGuard deviceGuard(deviceId);
+
+  uint64_t gpuAddr = reinterpret_cast<uint64_t>(pimpl_->gpuMem.get());
+  // Align to GPU page boundary and pin one page around the target address
+  unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK;
+  unsigned long pageOffset = gpuAddr - alignedAddr;
+  pimpl_->mappedSize = GPU_PAGE_SIZE;
+
+  // Pin the GPU memory for GDRCopy BAR1 mapping. Try GDR_PIN_FLAG_FORCE_PCIE first for optimal
+  // ordering on platforms that support it (e.g., GB200). Fall back to flags=0 if FORCE_PCIE is
+  // not supported. Both paths work correctly: CPU writes via atomicStore, GPU reads via
+  // system-scope acquire.
+  int ret =
+      gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, GDR_PIN_FLAG_FORCE_PCIE, &pimpl_->mh);
+  if (ret != 0) {
+    ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh);
+    if (ret != 0) {
+      THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
+            ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
+    }
+  }
+
+  ret = gdr_map(pimpl_->ctx->handle(), pimpl_->mh, &pimpl_->barPtr, pimpl_->mappedSize);
+  if (ret != 0) {
+    (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
+    THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
+  }
+
+  pimpl_->hostDstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(pimpl_->barPtr) + pageOffset);
+
+  INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)pimpl_->hostDstPtr);
+}
+
+GdrMap::~GdrMap() {
+  if (pimpl_) {
+    if (pimpl_->barPtr != nullptr) {
+      (void)gdr_unmap(pimpl_->ctx->handle(), pimpl_->mh, pimpl_->barPtr, pimpl_->mappedSize);
+    }
+    if (pimpl_->hostDstPtr != nullptr) {
+      (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
+    }
+  }
+}
+
+bool GdrMap::valid() const { return pimpl_ && pimpl_->hostDstPtr != nullptr; }
+
+uint64_t* GdrMap::hostPtr() const { return pimpl_ ? pimpl_->hostDstPtr : nullptr; }
+
+void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(pimpl_->mh, pimpl_->hostDstPtr, src, size); }
+
+void GdrMap::copyFrom(void* dst, size_t size) const {
+  gdr_copy_from_mapping(pimpl_->mh, dst, pimpl_->hostDstPtr, size);
+}
+
+}  // namespace mscclpp
+
+#else  // !defined(MSCCLPP_USE_GDRCOPY)
+
+namespace mscclpp {
+
+GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
+
+bool gdrEnabled() { return false; }
+
+const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
+
+// GdrMap::Impl — stub (no GDRCopy)
+
+struct GdrMap::Impl {};
+
+GdrMap::GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
+
+GdrMap::~GdrMap() = default;
+
+bool GdrMap::valid() const { return false; }
+
+uint64_t* GdrMap::hostPtr() const { return nullptr; }
+
+void GdrMap::copyTo(const void* /*src*/, size_t /*size*/) {}
+
+void GdrMap::copyFrom(void* /*dst*/, size_t /*size*/) const {}
+
+}  // namespace mscclpp
+
+#endif  // !defined(MSCCLPP_USE_GDRCOPY)
diff --git a/src/core/gpu_ipc_mem.cc b/src/core/gpu_ipc_mem.cc
index 3c9b41c4..c863ecdd 100644
--- a/src/core/gpu_ipc_mem.cc
+++ b/src/core/gpu_ipc_mem.cc
@@ -140,6 +140,11 @@ void GpuIpcMemHandle::deleter(GpuIpcMemHandle* handle) {
       UnixSocketServer::instance().unregisterFd(handle->posixFd.fd);
       ::close(handle->posixFd.fd);
     }
+    if (handle->typeFlags & GpuIpcMemHandle::Type::Fabric) {
+      if (handle->fabric.allocHandle != 0) {
+        cuMemRelease(handle->fabric.allocHandle);
+      }
+    }
     delete handle;
   }
 }
@@ -148,6 +153,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
   auto handle = UniqueGpuIpcMemHandle(new GpuIpcMemHandle(), &GpuIpcMemHandle::deleter);
   handle->typeFlags = GpuIpcMemHandle::Type::None;
   handle->posixFd.fd = -1;
+  handle->fabric.allocHandle = {};
 
   CUdeviceptr basePtr;
   size_t sz;
@@ -189,6 +195,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
   // FABRIC handle
   if (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) ==
       CUDA_SUCCESS) {
+    MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&(handle->fabric.allocHandle), (void*)basePtr));
     handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
   }
 
@@ -232,6 +239,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
   handle->offsetFromBase = 0;
   handle->typeFlags = GpuIpcMemHandle::Type::None;
   handle->posixFd.fd = -1;
+  handle->fabric.allocHandle = {};
 
   // POSIX FD handle
   int fileDesc;
@@ -246,11 +254,18 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
   if (isFabricAvailable && (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle,
                                                          CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS)) {
     handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
+    handle->fabric.allocHandle = allocHandle;
   }
 
   if (handle->typeFlags == GpuIpcMemHandle::Type::None) {
+    cuMemRelease(allocHandle);
     THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created");
   }
+
+  // Only release allocHandle if it is not stored in fabric.allocHandle.
+  if (!(handle->typeFlags & GpuIpcMemHandle::Type::Fabric)) {
+    MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
+  }
   return handle;
 #else   // !(CUDA_NVLS_API_AVAILABLE)
   THROW(GPU, Error, ErrorCode::InvalidUsage,
@@ -270,6 +285,8 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
   if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
     if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
         CUDA_SUCCESS) {
+      // Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
+      handle_.fabric.allocHandle = {};
       type_ = GpuIpcMemHandle::Type::Fabric;
     }
   }
@@ -418,41 +435,45 @@ std::shared_ptr<void> GpuIpcMem::mapMulticast([[maybe_unused]] int numDevices, [
   // This will block until all devices call cuMulticastAddDevice()
   MSCCLPP_CUTHROW(cuMulticastBindAddr(allocHandle_, mcOffset, bufferAddr, bufferSize, 0));
 
+  // cuMemMap requires offset to be 0 for multicast handles, so we map the entire range
+  // [0, mcOffset + bufferSize) and return a pointer at mcPtr + mcOffset. This only consumes
+  // extra virtual address space for the mcOffset region; no additional physical memory is used.
+  size_t mapSize = mcOffset + bufferSize;
   CUdeviceptr mcPtr;
-  MSCCLPP_CUTHROW(cuMemAddressReserve(&mcPtr, bufferSize, minMcGran, 0U, 0));
-  MSCCLPP_CUTHROW(cuMemMap(mcPtr, bufferSize, 0, allocHandle_, 0));
+  MSCCLPP_CUTHROW(cuMemAddressReserve(&mcPtr, mapSize, minMcGran, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap(mcPtr, mapSize, 0, allocHandle_, 0));
 
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   accessDesc.location.id = deviceId;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  MSCCLPP_CUTHROW(cuMemSetAccess(mcPtr, bufferSize, &accessDesc, 1));
+  MSCCLPP_CUTHROW(cuMemSetAccess(mcPtr, mapSize, &accessDesc, 1));
 
   // Return shared_ptr with custom deleter that unmaps and unbinds
   CUmemGenericAllocationHandle allocHandle = allocHandle_;
-  return std::shared_ptr<void>(
-      reinterpret_cast<void*>(mcPtr), [self = shared_from_this(), mcOffset, bufferSize, allocHandle](void* ptr) {
-        CUresult res;
-        const char* errStr;
+  return std::shared_ptr<void>(reinterpret_cast<void*>(mcPtr + mcOffset), [self = shared_from_this(), mcPtr, mapSize,
+                                                                           mcOffset, bufferSize, allocHandle](void*) {
+    CUresult res;
+    const char* errStr;
 
-        res = cuMemUnmap((CUdeviceptr)ptr, bufferSize);
-        if (res != CUDA_SUCCESS) {
-          (void)cuGetErrorString(res, &errStr);
-          WARN(GPU, "Failed to unmap CUDA memory at pointer ", (void*)ptr, ": ", errStr);
-        }
+    res = cuMemUnmap(mcPtr, mapSize);
+    if (res != CUDA_SUCCESS) {
+      (void)cuGetErrorString(res, &errStr);
+      WARN(GPU, "Failed to unmap CUDA memory at pointer ", (void*)mcPtr, ": ", errStr);
+    }
 
-        res = cuMemAddressFree((CUdeviceptr)ptr, bufferSize);
-        if (res != CUDA_SUCCESS) {
-          (void)cuGetErrorString(res, &errStr);
-          WARN(GPU, "Failed to free CUDA memory at pointer ", (void*)ptr, ": ", errStr);
-        }
+    res = cuMemAddressFree(mcPtr, mapSize);
+    if (res != CUDA_SUCCESS) {
+      (void)cuGetErrorString(res, &errStr);
+      WARN(GPU, "Failed to free CUDA memory at pointer ", (void*)mcPtr, ": ", errStr);
+    }
 
-        int deviceId;
-        CUdevice device;
-        if (cudaGetDevice(&deviceId) == cudaSuccess && cuDeviceGet(&device, deviceId) == CUDA_SUCCESS) {
-          (void)cuMulticastUnbind(allocHandle, device, mcOffset, bufferSize);
-        }
-      });
+    int deviceId;
+    CUdevice device;
+    if (cudaGetDevice(&deviceId) == cudaSuccess && cuDeviceGet(&device, deviceId) == CUDA_SUCCESS) {
+      (void)cuMulticastUnbind(allocHandle, device, mcOffset, bufferSize);
+    }
+  });
 #else   // !(CUDA_NVLS_API_AVAILABLE)
   THROW(GPU, Error, ErrorCode::InvalidUsage,
         "NVLS is not supported on this device (requires CUDA version >= 12.3 and Linux kernel version >= 5.6.0)");
diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc
index 3aa6aa1c..628d2dcb 100644
--- a/src/core/gpu_utils.cc
+++ b/src/core/gpu_utils.cc
@@ -5,48 +5,7 @@
 #include <mscclpp/gpu.hpp>
 #include <mscclpp/gpu_utils.hpp>
 
-#include "debug.h"
-
-static inline bool isCudaTeardownError(cudaError_t err) {
-#if defined(MSCCLPP_USE_ROCM)
-  return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice;
-#else   // !defined(MSCCLPP_USE_ROCM)
-  return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError ||
-         err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure || err == cudaErrorDeviceUninitialized;
-#endif  // !defined(MSCCLPP_USE_ROCM)
-}
-
-[[maybe_unused]] static inline bool isCuTeardownError(CUresult r) {
-  return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED || r == CUDA_ERROR_LAUNCH_FAILED;
-}
-
-#define MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cmd) \
-  do {                                         \
-    cudaError_t __e = cmd;                     \
-    if (isCudaTeardownError(__e)) {            \
-      (void)cudaGetLastError();                \
-    } else {                                   \
-      MSCCLPP_CUDATHROW(__e);                  \
-    }                                          \
-  } while (false)
-
-#define MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cmd) \
-  do {                                       \
-    CUresult __e = cmd;                      \
-    if (!isCuTeardownError(__e)) {           \
-      MSCCLPP_CUTHROW(__e);                  \
-    }                                        \
-  } while (false)
-
-#define MSCCLPP_CUTHROW_IGNORE(cmd)                                        \
-  do {                                                                     \
-    CUresult __e = cmd;                                                    \
-    if (__e != CUDA_SUCCESS) {                                             \
-      const char* errStr;                                                  \
-      cuGetErrorString(__e, &errStr);                                      \
-      WARN("%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, __e, errStr); \
-    }                                                                      \
-  } while (false)
+#include "gpu_utils_internal.hpp"
 
 namespace mscclpp {
 
diff --git a/src/core/ib.cc b/src/core/ib.cc
index 9b86cdf1..557f0426 100644
--- a/src/core/ib.cc
+++ b/src/core/ib.cc
@@ -3,6 +3,7 @@
 
 #include "ib.hpp"
 
+#include <arpa/inet.h>
 #include <malloc.h>
 #include <unistd.h>
 
@@ -20,6 +21,9 @@
 #include "context.hpp"
 #if defined(USE_IBVERBS)
 #include "ibverbs_wrapper.hpp"
+#if defined(MSCCLPP_USE_MLX5DV)
+#include "mlx5dv_wrapper.hpp"
+#endif  // defined(MSCCLPP_USE_MLX5DV)
 #endif  // defined(USE_IBVERBS)
 #include "logger.hpp"
 
@@ -63,7 +67,7 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
   return ret;
 }
 
-IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff), size_(0) {
+IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nullptr), buff_(buff), size_(0) {
   if (size == 0) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0");
   }
@@ -79,13 +83,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
   bool isGpuBuff = (gpuId != -1);
   if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
 #if !defined(MSCCLPP_USE_ROCM)
-    int fd;
-    MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    int fd = -1;
+    size_t rangeSize = pages * pageSize;
 
+    // Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
+    // bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
+    // routes DMA through the Data Direct engine for correct ordering and higher throughput.
+    // Fall back to the default (non-PCIe) mapping if the flag is unsupported.
+#if (CUDA_VERSION >= 12030)
+    CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                                                   CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    if (cuRes != CUDA_SUCCESS || fd < 0) {
+      if (fd >= 0) ::close(fd);
+      fd = -1;
+    }
+    bool usedPcieFlag = (fd >= 0);
+#endif  // CUDA_VERSION >= 12030
+    if (fd < 0) {
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+    }
+
+    // Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
+    // which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
     size_t offsetInDmaBuf = buffIntPtr % pageSize;
-    mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd,
-                                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
-                                         IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC);
+    int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
+                      IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
+
+#if defined(MSCCLPP_USE_MLX5DV)
+    if (isDataDirect) {
+      mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+#endif
+    if (mr_ == nullptr) {
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+
+    // If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
+#if (CUDA_VERSION >= 12030)
+    if (mr_ == nullptr && usedPcieFlag) {
+      ::close(fd);
+      MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+      mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
+    }
+#endif  // CUDA_VERSION >= 12030
+
     ::close(fd);
     if (mr_ == nullptr) {
       THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
@@ -129,30 +170,47 @@ const void* IbMr::getBuff() const { return buff_; }
 
 uint32_t IbMr::getLkey() const { return mr_->lkey; }
 
-IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr,
-           int maxRecvWr, int maxWrPerSend)
+IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
+           int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
     : portNum_(portNum),
       gidIndex_(gidIndex),
       info_(),
       qp_(nullptr),
-      cq_(nullptr),
-      wcs_(),
-      wrs_(),
-      sges_(),
-      wrn_(0),
-      numSignaledPostedItems_(0),
-      numSignaledStagedItems_(0),
-      maxCqPollNum_(maxCqPollNum),
-      maxWrPerSend_(maxWrPerSend) {
-  cq_ = IBVerbs::ibv_create_cq(ctx, maxCqSize, nullptr, nullptr, 0);
-  if (cq_ == nullptr) {
+      sendCq_(nullptr),
+      recvCq_(nullptr),
+      sendWcs_(),
+      recvWcs_(),
+      sendWrs_(),
+      sendSges_(),
+      recvWrs_(),
+      recvSges_(),
+      numStagedSend_(0),
+      numStagedRecv_(0),
+      numPostedSignaledSend_(0),
+      numStagedSignaledSend_(0),
+      maxSendCqPollNum_(maxSendCqPollNum),
+      maxSendWr_(maxSendWr),
+      maxWrPerSend_(maxWrPerSend),
+      maxRecvWr_(maxRecvWr),
+      noAtomic_(noAtomic) {
+  sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
+  if (sendCq_ == nullptr) {
     THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
   }
 
+  // Only create recv CQ if maxRecvWr > 0
+  if (maxRecvWr > 0) {
+    recvCq_ = IBVerbs::ibv_create_cq(ctx, maxRecvWr, nullptr, nullptr, 0);
+    if (recvCq_ == nullptr) {
+      THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
+    }
+  }
+
   struct ibv_qp_init_attr qpInitAttr = {};
   qpInitAttr.sq_sig_all = 0;
-  qpInitAttr.send_cq = cq_;
-  qpInitAttr.recv_cq = cq_;
+  qpInitAttr.send_cq = sendCq_;
+  // Use separate recv CQ if created, otherwise use the send CQ
+  qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_;
   qpInitAttr.qp_type = IBV_QPT_RC;
   qpInitAttr.cap.max_send_wr = maxSendWr;
   qpInitAttr.cap.max_recv_wr = maxRecvWr;
@@ -173,9 +231,9 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSiz
   info_.linkLayer = portAttr.link_layer;
   info_.qpn = qp->qp_num;
   info_.mtu = portAttr.active_mtu;
-  info_.is_grh = (portAttr.flags & IBV_QPF_GRH_REQUIRED);
+  info_.isGrh = (portAttr.flags & IBV_QPF_GRH_REQUIRED);
 
-  if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND || info_.is_grh) {
+  if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND || info_.isGrh) {
     if (gidIndex_ >= portAttr.gid_tbl_len) {
       THROW(NET, Error, ErrorCode::InvalidUsage, "invalid GID index ", gidIndex_, " for port ", portNum_,
             " (max index is ", portAttr.gid_tbl_len - 1, ")");
@@ -194,19 +252,28 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSiz
   qpAttr.qp_state = IBV_QPS_INIT;
   qpAttr.pkey_index = 0;
   qpAttr.port_num = portNum_;
-  qpAttr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
+  qpAttr.qp_access_flags = noAtomic_ ? IBV_ACCESS_REMOTE_WRITE
+                                     : (IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC);
   if (IBVerbs::ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
     THROW(NET, IbError, errno, "ibv_modify_qp failed (errno ", errno, ")");
   }
   qp_ = qp;
-  wrs_ = std::make_shared<std::vector<ibv_send_wr>>(maxWrPerSend_);
-  sges_ = std::make_shared<std::vector<ibv_sge>>(maxWrPerSend_);
-  wcs_ = std::make_shared<std::vector<ibv_wc>>(maxCqPollNum_);
+  sendWrs_ = std::make_shared<std::vector<ibv_send_wr>>(maxWrPerSend_);
+  sendSges_ = std::make_shared<std::vector<ibv_sge>>(maxWrPerSend_);
+  sendWcs_ = std::make_shared<std::vector<ibv_wc>>(maxSendCqPollNum_);
+  recvWcs_ = std::make_shared<std::vector<ibv_wc>>(maxRecvWr_);
+  if (maxRecvWr_ > 0) {
+    recvWrs_ = std::make_shared<std::vector<ibv_recv_wr>>(maxRecvWr_);
+    recvSges_ = std::make_shared<std::vector<ibv_sge>>(maxRecvWr_);
+  }
 }
 
 IbQp::~IbQp() {
   IBVerbs::ibv_destroy_qp(qp_);
-  IBVerbs::ibv_destroy_cq(cq_);
+  IBVerbs::ibv_destroy_cq(sendCq_);
+  if (recvCq_ != nullptr) {
+    IBVerbs::ibv_destroy_cq(recvCq_);
+  }
 }
 
 void IbQp::rtr(const IbQpInfo& info) {
@@ -215,9 +282,9 @@ void IbQp::rtr(const IbQpInfo& info) {
   qp_attr.path_mtu = static_cast<ibv_mtu>(info.mtu);
   qp_attr.dest_qp_num = info.qpn;
   qp_attr.rq_psn = 0;
-  qp_attr.max_dest_rd_atomic = 1;
+  qp_attr.max_dest_rd_atomic = noAtomic_ ? 0 : 1;
   qp_attr.min_rnr_timer = 0x12;
-  if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.is_grh) {
+  if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.isGrh) {
     qp_attr.ah_attr.is_global = 1;
     qp_attr.ah_attr.grh.dgid.global.subnet_prefix = info.spn;
     qp_attr.ah_attr.grh.dgid.global.interface_id = info.iid;
@@ -247,7 +314,7 @@ void IbQp::rts() {
   qp_attr.retry_cnt = 7;
   qp_attr.rnr_retry = 7;
   qp_attr.sq_psn = 0;
-  qp_attr.max_rd_atomic = 1;
+  qp_attr.max_rd_atomic = noAtomic_ ? 0 : 1;
   int ret = IBVerbs::ibv_modify_qp(
       qp_, &qp_attr,
       IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC);
@@ -256,25 +323,25 @@ void IbQp::rts() {
   }
 }
 
-IbQp::WrInfo IbQp::getNewWrInfo() {
-  if (wrn_ >= maxWrPerSend_) {
-    THROW(NET, Error, ErrorCode::InvalidUsage, "too many outstanding work requests. limit is ", maxWrPerSend_);
+IbQp::SendWrInfo IbQp::getNewSendWrInfo() {
+  if (numStagedSend_ >= maxWrPerSend_) {
+    THROW(NET, Error, ErrorCode::InvalidUsage, "too many staged work requests. limit is ", maxWrPerSend_);
   }
-  ibv_send_wr* wr_ = &wrs_->data()[wrn_];
-  ibv_sge* sge_ = &sges_->data()[wrn_];
+  ibv_send_wr* wr_ = &sendWrs_->data()[numStagedSend_];
+  ibv_sge* sge_ = &sendSges_->data()[numStagedSend_];
   wr_->sg_list = sge_;
   wr_->num_sge = 1;
   wr_->next = nullptr;
-  if (wrn_ > 0) {
-    (*wrs_)[wrn_ - 1].next = wr_;
+  if (numStagedSend_ > 0) {
+    (*sendWrs_)[numStagedSend_ - 1].next = wr_;
   }
-  wrn_++;
-  return IbQp::WrInfo{wr_, sge_};
+  numStagedSend_++;
+  return IbQp::SendWrInfo{wr_, sge_};
 }
 
-void IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
-                     uint64_t dstOffset, bool signaled) {
-  auto wrInfo = this->getNewWrInfo();
+void IbQp::stageSendWrite(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
+                          uint64_t dstOffset, bool signaled) {
+  auto wrInfo = this->getNewSendWrInfo();
   wrInfo.wr->wr_id = wrId;
   wrInfo.wr->opcode = IBV_WR_RDMA_WRITE;
   wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0;
@@ -283,12 +350,12 @@ void IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64
   wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset;
   wrInfo.sge->length = size;
   wrInfo.sge->lkey = mr->getLkey();
-  if (signaled) numSignaledStagedItems_++;
+  if (signaled) numStagedSignaledSend_++;
 }
 
-void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal,
-                          bool signaled) {
-  auto wrInfo = this->getNewWrInfo();
+void IbQp::stageSendAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal,
+                              bool signaled) {
+  auto wrInfo = this->getNewSendWrInfo();
   wrInfo.wr->wr_id = wrId;
   wrInfo.wr->opcode = IBV_WR_ATOMIC_FETCH_AND_ADD;
   wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0;
@@ -298,62 +365,149 @@ void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, u
   wrInfo.sge->addr = (uint64_t)(mr->getBuff());
   wrInfo.sge->length = sizeof(uint64_t);  // atomic op is always on uint64_t
   wrInfo.sge->lkey = mr->getLkey();
-  if (signaled) numSignaledStagedItems_++;
+  if (signaled) numStagedSignaledSend_++;
 }
 
-void IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
-                            uint64_t dstOffset, bool signaled, unsigned int immData) {
-  auto wrInfo = this->getNewWrInfo();
+void IbQp::stageSendWriteWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
+                                 uint64_t dstOffset, bool signaled, unsigned int immData) {
+  auto wrInfo = this->getNewSendWrInfo();
   wrInfo.wr->wr_id = wrId;
   wrInfo.wr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
   wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0;
   wrInfo.wr->wr.rdma.remote_addr = (uint64_t)(info.addr) + dstOffset;
   wrInfo.wr->wr.rdma.rkey = info.rkey;
-  wrInfo.wr->imm_data = immData;
-  wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset;
-  wrInfo.sge->length = size;
-  wrInfo.sge->lkey = mr->getLkey();
-  if (signaled) numSignaledStagedItems_++;
+  wrInfo.wr->imm_data = htonl(immData);
+  if (mr != nullptr) {
+    wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset;
+    wrInfo.sge->length = size;
+    wrInfo.sge->lkey = mr->getLkey();
+  } else {
+    // 0-byte write-with-imm: no source buffer needed
+    wrInfo.sge->addr = 0;
+    wrInfo.sge->length = 0;
+    wrInfo.sge->lkey = 0;
+  }
+  if (signaled) numStagedSignaledSend_++;
 }
 
 void IbQp::postSend() {
-  if (wrn_ == 0) {
+  if (numStagedSend_ == 0) {
     return;
   }
   struct ibv_send_wr* bad_wr;
-  int err = IBVerbs::ibv_post_send(qp_, wrs_->data(), &bad_wr);
+  int err = IBVerbs::ibv_post_send(qp_, sendWrs_->data(), &bad_wr);
   if (err != 0) {
     THROW(NET, IbError, err, "ibv_post_send failed (errno ", err, ")");
   }
-  wrn_ = 0;
-  numSignaledPostedItems_ += numSignaledStagedItems_;
-  numSignaledStagedItems_ = 0;
-  if (numSignaledPostedItems_ + 4 > cq_->cqe) {
-    WARN(NET, "IB: CQ is almost full ( ", numSignaledPostedItems_, " / ", cq_->cqe,
+  numStagedSend_ = 0;
+  numPostedSignaledSend_ += numStagedSignaledSend_;
+  numStagedSignaledSend_ = 0;
+  if (numPostedSignaledSend_ + 4 > sendCq_->cqe) {
+    WARN(NET, "IB: CQ is almost full ( ", numPostedSignaledSend_, " / ", sendCq_->cqe,
          " ). The connection needs to be flushed to prevent timeout errors.");
   }
 }
 
-int IbQp::pollCq() {
-  int wcNum = IBVerbs::ibv_poll_cq(cq_, maxCqPollNum_, wcs_->data());
+IbQp::RecvWrInfo IbQp::getNewRecvWrInfo() {
+  if (numStagedRecv_ >= maxRecvWr_) {
+    THROW(NET, Error, ErrorCode::InvalidUsage, "too many outstanding recv work requests. limit is ", maxRecvWr_);
+  }
+  ibv_recv_wr* wr = &recvWrs_->data()[numStagedRecv_];
+  ibv_sge* sge = &recvSges_->data()[numStagedRecv_];
+  wr->next = nullptr;
+  if (numStagedRecv_ > 0) {
+    (*recvWrs_)[numStagedRecv_ - 1].next = wr;
+  }
+  numStagedRecv_++;
+  return IbQp::RecvWrInfo{wr, sge};
+}
+
+void IbQp::stageRecv(uint64_t wrId) {
+  auto wrInfo = this->getNewRecvWrInfo();
+  // For RDMA write-with-imm, data goes to remote_addr specified by sender.
+  // We only need the recv WR to get the completion notification with imm_data.
+  wrInfo.wr->wr_id = wrId;
+  wrInfo.wr->sg_list = nullptr;
+  wrInfo.wr->num_sge = 0;
+}
+
+void IbQp::stageRecv(const IbMr* mr, uint64_t wrId, uint32_t size, uint64_t offset) {
+  auto wrInfo = this->getNewRecvWrInfo();
+  wrInfo.wr->wr_id = wrId;
+  wrInfo.sge->addr = reinterpret_cast<uint64_t>(mr->getBuff()) + offset;
+  wrInfo.sge->length = size;
+  wrInfo.sge->lkey = mr->getLkey();
+  wrInfo.wr->sg_list = wrInfo.sge;
+  wrInfo.wr->num_sge = 1;
+}
+
+void IbQp::postRecv() {
+  if (numStagedRecv_ == 0) return;
+  struct ibv_recv_wr* bad_wr;
+  int err = IBVerbs::ibv_post_recv(qp_, recvWrs_->data(), &bad_wr);
+  if (err != 0) {
+    THROW(NET, IbError, err, "ibv_post_recv failed (errno ", err, ")");
+  }
+  numStagedRecv_ = 0;
+}
+
+int IbQp::pollSendCq() {
+  int wcNum = IBVerbs::ibv_poll_cq(sendCq_, maxSendCqPollNum_, sendWcs_->data());
   if (wcNum > 0) {
-    numSignaledPostedItems_ -= wcNum;
+    numPostedSignaledSend_ -= wcNum;
   }
   return wcNum;
 }
 
-int IbQp::getWcStatus(int idx) const { return (*wcs_)[idx].status; }
+int IbQp::pollRecvCq() {
+  int wcNum = IBVerbs::ibv_poll_cq(recvCq_, maxRecvWr_, recvWcs_->data());
+  return wcNum;
+}
 
-std::string IbQp::getWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*wcs_)[idx].status); }
+int IbQp::getSendWcStatus(int idx) const { return (*sendWcs_)[idx].status; }
 
-int IbQp::getNumCqItems() const { return numSignaledPostedItems_; }
+std::string IbQp::getSendWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*sendWcs_)[idx].status); }
 
-IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr) {
+int IbQp::getNumSendCqItems() const { return numPostedSignaledSend_; }
+
+int IbQp::getRecvWcStatus(int idx) const { return (*recvWcs_)[idx].status; }
+
+std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*recvWcs_)[idx].status); }
+
+unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }
+
+IbCtx::IbCtx(const std::string& devName)
+    : devName_(devName),
+      ctx_(nullptr),
+      pd_(nullptr),
+      supportsRdmaAtomics_(false),
+      isMlx5_(false),
+      isDataDirect_(false),
+      isVF_(false) {
   int num;
   struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
   for (int i = 0; i < num; ++i) {
     if (std::string(devices[i]->name) == devName_) {
       ctx_ = IBVerbs::ibv_open_device(devices[i]);
+
+      // Detect if this IB device is a Virtual Function (VF).
+      // VFs have a 'physfn' sysfs symlink pointing to their parent PF; PFs do not.
+      {
+        std::string physfnPath = "/sys/class/infiniband/" + devName_ + "/device/physfn";
+        isVF_ = (access(physfnPath.c_str(), F_OK) == 0);
+        if (isVF_) {
+          INFO(NET, "IB device ", devName_, " is a Virtual Function (Data Direct ordering available)");
+        }
+      }
+
+#if defined(MSCCLPP_USE_MLX5DV)
+      if (MLX5DV::isAvailable()) {
+        isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]);
+        if (isMlx5_) {
+          INFO(NET, "IB device ", devName_, " supports mlx5 Direct Verbs");
+        }
+      }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
       break;
     }
   }
@@ -365,6 +519,26 @@ IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_
   if (pd_ == nullptr) {
     THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")");
   }
+
+  // Detect Data Direct support via mlx5dv_get_data_direct_sysfs_path
+#if defined(MSCCLPP_USE_MLX5DV)
+  if (isMlx5_ && MLX5DV::isAvailable()) {
+    char sysfsPath[256];
+    int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath));
+    if (ret == 0) {
+      isDataDirect_ = true;
+      INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")");
+    } else {
+      INFO(NET, "IB device ", devName_, " does not support Data Direct");
+    }
+  }
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+
+  // Query and cache RDMA atomics capability
+  struct ibv_device_attr attr = {};
+  if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) {
+    supportsRdmaAtomics_ = (attr.atomic_cap == IBV_ATOMIC_HCA || attr.atomic_cap == IBV_ATOMIC_GLOB);
+  }
 }
 
 IbCtx::~IbCtx() {
@@ -419,8 +593,8 @@ int IbCtx::getAnyUsablePort(int gidIndex) const {
   return -1;
 }
 
-std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr,
-                                      int maxRecvWr, int maxWrPerSend) {
+std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
+                                      int maxRecvWr, int maxWrPerSend, bool noAtomic) {
   if (port == -1) {
     port = this->getAnyUsablePort(gidIndex);
     if (port == -1) {
@@ -429,14 +603,22 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxCqSize, int
   } else if (!this->isPortUsable(port, gidIndex)) {
     THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
   }
-  return std::shared_ptr<IbQp>(
-      new IbQp(ctx_, pd_, port, gidIndex, maxCqSize, maxCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend));
+  return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
+                                        maxRecvWr, maxWrPerSend, noAtomic));
 }
 
 std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
-  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size));
+  return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isDataDirect_));
 }
 
+bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
+
+bool IbCtx::isMlx5() const { return isMlx5_; }
+
+bool IbCtx::isDataDirect() const { return isDataDirect_; }
+
+bool IbCtx::isVirtualFunction() const { return isVF_; }
+
 MSCCLPP_API_CPP int getIBDeviceCount() {
   int num;
   IBVerbs::ibv_get_device_list(&num);
@@ -542,6 +724,34 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport) { return ""; }
 
 MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string&) { return Transport::Unknown; }
 
+IbMr::~IbMr() {}
+IbMrInfo IbMr::getInfo() const { return IbMrInfo(); }
+const void* IbMr::getBuff() const { return nullptr; }
+uint32_t IbMr::getLkey() const { return 0; }
+
+IbQp::~IbQp() {}
+void IbQp::rtr(const IbQpInfo& /*info*/) {}
+void IbQp::rts() {}
+void IbQp::stageSendWrite(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint32_t /*size*/, uint64_t /*wrId*/,
+                          uint64_t /*srcOffset*/, uint64_t /*dstOffset*/, bool /*signaled*/) {}
+void IbQp::stageSendAtomicAdd(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint64_t /*wrId*/, uint64_t /*dstOffset*/,
+                              uint64_t /*addVal*/, bool /*signaled*/) {}
+void IbQp::stageSendWriteWithImm(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint32_t /*size*/, uint64_t /*wrId*/,
+                                 uint64_t /*srcOffset*/, uint64_t /*dstOffset*/, bool /*signaled*/,
+                                 unsigned int /*immData*/) {}
+void IbQp::postSend() {}
+void IbQp::stageRecv(uint64_t /*wrId*/) {}
+void IbQp::stageRecv(const IbMr* /*mr*/, uint64_t /*wrId*/, uint32_t /*size*/, uint64_t /*offset*/) {}
+void IbQp::postRecv() {}
+int IbQp::pollSendCq() { return 0; }
+int IbQp::pollRecvCq() { return 0; }
+int IbQp::getSendWcStatus(int /*idx*/) const { return 0; }
+std::string IbQp::getSendWcStatusString(int /*idx*/) const { return ""; }
+int IbQp::getNumSendCqItems() const { return 0; }
+int IbQp::getRecvWcStatus(int /*idx*/) const { return 0; }
+std::string IbQp::getRecvWcStatusString(int /*idx*/) const { return ""; }
+unsigned int IbQp::getRecvWcImmData(int /*idx*/) const { return 0; }
+
 #endif  // !defined(USE_IBVERBS)
 
 }  // namespace mscclpp
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index c9d81d41..22a9930f 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -4,11 +4,19 @@
 #ifndef MSCCLPP_CONNECTION_HPP_
 #define MSCCLPP_CONNECTION_HPP_
 
+#include <atomic>
+#include <memory>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <mutex>
+#include <thread>
+#include <utility>
+#include <vector>
 
 #include "communicator.hpp"
 #include "context.hpp"
+#include "endpoint.hpp"
+#include "gdr.hpp"
 #include "ib.hpp"
 #include "registered_memory.hpp"
 #include "socket.h"
@@ -29,6 +37,19 @@ class BaseConnection {
 
   virtual void flush(int64_t timeoutUsec = -1) = 0;
 
+  /// Start signal forwarding to the given memory address.
+  /// Called by the semaphore to specify where incoming signals should be written.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  virtual void startSignalForwarding(std::shared_ptr<uint64_t> /*mem*/) {}
+
+  /// Stop signal forwarding and release associated resources.
+  virtual void stopSignalForwarding() {}
+
+  /// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode).
+  /// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
+  /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
+  virtual bool isSignalForwarding() const { return false; }
+
   virtual Transport transport() const = 0;
 
   virtual Transport remoteTransport() const = 0;
@@ -39,6 +60,8 @@ class BaseConnection {
 
   int getMaxWriteQueueSize() const;
 
+  static std::shared_ptr<BaseConnection>& getImpl(Connection& conn) { return conn.impl_; }
+
  protected:
   friend class Context;
   friend class CudaIpcConnection;
@@ -77,12 +100,45 @@ class IBConnection : public BaseConnection {
   Transport transport_;
   Transport remoteTransport_;
   std::weak_ptr<IbQp> qp_;
-  std::unique_ptr<uint64_t> dummyAtomicSource_;  // not used anywhere but IB needs a source
-  RegisteredMemory dummyAtomicSourceMem_;
-  mscclpp::TransportInfo dstTransportInfo_;
+  std::unique_ptr<uint64_t> atomicSrc_;
+  RegisteredMemory atomicSrcMem_;
+  mscclpp::TransportInfo atomicSrcTransportInfo_;
+
+  // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
+  // instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
+  bool ibNoAtomic_;
+  bool gdrSignalForwarding_;  // ibNoAtomic_ && gdrEnabled() — decided once at construction
+  std::thread recvThread_;
+  std::atomic<bool> stopRecvThread_;
+  std::atomic<bool> recvThreadError_;  // Set by recv thread on fatal error
+  std::string recvThreadErrorMsg_;     // Error message from recv thread (written before recvThreadError_ is set)
+  int localGpuDeviceId_;               // Local GPU device ID for CUDA context and GDR mapping
+
+  // Signal forwarding design (HostNoAtomic mode):
+  // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
+  // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
+  //   the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around
+  //   detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half
+  //   incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
+  uint64_t signalAddr_;
+
+  std::unique_ptr<GdrMap> signalGdrMap_;
+
+  void recvThreadFunc();
 
  public:
   IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
+  ~IBConnection();
+
+  /// Start signal forwarding to the given memory address.
+  /// Must be called before the remote sends any updateAndSync in HostNoAtomic mode.
+  /// @param mem Shared pointer to the GPU memory for the signal token.
+  void startSignalForwarding(std::shared_ptr<uint64_t> mem) override;
+
+  /// Stop signal forwarding and release associated resources.
+  void stopSignalForwarding() override;
+
+  bool isSignalForwarding() const override;
 
   Transport transport() const override;
 
diff --git a/src/core/include/context.hpp b/src/core/include/context.hpp
index b53a2662..42d03db1 100644
--- a/src/core/include/context.hpp
+++ b/src/core/include/context.hpp
@@ -24,9 +24,9 @@ class CudaIpcStream {
  public:
   CudaIpcStream(int deviceId);
 
-  void memcpyD2D(void *dst, const void *src, size_t nbytes);
+  void memcpyD2D(void* dst, const void* src, size_t nbytes);
 
-  void memcpyH2D(void *dst, const void *src, size_t nbytes);
+  void memcpyH2D(void* dst, const void* src, size_t nbytes);
 
   void sync();
 
@@ -42,9 +42,7 @@ struct Context::Impl {
   std::shared_ptr<TokenPool> tokenPool_;
   const size_t maxNumTokens_ = 1 << 15;  // 32K tokens
 
-  Impl();
-
-  IbCtx *getIbContext(Transport ibTransport);
+  IbCtx* getIbContext(Transport ibTransport);
   std::shared_ptr<uint64_t> getToken();
 };
 
diff --git a/src/core/include/endpoint.hpp b/src/core/include/endpoint.hpp
index a3a5ad41..363faab1 100644
--- a/src/core/include/endpoint.hpp
+++ b/src/core/include/endpoint.hpp
@@ -4,6 +4,7 @@
 #ifndef MSCCLPP_ENDPOINT_HPP_
 #define MSCCLPP_ENDPOINT_HPP_
 
+#include <memory>
 #include <mscclpp/core.hpp>
 #include <vector>
 
@@ -24,6 +25,7 @@ struct Endpoint::Impl {
 
   // The following are only used for IB and are undefined for other transports.
   bool ibLocal_;
+  bool ibNoAtomic_;
   std::shared_ptr<IbQp> ibQp_;
   IbQpInfo ibQpInfo_;
 
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index fb6c436f..87b88888 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -17,356 +17,7 @@
 #include <mscclpp/switch_channel_device.hpp>
 
 #include "execution_common.hpp"
-
-namespace {
-#if defined(MSCCLPP_DEVICE_COMPILE)
-template <typename To, typename From>
-MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) {
-  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
-
-  union {
-    From f;
-    To t;
-  } u;
-  u.f = src;
-  return u.t;
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE T add_elements(T a, T b) {
-  return a + b;
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __half2 add_elements(__half2 a, __half2 b) {
-  return __hadd2(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __bfloat16 add_elements(__bfloat16 a, __bfloat16 b) {
-  return __hadd(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) {
-  return __hadd2(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-// FP8 E4M3 addition using __hadd (single element)
-template <>
-MSCCLPP_DEVICE_INLINE __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // Optimized assembly for gfx942
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false);
-#else
-  return __fp8_e4m3(__hadd(__half(a), __half(b)));
-#endif
-}
-
-// FP8 E5M2 addition using __hadd (single element) - must come before helper functions
-template <>
-MSCCLPP_DEVICE_INLINE __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // Optimized assembly for gfx942 (bfloat8)
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false);
-#else
-  return __fp8_e5m2(__hadd(__half(a), __half(b)));
-#endif
-}
-
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-// HIP gfx942 platform: Helper functions for vectorized FP8 operations
-// We use separate function names because __fp8x2_e4m3 and __fp8x2_e5m2 are both uint16_t
-
-// E4M3 vectorized addition for 2 elements
-MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e4m3(uint16_t a, uint16_t b) {
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false);
-}
-
-// E4M3 vectorized addition for 4 elements
-MSCCLPP_DEVICE_INLINE uint32_t add_fp8x4_e4m3(uint32_t a, uint32_t b) {
-  uint16_t a_low = a & 0xFFFF;
-  uint16_t a_high = (a >> 16) & 0xFFFF;
-  uint16_t b_low = b & 0xFFFF;
-  uint16_t b_high = (b >> 16) & 0xFFFF;
-  uint16_t result_low = add_fp8x2_e4m3(a_low, b_low);
-  uint16_t result_high = add_fp8x2_e4m3(a_high, b_high);
-  return (static_cast<uint32_t>(result_high) << 16) | result_low;
-}
-
-// E5M2 vectorized addition for 2 elements
-MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e5m2(uint16_t a, uint16_t b) {
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b, 0)));
-  return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, ival, false);
-}
-
-// E5M2 vectorized addition for 4 elements
-MSCCLPP_DEVICE_INLINE uint32_t add_fp8x4_e5m2(uint32_t a, uint32_t b) {
-  uint16_t a_low = a & 0xFFFF;
-  uint16_t a_high = (a >> 16) & 0xFFFF;
-  uint16_t b_low = b & 0xFFFF;
-  uint16_t b_high = (b >> 16) & 0xFFFF;
-  uint16_t result_low = add_fp8x2_e5m2(a_low, b_low);
-  uint16_t result_high = add_fp8x2_e5m2(a_high, b_high);
-  return (static_cast<uint32_t>(result_high) << 16) | result_low;
-}
-#endif
-
-#if !defined(MSCCLPP_DEVICE_HIP)
-// CUDA platform: Template specializations for vectorized FP8 operations
-
-// FP8 E4M3 vectorized addition using __hadd2 for 2 elements (CUDA only)
-template <>
-MSCCLPP_DEVICE_INLINE __fp8x2_e4m3 add_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) {
-  return __fp8x2_e4m3(__hadd2(__half2(a), __half2(b)));
-}
-
-// FP8 E4M3 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e4m3)
-template <>
-MSCCLPP_DEVICE_INLINE __fp8x4_e4m3 add_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) {
-  __fp8x2_e4m3* a_pair = reinterpret_cast<__fp8x2_e4m3*>(&a);
-  __fp8x2_e4m3* b_pair = reinterpret_cast<__fp8x2_e4m3*>(&b);
-
-  __fp8x2_e4m3 result[2];
-  result[0] = add_elements(a_pair[0], b_pair[0]);
-  result[1] = add_elements(a_pair[1], b_pair[1]);
-
-  return *reinterpret_cast<__fp8x4_e4m3*>(result);
-}
-
-// FP8 E5M2 vectorized addition for 2 elements (CUDA only)
-template <>
-MSCCLPP_DEVICE_INLINE __fp8x2_e5m2 add_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) {
-  return __fp8x2_e5m2(__hadd2(__half2(a), __half2(b)));
-}
-
-// FP8 E5M2 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e5m2)
-template <>
-MSCCLPP_DEVICE_INLINE __fp8x4_e5m2 add_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) {
-  __fp8x2_e5m2* a_pair = reinterpret_cast<__fp8x2_e5m2*>(&a);
-  __fp8x2_e5m2* b_pair = reinterpret_cast<__fp8x2_e5m2*>(&b);
-
-  __fp8x2_e5m2 result[2];
-  result[0] = add_elements(a_pair[0], b_pair[0]);
-  result[1] = add_elements(a_pair[1], b_pair[1]);
-
-  return *reinterpret_cast<__fp8x4_e5m2*>(result);
-}
-#endif
-#endif  // __FP8_TYPES_EXIST__
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int4 add_vectors_helper(int4 a, int4 b) {
-  int4 ret;
-  ret.w = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
-  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  ret.z = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
-  return ret;
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int4 add_vectors(int4 a, int4 b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE int4 add_vectors<__half>(int4 a, int4 b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE int4 add_vectors<__bfloat16>(int4 a, int4 b) {
-  return add_vectors_helper<__bfloat162>(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-template <>
-MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e4m3>(int4 a, int4 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // HIP gfx942: Use helper functions that work with storage types
-  int4 ret;
-  ret.w = add_fp8x4_e4m3(a.w, b.w);
-  ret.x = add_fp8x4_e4m3(a.x, b.x);
-  ret.y = add_fp8x4_e4m3(a.y, b.y);
-  ret.z = add_fp8x4_e4m3(a.z, b.z);
-  return ret;
-#else
-  return add_vectors_helper<__fp8x4_e4m3>(a, b);
-#endif
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e5m2>(int4 a, int4 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // HIP gfx942: Use helper functions that work with storage types
-  int4 ret;
-  ret.w = add_fp8x4_e5m2(a.w, b.w);
-  ret.x = add_fp8x4_e5m2(a.x, b.x);
-  ret.y = add_fp8x4_e5m2(a.y, b.y);
-  ret.z = add_fp8x4_e5m2(a.z, b.z);
-  return ret;
-#else
-  return add_vectors_helper<__fp8x4_e5m2>(a, b);
-#endif
-}
-#endif  // __FP8_TYPES_EXIST__
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint2 add_vectors_helper(uint2 a, uint2 b) {
-  uint2 ret;
-  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  return ret;
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__half>(uint2 a, uint2 b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__bfloat16>(uint2 a, uint2 b) {
-  return add_vectors_helper<__bfloat162>(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e4m3>(uint2 a, uint2 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // HIP gfx942: Use helper functions that work with storage types
-  uint2 ret;
-  ret.x = add_fp8x4_e4m3(a.x, b.x);
-  ret.y = add_fp8x4_e4m3(a.y, b.y);
-  return ret;
-#else
-  return add_vectors_helper<__fp8x4_e4m3>(a, b);
-#endif
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e5m2>(uint2 a, uint2 b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  // HIP gfx942: Use helper functions that work with storage types
-  uint2 ret;
-  ret.x = add_fp8x4_e5m2(a.x, b.x);
-  ret.y = add_fp8x4_e5m2(a.y, b.y);
-  return ret;
-#else
-  return add_vectors_helper<__fp8x4_e5m2>(a, b);
-#endif
-}
-#endif  // __FP8_TYPES_EXIST__
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int add_vectors_helper(int a, int b) {
-  return bit_cast<int, T>(add_elements(bit_cast<T, int>(a), bit_cast<T, int>(b)));
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__half>(int a, int b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__bfloat16>(int a, int b) {
-  return add_vectors_helper<__bfloat162>(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e4m3>(int a, int b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  return add_fp8x4_e4m3(a, b);
-#else
-  return add_vectors_helper<__fp8x4_e4m3>(a, b);
-#endif
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e5m2>(int a, int b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  return add_fp8x4_e5m2(a, b);
-#else
-  return add_vectors_helper<__fp8x4_e5m2>(a, b);
-#endif
-}
-#endif  // __FP8_TYPES_EXIST__
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors_helper(uint32_t a, uint32_t b) {
-  return bit_cast<uint32_t, T>(add_elements(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
-}
-
-template <typename T>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__bfloat16>(uint32_t a, uint32_t b) {
-  return add_vectors_helper<__bfloat162>(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-template <>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e4m3>(uint32_t a, uint32_t b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  return add_fp8x4_e4m3(a, b);
-#else
-  return add_vectors_helper<__fp8x4_e4m3>(a, b);
-#endif
-}
-
-template <>
-MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e5m2>(uint32_t a, uint32_t b) {
-#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
-  return add_fp8x4_e5m2(a, b);
-#else
-  return add_vectors_helper<__fp8x4_e5m2>(a, b);
-#endif
-}
-#endif  // __FP8_TYPES_EXIST__
-
-#endif  // MSCCLPP_DEVICE_COMPILE
-
-}  // namespace
-
+#include "reduce_kernel.hpp"
 namespace mscclpp {
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
@@ -534,7 +185,7 @@ MSCCLPP_DEVICE_INLINE void handlePut(const Operation& op, void* input, void* out
   }
 }
 
-template <typename T, bool ReuseScratch, bool SendToRemote = true>
+template <typename T, bool ReuseScratch, bool SendToRemote = true, ReduceOp OpType = SUM>
 MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input, void* output, void* scratch,
                                                 uint32_t offset, uint32_t unitSize) {
   const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize);
@@ -559,7 +210,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input
           sizeof(int4);
       void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]);
       val = mscclpp::read<int4>(remoteMemory, srcOffset + idx);
-      tmp = add_vectors<T>(tmp, val);
+      tmp = calVector<T, OpType>(tmp, val);
     }
     output4[outputOffset4 + idx] = tmp;
     if constexpr (SendToRemote) {
@@ -587,7 +238,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input
            getOffset<ReuseScratch>(memoryChannelBufferTypes_[op.inputBufferRefs[index + 1].id], offset)) /
           sizeof(T);
       void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]);
-      tmp = add_elements(tmp, mscclpp::read<T>(remoteMemory, srcOffset + idx));
+      tmp = tmp + mscclpp::read<T>(remoteMemory, srcOffset + idx);
     }
     static_cast<T*>(output)[idx] = tmp;
     if constexpr (SendToRemote) {
@@ -647,11 +298,11 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat
   ChannelType chType = op.channelType;
   if (chType == ChannelType::MEMORY) {
     size_t nPackets = size / sizeof(PacketPayload<PacketType>);
+    PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[0] << 1));
     for (size_t pktIdx = threadIdx.x; pktIdx < nPackets; pktIdx += blockDim.x) {
+      PacketPayload<PacketType> data = pkts[pktIdx].read(flag_);
+      PacketType pkt(data, flag_);
       for (uint32_t idx = 0; idx < nOutput; ++idx) {
-        PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[idx] << 1));
-        PacketPayload<PacketType> data = pkts[pktIdx].read(flag_);
-        PacketType pkt(data, flag_);
         size_t offset = (scratchOffset_ + (dstOffsets[idx] << 1)) / sizeof(PacketType);
         void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.outputBufferRefs[idx].id]);
         mscclpp::write<PacketType>(remoteMemory, offset + pktIdx, pkt);
@@ -661,10 +312,8 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat
     // Ensuring Data Is Ready
     size_t nPackets = size / sizeof(PacketPayload<PacketType>);
     for (size_t pktIdx = threadIdx.x; pktIdx < nPackets; pktIdx += blockDim.x) {
-      for (uint32_t idx = 0; idx < nOutput; ++idx) {
-        PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[idx] << 1));
-        pkts[pktIdx].read(flag_);
-      }
+      PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[0] << 1));
+      pkts[pktIdx].read(flag_);
     }
     __syncthreads();
 
@@ -674,14 +323,14 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat
       return;
     }
     uint32_t dstOffset = (dstOffsets[chIdx] << 1) + scratchOffset_;
-    uint32_t srcOffset = (srcOffsets[chIdx] << 1) + scratchOffset_;
+    uint32_t srcOffset = (srcOffsets[0] << 1) + scratchOffset_;
     MemoryId dstMemoryId = portChannelBufferIds_[op.outputBufferRefs[chIdx].id];
     portChannels_[channelIndexes[chIdx]].put(
         dstMemoryId, dstOffset, static_cast<MemoryId>(BufferType::SCRATCH) + localMemoryIdBegin_, srcOffset, size << 1);
   }
 }
 
-template <typename T, typename PacketType, bool SendToRemote = true>
+template <typename T, typename PacketType, bool SendToRemote = true, ReduceOp OpType = SUM>
 MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* input, void* output, void* scratch) {
   uint32_t size = op.inputBufferSizes[0];
   const uint32_t nSrcs = op.nInputs - 1;
@@ -704,9 +353,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in
     for (uint32_t index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
       PacketPayload<PacketType> val = pkt[idx].read(flag_);
-      data = add_vectors<T>(data, val);
+      data = calVector<T, OpType>(data, val);
     }
-    data = add_vectors<T>(data, srcPacketPayload[idx]);
+    data = calVector<T, OpType>(data, srcPacketPayload[idx]);
     dstPacketPayload[idx] = data;
 
     if constexpr (SendToRemote) {
@@ -720,7 +369,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in
   }
 }
 
-template <typename T, typename PacketType, bool SendToRemote = true>
+template <typename T, typename PacketType, bool SendToRemote = true, ReduceOp OpType = SUM>
 MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void* input, void* output, void* scratch) {
   uint32_t size = op.inputBufferSizes[0];
   const uint32_t nSrcs = op.nInputs - 1;
@@ -745,9 +394,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void
     for (uint32_t index = 0; index < nSrcs; ++index) {
       PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
       PacketPayload<PacketType> val = pkt[idx].read(flag_);
-      data = add_vectors<T>(data, val);
+      data = calVector<T, OpType>(data, val);
     }
-    data = add_vectors<T>(data, srcPacketPayload[idx]);
+    data = calVector<T, OpType>(data, srcPacketPayload[idx]);
     dstPacketPayload[idx] = data;
     PacketType* dst_val = &dstPkt[idx];
     dst_val->write(data, flag_);
@@ -790,7 +439,7 @@ MSCCLPP_DEVICE_INLINE void handleCopyPackets(const Operation& op, void* input, v
   mscclpp::copyToPackets<PacketType>(dst, src, size, threadIdx.x, blockDim.x, flag_);
 }
 
-template <typename T, bool ReuseScratch, bool SendToRemote = true>
+template <typename T, bool ReuseScratch, bool SendToRemote = true, ReduceOp OpType = SUM>
 MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, void* output, void* scratch,
                                             uint32_t offset, uint32_t unitSize) {
   const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize);
@@ -815,7 +464,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo
       size_t buffOffset =
           (inputOffsets[index] + getOffset<ReuseScratch>(outputBufferRefs[index].type, offset)) / sizeof(int4);
       int4 val = buff4[buffOffset + idx];
-      tmp = add_vectors<T>(tmp, val);
+      tmp = calVector<T, OpType>(tmp, val);
     }
     dst4[dstOffset4 + idx] = tmp;
     if constexpr (SendToRemote) {
@@ -840,7 +489,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo
       T* buff = static_cast<T*>(getBuffer(input, output, scratch, inputBufferRefs[index].type));
       uint32_t buffOffset =
           (inputOffsets[index] + getOffset<ReuseScratch>(inputBufferRefs[index].type, offset)) / sizeof(T);
-      tmp = add_elements(tmp, buff[buffOffset + idx]);
+      tmp = tmp + buff[buffOffset + idx];
     }
     dst[idx] = tmp;
     if constexpr (SendToRemote) {
@@ -872,51 +521,56 @@ MSCCLPP_DEVICE_INLINE void handleCopy(const Operation& op, void* input, void* ou
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
 template <typename T, bool ReuseScratch>
 MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint32_t offset, uint32_t unitSize) {
-  static_assert(sizeof(T) <= 8, "Only support type with size <= 8 bytes");
-  const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize);
-  if (size <= 0) {
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    assert(false && "MULTI_LOAD_REDUCE_STORE is not supported for uint8_t data type");
     return;
-  }
-  const uint32_t srcOffset = op.inputOffsets[0] + getOffset<ReuseScratch>(op.nvlsInputBufferType, offset);
-  const uint32_t dstOffset = op.outputOffsets[0] + getOffset<ReuseScratch>(op.nvlsOutputBufferType, offset);
-  assert(size % sizeof(T) == 0);
-  assert(srcOffset % sizeof(T) == 0);
-  assert(dstOffset % sizeof(T) == 0);
-
-  T* src = (T*)nvlsChannels_[op.nvlsInputIndex].mcPtr;
-  T* dst = (T*)nvlsChannels_[op.nvlsOutputIndex].mcPtr;
-  if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t>) {
-    const size_t nElem = size / sizeof(T);
-    const size_t srcOffsetElem = srcOffset / sizeof(T);
-    const size_t dstOffsetElem = dstOffset / sizeof(T);
-    VectorType<T, 1>* srcElem = reinterpret_cast<VectorType<T, 1>*>(src + srcOffsetElem);
-    VectorType<T, 1>* dstElem = reinterpret_cast<VectorType<T, 1>*>(dst + dstOffsetElem);
-    for (size_t idx = threadIdx.x; idx < nElem; idx += blockDim.x) {
-      auto val = SwitchChannelDeviceHandle::multimemLoadReduce(srcElem + idx);
-      SwitchChannelDeviceHandle::multimemStore(val, dstElem + idx);
-    }
   } else {
-    // handle data in 16-byte unit
-    using Type16 = typename mscclpp::VectorType<T, 16 / sizeof(T)>;
-    const size_t nType16 = size / sizeof(Type16);
-    const size_t srcOffset16 = srcOffset / sizeof(Type16);
-    const size_t dstOffset16 = dstOffset / sizeof(Type16);
-    Type16* src16 = reinterpret_cast<Type16*>(src) + srcOffset16;
-    Type16* dst16 = reinterpret_cast<Type16*>(dst) + dstOffset16;
-    for (size_t idx = threadIdx.x; idx < nType16; idx += blockDim.x) {
-      Type16 val = SwitchChannelDeviceHandle::multimemLoadReduce(src16 + idx);
-      SwitchChannelDeviceHandle::multimemStore(val, dst16 + idx);
+    static_assert(sizeof(T) <= 8, "Only support type with size <= 8 bytes");
+    const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize);
+    if (size <= 0) {
+      return;
     }
-    // handle rest of data
-    constexpr int RedBytes = (sizeof(T) == 8) ? 8 : 4;
-    using TypeRest = typename mscclpp::VectorType<T, RedBytes / sizeof(T)>;
-    const size_t processed = nType16 * sizeof(Type16);
-    const size_t nRest = (size - processed) / sizeof(TypeRest);
-    TypeRest* srcR = reinterpret_cast<TypeRest*>(src + srcOffset + processed);
-    TypeRest* dstR = reinterpret_cast<TypeRest*>(dst + dstOffset + processed);
-    for (size_t idx = threadIdx.x; idx < nRest; idx += blockDim.x) {
-      TypeRest val = SwitchChannelDeviceHandle::multimemLoadReduce(srcR + idx);
-      SwitchChannelDeviceHandle::multimemStore(val, dstR + idx);
+    const uint32_t srcOffset = op.inputOffsets[0] + getOffset<ReuseScratch>(op.nvlsInputBufferType, offset);
+    const uint32_t dstOffset = op.outputOffsets[0] + getOffset<ReuseScratch>(op.nvlsOutputBufferType, offset);
+    assert(size % sizeof(T) == 0);
+    assert(srcOffset % sizeof(T) == 0);
+    assert(dstOffset % sizeof(T) == 0);
+
+    T* src = (T*)nvlsChannels_[op.nvlsInputIndex].mcPtr;
+    T* dst = (T*)nvlsChannels_[op.nvlsOutputIndex].mcPtr;
+    if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t>) {
+      const size_t nElem = size / sizeof(T);
+      const size_t srcOffsetElem = srcOffset / sizeof(T);
+      const size_t dstOffsetElem = dstOffset / sizeof(T);
+      VectorType<T, 1>* srcElem = reinterpret_cast<VectorType<T, 1>*>(src + srcOffsetElem);
+      VectorType<T, 1>* dstElem = reinterpret_cast<VectorType<T, 1>*>(dst + dstOffsetElem);
+      for (size_t idx = threadIdx.x; idx < nElem; idx += blockDim.x) {
+        auto val = SwitchChannelDeviceHandle::multimemLoadReduce(srcElem + idx);
+        SwitchChannelDeviceHandle::multimemStore(val, dstElem + idx);
+      }
+    } else {
+      // handle data in 16-byte unit
+      using Type16 = mscclpp::VectorType<T, 16 / sizeof(T)>;
+      const size_t nType16 = size / sizeof(Type16);
+      const size_t srcOffset16 = srcOffset / sizeof(Type16);
+      const size_t dstOffset16 = dstOffset / sizeof(Type16);
+      Type16* src16 = reinterpret_cast<Type16*>(src) + srcOffset16;
+      Type16* dst16 = reinterpret_cast<Type16*>(dst) + dstOffset16;
+      for (size_t idx = threadIdx.x; idx < nType16; idx += blockDim.x) {
+        Type16 val = SwitchChannelDeviceHandle::multimemLoadReduce(src16 + idx);
+        SwitchChannelDeviceHandle::multimemStore(val, dst16 + idx);
+      }
+      // handle rest of data
+      constexpr int RedBytes = (sizeof(T) == 8) ? 8 : 4;
+      using TypeRest = mscclpp::VectorType<T, RedBytes / sizeof(T)>;
+      const size_t processed = nType16 * sizeof(Type16);
+      const size_t nRest = (size - processed) / sizeof(TypeRest);
+      TypeRest* srcR = reinterpret_cast<TypeRest*>(src + srcOffset + processed);
+      TypeRest* dstR = reinterpret_cast<TypeRest*>(dst + dstOffset + processed);
+      for (size_t idx = threadIdx.x; idx < nRest; idx += blockDim.x) {
+        TypeRest val = SwitchChannelDeviceHandle::multimemLoadReduce(srcR + idx);
+        SwitchChannelDeviceHandle::multimemStore(val, dstR + idx);
+      }
     }
   }
 }
@@ -1222,7 +876,7 @@ class ExecutionKernel {
 #endif
         break;
 #if defined(__FP8_TYPES_EXIST__)
-      case DataType::FP8_E4M3:
+      case DataType::FLOAT8_E4M3:
         executionKernel<__fp8_e4m3, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
             rank, (__fp8_e4m3*)src, (__fp8_e4m3*)dst, (__fp8_e4m3*)scratch, scratchOffset, scratchChunkSize, plan,
             semaphores, localMemoryIdBegin, flag
@@ -1233,7 +887,7 @@ class ExecutionKernel {
         );
 #endif
         break;
-      case DataType::FP8_E5M2:
+      case DataType::FLOAT8_E5M2:
         executionKernel<__fp8_e5m2, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
             rank, (__fp8_e5m2*)src, (__fp8_e5m2*)dst, (__fp8_e5m2*)scratch, scratchOffset, scratchChunkSize, plan,
             semaphores, localMemoryIdBegin, flag
@@ -1245,6 +899,32 @@ class ExecutionKernel {
 #endif
         break;
 #endif  // __FP8_TYPES_EXIST__
+      case DataType::FLOAT8_E4M3B15:
+        executionKernel<__fp8_e4m3b15, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (__fp8_e4m3b15*)src, (__fp8_e4m3b15*)dst, (__fp8_e4m3b15*)scratch, scratchOffset, scratchChunkSize,
+            plan, semaphores, localMemoryIdBegin, flag
+#if defined(ENABLE_NPKIT)
+            ,
+            NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
+#else
+        );
+#endif
+        break;
+      case DataType::UINT8:
+        executionKernel<uint8_t, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
+            rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores,
+            localMemoryIdBegin, flag
+#if defined(ENABLE_NPKIT)
+            ,
+            NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
+#else
+        );
+#endif
+        break;
+      case DataType::AUTO:
+        // AUTO is a sentinel that must be resolved before reaching this point.
+        assert(false && "DataType::AUTO must be resolved before kernel launch");
+        break;
     }
   }
 #else   // !defined(MSCCLPP_DEVICE_HIP)
diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp
new file mode 100644
index 00000000..e0c7f006
--- /dev/null
+++ b/src/core/include/gdr.hpp
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_GDR_HPP_
+#define MSCCLPP_GDR_HPP_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace mscclpp {
+
+enum class GdrStatus {
+  Ok,             // GDRCopy initialized successfully
+  NotBuilt,       // Built without MSCCLPP_USE_GDRCOPY
+  Disabled,       // Disabled via MSCCLPP_FORCE_DISABLE_GDR
+  DriverMissing,  // /dev/gdrdrv not found
+  OpenFailed,     // gdr_open() failed
+};
+
+/// Return the detailed status of the global GDRCopy context.
+GdrStatus gdrStatus();
+
+/// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok).
+bool gdrEnabled();
+
+/// Return a human-readable error message for the current GDRCopy status.
+const char* gdrStatusMessage();
+
+/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
+/// When GDRCopy is not available, all operations are no-ops and valid() returns false.
+class GdrMap {
+ public:
+  /// Pin and map a GPU address for direct host-side access.
+  /// @param gpuMem   Shared pointer to the GPU memory (e.g. from gpuCallocShared).
+  /// @param deviceId The CUDA device ID for setting context.
+  GdrMap(std::shared_ptr<void> gpuMem, int deviceId);
+  ~GdrMap();
+
+  GdrMap(const GdrMap&) = delete;
+  GdrMap& operator=(const GdrMap&) = delete;
+
+  /// Whether the mapping was established successfully.
+  bool valid() const;
+
+  /// Return the BAR1-mapped host pointer to the GPU location.
+  uint64_t* hostPtr() const;
+
+  /// Copy data from host memory to the mapped GPU location.
+  void copyTo(const void* src, size_t size);
+
+  /// Copy data from the mapped GPU location to host memory.
+  void copyFrom(void* dst, size_t size) const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> pimpl_;
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_GDR_HPP_
diff --git a/src/core/include/gpu_ipc_mem.hpp b/src/core/include/gpu_ipc_mem.hpp
index 98fa47f2..f66545c2 100644
--- a/src/core/include/gpu_ipc_mem.hpp
+++ b/src/core/include/gpu_ipc_mem.hpp
@@ -44,9 +44,10 @@ struct GpuIpcMemHandle {
 
   struct {
     char handle[64];
+    CUmemGenericAllocationHandle allocHandle;
   } fabric;
 
-  static void deleter(GpuIpcMemHandle *handle);
+  static void deleter(GpuIpcMemHandle* handle);
 
   // We make GpuIpcMemHandle trivially copyable for easy serialization,
   // and thus it cannot have explicit destructors.
@@ -61,7 +62,7 @@ struct GpuIpcMemHandle {
     using Base::Base;
 
     // Allow implicit conversion from Base
-    UniquePtr(Base &&other) : Base(std::move(other)) {}
+    UniquePtr(Base&& other) : Base(std::move(other)) {}
   };
 
   static UniquePtr create(const CUdeviceptr ptr);
@@ -70,7 +71,7 @@ struct GpuIpcMemHandle {
 
 using UniqueGpuIpcMemHandle = GpuIpcMemHandle::UniquePtr;
 
-std::ostream &operator<<(std::ostream &os, const GpuIpcMemHandle::TypeFlags &typeFlags);
+std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typeFlags);
 
 static_assert(std::is_trivially_copyable_v<GpuIpcMemHandle>);
 
@@ -82,7 +83,7 @@ class GpuIpcMem : public std::enable_shared_from_this<GpuIpcMem> {
   /// Create a GpuIpcMem instance from a GpuIpcMemHandle.
   /// @param handle The handle to import.
   /// @return A shared_ptr to the created GpuIpcMem instance.
-  static std::shared_ptr<GpuIpcMem> create(const GpuIpcMemHandle &handle);
+  static std::shared_ptr<GpuIpcMem> create(const GpuIpcMemHandle& handle);
 
   ~GpuIpcMem();
 
@@ -102,7 +103,7 @@ class GpuIpcMem : public std::enable_shared_from_this<GpuIpcMem> {
   std::shared_ptr<void> mapMulticast(int numDevices, size_t mcOffset, CUdeviceptr bufferAddr, size_t bufferSize);
 
  private:
-  GpuIpcMem(const GpuIpcMemHandle &handle);
+  GpuIpcMem(const GpuIpcMemHandle& handle);
 
   GpuIpcMemHandle handle_;
   CUmemGenericAllocationHandle allocHandle_;
diff --git a/src/core/include/gpu_utils_internal.hpp b/src/core/include/gpu_utils_internal.hpp
new file mode 100644
index 00000000..a7cea86b
--- /dev/null
+++ b/src/core/include/gpu_utils_internal.hpp
@@ -0,0 +1,64 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_GPU_UTILS_INTERNAL_HPP_
+#define MSCCLPP_GPU_UTILS_INTERNAL_HPP_
+
+#include <mscclpp/gpu.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "logger.hpp"
+
+namespace mscclpp {
+
+/// Check if a CUDA error indicates runtime teardown (safe to ignore in destructors).
+inline bool isCudaTeardownError(cudaError_t err) {
+#if defined(MSCCLPP_USE_ROCM)
+  return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice;
+#else   // !defined(MSCCLPP_USE_ROCM)
+  return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError ||
+         err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure || err == cudaErrorDeviceUninitialized;
+#endif  // !defined(MSCCLPP_USE_ROCM)
+}
+
+/// Check if a CUDA driver error indicates runtime teardown.
+inline bool isCuTeardownError(CUresult r) {
+  return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED || r == CUDA_ERROR_LAUNCH_FAILED;
+}
+
+}  // namespace mscclpp
+
+/// Execute a CUDA runtime call and ignore teardown errors (useful in destructors).
+/// Non-teardown errors will throw.
+#define MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cmd) \
+  do {                                         \
+    cudaError_t __e = cmd;                     \
+    if (mscclpp::isCudaTeardownError(__e)) {   \
+      (void)cudaGetLastError();                \
+    } else {                                   \
+      MSCCLPP_CUDATHROW(__e);                  \
+    }                                          \
+  } while (false)
+
+/// Execute a CUDA driver call and ignore teardown errors (useful in destructors).
+/// Non-teardown errors will throw.
+#define MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cmd) \
+  do {                                       \
+    CUresult __e = cmd;                      \
+    if (!mscclpp::isCuTeardownError(__e)) {  \
+      MSCCLPP_CUTHROW(__e);                  \
+    }                                        \
+  } while (false)
+
+/// Execute a CUDA driver call and log (but don't throw) on error.
+#define MSCCLPP_CUTHROW_IGNORE(cmd)                                                                   \
+  do {                                                                                                \
+    CUresult __e = cmd;                                                                               \
+    if (__e != CUDA_SUCCESS) {                                                                        \
+      const char* errStr;                                                                             \
+      cuGetErrorString(__e, &errStr);                                                                 \
+      WARN(GPU, __FILE__, ":", __LINE__, " Cuda failure ", static_cast<int>(__e), " '", errStr, "'"); \
+    }                                                                                                 \
+  } while (false)
+
+#endif  // MSCCLPP_GPU_UTILS_INTERNAL_HPP_
diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp
index c6436dbf..36c5a237 100644
--- a/src/core/include/ib.hpp
+++ b/src/core/include/ib.hpp
@@ -17,6 +17,7 @@ struct ibv_qp;
 struct ibv_cq;
 struct ibv_wc;
 struct ibv_send_wr;
+struct ibv_recv_wr;
 struct ibv_sge;
 
 namespace mscclpp {
@@ -28,14 +29,14 @@ struct IbMrInfo {
 
 class IbMr {
  public:
-  virtual ~IbMr();
+  ~IbMr();
 
-  virtual IbMrInfo getInfo() const;
-  virtual const void* getBuff() const;
-  virtual uint32_t getLkey() const;
+  IbMrInfo getInfo() const;
+  const void* getBuff() const;
+  uint32_t getLkey() const;
 
  private:
-  IbMr(ibv_pd* pd, void* buff, std::size_t size);
+  IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect);
 
   ibv_mr* mr_;
   void* buff_;
@@ -52,7 +53,7 @@ struct IbQpInfo {
   uint64_t spn;
   int mtu;
   uint64_t iid;
-  bool is_grh;
+  bool isGrh;
 };
 
 enum class WsStatus {
@@ -61,38 +62,48 @@ enum class WsStatus {
 
 class IbQp {
  public:
-  virtual ~IbQp();
+  ~IbQp();
 
-  virtual void rtr([[maybe_unused]] const IbQpInfo& info);
-  virtual void rts();
-  virtual void stageSend([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info,
-                         [[maybe_unused]] uint32_t size, [[maybe_unused]] uint64_t wrId,
-                         [[maybe_unused]] uint64_t srcOffset, [[maybe_unused]] uint64_t dstOffset,
-                         [[maybe_unused]] bool signaled);
-  virtual void stageAtomicAdd([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info,
-                              [[maybe_unused]] uint64_t wrId, [[maybe_unused]] uint64_t dstOffset,
-                              [[maybe_unused]] uint64_t addVal, [[maybe_unused]] bool signaled);
-  virtual void stageSendWithImm([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info,
-                                [[maybe_unused]] uint32_t size, [[maybe_unused]] uint64_t wrId,
-                                [[maybe_unused]] uint64_t srcOffset, [[maybe_unused]] uint64_t dstOffset,
-                                [[maybe_unused]] bool signaled, [[maybe_unused]] unsigned int immData);
-  virtual void postSend();
-  virtual int pollCq();
+  void rtr(const IbQpInfo& info);
+  void rts();
+  void stageSendWrite(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
+                      uint64_t dstOffset, bool signaled);
+  void stageSendAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal,
+                          bool signaled);
+  void stageSendWriteWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
+                             uint64_t dstOffset, bool signaled, unsigned int immData);
+  void postSend();
+
+  void stageRecv(uint64_t wrId);
+  void stageRecv(const IbMr* mr, uint64_t wrId, uint32_t size, uint64_t offset = 0);
+  void postRecv();
+
+  int pollSendCq();
+  int pollRecvCq();
 
   IbQpInfo& getInfo() { return info_; }
-  virtual int getWcStatus([[maybe_unused]] int idx) const;
-  virtual std::string getWcStatusString([[maybe_unused]] int idx) const;
-  virtual int getNumCqItems() const;
+  int getSendWcStatus(int idx) const;
+  std::string getSendWcStatusString(int idx) const;
+  int getNumSendCqItems() const;
+  int getRecvWcStatus(int idx) const;
+  std::string getRecvWcStatusString(int idx) const;
+  unsigned int getRecvWcImmData(int idx) const;
 
  private:
-  struct WrInfo {
+  struct SendWrInfo {
     ibv_send_wr* wr;
     ibv_sge* sge;
   };
 
-  IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr,
-       int maxRecvWr, int maxWrPerSend);
-  WrInfo getNewWrInfo();
+  struct RecvWrInfo {
+    ibv_recv_wr* wr;
+    ibv_sge* sge;
+  };
+
+  IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
+       int maxRecvWr, int maxWrPerSend, bool noAtomic);
+  SendWrInfo getNewSendWrInfo();
+  RecvWrInfo getNewRecvWrInfo();
 
   int portNum_;
   int gidIndex_;
@@ -100,16 +111,24 @@ class IbQp {
   IbQpInfo info_;
 
   ibv_qp* qp_;
-  ibv_cq* cq_;
-  std::shared_ptr<std::vector<ibv_wc>> wcs_;
-  std::shared_ptr<std::vector<ibv_send_wr>> wrs_;
-  std::shared_ptr<std::vector<ibv_sge>> sges_;
-  int wrn_;
-  int numSignaledPostedItems_;
-  int numSignaledStagedItems_;
+  ibv_cq* sendCq_;
+  ibv_cq* recvCq_;
+  std::shared_ptr<std::vector<ibv_wc>> sendWcs_;
+  std::shared_ptr<std::vector<ibv_wc>> recvWcs_;
+  std::shared_ptr<std::vector<ibv_send_wr>> sendWrs_;
+  std::shared_ptr<std::vector<ibv_sge>> sendSges_;
+  std::shared_ptr<std::vector<ibv_recv_wr>> recvWrs_;
+  std::shared_ptr<std::vector<ibv_sge>> recvSges_;
+  int numStagedSend_;
+  int numStagedRecv_;
+  int numPostedSignaledSend_;
+  int numStagedSignaledSend_;
 
-  const int maxCqPollNum_;
+  const int maxSendCqPollNum_;
+  const int maxSendWr_;
   const int maxWrPerSend_;
+  const int maxRecvWr_;
+  const bool noAtomic_;
 
   friend class IbCtx;
 };
@@ -120,17 +139,25 @@ class IbCtx {
   IbCtx(const std::string& devName);
   ~IbCtx();
 
-  std::shared_ptr<IbQp> createQp(int port, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr, int maxRecvWr,
-                                 int maxWrPerSend);
+  std::shared_ptr<IbQp> createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
+                                 int maxRecvWr, int maxWrPerSend, bool noAtomic);
   std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
+  bool supportsRdmaAtomics() const;
+  bool isMlx5() const;
+  bool isDataDirect() const;
+  bool isVirtualFunction() const;
 #else
   IbCtx([[maybe_unused]] const std::string& devName) {}
   ~IbCtx() {}
 
-  std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int) { return nullptr; }
+  std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int, bool) { return nullptr; }
   std::unique_ptr<const IbMr> registerMr([[maybe_unused]] void* buff, [[maybe_unused]] std::size_t size) {
     return nullptr;
   }
+  bool supportsRdmaAtomics() const { return false; }
+  bool isMlx5() const { return false; }
+  bool isDataDirect() const { return false; }
+  bool isVirtualFunction() const { return false; }
 #endif
 
   const std::string& getDevName() const { return devName_; };
@@ -142,6 +169,10 @@ class IbCtx {
   const std::string devName_;
   ibv_context* ctx_;
   ibv_pd* pd_;
+  bool supportsRdmaAtomics_;
+  bool isMlx5_;
+  bool isDataDirect_;
+  bool isVF_;
 };
 
 }  // namespace mscclpp
diff --git a/src/core/include/ibverbs_wrapper.hpp b/src/core/include/ibverbs_wrapper.hpp
index 45054ff3..5b0da8ba 100644
--- a/src/core/include/ibverbs_wrapper.hpp
+++ b/src/core/include/ibverbs_wrapper.hpp
@@ -12,12 +12,12 @@ namespace mscclpp {
 
 struct IBVerbs {
  private:
-  static void *dlsym(const std::string &symbol, bool allowReturnNull = false);
+  static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
 
  public:
 #define REGISTER_IBV_FUNC_WITH_NAME(name__, func__)                                          \
   template <typename... Args>                                                                \
-  static inline auto(name__)(Args && ...args) {                                              \
+  static inline auto(name__)(Args && ... args) {                                             \
     static_assert(sizeof(&::func__) > 0, #func__ " is expected be a function, not a macro"); \
     static decltype(&::func__) impl = nullptr;                                               \
     if (!impl) impl = reinterpret_cast<decltype(impl)>(IBVerbs::dlsym(#func__));             \
@@ -46,7 +46,7 @@ struct IBVerbs {
   REGISTER_IBV_FUNC(ibv_wc_status_str)
 
   static bool isDmabufSupported();
-  static struct ibv_mr *ibv_reg_dmabuf_mr(struct ibv_pd *, uint64_t, size_t, uint64_t, int, int);
+  static struct ibv_mr* ibv_reg_dmabuf_mr(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int);
 
   ///
   /// Below is for cases where the API (may be / is) a macro. Refer to `infiniband/verbs.h`.
@@ -57,8 +57,8 @@ struct IBVerbs {
 #else  // defined(ibv_get_device_list)
 #undef ibv_get_device_list
   REGISTER_IBV_FUNC(ibv_static_providers)
-  static inline struct ibv_device **ibv_get_device_list(int *num_devices) {
-    using FuncType = struct ibv_device **(*)(int *);
+  static inline struct ibv_device** ibv_get_device_list(int* num_devices) {
+    using FuncType = struct ibv_device** (*)(int*);
     static FuncType impl = nullptr;
     if (!impl) impl = reinterpret_cast<FuncType>(IBVerbs::dlsym("ibv_get_device_list"));
     IBVerbs::ibv_static_providers(NULL, _RDMA_STATIC_PREFIX(RDMA_STATIC_PROVIDERS), NULL);
@@ -67,21 +67,21 @@ struct IBVerbs {
 #endif  // defined(ibv_get_device_list)
 
 #undef ibv_query_port
-  static inline int ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) {
+  static inline int ibv_query_port(struct ibv_context* context, uint8_t port_num, struct ibv_port_attr* port_attr) {
     static decltype(&::ibv_query_port) impl = nullptr;
     if (!impl) impl = reinterpret_cast<decltype(impl)>(IBVerbs::dlsym("ibv_query_port"));
-    struct verbs_context *vctx = verbs_get_ctx_op(context, query_port);
+    struct verbs_context* vctx = verbs_get_ctx_op(context, query_port);
     if (!vctx) {
       int rc;
       ::memset(port_attr, 0, sizeof(*port_attr));
-      rc = impl(context, port_num, (struct _compat_ibv_port_attr *)port_attr);
+      rc = impl(context, port_num, (struct _compat_ibv_port_attr*)port_attr);
       return rc;
     }
     return vctx->query_port(context, port_num, port_attr, sizeof(*port_attr));
   }
 
 #undef ibv_reg_mr
-  static inline struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
+  static inline struct ibv_mr* ibv_reg_mr(struct ibv_pd* pd, void* addr, size_t length, int access) {
     static decltype(&::ibv_reg_mr) impl = nullptr;
     static decltype(&::ibv_reg_mr_iova2) impl_iova2 = nullptr;
     int is_access_const = __builtin_constant_p(((int)(access)&IBV_ACCESS_OPTIONAL_RANGE) == 0);
@@ -98,11 +98,15 @@ struct IBVerbs {
   /// Below is for cases where the API (may be / is) a static function. Refer to `infiniband/verbs.h`.
   ///
 
-  static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  static inline int ibv_post_send(struct ibv_qp* qp, struct ibv_send_wr* wr, struct ibv_send_wr** bad_wr) {
     return qp->context->ops.post_send(qp, wr, bad_wr);
   }
 
-  static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) {
+  static inline int ibv_post_recv(struct ibv_qp* qp, struct ibv_recv_wr* wr, struct ibv_recv_wr** bad_wr) {
+    return qp->context->ops.post_recv(qp, wr, bad_wr);
+  }
+
+  static inline int ibv_poll_cq(struct ibv_cq* cq, int num_entries, struct ibv_wc* wc) {
     return cq->context->ops.poll_cq(cq, num_entries, wc);
   }
 };
diff --git a/src/core/include/mlx5dv_wrapper.hpp b/src/core/include/mlx5dv_wrapper.hpp
new file mode 100644
index 00000000..79403a36
--- /dev/null
+++ b/src/core/include/mlx5dv_wrapper.hpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_MLX5DV_WRAPPER_HPP_
+#define MSCCLPP_MLX5DV_WRAPPER_HPP_
+
+#if defined(MSCCLPP_USE_MLX5DV)
+
+#include <infiniband/verbs.h>
+
+#include <string>
+
+namespace mscclpp {
+
+struct MLX5DV {
+  /// Whether libmlx5.so was successfully loaded at runtime.
+  static bool isAvailable();
+
+  /// Check if the given IB device supports mlx5 Direct Verbs.
+  static bool mlx5dv_is_supported(struct ibv_device* device);
+
+  /// Register a DMABUF memory region using mlx5dv extensions.
+  /// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version.
+  static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
+                                             int access);
+
+  /// Query the Data Direct sysfs path for the given IB context.
+  /// Returns 0 on success (device supports Data Direct), non-zero otherwise.
+  static int mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len);
+
+ private:
+  static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
+};
+
+}  // namespace mscclpp
+
+#endif  // defined(MSCCLPP_USE_MLX5DV)
+#endif  // MSCCLPP_MLX5DV_WRAPPER_HPP_
diff --git a/src/core/include/reduce_kernel.hpp b/src/core/include/reduce_kernel.hpp
new file mode 100644
index 00000000..463f827d
--- /dev/null
+++ b/src/core/include/reduce_kernel.hpp
@@ -0,0 +1,195 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_REDUCE_KERNEL_HPP_
+#define MSCCLPP_REDUCE_KERNEL_HPP_
+
+#include <mscclpp/algorithm.hpp>
+#include <mscclpp/gpu_data_types.hpp>
+#include <type_traits>
+
+namespace mscclpp {
+
+#if defined(MSCCLPP_DEVICE_COMPILE)
+
+// Generic element-wise calculation helper
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE T calElements(const T& a, const T& b) {
+  if constexpr (OpType == SUM) {
+    return a + b;
+  } else if constexpr (OpType == MIN) {
+    return mscclpp::min(a, b);
+  }
+  static_assert(OpType == SUM || OpType == MIN, "Unsupported ReduceOp");
+}
+
+// Generic vector reduction helpers
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) {
+  uint2 ret;
+  ret.x = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
+  ret.y = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
+  return ret;
+}
+
+/// f32x2 specialization for uint2: uses packed f32x2 operator+ (Blackwell __fadd2_rn when available).
+template <>
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, SUM>(const uint2& a, const uint2& b) {
+  f32x2 fa = bit_cast<f32x2, uint2>(a);
+  f32x2 fb = bit_cast<f32x2, uint2>(b);
+  f32x2 fr = fa + fb;
+  return bit_cast<uint2, f32x2>(fr);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, MIN>(const uint2& a, const uint2& b) {
+  f32x2 fa = bit_cast<f32x2, uint2>(a);
+  f32x2 fb = bit_cast<f32x2, uint2>(b);
+  f32x2 fr = mscclpp::min(fa, fb);
+  return bit_cast<uint2, f32x2>(fr);
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) {
+  int4 ret;
+  ret.w = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
+  ret.x = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  ret.z = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
+  return ret;
+}
+
+/// f32x2 specialization for int4: process as two uint2 pairs using packed f32x2 arithmetic.
+template <>
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, SUM>(const int4& a, const int4& b) {
+  uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
+  uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
+  uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
+  uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
+  uint2 lo_r = calVectorHelper<f32x2, SUM>(lo_a, lo_b);
+  uint2 hi_r = calVectorHelper<f32x2, SUM>(hi_a, hi_b);
+  return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, MIN>(const int4& a, const int4& b) {
+  uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
+  uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
+  uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
+  uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
+  uint2 lo_r = calVectorHelper<f32x2, MIN>(lo_a, lo_b);
+  uint2 hi_r = calVectorHelper<f32x2, MIN>(hi_a, hi_b);
+  return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE int calVectorHelper(const int& a, const int& b) {
+  return bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+}
+
+template <typename T, ReduceOp OpType>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) {
+  return bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
+}
+
+/// f32x2 specialization for uint32_t: a single float packed in 32 bits (scalar fallback).
+template <>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, SUM>(const uint32_t& a, const uint32_t& b) {
+  float fa = bit_cast<float, uint32_t>(a);
+  float fb = bit_cast<float, uint32_t>(b);
+  return bit_cast<uint32_t, float>(fa + fb);
+}
+
+template <>
+MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, MIN>(const uint32_t& a, const uint32_t& b) {
+  float fa = bit_cast<float, uint32_t>(a);
+  float fb = bit_cast<float, uint32_t>(b);
+  return bit_cast<uint32_t, float>(fminf(fa, fb));
+}
+
+// calVector wrapper – converts scalar types to vector types and calls calVectorHelper
+template <typename T, ReduceOp OpType, typename DataType>
+MSCCLPP_DEVICE_INLINE DataType calVector(const DataType& a, const DataType& b) {
+  // Define the vectorized computation type based on the element type
+  static_assert(sizeof(DataType) % sizeof(T) == 0, "DataType size must be multiple of T size");
+  static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes");
+  using CompType = typename std::conditional_t<
+      std::is_same_v<T, float>, f32x2,
+      std::conditional_t<
+          std::is_same_v<T, __half>, f16x2,
+          std::conditional_t<
+              std::is_same_v<T, __bfloat16>, bf16x2,
+              std::conditional_t<
+                  std::is_same_v<T, uint8_t>, u8x4,
+                  std::conditional_t<std::is_same_v<T, __fp8_e4m3b15>, f8_e4m3b15x4,
+#if defined(__FP8_TYPES_EXIST__)
+                                     std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
+                                                        std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4, T>>
+#else
+                                     T
+#endif
+                                     >>>>>;
+  return calVectorHelper<CompType, OpType>(a, b);
+}
+
+/// Upcast a packed DataType (containing T elements) to a packed AccDataType (containing AccumT elements).
+/// Uses the optimized to<>() specializations when available (e.g. FP8 -> float hardware intrinsics).
+/// When AccumT == T, this is a no-op identity.
+template <typename T, typename AccumT, typename AccDataType, typename DataType>
+MSCCLPP_DEVICE_INLINE AccDataType upcastVector(const DataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return val;
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+    ToVec result = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
+    return reinterpret_cast<const AccDataType&>(result);
+  }
+}
+
+/// Downcast a packed AccDataType (containing AccumT elements) back to DataType (containing T elements).
+/// Uses the optimized to<>() specializations when available.
+/// When AccumT == T, this is a no-op identity.
+template <typename T, typename AccumT, typename DataType, typename AccDataType>
+MSCCLPP_DEVICE_INLINE DataType downcastVector(const AccDataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return val;
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+    FromVec result = mscclpp::to<FromVec>(reinterpret_cast<const ToVec&>(val));
+    return reinterpret_cast<const DataType&>(result);
+  }
+}
+
+/// Accumulate `val` (packed T elements in DataType) into `acc` (packed AccumT elements in AccDataType).
+/// When AccumT == T, falls back to the standard calVector.
+/// Otherwise, upcasts val to AccumT, reduces element-wise, and returns the AccumT accumulator.
+template <typename T, typename AccumT, ReduceOp OpType, typename AccDataType, typename DataType>
+MSCCLPP_DEVICE_INLINE AccDataType calVectorAccum(const AccDataType& acc, const DataType& val) {
+  if constexpr (std::is_same_v<T, AccumT>) {
+    return calVector<T, OpType>(acc, val);
+  } else {
+    constexpr int nElems = sizeof(DataType) / sizeof(T);
+    using FromVec = VectorType<T, nElems>;
+    using ToVec = VectorType<AccumT, nElems>;
+
+    ToVec fv = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
+    const ToVec& fa = reinterpret_cast<const ToVec&>(acc);
+    ToVec fr;
+#pragma unroll
+    for (int i = 0; i < nElems; ++i) {
+      fr.data[i] = calElements<AccumT, OpType>(fa.data[i], fv.data[i]);
+    }
+    return reinterpret_cast<const AccDataType&>(fr);
+  }
+}
+
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_REDUCE_KERNEL_HPP_
diff --git a/src/core/mlx5dv_wrapper.cc b/src/core/mlx5dv_wrapper.cc
new file mode 100644
index 00000000..a56fad96
--- /dev/null
+++ b/src/core/mlx5dv_wrapper.cc
@@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#if defined(MSCCLPP_USE_MLX5DV)
+
+// _GNU_SOURCE is required for dlvsym()
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "mlx5dv_wrapper.hpp"
+
+#include <dlfcn.h>
+#include <infiniband/mlx5dv.h>
+
+#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
+#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
+#endif
+
+#include <memory>
+
+#include "logger.hpp"
+
+namespace mscclpp {
+
+static std::unique_ptr<void, int (*)(void*)> globalMLX5Handle(nullptr, &::dlclose);
+
+void* MLX5DV::dlsym(const std::string& symbol, bool allowReturnNull) {
+  if (!globalMLX5Handle) {
+    const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
+    for (int i = 0; possibleLibNames[i] != nullptr; i++) {
+      void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
+      if (handle) {
+        globalMLX5Handle.reset(handle);
+        break;
+      }
+    }
+    if (!globalMLX5Handle) {
+      if (allowReturnNull) return nullptr;
+      THROW(NET, SysError, errno, "Failed to open libmlx5: ", std::string(::dlerror()));
+    }
+  }
+  void* ptr = ::dlsym(globalMLX5Handle.get(), symbol.c_str());
+  if (!ptr && !allowReturnNull) {
+    THROW(NET, SysError, errno, "Failed to load libmlx5 symbol: ", symbol);
+  }
+  return ptr;
+}
+
+bool MLX5DV::isAvailable() {
+  static int available = -1;
+  if (available == -1) {
+    // Try to load the library; if it fails, mlx5dv is not available
+    const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
+    for (int i = 0; possibleLibNames[i] != nullptr; i++) {
+      void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
+      if (handle) {
+        if (!globalMLX5Handle) {
+          globalMLX5Handle.reset(handle);
+        } else {
+          ::dlclose(handle);
+        }
+        available = 1;
+        INFO(NET, "libmlx5 loaded successfully");
+        return true;
+      }
+    }
+    available = 0;
+    DEBUG(NET, "libmlx5 not available");
+  }
+  return available == 1;
+}
+
+bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
+  using FuncType = bool (*)(struct ibv_device*);
+  static FuncType impl = nullptr;
+  if (!impl) {
+    void* ptr = MLX5DV::dlsym("mlx5dv_is_supported", /*allowReturnNull=*/true);
+    if (!ptr) return false;
+    impl = reinterpret_cast<FuncType>(ptr);
+  }
+  return impl(device);
+}
+
+struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
+                                            int access) {
+  // mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
+  // Must use dlvsym with "MLX5_1.25" version to get the Data Direct-capable symbol.
+  using FuncType = struct ibv_mr* (*)(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int, int);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_reg_dmabuf_mr", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_reg_dmabuf_mr", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
+    resolved = true;
+  }
+  if (!impl) return nullptr;
+  return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
+}
+
+int MLX5DV::mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) {
+  using FuncType = int (*)(struct ibv_context*, char*, size_t);
+  static FuncType impl = nullptr;
+  static bool resolved = false;
+  if (!resolved) {
+    if (globalMLX5Handle) {
+      void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_get_data_direct_sysfs_path", "MLX5_1.25");
+      if (!ptr) {
+        ptr = MLX5DV::dlsym("mlx5dv_get_data_direct_sysfs_path", /*allowReturnNull=*/true);
+      }
+      impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
+    }
+    resolved = true;
+  }
+  if (!impl) return -1;
+  return impl(context, buf, buf_len);
+}
+
+}  // namespace mscclpp
+
+#endif  // defined(MSCCLPP_USE_MLX5DV)
diff --git a/src/core/npkit/npkit.cc b/src/core/npkit/npkit.cc
index 30fc35c7..84457abf 100644
--- a/src/core/npkit/npkit.cc
+++ b/src/core/npkit/npkit.cc
@@ -103,10 +103,10 @@ static int GetGpuClockRateInKhz() {
   else
     return 25000;
 #else
-  cudaDeviceProp dev_prop;
+  int clockRate;
   MSCCLPP_CUDATHROW(cudaGetDevice(&dev_id));
-  MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev_id));
-  return dev_prop.clockRate;
+  MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev_id));
+  return clockRate;
 #endif
 }
 #endif
diff --git a/src/core/proxy.cc b/src/core/proxy.cc
index 2a980505..de5b90fc 100644
--- a/src/core/proxy.cc
+++ b/src/core/proxy.cc
@@ -59,11 +59,15 @@ MSCCLPP_API_CPP Proxy::~Proxy() {
 MSCCLPP_API_CPP void Proxy::start(bool blocking) {
   pimpl_->running.store(true, std::memory_order_release);
   pimpl_->service = std::thread([this] {
+    // threadInit() is responsible for setting up the runtime context for the thread.
+    // The default implementation sets the CUDA device and NUMA affinity to match the main thread (see Proxy ctor).
+    // It should be called before any CUDA API calls to avoid resource allocation on unwanted GPUs.
+    pimpl_->threadInit();
+
     // never capture in a proxy thread
     auto mode = cudaStreamCaptureModeRelaxed;
     MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode));
 
-    pimpl_->threadInit();
     pimpl_->threadStarted.store(true, std::memory_order_release);
 
     ProxyHandler handler = this->pimpl_->handler;
diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc
index 57ac5979..49a3791b 100644
--- a/src/core/semaphore.cc
+++ b/src/core/semaphore.cc
@@ -8,7 +8,7 @@
 #include "atomic.hpp"
 #include "connection.hpp"
 #include "context.hpp"
-#include "debug.h"
+#include "logger.hpp"
 #include "registered_memory.hpp"
 #include "serialization.hpp"
 
@@ -49,12 +49,12 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection
     token_ = std::make_shared<uint64_t>(0);
   } else if (localDevice.type == DeviceType::GPU) {
     if (localDevice.id < 0) {
-      throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
+      THROW(CONN, Error, ErrorCode::InvalidUsage, "Local GPU ID is not provided");
     }
     CudaDeviceGuard deviceGuard(localDevice.id);
     token_ = gpuCallocToken(connection_.context());
   } else {
-    throw Error("Unsupported local device type", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Unsupported local device type");
   }
   idMemory_ = std::move(connection_.context()->registerMemory(token_.get(), sizeof(uint64_t), connection_.transport()));
 }
@@ -79,7 +79,7 @@ MSCCLPP_API_CPP SemaphoreStub SemaphoreStub::deserialize(const std::vector<char>
   RegisteredMemory idMemory(std::make_shared<RegisteredMemory::Impl>(data.begin(), memEnd));
   auto it = detail::deserialize(memEnd, device);
   if (it != data.end()) {
-    throw Error("SemaphoreStub deserialize failed", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "SemaphoreStub deserialize failed");
   }
   return SemaphoreStub(std::make_shared<Impl>(std::move(idMemory), device));
 }
@@ -120,13 +120,35 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
       expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
       outboundToken_(std::make_unique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
-    throw Error("Local endpoint device type of Host2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
   }
+  auto connImpl = BaseConnection::getImpl(connection());
+  if (connImpl->isSignalForwarding()) {
+    // Signal forwarding (HostNoAtomic): the receiver's recv thread polls the recv CQ for
+    // WRITE_WITH_IMM completions, then forwards the token to inboundToken_ via GDRCopy.
+    CudaDeviceGuard deviceGuard(connection().localDevice().id);
+#if defined(MSCCLPP_USE_ROCM)
+    inboundToken_ = detail::gpuCallocUncachedShared<uint64_t>();
+#else
+    inboundToken_ = detail::gpuCallocShared<uint64_t>();
+#endif
+    connImpl->startSignalForwarding(inboundToken_);
+  }
+  // When isSignalForwarding() is false (atomic mode), inboundToken_ stays null
+  // and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
 }
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator, const Connection& connection)
     : Host2DeviceSemaphore(buildSemaphoreFromConnection(communicator, connection)) {}
 
+MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
+  if (inboundToken_) {
+    // Clear the connection's signal forwarding destination (and GdrMap)
+    // before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
+    BaseConnection::getImpl(connection())->stopSignalForwarding();
+  }
+}
+
 MSCCLPP_API_CPP Connection& Host2DeviceSemaphore::connection() { return semaphore_.connection(); }
 
 MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
@@ -135,7 +157,11 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const {
   Host2DeviceSemaphore::DeviceHandle device;
-  device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
+  // If inboundToken_ is allocated (signal forwarding mode), the GPU polls it.
+  // Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
+  // which is the same address targeted by the NIC's atomic operation.
+  device.inboundToken =
+      inboundToken_ ? inboundToken_.get() : reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
   device.expectedInboundToken = expectedInboundToken_.get();
   return device;
 }
@@ -145,10 +171,18 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
       expectedInboundToken_(std::make_unique<uint64_t>()),
       outboundToken_(std::make_unique<uint64_t>()) {
   if (connection().transport() == Transport::CudaIpc) {
-    throw Error("Host2HostSemaphore cannot be used with CudaIpc transport", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Host2HostSemaphore cannot be used with CudaIpc transport");
   }
   if (connection().localDevice().type != DeviceType::CPU) {
-    throw Error("Local endpoint device type of Host2HostSemaphore should be CPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
+  }
+  auto connImpl = BaseConnection::getImpl(connection());
+  if (connImpl->isSignalForwarding()) {
+    // Signal forwarding mode: tell the recv thread where to write the incoming token.
+    // Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
+    auto token =
+        std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
+    connImpl->startSignalForwarding(std::move(token));
   }
 }
 
@@ -174,17 +208,16 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
   while (atomicLoad(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), memoryOrderAcquire) <
          (*expectedInboundToken_)) {
     if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) {
-      throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout);
+      THROW(CONN, Error, ErrorCode::Timeout, "Host2HostSemaphore::wait timed out");
     }
   }
 }
 
 MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const Semaphore& semaphore)
-    : semaphore_(semaphore),
-      expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
-      outboundToken_(detail::gpuCallocUnique<uint64_t>()) {
+    : semaphore_(semaphore), expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()) {
   if (connection().localDevice().type != DeviceType::GPU) {
-    throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage,
+          "Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU");
   }
 }
 
@@ -199,7 +232,6 @@ MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::DeviceHandle MemoryDevice2DeviceSe
   device.remoteInboundToken = reinterpret_cast<uint64_t*>(semaphore_.remoteMemory().data());
   device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
   device.expectedInboundToken = expectedInboundToken_.get();
-  device.outboundToken = outboundToken_.get();
   return device;
 };
 
diff --git a/src/ext/collectives/algorithm_collection_builder.cc b/src/ext/collectives/algorithm_collection_builder.cc
index 2e7b2920..7ba97a3c 100644
--- a/src/ext/collectives/algorithm_collection_builder.cc
+++ b/src/ext/collectives/algorithm_collection_builder.cc
@@ -8,12 +8,15 @@
 #include "allgather/allgather_fullmesh_2.hpp"
 #include "allreduce/allreduce_allpair_packet.hpp"
 #include "allreduce/allreduce_fullmesh.hpp"
-#include "allreduce/allreduce_nvls.hpp"
+#include "allreduce/allreduce_nvls_block_pipeline.hpp"
 #include "allreduce/allreduce_nvls_packet.hpp"
-#include "allreduce/allreduce_nvls_with_copy.hpp"
-#include "allreduce/allreduce_nvls_with_copy_2.hpp"
+#include "allreduce/allreduce_nvls_warp_pipeline.hpp"
+#include "allreduce/allreduce_nvls_zero_copy.hpp"
 #include "allreduce/allreduce_packet.hpp"
 #include "alltoallv/alltoallv_fullmesh.hpp"
+#include "allreduce/allreduce_rsag.hpp"
+#include "allreduce/allreduce_rsag_pipeline.hpp"
+#include "allreduce/allreduce_rsag_zero_copy.hpp"
 #include "logger.hpp"
 
 namespace mscclpp {
@@ -50,8 +53,9 @@ AlgorithmCollection AlgorithmCollectionBuilder::build() {
 void AlgorithmCollectionBuilder::reset() { gAlgorithmCollectionBuilder_.reset(); }
 
 AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t scratchBuffer,
-                                                                       size_t scratchBufferSize, int rank) {
-  auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize);
+                                                                       size_t scratchBufferSize, uintptr_t flagBuffer,
+                                                                       size_t flagBufferSize, int rank) {
+  auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize);
   auto dslCollection = buildDefaultDslAlgorithms(rank);
   nativeCollection.extend(dslCollection);
   nativeCollection.setSelectors(algoSelector_, fallbackAlgoSelector_);
@@ -59,24 +63,39 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t
 }
 
 AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uintptr_t scratchBuffer,
-                                                                             size_t scratchBufferSize) {
+                                                                             size_t scratchBufferSize,
+                                                                             uintptr_t flagBuffer,
+                                                                             size_t flagBufferSize) {
   AlgorithmCollection collection;
-  auto allreduceAllpairPkt = std::make_shared<AllreduceAllpairPacket>(scratchBuffer, scratchBufferSize)->build();
+  auto allreduceAllpairPkt =
+      std::make_shared<AllreduceAllpairPacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
   collection.registerAlgorithm(allreduceAllpairPkt->collective(), allreduceAllpairPkt->name(), allreduceAllpairPkt);
-  auto allreduceNvlsPacket = std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize)->build();
+  auto allreduceNvlsPacket =
+      std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
   collection.registerAlgorithm(allreduceNvlsPacket->collective(), allreduceNvlsPacket->name(), allreduceNvlsPacket);
-  auto allreduceNvlsWithCopy = std::make_shared<AllreduceNvlsWithCopy>(scratchBuffer, scratchBufferSize)->build();
-  collection.registerAlgorithm(allreduceNvlsWithCopy->collective(), allreduceNvlsWithCopy->name(),
-                               allreduceNvlsWithCopy);
-  auto allreduceNvlsWithCopy2 = std::make_shared<AllreduceNvlsWithCopy2>(scratchBuffer, scratchBufferSize)->build();
-  collection.registerAlgorithm(allreduceNvlsWithCopy2->collective(), allreduceNvlsWithCopy2->name(),
-                               allreduceNvlsWithCopy2);
-  auto allreducePkt = std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize)->build();
+  auto allreduceNvlsWarpPipeline =
+      std::make_shared<AllreduceNvlsWarpPipeline>(scratchBuffer, scratchBufferSize)->build();
+  collection.registerAlgorithm(allreduceNvlsWarpPipeline->collective(), allreduceNvlsWarpPipeline->name(),
+                               allreduceNvlsWarpPipeline);
+  auto allreduceNvlsBlockPipeline =
+      std::make_shared<AllreduceNvlsBlockPipeline>(scratchBuffer, scratchBufferSize)->build();
+  collection.registerAlgorithm(allreduceNvlsBlockPipeline->collective(), allreduceNvlsBlockPipeline->name(),
+                               allreduceNvlsBlockPipeline);
+  auto allreducePkt =
+      std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
   collection.registerAlgorithm(allreducePkt->collective(), allreducePkt->name(), allreducePkt);
   auto allreduceNvls = std::make_shared<AllreduceNvls>()->build();
   collection.registerAlgorithm(allreduceNvls->collective(), allreduceNvls->name(), allreduceNvls);
   auto allreduceFullmesh = std::make_shared<AllreduceFullmesh>(scratchBuffer, scratchBufferSize)->build();
   collection.registerAlgorithm(allreduceFullmesh->collective(), allreduceFullmesh->name(), allreduceFullmesh);
+  auto allreduceRsag = std::make_shared<AllreduceRsAg>(scratchBuffer, scratchBufferSize)->build();
+  collection.registerAlgorithm(allreduceRsag->collective(), allreduceRsag->name(), allreduceRsag);
+  auto allreduceRsagPipeline = std::make_shared<AllreduceRsAgPipeline>(scratchBuffer, scratchBufferSize)->build();
+  collection.registerAlgorithm(allreduceRsagPipeline->collective(), allreduceRsagPipeline->name(),
+                               allreduceRsagPipeline);
+  auto allreduceRsagZeroCopy = std::make_shared<AllreduceRsAgZeroCopy>()->build();
+  collection.registerAlgorithm(allreduceRsagZeroCopy->collective(), allreduceRsagZeroCopy->name(),
+                               allreduceRsagZeroCopy);
 
   auto allgatherFullmesh = std::make_shared<AllgatherFullmesh>(scratchBuffer, scratchBufferSize)->build();
   collection.registerAlgorithm(allgatherFullmesh->collective(), allgatherFullmesh->name(), allgatherFullmesh);
@@ -110,13 +129,13 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultDslAlgorithms(int ra
     return oss.str();
   };
 
-  std::string planDir = env()->executionPlanDir;
+  auto planDir = std::filesystem::path(env()->cacheDir) / "default";
   if (!std::filesystem::exists(planDir)) {
-    INFO(ALGO, "Plan directory does not exist: ", planDir);
+    INFO(ALGO, "Default plan directory does not exist: ", planDir);
     return collection;
   }
   for (const auto& config : defaultAlgoConfigs) {
-    std::string planPath = planDir + "/" + config.filename;
+    auto planPath = planDir / config.filename;
     INFO(ALGO, "Loading plan: ", planPath);
     if (!std::filesystem::exists(planPath)) {
       INFO(ALGO, "Plan file does not exist: ", planPath);
diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu
index 34f8d4e7..fb51a342 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh.cu
@@ -170,7 +170,7 @@ std::shared_ptr<void> AllgatherFullmesh::initAllgatherContext(std::shared_ptr<Co
   return ctx;
 }
 
-AlgorithmCtxKey AllgatherFullmesh::generateAllgatherContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllgatherFullmesh::generateAllgatherContextKey(const void*, void*, size_t, DataType, bool) {
   // always return same key, non-zero copy algo
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
@@ -183,14 +183,16 @@ std::shared_ptr<Algorithm> AllgatherFullmesh::build() {
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, [[maybe_unused]] DataType dtype, [[maybe_unused]] ReduceOp op,
              cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-             const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             const std::unordered_map<std::string, uintptr_t>& extras,
+             [[maybe_unused]] DataType accumDtype) -> CommResult {
         return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllgatherContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 }  // namespace collective
diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
index 84f14ca2..9d169d68 100644
--- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu
+++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu
@@ -107,12 +107,6 @@ __global__ void __launch_bounds__(1024, 1)
   }
 }
 
-AllgatherFullmesh2::AllgatherFullmesh2() : disableChannelCache_(false) {
-  if (mscclpp::env()->disableChannelCache) {
-    disableChannelCache_ = true;
-  }
-}
-
 void AllgatherFullmesh2::initialize(std::shared_ptr<Communicator> comm) {
   this->conns_ = setupConnections(comm);
   this->memorySemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
@@ -174,7 +168,7 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
   CUdeviceptr recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
   size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
-  if (disableChannelCache_) {
+  if (!symmetricMemory_) {
     channelOutOffset = 0;
     recvBytes = inputSize * comm->bootstrap()->getNranks();
     recvBasePtr = (CUdeviceptr)output;
@@ -197,10 +191,11 @@ std::shared_ptr<void> AllgatherFullmesh2::initAllgatherContext(std::shared_ptr<m
 }
 
 mscclpp::AlgorithmCtxKey AllgatherFullmesh2::generateAllgatherContextKey(const void*, void* output, size_t,
-                                                                         mscclpp::DataType) {
+                                                                         mscclpp::DataType, bool symmetricMemory) {
   static int tag = 0;
-  if (disableChannelCache_) {
-    // always return a new key if channel cache is disabled
+  symmetricMemory_ = symmetricMemory;
+  if (!symmetricMemory_) {
+    // always return a new key if symmetric memory is not enabled.
     return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, tag++};
   }
   size_t recvBytes;
@@ -217,14 +212,17 @@ std::shared_ptr<Algorithm> AllgatherFullmesh2::build() {
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, [[maybe_unused]] mscclpp::DataType dtype, [[maybe_unused]] ReduceOp op,
              cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-             const std::unordered_map<std::string, uintptr_t>& extras) -> mscclpp::CommResult {
+             const std::unordered_map<std::string, uintptr_t>& extras,
+             [[maybe_unused]] mscclpp::DataType accumDtype) -> mscclpp::CommResult {
         return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
       [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
-             mscclpp::DataType dtype) { return self->generateAllgatherContextKey(input, output, inputSize, dtype); });
+             mscclpp::DataType dtype, bool symmetricMemory) {
+        return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
 }
 
 }  // namespace collective
diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
index a4881093..17bcfc33 100644
--- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <collective_utils.hpp>
+#include <type_traits>
 
 #include "allreduce/allreduce_allpair_packet.hpp"
 #include "allreduce/common.hpp"
@@ -11,29 +12,18 @@
 namespace mscclpp {
 namespace collective {
 
-__device__ uint32_t deviceFlag = 1;
-
-template <ReduceOp OpType, typename T, bool flagPerBlock = false>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                                   size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
-                                  int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags) {
+                                  int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
+                                  uint32_t flagSize) {
   // This version of allreduce only works for single nodes
   if (worldSize != nRanksPerNode) return;
 
   if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
   const int nPeers = nRanksPerNode - 1;
 
-  uint32_t flag = 0;
-  if constexpr (flagPerBlock) {
-    flag = ((uint32_t*)flags)[blockIdx.x];
-  } else {
-    flag = deviceFlag;
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((LL8Packet*)flags)[blockIdx.x].write(0, flag);
-    }
-  }
-
+  uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
   size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0;
   size_t channelScratchOffset = scratchBaseOffset;
 
@@ -54,30 +44,23 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
   // step 2: Reduce Data
   for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nelems; idx += blockDim.x * gridDim.x) {
     uint32_t data = src[idx];
+    using AccRaw = std::conditional_t<std::is_same_v<T, AccumT>, uint32_t,
+                                      mscclpp::VectorType<AccumT, sizeof(uint32_t) / sizeof(T)>>;
+    AccRaw acc = mscclpp::upcastVector<T, AccumT, AccRaw>(data);
     for (int index = 0; index < nPeers; index++) {
       const int remoteRank = index < rank ? index : index + 1;
       LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems;
       uint32_t val = dstPkt[idx].read(flag, -1);
-      data = cal_vectors<T, OpType>(val, data);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(acc, val);
     }
-    dst[idx] = data;
+    dst[idx] = mscclpp::downcastVector<T, AccumT, uint32_t>(acc);
   }
-  if constexpr (flagPerBlock) {
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((uint32_t*)flags)[blockIdx.x] = flag + 1;
-    }
-  } else {
-    // Make sure all threadblocks have finished reading before incrementing the flag
-    if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
-      ((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
-    }
-    if (blockIdx.x == 0) {
-      __syncthreads();
-    }
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-      deviceFlag++;
-    }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ((uint32_t*)flags)[blockIdx.x] = flag + 1;
+  }
+  if (blockIdx.x == 0 && threadIdx.x >= gridDim.x && threadIdx.x < flagSize / sizeof(uint32_t)) {
+    ((uint32_t*)flags)[threadIdx.x] = flag + 1;
   }
 }
 
@@ -88,24 +71,23 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
   return {(worldSize - 1) * 4, 512};
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllpairAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
-                          cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0,
+                          cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0,
                           int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
-    if (nBlocks == 7 || nBlocks == 28) {
-      allreduceAllPairs<OpType, T, true><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
-          (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-          nRanksPerNode, worldSize, nelems, numScratchBuff, flags);
-      return cudaGetLastError();
+    // Round nBlocks to multiple of nPeers so every block maps to a valid peer.
+    const int nPeers = worldSize - 1;
+    if (nPeers > 0) {
+      nBlocks = (nBlocks / nPeers) * nPeers;
     }
-    allreduceAllPairs<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreduceAllPairs<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, numScratchBuff, flags);
+        nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
     return cudaGetLastError();
   }
 };
@@ -116,44 +98,38 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr<Communicator> comm) {
   RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
   registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory);
   registeredMemories_.push_back(scratchMemory);
-  flags_ = detail::gpuCallocShared<LL8Packet>(maxBlockNum_);
-  std::vector<uint32_t> flags(28, 1);
-  flags7_ = detail::gpuCallocShared<uint32_t>(7);
-  flags28_ = detail::gpuCallocShared<uint32_t>(28);
-  gpuMemcpy<uint32_t>(flags7_.get(), flags.data(), 7, cudaMemcpyHostToDevice);
-  gpuMemcpy<uint32_t>(flags28_.get(), flags.data(), 28, cudaMemcpyHostToDevice);
 }
 
 CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
                                                        size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op,
                                                        cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                       const std::unordered_map<std::string, uintptr_t>&) {
+                                                       const std::unordered_map<std::string, uintptr_t>&,
+                                                       DataType accumDtype) {
   auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
   std::pair<int, int> blockAndThreadNum{nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
   }
-  void* flags = this->flags_.get();
-  if (blockAndThreadNum.first == 7) {
-    flags = this->flags7_.get();
-  } else if (blockAndThreadNum.first == 28) {
-    flags = this->flags28_.get();
+  // nBlocks must be at least nPeers for allpair — each block maps to one peer.
+  const int nPeers = algoCtx->nRanksPerNode - 1;
+  if (nPeers > 0 && blockAndThreadNum.first < nPeers) {
+    return CommResult::CommInvalidArgument;
   }
-
   size_t sendBytes;
   CUdeviceptr sendBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   size_t channelInOffset = (char*)input - (char*)sendBasePtr;
 
-  AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
-  cudaError_t error = allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr,
-                                nullptr, nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank,
-                                algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, flags,
-                                this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
+  cudaError_t error =
+      allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr,
+                nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerNode,
+                algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_,
+                this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
     WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -185,7 +161,7 @@ std::shared_ptr<void> AllreduceAllpairPacket::initAllreduceContext(std::shared_p
   return ctx;
 }
 
-AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType, bool) {
   size_t sendBytes;
   CUdeviceptr sendBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -193,21 +169,23 @@ AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void*
 }
 
 std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
-  auto self = std::make_shared<AllreduceAllpairPacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+  auto self = std::make_shared<AllreduceAllpairPacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_,
+                                                       flagBuffer_, flagBufferSize_);
   return std::make_shared<NativeAlgorithm>(
       "default_allreduce_allpair_packet", "allreduce",
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 }  // namespace collective
diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
index e8cd93bb..24d2a31c 100644
--- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu
+++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu
@@ -9,7 +9,7 @@
 namespace mscclpp {
 namespace collective {
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(512, 1)
     allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
                       DeviceHandle<MemoryChannel>* memoryOutChannels, size_t channelOutDataOffset, int rank,
@@ -26,6 +26,10 @@ __global__ void __launch_bounds__(512, 1)
   int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
   int4* resultBuff4 = reinterpret_cast<int4*>(resultBuff);
 
+  // AccumVec: wider vector for mixed-precision accumulation. When AccumT==T, this is just int4 (no-op).
+  constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T);
+  using AccumVec = std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nElemsPerInt4>>;
+
   // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4`
   constexpr size_t unitNInt4 = 512;
   const size_t maxNInt4PerBlock =
@@ -81,12 +85,14 @@ __global__ void __launch_bounds__(512, 1)
     __syncthreads();
 
     for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) {
-      int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(rawData);
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
-        data = cal_vectors<T, OpType>(val, data);
+        acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, val);
       }
+      int4 data = mscclpp::downcastVector<T, AccumT, int4>(acc);
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
@@ -121,12 +127,14 @@ __global__ void __launch_bounds__(512, 1)
     __syncthreads();
 
     for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) {
-      int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
+      AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(rawData);
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
-        data = cal_vectors<T, OpType>(val, data);
+        acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, val);
       }
+      int4 data = mscclpp::downcastVector<T, AccumT, int4>(acc);
       resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
       for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
         outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
@@ -144,17 +152,18 @@ __global__ void __launch_bounds__(512, 1)
   }
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceAllconnectAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t,
                           size_t channelOutDataOffset, size_t, int rank, int nRanksPerNode, int worldSize,
-                          size_t inputSize, cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks,
+                          int nThreadsPerBlock) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     size_t nelems = inputSize / sizeof(T);
     if (nBlocks == 0) nBlocks = 35;
     if (nThreadsPerBlock == 0) nThreadsPerBlock = 512;
-    allreduceFullmesh<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreduceFullmesh<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels,
         channelOutDataOffset, rank, nRanksPerNode, worldSize, nelems);
     return cudaGetLastError();
@@ -173,15 +182,18 @@ void AllreduceFullmesh::initialize(std::shared_ptr<Communicator> comm) {
   localScratchMemory_ = std::move(localMemory);
 }
 
-CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
-                                                  size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
-                                                  int nBlocks, int nThreadsPerBlock,
-                                                  const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceFullmesh::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx_void, const void* input, void* output, size_t inputSize, DataType dtype,
+    ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   size_t recvBytes;
   CUdeviceptr recvBasePtr;
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  size_t channelOutOffset = (char*)output - (char*)recvBasePtr;
+  size_t channelOutOffset = 0;
+  if (symmetricMemory_) {
+    MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
+    channelOutOffset = (char*)output - (char*)recvBasePtr;
+  }
   std::shared_ptr<DeviceHandle<MemoryChannel>> inputChannelHandles;
   if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) {
     inputChannelHandles = this->memoryChannelsMap_[input].second;
@@ -194,17 +206,24 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
   }
   inputChannelHandles = this->memoryChannelsMap_[input].second;
 
-  AllreduceFunc allreduce = dispatch<AllreduceAllconnectAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceAllconnectAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", static_cast<int>(op),
          static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  if (numBlocksAndThreads.first > 64) {
+    WARN("AllreduceFullmesh: number of blocks exceeds maximum supported blocks, which is 64");
+    return mscclpp::CommResult::CommInvalidArgument;
+  }
+  if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
+    numBlocksAndThreads = {35, 512};
+  }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(),
                 nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
-                stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -212,19 +231,21 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr<void> ct
   return CommResult::CommSuccess;
 }
 
-AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType) {
+AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType,
+                                                               bool symmetricMemory) {
   static int tag = 0;
   size_t recvBytes;
   CUdeviceptr recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
-  if (env()->disableChannelCache) {
+  symmetricMemory_ = symmetricMemory;
+  if (!symmetricMemory_) {
     return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, tag++};
   }
   return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, 0};
 }
 
 std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
-                                                              void* output, size_t, DataType) {
+                                                              void* output, size_t size, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
@@ -236,6 +257,10 @@ std::shared_ptr<void> AllreduceFullmesh::initAllreduceContext(std::shared_ptr<Co
   size_t recvBytes;
   CUdeviceptr recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
+  if (!symmetricMemory_) {
+    recvBytes = size;
+    recvBasePtr = (CUdeviceptr)output;
+  }
   RegisteredMemory localMemory = comm->registerMemory((void*)recvBasePtr, recvBytes, Transport::CudaIpc);
   ctx->registeredMemories = setupRemoteMemories(comm, ctx->rank, localMemory);
   ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory,
@@ -251,15 +276,17 @@ std::shared_ptr<Algorithm> AllreduceFullmesh::build() {
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 }  // namespace collective
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
similarity index 72%
rename from src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
rename to src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
index 2a109c6f..2d71cd63 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -3,7 +3,7 @@
 
 #include <mscclpp/algorithm.hpp>
 
-#include "allreduce/allreduce_nvls_with_copy_2.hpp"
+#include "allreduce/allreduce_nvls_block_pipeline.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
 #include "debug.h"
@@ -15,11 +15,12 @@ __device__ DeviceSemaphore deviceSemaphore[NUM_SEMAPHORES];
 
 template <typename T>
 __global__ void __launch_bounds__(1024, 1)
-    allreduceNvlsWithCopy2([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch, [[maybe_unused]] void* dst,
-                           [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
-                           [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels, [[maybe_unused]] size_t size,
-                           [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
-                           [[maybe_unused]] int nRanksPerNode) {
+    allreduceNvlsBlockPipeline([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch,
+                               [[maybe_unused]] void* dst,
+                               [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                               [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels,
+                               [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize,
+                               [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
   int nPeers = nRanksPerNode - 1;
@@ -145,28 +146,35 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
-struct NvlsWithCopy2Adapter {
+template <ReduceOp OpType, typename T, typename AccumT = T>
+struct NvlsBlockPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
-                          cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
-#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
-    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
+    if constexpr (std::is_same_v<T, uint8_t>) {
+      return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
       return cudaErrorNotSupported;
     } else
+#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
+      if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+        return cudaErrorNotSupported;
+      } else
 #endif
-    {
-      using ChannelType = DeviceHandle<BaseMemoryChannel>;
-      allreduceNvlsWithCopy2<T>
-          <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels,
-                                                     inputSize, scratchBufferSize, rank, nRanksPerNode);
-      return cudaGetLastError();
-    }
+      {
+        using ChannelType = DeviceHandle<BaseMemoryChannel>;
+        allreduceNvlsBlockPipeline<T>
+            <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
+                                                       nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
+        return cudaGetLastError();
+      }
   }
 };
 
-void AllreduceNvlsWithCopy2::initialize(std::shared_ptr<Communicator> comm) {
+void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
   int nBaseChannels = 64;
   this->conns_ = setupConnections(comm);
@@ -176,14 +184,16 @@ void AllreduceNvlsWithCopy2::initialize(std::shared_ptr<Communicator> comm) {
   // setup base memory channels
   this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
+  this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
-CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
-                                                       void* output, size_t inputSize, DataType dtype, ReduceOp op,
-                                                       cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                       const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
+                                                           void* output, size_t inputSize, DataType dtype, ReduceOp op,
+                                                           cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                                           const std::unordered_map<std::string, uintptr_t>& extras,
+                                                           DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsWithCopy2Adapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -194,52 +204,53 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<voi
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0,
+                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
+    WARN("AllreduceNvlsBlockPipeline failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
 }
 
-AlgorithmCtxKey AllreduceNvlsWithCopy2::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceNvlsBlockPipeline::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
 
-std::shared_ptr<void> AllreduceNvlsWithCopy2::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
-                                                                   void*, size_t, DataType) {
+std::shared_ptr<void> AllreduceNvlsBlockPipeline::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
+                                                                       void*, size_t, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
   ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
   ctx->switchChannels =
-      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
 
-std::shared_ptr<Algorithm> AllreduceNvlsWithCopy2::build() {
-  auto self = std::make_shared<AllreduceNvlsWithCopy2>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+std::shared_ptr<Algorithm> AllreduceNvlsBlockPipeline::build() {
+  auto self =
+      std::make_shared<AllreduceNvlsBlockPipeline>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
   return std::make_shared<NativeAlgorithm>(
-      "default_allreduce_nvls_with_copy2", "allreduce",
+      "default_allreduce_nvls_block_pipeline", "allreduce",
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
index aafe7566..a616485e 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu
@@ -1,33 +1,25 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <type_traits>
+
 #include "allreduce/allreduce_nvls_packet.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
-#include "debug.h"
+#include "logger.hpp"
 
 namespace mscclpp {
 namespace collective {
 
-__device__ uint32_t deviceFlag = 1;
-template <ReduceOp OpType, typename T, bool flagPerBlock = false>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output,
                         [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
                         [[maybe_unused]] size_t nelems, [[maybe_unused]] size_t scratchBufferSize,
-                        [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags) {
+                        [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags,
+                        [[maybe_unused]] uint32_t flagBufferSize) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-  uint32_t flag = 0;
-  if constexpr (flagPerBlock) {
-    flag = ((uint32_t*)flags)[blockIdx.x];
-  } else {
-    flag = deviceFlag;
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((LL8Packet*)flags)[blockIdx.x].write(0, flag);
-    }
-  }
-
+  uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
   size_t scratchBaseOffset = (flag % 2) ? scratchBufferSize / 2 : 0;
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t nPktPerRank = nelems / worldSize / (sizeof(mscclpp::LL8Packet::Payload) / sizeof(T));
@@ -41,31 +33,24 @@ __global__ void __launch_bounds__(1024, 1)
     mscclpp::SwitchChannelDeviceHandle::multimemStore(*(mscclpp::f32x2*)(&pkt), multiPkt + i);
   }
   for (uint32_t i = tid; i < nPktPerRank * worldSize; i += blockDim.x * gridDim.x) {
-    uint data = src[i];
+    // When T == AccumT, stay with raw uint to avoid type mismatch in identity path.
+    using AccRaw =
+        std::conditional_t<std::is_same_v<T, AccumT>, uint, mscclpp::VectorType<AccumT, sizeof(uint) / sizeof(T)>>;
+    AccRaw acc = mscclpp::upcastVector<T, AccumT, AccRaw>(src[i]);
     for (int peer = 0; peer < worldSize; peer++) {
-      if (peer == rank) {
-        continue;
-      }
+      if (peer == rank) continue;
       uint val = scratchPkt[peer * worldSize * nPktPerRank + i].read(flag);
-      data = cal_vectors<T, OpType>(data, val);
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(acc, val);
     }
-    dst[i] = data;
+    dst[i] = mscclpp::downcastVector<T, AccumT, uint>(acc);
   }
-  if constexpr (flagPerBlock) {
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      ((uint32_t*)flags)[blockIdx.x] = flag + 1;
-    }
-  } else {
-    if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
-      ((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
-    }
-    if (blockIdx.x == 0) {
-      __syncthreads();
-    }
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-      deviceFlag++;
-    }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ((uint32_t*)flags)[blockIdx.x] = flag + 1;
+  }
+  // update other flags in-case using different number of blocks in next launch
+  if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) {
+    ((uint32_t*)flags)[threadIdx.x] = flag + 1;
   }
 #endif
 }
@@ -80,35 +65,27 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize) {
   return {blockNum, threadNum};
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct AllreduceNvlsPacketAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int, int worldSize, size_t inputSize, cudaStream_t stream,
-                          void* flags, uint32_t, int nBlocks, int nThreadsPerBlock) {
-    if (nBlocks == 4 || nBlocks == 8) {
-      allreduceNvlsPacket<OpType, T, true>
-          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels,
-                                                     inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags);
-    } else {
-      allreduceNvlsPacket<OpType, T>
-          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels,
-                                                     inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags);
-    }
+                          void* flags, uint32_t flagBufferSize, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    allreduceNvlsPacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+        (const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank,
+        worldSize, flags, flagBufferSize);
     return cudaGetLastError();
   }
 };
 
-void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator>) {
-  std::vector<uint32_t> flags(8, 1);
-  flags_ = detail::gpuCallocShared<LL8Packet>(16);
-  flags4_ = detail::gpuCallocShared<uint32_t>(4);
-  flags8_ = detail::gpuCallocShared<uint32_t>(8);
-  gpuMemcpy<uint32_t>(flags4_.get(), flags.data(), 4, cudaMemcpyHostToDevice);
-  gpuMemcpy<uint32_t>(flags8_.get(), flags.data(), 8, cudaMemcpyHostToDevice);
+void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator> comm) {
+  int nSwitchChannels = 1;
+  this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
+  this->switchChannels_ =
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
 }
 
-AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
 
@@ -120,10 +97,7 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
   ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
-  int nSwitchChannels = 1;
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
-  ctx->switchChannels =
-      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
+  ctx->switchChannels = this->switchChannels_;
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
@@ -131,54 +105,53 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
 CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
                                                     void* output, size_t inputSize, mscclpp::DataType dtype,
                                                     ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                    const std::unordered_map<std::string, uintptr_t>&) {
+                                                    const std::unordered_map<std::string, uintptr_t>&,
+                                                    mscclpp::DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize);
   }
   if (blockAndThreadNum.first > maxBlockNum_) {
-    WARN("Block number %d exceeds the maximum limit %d", blockAndThreadNum.first, maxBlockNum_);
+    WARN(ALGO, "Block number ", blockAndThreadNum.first, " exceeds the maximum limit ", maxBlockNum_);
     return CommResult::CommInvalidArgument;
   }
-  AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
+    WARN(ALGO, "Unsupported operation or data type for allreduce, dtype=", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
-  void* flags = this->flags_.get();
-  if (blockAndThreadNum.first == 4) {
-    flags = this->flags4_.get();
-  } else if (blockAndThreadNum.first == 8) {
-    flags = this->flags8_.get();
-  }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr,
-                0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, flags,
-                0, blockAndThreadNum.first, blockAndThreadNum.second);
+                0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
+                (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
+    WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
 }
 
 std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
-  auto self = std::make_shared<AllreduceNvlsPacket>((uintptr_t)scratchBuffer_, scratchBufferSize_);
+  auto self = std::make_shared<AllreduceNvlsPacket>((uintptr_t)scratchBuffer_, scratchBufferSize_, flagBuffer_,
+                                                    flagBufferSize_);
   return std::make_shared<mscclpp::NativeAlgorithm>(
       "default_allreduce_nvls_packet", "allreduce",
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             mscclpp::DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
       [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
-             mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); });
+             mscclpp::DataType dtype, bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
 }
 }  // namespace collective
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
similarity index 71%
rename from src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
rename to src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
index 113fdb7c..3bb054da 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -3,7 +3,7 @@
 
 #include <mscclpp/algorithm.hpp>
 
-#include "allreduce/allreduce_nvls_with_copy.hpp"
+#include "allreduce/allreduce_nvls_warp_pipeline.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
 #include "debug.h"
@@ -13,11 +13,12 @@ namespace collective {
 
 template <typename T>
 __global__ void __launch_bounds__(1024, 1)
-    allreduce10([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch, [[maybe_unused]] void* dst,
-                [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
-                [[maybe_unused]] DeviceHandle<SwitchChannel>* multicast, [[maybe_unused]] size_t size,
-                [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
-                [[maybe_unused]] int nRanksPerNode) {
+    allreduceNvlsWarpPipeline([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch,
+                              [[maybe_unused]] void* dst,
+                              [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                              [[maybe_unused]] DeviceHandle<SwitchChannel>* multicast, [[maybe_unused]] size_t size,
+                              [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
+                              [[maybe_unused]] int nRanksPerNode) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
   int nPeers = nRanksPerNode - 1;
@@ -108,28 +109,35 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
-struct NvlsWithCopyAdapter {
+template <ReduceOp OpType, typename T, typename AccumT = T>
+struct NvlsWarpPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
-                          cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
-#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
-    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
+    if constexpr (std::is_same_v<T, uint8_t>) {
+      return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
       return cudaErrorNotSupported;
     } else
+#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
+      if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+        return cudaErrorNotSupported;
+      } else
 #endif
-    {
-      using ChannelType = DeviceHandle<BaseMemoryChannel>;
-      allreduce10<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                               nvlsChannels, inputSize, scratchBufferSize, rank,
-                                                               nRanksPerNode);
-      return cudaGetLastError();
-    }
+      {
+        using ChannelType = DeviceHandle<BaseMemoryChannel>;
+        allreduceNvlsWarpPipeline<T>
+            <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
+                                                       nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
+        return cudaGetLastError();
+      }
   }
 };
 
-void AllreduceNvlsWithCopy::initialize(std::shared_ptr<Communicator> comm) {
+void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
   int nBaseChannels = 64;
   this->conns_ = setupConnections(comm);
@@ -139,14 +147,15 @@ void AllreduceNvlsWithCopy::initialize(std::shared_ptr<Communicator> comm) {
   // setup base memory channels
   this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
+  this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
-CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
-                                                      void* output, size_t inputSize, DataType dtype, ReduceOp op,
-                                                      cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                      const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx_void, const void* input, void* output, size_t inputSize, DataType dtype,
+    ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsWithCopyAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsWarpPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -157,51 +166,52 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr<void
   }
   cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr,
                                 ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_,
-                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0,
+                                ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
+    WARN("AllreduceNvlsWarpPipeline failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
 }
 
-AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t, DataType) {
+AlgorithmCtxKey AllreduceNvlsWarpPipeline::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
 
-std::shared_ptr<void> AllreduceNvlsWithCopy::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
-                                                                  void*, size_t, DataType) {
+std::shared_ptr<void> AllreduceNvlsWarpPipeline::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
+                                                                      void*, size_t, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
   ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
   ctx->switchChannels =
-      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
 
-std::shared_ptr<Algorithm> AllreduceNvlsWithCopy::build() {
-  auto self = std::make_shared<AllreduceNvlsWithCopy>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+std::shared_ptr<Algorithm> AllreduceNvlsWarpPipeline::build() {
+  auto self =
+      std::make_shared<AllreduceNvlsWarpPipeline>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
   return std::make_shared<NativeAlgorithm>(
-      "default_allreduce_nvls_with_copy", "allreduce",
+      "default_allreduce_nvls_warp_pipeline", "allreduce",
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_nvls.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
similarity index 69%
rename from src/ext/collectives/allreduce/allreduce_nvls.cu
rename to src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
index 98f884f8..e7f2028f 100644
--- a/src/ext/collectives/allreduce/allreduce_nvls.cu
+++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -3,7 +3,7 @@
 
 #include <mscclpp/core.hpp>
 
-#include "allreduce/allreduce_nvls.hpp"
+#include "allreduce/allreduce_nvls_zero_copy.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
 #include "debug.h"
@@ -11,6 +11,8 @@
 namespace mscclpp {
 namespace collective {
 
+constexpr int MAX_NBLOCKS = 32;
+
 template <typename T>
 __global__ void __launch_bounds__(1024, 1)
     allreduceNvls([[maybe_unused]] mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>* memoryChannels,
@@ -23,9 +25,18 @@ __global__ void __launch_bounds__(1024, 1)
   int nBlocks = gridDim.x;
   int bid = blockIdx.x;
   size_t sizePerRank = size / nRanksPerNode;
-  size_t sizePerBlock = sizePerRank / nBlocks;
+  const size_t minAlign = 16;
+  // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore
+  size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks;
+  sizePerBlock = (sizePerBlock + minAlign - 1) / minAlign * minAlign;
+
   size_t rankOffset = sizePerRank * rank;
   size_t blockOffset = sizePerBlock * bid + rankOffset;
+  size_t curBlockSize = 0;
+  if (sizePerBlock * bid < sizePerRank) {
+    curBlockSize = min(sizePerBlock, sizePerRank - sizePerBlock * bid);
+  }
+
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastPtr = multicast + bid;
   mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicastOutPtr = multicastOut + bid;
 
@@ -44,8 +55,10 @@ __global__ void __launch_bounds__(1024, 1)
   __syncthreads();
   T* src = (T*)multicastPtr->mcPtr;
   T* dst = (T*)multicastOutPtr->mcPtr;
-  handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock,
-                             threadIdx.x, blockDim.x);
+  if (curBlockSize > 0) {
+    handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, curBlockSize,
+                               threadIdx.x, blockDim.x);
+  }
   __syncthreads();
   if (threadIdx.x < nPeers) {
     channels[threadIdx.x].relaxedSignal();
@@ -54,15 +67,22 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct NvlsAdapter {
   static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
                           size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize,
-                          cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) {
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    // uint8_t is not supported for NVLS (no hardware support for byte-level reduction)
+    if constexpr (std::is_same_v<T, uint8_t>) {
+      return cudaErrorNotSupported;
+    } else if constexpr (std::is_same_v<T, __fp8_e4m3b15>) {
+      // fp8_e4m3b15 is a software-only type with no hardware NVLS support.
+      return cudaErrorNotSupported;
+    } else
 #if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000)
-    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+        if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
       return cudaErrorNotSupported;
     } else
 #endif
@@ -77,7 +97,12 @@ struct NvlsAdapter {
 };
 
 void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
-  nSwitchChannels_ = 8;
+  int device;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&device));
+  cudaDeviceProp deviceProp;
+  MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device));
+  computeCapabilityMajor_ = deviceProp.major;
+  nSwitchChannels_ = 32;
   this->conns_ = setupConnections(comm);
   // setup semaphores
   std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores =
@@ -85,14 +110,21 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
   // setup base memory channels
   this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nSwitchChannels_);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
+  this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
+  this->nvlsOutConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
 CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
                                               size_t inputSize, mscclpp::DataType dtype, ReduceOp op,
                                               cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                              const std::unordered_map<std::string, uintptr_t>&) {
+                                              [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras,
+                                              mscclpp::DataType accumDtype) {
+  if (!symmetricMemory_) {
+    WARN("AllreduceNvls requires symmetric memory for now.");
+    return CommResult::CommInvalidArgument;
+  }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -110,12 +142,22 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
   }
   std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
   if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) {
-    numBlocksAndThreads = {ctx->nRanksPerNode, 1024};
+    numBlocksAndThreads = {::min(ctx->nRanksPerNode, MAX_NBLOCKS), 1024};
+    // For GB200 devices with MNNVLS (Multi-Node NVLink Sharp), scale the number of blocks inversely with
+    // the number of GPUs. Empirically, 32 blocks works well for 4 GPUs and 16 for 8 GPUs, which
+    // follows the formula 128 / nGPUs, clamped to [1, MAX_NBLOCKS].
+    if (computeCapabilityMajor_ == 10) {
+      numBlocksAndThreads.first = ::max(1, ::min(128 / ctx->workSize, MAX_NBLOCKS));
+    }
+  }
+  if (numBlocksAndThreads.first > MAX_NBLOCKS) {
+    WARN("Number of blocks exceeds maximum supported value of %d", MAX_NBLOCKS);
+    return CommResult::CommInvalidArgument;
   }
   cudaError_t error =
       allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels,
                 nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize,
-                inputSize, stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+                inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
@@ -124,7 +166,8 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
 }
 
 mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t,
-                                                                    mscclpp::DataType) {
+                                                                    mscclpp::DataType, bool symmetricMemory) {
+  symmetricMemory_ = symmetricMemory;
   size_t sendBytes, recvBytes;
   CUdeviceptr sendBasePtr, recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -145,13 +188,11 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
 
   // setup channels
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
-  ctx->switchChannels = setupNvlsChannels(ctx->nvlsConnections, (void*)sendBasePtr, sendBytes, nSwitchChannels_);
+  ctx->switchChannels = setupNvlsChannels(this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_);
   if (input != output) {
-    auto nvlsOutConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
+    auto nvlsOutConnections = this->nvlsOutConnections_;
     std::vector<mscclpp::SwitchChannel> outChannels =
-        setupNvlsChannels(nvlsOutConnections, (void*)recvBasePtr, recvBytes, nSwitchChannels_);
-    ctx->nvlsConnections.insert(ctx->nvlsConnections.end(), nvlsOutConnections.begin(), nvlsOutConnections.end());
+        setupNvlsChannels(this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_);
     ctx->switchChannels.insert(ctx->switchChannels.end(), outChannels.begin(), outChannels.end());
   }
 
@@ -162,19 +203,22 @@ std::shared_ptr<void> AllreduceNvls::initAllreduceContext(std::shared_ptr<mscclp
 std::shared_ptr<mscclpp::Algorithm> AllreduceNvls::build() {
   auto self = std::make_shared<AllreduceNvls>();
   return std::make_shared<mscclpp::NativeAlgorithm>(
-      "default_allreduce_nvls", "allreduce",
+      "default_allreduce_nvls_zero_copy", "allreduce",
       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             mscclpp::DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
       [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
-             mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); });
+             mscclpp::DataType dtype, bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
 }
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index d150c717..e2d8ef73 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -2,22 +2,21 @@
 // Licensed under the MIT License.
 
 #include <mscclpp/algorithm.hpp>
+#include <type_traits>
 
 #include "allreduce/allreduce_packet.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
-#include "debug.h"
+#include "logger.hpp"
 
 namespace mscclpp {
 namespace collective {
 
-__device__ uint32_t deviceFlag = 1;
-
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 __global__ void __launch_bounds__(1024, 1)
     allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
                     size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize,
-                    size_t nelems, void* flags, uint32_t numScratchBuff
+                    size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff
 #if defined(ENABLE_NPKIT)
                     ,
                     NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) {
@@ -60,11 +59,7 @@ __global__ void __launch_bounds__(1024, 1)
   const int nPeers = nRanksPerNode - 1;
   const size_t nPkts = nelems / 2;
 
-  uint32_t flag = deviceFlag;
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    ((LL8Packet*)flags)[blockIdx.x].write(0, flag);
-  }
+  uint32_t flag = ((uint32_t*)flags)[blockIdx.x];
   size_t channelScratchOffset = (flag % numScratchBuff) ? scratchBufferSize / numScratchBuff : 0;
 
   int nelemsPerRank = nelems / worldSize;
@@ -98,12 +93,21 @@ __global__ void __launch_bounds__(1024, 1)
   // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
     uint2 data = src[idx];
-    for (int index = 0; index < nPeers; index++) {
-      const int remoteRank = index < rank ? index : index + 1;
-      mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
-      uint2 val = dstPkt[idx].read(flag);
-      data.x = cal_vectors<T, OpType>(val.x, data.x);
-      data.y = cal_vectors<T, OpType>(val.y, data.y);
+    {
+      // When T == AccumT, stay with raw uint32_t to avoid type mismatch in identity path.
+      using AccRaw = std::conditional_t<std::is_same_v<T, AccumT>, uint32_t,
+                                        mscclpp::VectorType<AccumT, sizeof(uint32_t) / sizeof(T)>>;
+      AccRaw accX = mscclpp::upcastVector<T, AccumT, AccRaw>(data.x);
+      AccRaw accY = mscclpp::upcastVector<T, AccumT, AccRaw>(data.y);
+      for (int index = 0; index < nPeers; index++) {
+        const int remoteRank = index < rank ? index : index + 1;
+        mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
+        uint2 val = dstPkt[idx].read(flag);
+        accX = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(accX, val.x);
+        accY = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(accY, val.y);
+      }
+      data.x = mscclpp::downcastVector<T, AccumT, uint32_t>(accX);
+      data.y = mscclpp::downcastVector<T, AccumT, uint32_t>(accY);
     }
 
     dst[idx].x = data.x;
@@ -129,15 +133,12 @@ __global__ void __launch_bounds__(1024, 1)
     result[idx].y = data.y;
   }
 
-  // Make sure all threadblocks have finished reading before incrementing the flag
-  if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
-    ((LL8Packet*)flags)[threadIdx.x].read(flag, -1);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    ((uint32_t*)flags)[blockIdx.x] = flag + 1;
   }
-  if (blockIdx.x == 0) {
-    __syncthreads();
-  }
-  if (threadIdx.x == 0 && blockIdx.x == 0) {
-    deviceFlag++;
+  if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) {
+    ((uint32_t*)flags)[threadIdx.x] = flag + 1;
   }
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && \
     defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT)
@@ -151,25 +152,27 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 }
 
-template <ReduceOp OpType, typename T>
+template <ReduceOp OpType, typename T, typename AccumT = T>
 struct PacketAdapter {
   static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
-                          cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0,
-                          int nThreadsPerBlock = 0) {
+                          cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff,
+                          int nBlocks = 0, int nThreadsPerBlock = 0) {
     using ChannelType = DeviceHandle<MemoryChannel>;
     const size_t nelems = inputSize / sizeof(T);
+    // Optimize the number of blocks to be multiple of (worldSize - 1)
+    nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1);
 #if defined(ENABLE_NPKIT)
     size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS;
-    allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
+    allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, sharedMemSize, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, flags, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
+        nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(),
         NpKit::GetCpuTimestamp());
 #else
-    allreducePacket<OpType><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+    allreducePacket<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
         (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
-        nRanksPerNode, worldSize, nelems, flags, numScratchBuff);
+        nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff);
 #endif
     return cudaGetLastError();
   }
@@ -193,18 +196,22 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
     }
   }
 
-#if defined(__FP8_TYPES_EXIST__)
   // FP8-specific tuning for 32KB-256KB range
-  if (dtype == DataType::FP8_E4M3 || dtype == DataType::FP8_E5M2) {
-    if (inputSize < (64 << 10)) {
-      nThreadsPerBlock = 64;
-    } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
-      nThreadsPerBlock = 128;
-    } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) {
-      nThreadsPerBlock = 256;
+  {
+    bool isFp8 = dtype == DataType::FLOAT8_E4M3B15;
+#if defined(__FP8_TYPES_EXIST__)
+    isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2;
+#endif
+    if (isFp8) {
+      if (inputSize < (64 << 10)) {
+        nThreadsPerBlock = 64;
+      } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
+        nThreadsPerBlock = 128;
+      } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) {
+        nThreadsPerBlock = 256;
+      }
     }
   }
-#endif
 #endif
   return {nBlocks, nThreadsPerBlock};
 }
@@ -215,13 +222,13 @@ void AllreducePacket::initialize(std::shared_ptr<Communicator> comm) {
   RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
   registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory);
   registeredMemories_.push_back(scratchMemory);
-  flags_ = detail::gpuCallocShared<LL8Packet>(maxBlockNum_);
 }
 
 CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input, void* output,
                                                 size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op,
                                                 cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                const std::unordered_map<std::string, uintptr_t>&) {
+                                                const std::unordered_map<std::string, uintptr_t>&,
+                                                DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   std::pair<int, int> blockAndThreadNum = {nBlocks, nThreadsPerBlock};
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
@@ -233,18 +240,19 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<void> ctx_
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   size_t channelInOffset = (char*)input - (char*)sendBasePtr;
 
-  void* flags = this->flags_.get();
-  AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<PacketAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
+    WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
+         ", dtype=", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   cudaError_t error =
       allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr,
                 channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize,
-                stream, flags, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second);
+                stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_,
+                blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error));
+    WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
@@ -274,7 +282,7 @@ std::shared_ptr<void> AllreducePacket::initAllreduceContext(std::shared_ptr<Comm
   return ctx;
 }
 
-AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType) {
+AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType, bool) {
   size_t sendBytes;
   CUdeviceptr sendBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
@@ -282,20 +290,22 @@ AlgorithmCtxKey AllreducePacket::generateAllreduceContextKey(const void* input,
 }
 
 std::shared_ptr<Algorithm> AllreducePacket::build() {
-  auto self = std::make_shared<AllreducePacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+  auto self = std::make_shared<AllreducePacket>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_,
+                                                flagBuffer_, flagBufferSize_);
   return std::make_shared<NativeAlgorithm>(
       "default_allreduce_packet", "allreduce", [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
-                                         extras);
+                                         extras, accumDtype);
       },
       [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize,
              DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) {
-        return self->generateAllreduceContextKey(input, output, inputSize, dtype);
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
       });
 }
 
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
new file mode 100644
index 00000000..db471b93
--- /dev/null
+++ b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -0,0 +1,230 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "allreduce/allreduce_rsag.hpp"
+#include "allreduce/common.hpp"
+#include "collective_utils.hpp"
+#include "logger.hpp"
+
+namespace mscclpp {
+namespace collective {
+
+// Allreduce using the Reduce-Scatter + All-Gather (RSAG) pattern.
+//
+// This algorithm performs allreduce in three phases over intra-node peers
+// connected via CudaIpc memory channels:
+//
+//   1. Scatter: Each rank copies its input data into a scratch buffer, then
+//      signals peers and waits for all peers to do the same.
+//
+//   2. Reduce-Scatter: Each rank reduces its assigned chunk by reading the
+//      corresponding chunks from all peers' scratch buffers (via remote memory
+//      handles) and applying the reduction op. The reduced result is written
+//      back to both the local result buffer and peers' scratch buffers.
+//
+//   3. All-Gather: After a second signal/wait barrier, each rank copies the
+//      reduced chunks produced by other ranks from the scratch buffer into its
+//      result buffer, completing the allreduce.
+//
+// Data is processed in int4-sized (16-byte) units for coalesced memory access,
+// with special handling for any remainder elements at the tail.
+template <ReduceOp OpType, typename T>
+__global__ void __launch_bounds__(1024, 1)
+    allreduceRsAg(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                  DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int nRanksPerNode,
+                  int worldSize, size_t nelems) {
+  int blockId = blockIdx.x;
+  uint32_t nPeers = nRanksPerNode - 1;
+
+  assert((uintptr_t)buff % sizeof(int4) == 0);
+  assert((uintptr_t)resultBuff % sizeof(int4) == 0);
+
+  constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
+  uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
+                           nelemsPerInt4 * nRanksPerNode;
+  uint32_t nelemsPerRank = alignedNelems / nRanksPerNode;
+  uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
+  uint32_t lastInt4Index = nelems / nelemsPerInt4;
+  uint32_t remainder = nelems % nelemsPerInt4;
+
+  int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
+  int4* resultBuff4 = reinterpret_cast<int4*>((char*)resultBuff);
+  int4* buff4 = reinterpret_cast<int4*>((char*)buff);
+  DeviceHandle<BaseMemoryChannel>* memoryChannelsLocal = memoryChannels + blockId * nPeers;
+
+  uint32_t nInt4PerBlock = nInt4PerRank / gridDim.x;
+  uint32_t remainderForBlock = nInt4PerRank % gridDim.x;
+  uint32_t offset4 = blockId * nInt4PerBlock;
+  if (blockId == (int)(gridDim.x - 1)) {
+    nInt4PerBlock += remainderForBlock;
+  }
+  if (nInt4PerBlock == 0) return;
+  uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerNode;
+
+  for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) {
+    int rankIdx = idx / nInt4PerBlock;
+    uint32_t offsetIdx = rankIdx * nInt4PerRank + offset4 + (idx % nInt4PerBlock);
+    if (offsetIdx > lastInt4Index) continue;
+    if (offsetIdx == lastInt4Index && remainder != 0) {
+      for (uint32_t i = 0; i < remainder; i++) {
+        ((T*)&scratch4[offsetIdx])[i] = ((T*)&buff4[offsetIdx])[i];
+      }
+      continue;
+    }
+    scratch4[offsetIdx] = buff4[offsetIdx];
+  }
+  __syncthreads();
+  if (threadIdx.x < nPeers) {
+    memoryChannelsLocal[threadIdx.x].signal();
+    memoryChannelsLocal[threadIdx.x].wait();
+  }
+  __syncthreads();
+  for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) {
+    uint32_t offset = idx + offset4 + rank * nInt4PerRank;
+    if (offset > lastInt4Index) continue;
+    int4 tmp = scratch4[offset];
+    for (uint32_t i = 0; i < nPeers; i++) {
+      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
+      int4 data = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
+      tmp = calVector<T, OpType>(data, tmp);
+    }
+    for (uint32_t i = 0; i < nPeers; i++) {
+      int rankIdx = (rank + i + 1) % nRanksPerNode;
+      int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
+      mscclpp::write<int4>(((void**)remoteMemories)[peerIdx], offset, tmp);
+    }
+    if (offset == lastInt4Index && remainder != 0) {
+      for (uint32_t i = 0; i < remainder; i++) {
+        ((T*)&resultBuff4[offset])[i] = ((T*)&tmp)[i];
+      }
+      continue;
+    }
+    resultBuff4[offset] = tmp;
+  }
+  __syncthreads();
+  if (threadIdx.x < nPeers) {
+    memoryChannelsLocal[threadIdx.x].signal();
+    memoryChannelsLocal[threadIdx.x].wait();
+  }
+  __syncthreads();
+  for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) {
+    int rankIdx = idx / nInt4PerBlock;
+    if (rankIdx == rank) continue;
+    uint32_t offsetIdx = rankIdx * nInt4PerRank + offset4 + (idx % nInt4PerBlock);
+    if (offsetIdx > lastInt4Index) continue;
+    if (offsetIdx == lastInt4Index && remainder != 0) {
+      for (uint32_t i = 0; i < remainder; i++) {
+        ((T*)&resultBuff4[offsetIdx])[i] = ((T*)&scratch4[offsetIdx])[i];
+      }
+      continue;
+    }
+    resultBuff4[offsetIdx] = scratch4[offsetIdx];
+  }
+}
+
+template <ReduceOp OpType, typename T, typename AccumT = T>
+struct AllreduceRsAgAdapter {
+  static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
+                          DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
+                          size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream,
+                          void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    using ChannelType = DeviceHandle<BaseMemoryChannel>;
+    size_t nelems = inputSize / sizeof(T);
+    if (nBlocks == 0 || nThreadsPerBlock == 0) {
+      nThreadsPerBlock = 1024;
+      nBlocks = 64;
+    }
+    allreduceRsAg<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+        (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
+        nRanksPerNode, worldSize, nelems);
+    return cudaGetLastError();
+  }
+};
+
+void AllreduceRsAg::initialize(std::shared_ptr<Communicator> comm) {
+  this->conns_ = setupConnections(comm);
+  nChannelsPerConnection_ = 64;
+  comm_ = comm;
+  // setup semaphores
+  this->scratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
+  RegisteredMemory localMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
+  this->remoteScratchMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), localMemory);
+  localScratchMemory_ = std::move(localMemory);
+
+  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, this->scratchSemaphores_, nChannelsPerConnection_);
+  this->baseMemoryChannelHandles_ = setupBaseMemoryChannelDeviceHandles(baseChannels_);
+  std::vector<void*> remoteMemoryHandles;
+  for (const auto& remoteMemory : this->remoteScratchMemories_) {
+    remoteMemoryHandles.push_back(remoteMemory.data());
+  }
+  this->remoteMemoryHandles_ = detail::gpuCallocShared<void*>(remoteMemoryHandles.size());
+  gpuMemcpy(this->remoteMemoryHandles_.get(), remoteMemoryHandles.data(), remoteMemoryHandles.size(),
+            cudaMemcpyHostToDevice);
+}
+
+CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
+                                              size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
+                                              int nBlocks, int nThreadsPerBlock,
+                                              const std::unordered_map<std::string, uintptr_t>&, DataType accumDtype) {
+  auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgAdapter>(op, dtype, accumDtype);
+  if (!allreduce) {
+    WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
+         ", dtype=", static_cast<int>(dtype));
+    return CommResult::CommInvalidArgument;
+  }
+  if (inputSize > this->scratchBufferSize_) {
+    WARN(ALGO, "Input size ", inputSize, " exceeds scratch buffer size ", this->scratchBufferSize_);
+    return CommResult::CommInvalidArgument;
+  }
+  std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
+                                this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank,
+                                algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, 0,
+                                numBlocksAndThreads.first, numBlocksAndThreads.second);
+  if (error != cudaSuccess) {
+    WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
+    return CommResult::CommUnhandledCudaError;
+  }
+  return CommResult::CommSuccess;
+}
+
+AlgorithmCtxKey AllreduceRsAg::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
+  return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
+}
+
+std::shared_ptr<void> AllreduceRsAg::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void*,
+                                                          size_t, DataType) {
+  auto ctx = std::make_shared<AlgorithmCtx>();
+  ctx->rank = comm->bootstrap()->getRank();
+  ctx->workSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+
+  ctx->memorySemaphores = this->scratchSemaphores_;
+  ctx->registeredMemories = this->remoteScratchMemories_;
+  return ctx;
+}
+
+std::shared_ptr<Algorithm> AllreduceRsAg::build() {
+  auto self = std::make_shared<AllreduceRsAg>((uintptr_t)scratchBuffer_, scratchBufferSize_);
+  return std::make_shared<NativeAlgorithm>(
+      "default_allreduce_rsag", "allreduce",
+      [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
+      [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
+        return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
+                                         extras, accumDtype);
+      },
+      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize,
+             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
+}
+}  // namespace collective
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
new file mode 100644
index 00000000..eabe3dc5
--- /dev/null
+++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu
@@ -0,0 +1,337 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "allreduce/allreduce_rsag_pipeline.hpp"
+#include "allreduce/common.hpp"
+#include "collective_utils.hpp"
+#include "logger.hpp"
+
+namespace mscclpp {
+namespace collective {
+constexpr int MAX_NBLOCKS_FOR_PUT = 32;
+constexpr int MAX_NBLOCKS_FOR_RECV = 32;
+constexpr int MAX_NBLOCKS_FOR_REDUCE = 64;
+constexpr int REDUCE_COPY_RATIO = 2;
+__device__ DeviceSemaphore semaphoreForSend[MAX_NBLOCKS_FOR_REDUCE];
+__device__ DeviceSemaphore semaphoreForRecv[MAX_NBLOCKS_FOR_REDUCE];
+__device__ DeviceSemaphore semaphoreForReduce[MAX_NBLOCKS_FOR_REDUCE];
+
+// TODO: move it to a common header file
+template <typename T>
+__device__ __forceinline__ int4 loadVec(const T* buff, size_t i, size_t nelems) {
+  constexpr size_t ElemsPerInt4 = sizeof(int4) / sizeof(T);
+  size_t offset = i * ElemsPerInt4;
+  if (offset + ElemsPerInt4 <= nelems) {
+    return reinterpret_cast<const int4*>(buff)[i];
+  } else {
+    union {
+      int4 i;
+      T t[ElemsPerInt4];
+    } vec;
+    vec.i = make_int4(0, 0, 0, 0);
+    for (size_t j = 0; j < ElemsPerInt4 && offset + j < nelems; ++j) {
+      vec.t[j] = buff[offset + j];
+    }
+    return vec.i;
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ void storeVec(T* buff, size_t i, int4 val, size_t nelems) {
+  constexpr size_t ElemsPerInt4 = sizeof(int4) / sizeof(T);
+  size_t offset = i * ElemsPerInt4;
+  if (offset + ElemsPerInt4 <= nelems) {
+    reinterpret_cast<int4*>(buff)[i] = val;
+  } else {
+    union {
+      int4 i;
+      T t[ElemsPerInt4];
+    } vec;
+    vec.i = val;
+    for (size_t j = 0; j < ElemsPerInt4 && offset + j < nelems; ++j) {
+      buff[offset + j] = vec.t[j];
+    }
+  }
+}
+
+// Pipelined Reduce-Scatter + All-Gather (RSAG) allreduce.
+//
+// This is a pipelined variant of the basic RSAG allreduce that overlaps
+// communication and computation by splitting the data into chunks processed
+// across multiple iterations. Three groups of thread blocks run concurrently
+// with different roles, synchronized via device semaphores:
+//
+//   PUT blocks  — Read local input chunks and write them into peers' scratch
+//                 buffers via remote memory handles (CudaIpc).
+//
+//   REDUCE blocks — After a signal/wait barrier confirming PUT completion,
+//                   reduce the local chunk with data received from all peers
+//                   in the scratch buffer. Write the reduced result to both
+//                   the local output and peers' scratch (for the AG phase).
+//
+//   RECV blocks — After a signal/wait barrier confirming REDUCE completion,
+//                 copy other ranks' reduced chunks from scratch into the
+//                 local result buffer, completing the all-gather.
+//
+// Pipelining is achieved by using a circular scratch buffer (pipelineDepth
+// stages). PUT blocks wait on a semaphore before reusing a scratch slot,
+// allowing the next iteration's PUT to overlap with the current iteration's
+// REDUCE and RECV. Each REDUCE block handles a subset of the PUT block's
+// data (controlled by REDUCE_COPY_RATIO), enabling finer-grained overlap.
+//
+// Data is processed in int4-sized (16-byte) units with vectorized load/store
+// helpers that handle tail elements.
+
+template <ReduceOp OpType, typename T>
+__global__ void __launch_bounds__(1024, 1)
+    allreduceRsAgPipeline(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                          DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank,
+                          int nRanksPerNode, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut,
+                          uint32_t nblocksForReduce, uint32_t nblocksForRecv) {
+  uint32_t bid = blockIdx.x;
+  constexpr uint32_t nStepsPerIter = 4;
+  uint32_t nInt4 = (nelems * sizeof(T) + sizeof(int4) - 1) / sizeof(int4);
+  uint32_t nInt4PerIter = nblocksForReduce * blockDim.x * nStepsPerIter;
+  const uint32_t chunkSize = nInt4PerIter * worldSize;
+  uint32_t nIters = (nInt4 + chunkSize - 1) / chunkSize;
+  uint32_t nPeers = nRanksPerNode - 1;
+  int4* scratch4 = reinterpret_cast<int4*>((char*)scratch);
+  const uint32_t scratchIterStride = 2 * chunkSize;  // one for AS, one for AG
+  const uint32_t pipelineDepth = scratchSize / sizeof(int4) / scratchIterStride;
+  assert(pipelineDepth >= 1);
+
+  if (bid < nblocksForPut) {
+    if (threadIdx.x == 0) {
+      semaphoreForSend[bid].set(pipelineDepth);
+    }
+    for (uint32_t iter = 0; iter < nIters; iter++) {
+      if (threadIdx.x == 0) {
+        semaphoreForSend[bid].acquire();
+      }
+      __syncthreads();
+      uint32_t threadIdInPut = bid * blockDim.x + threadIdx.x;
+      for (uint32_t peer = 0; peer < nPeers; peer++) {
+        int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+        int peerId = remoteRankId < rank ? remoteRankId : remoteRankId - 1;
+        // Read chunk[remoteRankId] from local buff, write to peer's scratch[rank] (sender's slot)
+        uint32_t srcOffset = iter * chunkSize + remoteRankId * nInt4PerIter;
+        uint32_t dstOffset = (iter % pipelineDepth) * scratchIterStride + rank * nInt4PerIter;
+        int4 tmp[nStepsPerIter * REDUCE_COPY_RATIO];
+#pragma unroll
+        for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) {
+          uint32_t offset = srcOffset + threadIdInPut + step * blockDim.x * nblocksForPut;
+          tmp[step] = loadVec(buff, offset, nelems);
+        }
+#pragma unroll
+        for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) {
+          uint32_t offset = dstOffset + threadIdInPut + step * blockDim.x * nblocksForPut;
+          mscclpp::write<int4>(((void**)remoteMemories)[peerId], offset, tmp[step]);
+        }
+      }
+      __syncthreads();
+      if (threadIdx.x < REDUCE_COPY_RATIO) {
+        semaphoreForReduce[bid * REDUCE_COPY_RATIO + threadIdx.x].release();
+      }
+    }
+  } else if (bid < nblocksForPut + nblocksForReduce) {
+    uint32_t bidInReduce = bid - nblocksForPut;
+    DeviceHandle<BaseMemoryChannel>* localMemoryChannels = memoryChannels + bidInReduce * nPeers;
+    // Map REDUCE blocks to PUT blocks: REDUCE blocks 0,1 handle PUT block 0's data
+    uint32_t putBlockId = bidInReduce / REDUCE_COPY_RATIO;
+    uint32_t subBlockId = bidInReduce % REDUCE_COPY_RATIO;
+    for (uint32_t iter = 0; iter < nIters; iter++) {
+      if (threadIdx.x == 0) {
+        semaphoreForReduce[bidInReduce].acquire();
+      }
+      uint32_t baseOffset = (iter % pipelineDepth) * scratchIterStride;
+      uint32_t baseSrcOffset = iter * chunkSize;
+
+      // Use same thread mapping as PUT: putBlockId * blockDim.x + threadIdx.x
+      uint32_t threadIdInPut = putBlockId * blockDim.x + threadIdx.x;
+      __syncthreads();
+      if (threadIdx.x < nPeers) {
+        localMemoryChannels[threadIdx.x].signal();
+        localMemoryChannels[threadIdx.x].wait();
+      }
+      __syncthreads();
+#pragma unroll nStepsPerIter
+      for (uint32_t step = 0; step < nStepsPerIter; step++) {
+        // Map to PUT's step pattern: each REDUCE block handles nStepsPerIter steps
+        // subBlockId determines which subset of the REDUCE_COPY_RATIO * nStepsPerIter steps
+        uint32_t putStep = subBlockId * nStepsPerIter + step;
+        uint32_t myChunkOffset =
+            baseSrcOffset + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
+        int4 tmp = loadVec(buff, myChunkOffset, nelems);
+        // Add data from each peer's slot in scratch (peer sent their chunk[rank] to our scratch[peer])
+        for (uint32_t peer = 0; peer < nPeers; peer++) {
+          int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+          uint32_t peerSlotOffset =
+              baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
+          int4 data = scratch4[peerSlotOffset];
+          tmp = calVector<T, OpType>(data, tmp);
+        }
+        storeVec(resultBuff, myChunkOffset, tmp, nelems);
+        // Broadcast reduced result to all peers' scratch at SCATTER_AG_OFFSET + rank * nInt4PerIter
+        uint32_t dstOffset =
+            baseOffset + chunkSize + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut;
+        for (uint32_t i = 0; i < nPeers; i++) {
+          int peerIdx = (rank + i + 1) % nRanksPerNode;
+          int index = peerIdx < rank ? peerIdx : peerIdx - 1;
+          mscclpp::write<int4>(((void**)remoteMemories)[index], dstOffset, tmp);
+        }
+      }
+      __syncthreads();
+      if (threadIdx.x == 0) {
+        semaphoreForRecv[bidInReduce].release();
+      }
+    }
+  } else if (bid < nblocksForPut + nblocksForReduce + nblocksForRecv) {
+    uint32_t bidInRecv = bid - nblocksForPut - nblocksForReduce;
+    DeviceHandle<BaseMemoryChannel>* localMemoryChannels = memoryChannels + (nblocksForReduce + bidInRecv) * nPeers;
+    for (uint32_t iter = 0; iter < nIters; iter++) {
+      if (threadIdx.x < REDUCE_COPY_RATIO) {
+        semaphoreForRecv[bidInRecv * REDUCE_COPY_RATIO + threadIdx.x].acquire();
+      }
+      uint32_t baseOffset = scratchIterStride * (iter % pipelineDepth);
+      uint32_t baseDstOffset = chunkSize * iter;
+      int threadIdInRecv = bidInRecv * blockDim.x + threadIdx.x;
+      __syncthreads();
+      if (threadIdx.x < nPeers) {
+        localMemoryChannels[threadIdx.x].signal();
+        localMemoryChannels[threadIdx.x].wait();
+      }
+      __syncthreads();
+      // Copy other ranks' reduced chunks from scratch to result
+      for (uint32_t peer = 0; peer < nPeers; peer++) {
+        int remoteRankId = (rank + peer + 1) % nRanksPerNode;
+        for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) {
+          uint32_t offset = baseOffset + chunkSize + remoteRankId * nInt4PerIter + threadIdInRecv +
+                            step * blockDim.x * nblocksForRecv;
+          uint32_t dstOffset =
+              baseDstOffset + remoteRankId * nInt4PerIter + threadIdInRecv + step * blockDim.x * nblocksForRecv;
+          storeVec(resultBuff, dstOffset, scratch4[offset], nelems);
+        }
+      }
+      __syncthreads();
+      if (threadIdx.x == 0) {
+        semaphoreForSend[bidInRecv].release();
+      }
+    }
+  }
+}
+
+template <ReduceOp OpType, typename T, typename AccumT = T>
+struct AllreduceRsAgPipelineAdapter {
+  static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
+                          DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
+                          size_t scratchSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize,
+                          cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    using ChannelType = DeviceHandle<BaseMemoryChannel>;
+    size_t nelems = inputSize / sizeof(T);
+    uint32_t nblocksForPut = MAX_NBLOCKS_FOR_PUT;
+    uint32_t nblocksForReduce = MAX_NBLOCKS_FOR_REDUCE;
+    uint32_t nblocksForRecv = MAX_NBLOCKS_FOR_RECV;
+    int maxNblocks = nblocksForPut + nblocksForReduce + nblocksForRecv;
+    if (nBlocks == 0 || nThreadsPerBlock == 0) {
+      nThreadsPerBlock = 1024;
+      nBlocks = maxNblocks;
+    } else {
+      nBlocks = nBlocks / (REDUCE_COPY_RATIO + 2) * (REDUCE_COPY_RATIO + 2);
+      if (nBlocks > maxNblocks) {
+        WARN(ALGO, "The number of blocks is too large for the allreduce pipeline algorithm, reducing it to ",
+             maxNblocks);
+        nBlocks = maxNblocks;
+      }
+      nblocksForPut = nBlocks / (REDUCE_COPY_RATIO + 2);
+      nblocksForReduce = nblocksForPut * REDUCE_COPY_RATIO;
+      nblocksForRecv = nblocksForPut;
+    }
+    allreduceRsAgPipeline<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+        (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank,
+        nRanksPerNode, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv);
+    return cudaGetLastError();
+  }
+};
+
+void AllreduceRsAgPipeline::initialize(std::shared_ptr<Communicator> comm) {
+  this->conns_ = setupConnections(comm);
+  nChannelsPerConnection_ = MAX_NBLOCKS_FOR_REDUCE + MAX_NBLOCKS_FOR_RECV;
+  comm_ = comm;
+  // setup semaphores
+  this->scratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
+  RegisteredMemory localMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc);
+  this->remoteScratchMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), localMemory);
+  localScratchMemory_ = std::move(localMemory);
+
+  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, this->scratchSemaphores_, nChannelsPerConnection_);
+  this->baseMemoryChannelHandles_ = setupBaseMemoryChannelDeviceHandles(baseChannels_);
+  std::vector<void*> remoteMemoryHandles;
+  for (const auto& remoteMemory : this->remoteScratchMemories_) {
+    remoteMemoryHandles.push_back(remoteMemory.data());
+  }
+  this->remoteMemoryHandles_ = detail::gpuCallocShared<void*>(remoteMemoryHandles.size());
+  gpuMemcpy(this->remoteMemoryHandles_.get(), remoteMemoryHandles.data(), remoteMemoryHandles.size(),
+            cudaMemcpyHostToDevice);
+}
+
+CommResult AllreduceRsAgPipeline::allreduceKernelFunc(
+    const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op,
+    cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+    [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
+  auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgPipelineAdapter>(op, dtype, accumDtype);
+  if (!allreduce) {
+    WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
+         ", dtype=", static_cast<int>(dtype));
+    return CommResult::CommInvalidArgument;
+  }
+  std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(),
+                                this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, this->scratchBufferSize_,
+                                algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0,
+                                0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+  if (error != cudaSuccess) {
+    WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
+    return CommResult::CommUnhandledCudaError;
+  }
+  return CommResult::CommSuccess;
+}
+
+AlgorithmCtxKey AllreduceRsAgPipeline::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
+  return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
+}
+
+std::shared_ptr<void> AllreduceRsAgPipeline::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
+                                                                  void*, size_t, DataType) {
+  auto ctx = std::make_shared<AlgorithmCtx>();
+  ctx->rank = comm->bootstrap()->getRank();
+  ctx->workSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+
+  ctx->memorySemaphores = this->scratchSemaphores_;
+  ctx->registeredMemories = this->remoteScratchMemories_;
+  return ctx;
+}
+
+std::shared_ptr<Algorithm> AllreduceRsAgPipeline::build() {
+  auto self = std::make_shared<AllreduceRsAgPipeline>((uintptr_t)scratchBuffer_, scratchBufferSize_);
+  return std::make_shared<NativeAlgorithm>(
+      "default_allreduce_rsag_pipeline", "allreduce",
+      [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
+      [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
+        return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
+                                         extras, accumDtype);
+      },
+      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize,
+             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
+}
+}  // namespace collective
+}  // namespace mscclpp
diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
new file mode 100644
index 00000000..f95ba7e3
--- /dev/null
+++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu
@@ -0,0 +1,247 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <type_traits>
+
+#include "allreduce/allreduce_rsag_zero_copy.hpp"
+#include "allreduce/common.hpp"
+#include "collective_utils.hpp"
+#include "logger.hpp"
+
+namespace mscclpp {
+namespace collective {
+
+__device__ mscclpp::DeviceSyncer globalSyncer;
+
+// Zero-copy Reduce-Scatter + All-Gather (RSAG) allreduce.
+//
+// Unlike the standard RSAG which copies input into a scratch buffer first,
+// this variant reads directly from peers' input buffers and writes reduced
+// results directly to peers' output buffers — eliminating the need for a
+// separate scratch buffer and reducing memory traffic.
+//
+// The algorithm runs in a single kernel with the following steps:
+//
+//   1. Barrier: Signal and wait on all peers to ensure input buffers are ready.
+//
+//   2. Reduce-Scatter: Each rank reads its assigned chunk from every peer's
+//      input buffer (via CudaIpc remote memory handles), reduces all values
+//      locally, then writes the reduced result to its own output buffer AND
+//      directly to every peer's output buffer at the same offset.
+//
+//   3. Global sync + Barrier: A device-wide sync ensures all writes complete,
+//      followed by a final signal/wait to guarantee all peers have finished
+//      writing, making the full output buffer valid on every rank.
+//
+// This approach requires registering both input and output buffers as remote
+// memories (2 * nPeers handles), but avoids scratch buffer allocation and
+// the extra copy steps of the standard RSAG. The NRanksPerNode template
+// parameter enables compile-time unrolling of peer loops (supports 4 or 8).
+
+template <int NRanksPerNode, ReduceOp OpType, typename T, typename AccumT = T>
+__global__ void __launch_bounds__(1024, 1)
+    allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                          DeviceHandle<SwitchChannel>* switchChannels, void* remoteMemories, int rank, int worldSize,
+                          size_t nelems) {
+  int blockId = blockIdx.x;
+
+  assert((uintptr_t)buff % sizeof(int4) == 0);
+  assert((uintptr_t)resultBuff % sizeof(int4) == 0);
+
+  constexpr int NPeers = NRanksPerNode - 1;
+  constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T);
+  const uint32_t outputRemoteBufferOffset = NRanksPerNode - 1;
+  uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 *
+                           nelemsPerInt4 * NRanksPerNode;
+  uint32_t nelemsPerRank = alignedNelems / NRanksPerNode;
+  uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4;
+  uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4;
+
+  int4* resultBuff4 = reinterpret_cast<int4*>((char*)resultBuff);
+  int4* buff4 = reinterpret_cast<int4*>((char*)buff);
+  DeviceHandle<BaseMemoryChannel>* memoryChannelsLocal = memoryChannels + blockId * NPeers;
+
+  uint32_t nInt4PerBlock = nInt4PerRank / gridDim.x;
+  uint32_t remainderForBlock = nInt4PerRank % gridDim.x;
+  uint32_t offset4 = blockId * nInt4PerBlock;
+  if (blockId == (int)(gridDim.x - 1)) {
+    nInt4PerBlock += remainderForBlock;
+  }
+  if (nInt4PerBlock == 0) return;
+
+  if (threadIdx.x < NPeers) {
+    memoryChannelsLocal[threadIdx.x].relaxedSignal();
+    memoryChannelsLocal[threadIdx.x].relaxedWait();
+  }
+  __syncthreads();
+  int4 data[NPeers];
+  // AccumInt4: when AccumT != T, use a wider accumulator type.
+  // For AccumT == T, this is just int4 (no-op conversion).
+  constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T);
+  // When T == AccumT, stay with raw int4 to avoid type mismatch in identity path.
+  using AccumVec = std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nElemsPerInt4>>;
+  for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) {
+    uint32_t offset = idx + offset4 + rank * nInt4PerRank;
+    if (offset >= nInt4Total) continue;
+    int4 tmp_raw = buff4[offset];
+#pragma unroll
+    for (int i = 0; i < NPeers; i++) {
+      int rankIdx = (rank + i + 1) % NRanksPerNode;
+      int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
+      data[i] = mscclpp::read<int4>(((void**)remoteMemories)[peerIdx], offset);
+    }
+    AccumVec acc = mscclpp::upcastVector<T, AccumT, AccumVec>(tmp_raw);
+    for (int i = 0; i < NPeers; i++) {
+      acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccumVec>(acc, data[i]);
+    }
+    int4 tmp = mscclpp::downcastVector<T, AccumT, int4>(acc);
+#pragma unroll
+    for (int i = 0; i < NPeers; i++) {
+      int rankIdx = (rank + i + 1) % NRanksPerNode;
+      int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1;
+      mscclpp::write<int4>(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp);
+    }
+    resultBuff4[offset] = tmp;
+  }
+  // Use device barrier gives better performance here.
+  globalSyncer.sync(gridDim.x);
+  if (blockIdx.x == 0 && threadIdx.x < NPeers) {
+    memoryChannelsLocal[threadIdx.x].signal();
+    memoryChannelsLocal[threadIdx.x].wait();
+  }
+}
+
+template <ReduceOp OpType, typename T, typename AccumT = T>
+struct AllreduceRsAgZeroCopyAdapter {
+  static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories,
+                          DeviceHandle<SwitchChannel>* switchChannel, DeviceHandle<SwitchChannel>*, size_t, size_t,
+                          size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream,
+                          void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) {
+    using ChannelType = DeviceHandle<BaseMemoryChannel>;
+    size_t nelems = inputSize / sizeof(T);
+    if (nBlocks == 0 || nThreadsPerBlock == 0) {
+      nThreadsPerBlock = 1024;
+      nBlocks = 64;
+      if (inputSize >= (1 << 26)) {
+        nBlocks = 128;
+      }
+    }
+    if (nRanksPerNode == 4) {
+      allreduceRsAgZeroCopy<4, OpType, T, AccumT>
+          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
+                                                     switchChannel, remoteMemories, rank, worldSize, nelems);
+    } else if (nRanksPerNode == 8) {
+      allreduceRsAgZeroCopy<8, OpType, T, AccumT>
+          <<<nBlocks, nThreadsPerBlock, 0, stream>>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels,
+                                                     switchChannel, remoteMemories, rank, worldSize, nelems);
+    } else {
+      THROW(ALGO, Error, ErrorCode::InvalidUsage, "Unsupported number of ranks per node: ", nRanksPerNode);
+    }
+    return cudaGetLastError();
+  }
+};
+
+void AllreduceRsAgZeroCopy::initialize(std::shared_ptr<Communicator> comm) {
+  this->conns_ = setupConnections(comm);
+  nChannelsPerConnection_ = 128;
+  comm_ = comm;
+  // setup semaphores
+  this->semaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_);
+  this->baseChannels_ = setupBaseMemoryChannels(this->conns_, this->semaphores_, nChannelsPerConnection_);
+  this->baseMemoryChannelHandles_ = setupBaseMemoryChannelDeviceHandles(baseChannels_);
+}
+
+CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
+                                                      size_t inputSize, DataType dtype, ReduceOp op,
+                                                      cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                                      const std::unordered_map<std::string, uintptr_t>&,
+                                                      DataType accumDtype) {
+  auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
+  AllreduceFunc allreduce = dispatch<AllreduceRsAgZeroCopyAdapter>(op, dtype, accumDtype);
+  if (!allreduce) {
+    WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast<int>(op),
+         ", dtype=", static_cast<int>(dtype));
+    return CommResult::CommInvalidArgument;
+  }
+  std::pair<int, int> numBlocksAndThreads = {nBlocks, nThreadsPerBlock};
+  cudaError_t error =
+      allreduce(input, nullptr, output, this->baseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(),
+                nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream,
+                nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second);
+  if (error != cudaSuccess) {
+    WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error));
+    return CommResult::CommUnhandledCudaError;
+  }
+  return CommResult::CommSuccess;
+}
+
+AlgorithmCtxKey AllreduceRsAgZeroCopy::generateAllreduceContextKey(const void* inputBuffer, void* outputBuffer,
+                                                                   size_t size, DataType, bool symmetricMemory) {
+  // For non-symmetric algorithms, we use both input and output buffer pointers in the key.
+  static int tag = 0;
+  if (symmetricMemory) {
+    size_t inputBytes, outputBytes;
+    CUdeviceptr inputBasePtr, outputBasePtr;
+    MSCCLPP_CUTHROW(cuMemGetAddressRange(&inputBasePtr, &inputBytes, (CUdeviceptr)inputBuffer));
+    MSCCLPP_CUTHROW(cuMemGetAddressRange(&outputBasePtr, &outputBytes, (CUdeviceptr)outputBuffer));
+    return AlgorithmCtxKey{(void*)inputBasePtr, (void*)outputBasePtr, inputBytes, outputBytes, 0};
+  }
+  return AlgorithmCtxKey{(void*)inputBuffer, outputBuffer, size, size, ++tag};
+}
+
+std::shared_ptr<void> AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_ptr<Communicator> comm, const void* input,
+                                                                  void* output, size_t size, DataType) {
+  auto ctx = std::make_shared<AlgorithmCtx>();
+  ctx->rank = comm->bootstrap()->getRank();
+  ctx->workSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+
+  ctx->memorySemaphores = this->semaphores_;
+
+  // register input and output memories
+  RegisteredMemory inputMemory = comm->registerMemory((void*)input, size, Transport::CudaIpc);
+  RegisteredMemory outputMemory = comm->registerMemory(output, size, Transport::CudaIpc);
+  this->inputMemories_.push_back(inputMemory);
+  this->outputMemories_.push_back(outputMemory);
+
+  auto remoteInputMemories = setupRemoteMemories(comm, ctx->rank, inputMemory);
+  auto remoteOutputMemories = setupRemoteMemories(comm, ctx->rank, outputMemory);
+  ctx->registeredMemories.insert(ctx->registeredMemories.end(), remoteInputMemories.begin(), remoteInputMemories.end());
+  ctx->registeredMemories.insert(ctx->registeredMemories.end(), remoteOutputMemories.begin(),
+                                 remoteOutputMemories.end());
+  std::vector<void*> remoteMemoryHandles;
+  for (const auto& remoteMemory : ctx->registeredMemories) {
+    remoteMemoryHandles.push_back(remoteMemory.data());
+  }
+  ctx->remoteMemoryHandles = detail::gpuCallocShared<void*>(remoteMemoryHandles.size());
+  gpuMemcpy(ctx->remoteMemoryHandles.get(), remoteMemoryHandles.data(), remoteMemoryHandles.size(),
+            cudaMemcpyHostToDevice);
+
+  // store local registered memories to ctx for lifetime management
+  ctx->registeredMemories.push_back(inputMemory);
+  ctx->registeredMemories.push_back(outputMemory);
+  return ctx;
+}
+
+std::shared_ptr<Algorithm> AllreduceRsAgZeroCopy::build() {
+  auto self = std::make_shared<AllreduceRsAgZeroCopy>();
+  return std::make_shared<NativeAlgorithm>(
+      "default_allreduce_rsag_zero_copy", "allreduce",
+      [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
+      [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
+             int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+             DataType accumDtype) -> CommResult {
+        return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
+                                         extras, accumDtype);
+      },
+      [self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize,
+             DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype,
+             bool symmetricMemory) {
+        return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory);
+      });
+}
+}  // namespace collective
+}  // namespace mscclpp
diff --git a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
index 0ed8ff3c..20e420b2 100644
--- a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
+++ b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu
@@ -67,7 +67,8 @@ std::shared_ptr<Algorithm> AlltoallvFullmesh::build() {
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              size_t outputSize, DataType dtype, [[maybe_unused]] ReduceOp op, cudaStream_t stream,
              int nBlocks, int nThreadsPerBlock,
-             const std::unordered_map<std::string, uintptr_t>& extras) {
+             const std::unordered_map<std::string, uintptr_t>& extras,
+             [[maybe_unused]] DataType accumDtype) -> CommResult {
         return self->alltoallvKernelFunc(ctx, input, output, inputSize, outputSize, dtype, stream,
                                          nBlocks, nThreadsPerBlock, extras);
       },
@@ -77,7 +78,8 @@ std::shared_ptr<Algorithm> AlltoallvFullmesh::build() {
         return self->initAlltoallvContext(comm, input, output, inputSize, outputSize, dtype);
       },
       // Context key generation function
-      [self](const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype) {
+      [self](const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype,
+             [[maybe_unused]] bool symmetricMemory) {
         return self->generateAlltoallvContextKey(input, output, inputSize, outputSize, dtype);
       });
 
diff --git a/src/ext/collectives/include/allgather/allgather_fullmesh.hpp b/src/ext/collectives/include/allgather/allgather_fullmesh.hpp
index 085f4ac4..d1a4bbcd 100644
--- a/src/ext/collectives/include/allgather/allgather_fullmesh.hpp
+++ b/src/ext/collectives/include/allgather/allgather_fullmesh.hpp
@@ -25,7 +25,7 @@ class AllgatherFullmesh : public AlgorithmBuilder {
 
   std::shared_ptr<void> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
                                              size_t, mscclpp::DataType);
-  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType);
+  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType, bool);
 
   void* scratchBuffer_;
   size_t scratchBufferSize_;
diff --git a/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp b/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp
index ea176ba1..56783e3b 100644
--- a/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp
+++ b/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp
@@ -11,11 +11,11 @@ namespace collective {
 
 class AllgatherFullmesh2 : public AlgorithmBuilder {
  public:
-  AllgatherFullmesh2();
+  AllgatherFullmesh2() = default;
   std::shared_ptr<Algorithm> build() override;
 
  private:
-  bool disableChannelCache_;
+  bool symmetricMemory_;
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
   const int nChannelsPerConnection_ = 35;
@@ -27,7 +27,7 @@ class AllgatherFullmesh2 : public AlgorithmBuilder {
 
   std::shared_ptr<void> initAllgatherContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType, bool);
 };
 
 }  // namespace collective
diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
index e995b940..362308b2 100644
--- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp
@@ -9,19 +9,22 @@ namespace mscclpp {
 namespace collective {
 class AllreduceAllpairPacket : public AlgorithmBuilder {
  public:
-  AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
-      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
+      : scratchBuffer_((void*)scratchBuffer),
+        scratchBufferSize_(scratchBufferSize),
+        flagBuffer_(flagBuffer),
+        flagBufferSize_(flagBufferSize){};
   std::shared_ptr<Algorithm> build() override;
 
  private:
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
   void* scratchBuffer_;
   size_t scratchBufferSize_;
@@ -30,9 +33,8 @@ class AllreduceAllpairPacket : public AlgorithmBuilder {
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
   std::vector<RegisteredMemory> registeredMemories_;
-  std::shared_ptr<LL8Packet> flags_;
-  std::shared_ptr<uint32_t> flags7_;
-  std::shared_ptr<uint32_t> flags28_;
+  uintptr_t flagBuffer_;
+  size_t flagBufferSize_;
 };
 }  // namespace collective
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
index 31a7f145..a54352b3 100644
--- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp
@@ -16,11 +16,11 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   std::shared_ptr<Communicator> comm_;
@@ -32,6 +32,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder {
   RegisteredMemory localScratchMemory_;
   std::unordered_map<const void*, std::pair<std::vector<MemoryChannel>, std::shared_ptr<DeviceHandle<MemoryChannel>>>>
       memoryChannelsMap_;
+  bool symmetricMemory_ = false;
 };
 }  // namespace collective
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
similarity index 72%
rename from src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp
rename to src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
index 1077b122..81b74add 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
@@ -1,14 +1,17 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#ifndef MSCCLPP_EXT_ALLREDUCE_NVLS_BLOCK_PIPELINE_HPP_
+#define MSCCLPP_EXT_ALLREDUCE_NVLS_BLOCK_PIPELINE_HPP_
+
 #include <mscclpp/algorithm.hpp>
 
 namespace mscclpp {
 namespace collective {
 
-class AllreduceNvlsWithCopy : public AlgorithmBuilder {
+class AllreduceNvlsBlockPipeline : public AlgorithmBuilder {
  public:
-  AllreduceNvlsWithCopy(uintptr_t scratchBuffer, size_t scratchBufferSize)
+  AllreduceNvlsBlockPipeline(uintptr_t scratchBuffer, size_t scratchBufferSize)
       : scratchBuffer_(reinterpret_cast<void*>(scratchBuffer)), scratchBufferSize_(scratchBufferSize){};
   std::shared_ptr<Algorithm> build() override;
 
@@ -16,11 +19,11 @@ class AllreduceNvlsWithCopy : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
   const size_t nvlsBufferSize_ = (1 << 30);
   void* scratchBuffer_;
@@ -29,6 +32,9 @@ class AllreduceNvlsWithCopy : public AlgorithmBuilder {
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
 };
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_ALLREDUCE_NVLS_BLOCK_PIPELINE_HPP_
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
index 8761162a..fb0c63b8 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp
@@ -10,27 +10,32 @@ namespace mscclpp {
 namespace collective {
 class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
  public:
-  AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
-      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
+      : scratchBuffer_((void*)scratchBuffer),
+        scratchBufferSize_(scratchBufferSize),
+        flagBuffer_(flagBuffer),
+        flagBufferSize_(flagBufferSize){};
   std::shared_ptr<mscclpp::Algorithm> build() override;
 
  private:
   void initialize(std::shared_ptr<mscclpp::Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
-                                 int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras);
+                                 int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+                                 mscclpp::DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<mscclpp::Communicator> comm, const void*, void* output,
                                              size_t, mscclpp::DataType);
-  mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType);
+  mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType, bool);
 
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   const size_t nvlsBufferSize_ = (1 << 30);
   const int maxBlockNum_ = 16;
-  std::shared_ptr<LL8Packet> flags_;
-  std::shared_ptr<uint32_t> flags4_;
-  std::shared_ptr<uint32_t> flags8_;
+  uintptr_t flagBuffer_;
+  size_t flagBufferSize_;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
+  std::vector<SwitchChannel> switchChannels_;
 };
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
similarity index 73%
rename from src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp
rename to src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
index 7bfa9822..8f02a873 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
@@ -1,17 +1,17 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#ifndef MSCCLPP_EXT_ALLREDUCE_NVLS_WITH_COPY_2_HPP_
-#define MSCCLPP_EXT_ALLREDUCE_NVLS_WITH_COPY_2_HPP_
+#ifndef MSCCLPP_EXT_ALLREDUCE_NVLS_WARP_PIPELINE_HPP_
+#define MSCCLPP_EXT_ALLREDUCE_NVLS_WARP_PIPELINE_HPP_
 
 #include <mscclpp/algorithm.hpp>
 
 namespace mscclpp {
 namespace collective {
 
-class AllreduceNvlsWithCopy2 : public AlgorithmBuilder {
+class AllreduceNvlsWarpPipeline : public AlgorithmBuilder {
  public:
-  AllreduceNvlsWithCopy2(uintptr_t scratchBuffer, size_t scratchBufferSize)
+  AllreduceNvlsWarpPipeline(uintptr_t scratchBuffer, size_t scratchBufferSize)
       : scratchBuffer_(reinterpret_cast<void*>(scratchBuffer)), scratchBufferSize_(scratchBufferSize){};
   std::shared_ptr<Algorithm> build() override;
 
@@ -19,11 +19,11 @@ class AllreduceNvlsWithCopy2 : public AlgorithmBuilder {
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
   const size_t nvlsBufferSize_ = (1 << 30);
   void* scratchBuffer_;
@@ -32,8 +32,9 @@ class AllreduceNvlsWithCopy2 : public AlgorithmBuilder {
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
 };
 }  // namespace collective
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_EXT_ALLREDUCE_NVLS_WITH_COPY_2_HPP_
\ No newline at end of file
+#endif  // MSCCLPP_EXT_ALLREDUCE_NVLS_WARP_PIPELINE_HPP_
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
similarity index 60%
rename from src/ext/collectives/include/allreduce/allreduce_nvls.hpp
rename to src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
index 4591cb42..d53ea180 100644
--- a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -1,6 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#ifndef MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_
+#define MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_
+
 #include <mscclpp/algorithm.hpp>
 
 namespace mscclpp {
@@ -12,21 +15,30 @@ class AllreduceNvls : public AlgorithmBuilder {
   std::shared_ptr<Algorithm> build() override;
 
  private:
+  bool symmetricMemory_ = false;
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
-  const size_t nvlsBufferSize_ = (1 << 30);
+  // Large buffer size because cuMemMap requires offset=0 for multicast handles, so the entire
+  // user allocation must be mapped. This only reserves virtual address space; no physical memory
+  // is consumed beyond what is actually bound.
+  const size_t nvlsBufferSize_ = (1UL << 34);
   uint32_t nSwitchChannels_;
   std::shared_ptr<DeviceHandle<BaseMemoryChannel>> memoryChannelsDeviceHandle_;
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
+  std::vector<std::shared_ptr<NvlsConnection>> nvlsOutConnections_;
+  int computeCapabilityMajor_{0};
 };
 
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
index f562aca5..de7ca471 100644
--- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp
+++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp
@@ -9,28 +9,32 @@ namespace mscclpp {
 namespace collective {
 class AllreducePacket : public AlgorithmBuilder {
  public:
-  AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize)
-      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize)
+      : scratchBuffer_((void*)scratchBuffer),
+        scratchBufferSize_(scratchBufferSize),
+        flagBuffer_(flagBuffer),
+        flagBufferSize_(flagBufferSize){};
   std::shared_ptr<Algorithm> build() override;
 
  private:
   void initialize(std::shared_ptr<Communicator> comm);
   CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
                                  DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                 const std::unordered_map<std::string, uintptr_t>& extras);
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
 
   std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
                                              DataType);
-  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
 
   void* scratchBuffer_;
   size_t scratchBufferSize_;
   const int nSegmentsForScratchBuffer_ = 2;
   const int maxBlockNum_ = 56;
   std::vector<Connection> conns_;
+  uintptr_t flagBuffer_;
+  size_t flagBufferSize_;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores_;
   std::vector<RegisteredMemory> registeredMemories_;
-  std::shared_ptr<LL8Packet> flags_;
 };
 }  // namespace collective
 }  // namespace mscclpp
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
new file mode 100644
index 00000000..1fd663da
--- /dev/null
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_EXT_ALLREDUCE_RSAG_HPP_
+#define MSCCLPP_EXT_ALLREDUCE_RSAG_HPP_
+
+#include <mscclpp/algorithm.hpp>
+
+namespace mscclpp {
+namespace collective {
+
+class AllreduceRsAg : public mscclpp::AlgorithmBuilder {
+ public:
+  AllreduceRsAg(uintptr_t scratchBuffer, size_t scratchBufferSize)
+      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  std::shared_ptr<mscclpp::Algorithm> build() override;
+
+ private:
+  void initialize(std::shared_ptr<Communicator> comm);
+  CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+                                 DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
+
+  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
+                                             DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
+  void* scratchBuffer_;
+  size_t scratchBufferSize_;
+  std::shared_ptr<Communicator> comm_;
+  int nChannelsPerConnection_;
+  std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> scratchSemaphores_;
+  std::vector<RegisteredMemory> remoteScratchMemories_;
+  RegisteredMemory localScratchMemory_;
+
+  std::vector<BaseMemoryChannel> baseChannels_;
+  std::shared_ptr<DeviceHandle<BaseMemoryChannel>> baseMemoryChannelHandles_;
+  std::shared_ptr<void*> remoteMemoryHandles_;
+};
+}  // namespace collective
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_ALLREDUCE_RSAG_HPP_
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
new file mode 100644
index 00000000..7629f2fe
--- /dev/null
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_EXT_ALLREDUCE_RSAG_PIPELINE_HPP_
+#define MSCCLPP_EXT_ALLREDUCE_RSAG_PIPELINE_HPP_
+
+#include <mscclpp/algorithm.hpp>
+
+namespace mscclpp {
+namespace collective {
+
+class AllreduceRsAgPipeline : public mscclpp::AlgorithmBuilder {
+ public:
+  AllreduceRsAgPipeline(uintptr_t scratchBuffer, size_t scratchBufferSize)
+      : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){};
+  std::shared_ptr<mscclpp::Algorithm> build() override;
+
+ private:
+  void initialize(std::shared_ptr<Communicator> comm);
+  CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+                                 DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
+
+  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
+                                             DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
+  void* scratchBuffer_;
+  size_t scratchBufferSize_;
+  std::shared_ptr<Communicator> comm_;
+  int nChannelsPerConnection_;
+  std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> scratchSemaphores_;
+  std::vector<RegisteredMemory> remoteScratchMemories_;
+  RegisteredMemory localScratchMemory_;
+
+  std::vector<BaseMemoryChannel> baseChannels_;
+  std::shared_ptr<DeviceHandle<BaseMemoryChannel>> baseMemoryChannelHandles_;
+  std::shared_ptr<void*> remoteMemoryHandles_;
+};
+}  // namespace collective
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_ALLREDUCE_RSAG_PIPELINE_HPP_
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
new file mode 100644
index 00000000..05bf2ef3
--- /dev/null
+++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_EXT_ALLREDUCE_RSAG_ZERO_COPY_HPP_
+#define MSCCLPP_EXT_ALLREDUCE_RSAG_ZERO_COPY_HPP_
+
+#include <mscclpp/algorithm.hpp>
+
+namespace mscclpp {
+namespace collective {
+
+class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder {
+ public:
+  AllreduceRsAgZeroCopy() = default;
+  std::shared_ptr<mscclpp::Algorithm> build() override;
+
+ private:
+  void initialize(std::shared_ptr<Communicator> comm);
+  CommResult allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
+                                 DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                 const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype);
+
+  std::shared_ptr<void> initAllreduceContext(std::shared_ptr<Communicator> comm, const void*, void* output, size_t,
+                                             DataType);
+  AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool);
+  std::shared_ptr<Communicator> comm_;
+  int nChannelsPerConnection_;
+  std::vector<Connection> conns_;
+  std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> semaphores_;
+  std::vector<RegisteredMemory> inputMemories_;
+  std::vector<RegisteredMemory> outputMemories_;
+
+  std::vector<BaseMemoryChannel> baseChannels_;
+  std::shared_ptr<DeviceHandle<BaseMemoryChannel>> baseMemoryChannelHandles_;
+};
+}  // namespace collective
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_ALLREDUCE_RSAG_ZERO_COPY_HPP_
\ No newline at end of file
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 10eecf7e..1e0e7e69 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#ifndef MSCCLPP_ALLREDUCE_COMMOM_HPP_
-#define MSCCLPP_ALLREDUCE_COMMOM_HPP_
+#ifndef MSCCLPP_ALLREDUCE_COMMON_HPP_
+#define MSCCLPP_ALLREDUCE_COMMON_HPP_
 
 #include <cmath>
 #include <mscclpp/algorithm.hpp>
@@ -10,6 +10,8 @@
 #include <mscclpp/packet_device.hpp>
 #include <type_traits>
 
+#include "reduce_kernel.hpp"
+
 #if defined(ENABLE_NPKIT)
 #include <mscclpp/npkit/npkit.hpp>
 #endif
@@ -22,438 +24,6 @@ constexpr ReduceOp MIN = ReduceOp::MIN;
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
 
-template <typename To, typename From>
-__forceinline__ __device__ To bit_cast(const From& src) {
-  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
-
-  union {
-    From f;
-    To t;
-  } u;
-  u.f = src;
-  return u.t;
-}
-
-template <typename T>
-__forceinline__ __device__ T clip(T val) {
-  return val;
-}
-
-template <>
-__forceinline__ __device__ __half clip(__half val) {
-  val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff));
-  val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff));
-
-  return val;
-}
-
-template <>
-__forceinline__ __device__ __half2 clip(__half2 val) {
-  val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff));
-  val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff));
-  val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff));
-  val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff));
-  return val;
-}
-
-template <>
-__forceinline__ __device__ __bfloat16 clip(__bfloat16 val) {
-  val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80));
-  val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80));
-  return val;
-}
-
-template <>
-__forceinline__ __device__ __bfloat162 clip(__bfloat162 val) {
-  val.x = __hmax(val.x, bit_cast<__bfloat16, unsigned short>(0xff80));
-  val.x = __hmin(val.x, bit_cast<__bfloat16, unsigned short>(0x7f80));
-  val.y = __hmax(val.y, bit_cast<__bfloat16, unsigned short>(0xff80));
-  val.y = __hmin(val.y, bit_cast<__bfloat16, unsigned short>(0x7f80));
-  return val;
-}
-
-template <typename T, bool UseClip = true>
-__forceinline__ __device__ T add_elements(T a, T b) {
-  if constexpr (UseClip) {
-    return clip(a + b);
-  } else {
-    return a + b;
-  }
-}
-
-template <bool UseClip = true>
-__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) {
-  if constexpr (UseClip) {
-    return clip(__hadd2(a, b));
-  } else {
-    return __hadd2(a, b);
-  }
-}
-
-template <bool UseClip = true>
-__forceinline__ __device__ __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) {
-  if constexpr (UseClip) {
-    return clip(__hadd2(a, b));
-  } else {
-    return __hadd2(a, b);
-  }
-}
-
-template <typename T>
-__forceinline__ __device__ T min_elements(T a, T b) {
-  return (a < b ? a : b);
-}
-
-template <>
-__forceinline__ __device__ __half2 min_elements(__half2 a, __half2 b) {
-#if defined(__HIP_PLATFORM_AMD__)
-  __half2 val;
-  val.x = __hmin(a.x, b.x);
-  val.y = __hmin(a.y, b.y);
-  return val;
-#else
-  return __hmin2(a, b);
-#endif
-}
-
-template <>
-__forceinline__ __device__ __bfloat162 min_elements(__bfloat162 a, __bfloat162 b) {
-  return __hmin2(a, b);
-}
-
-#if defined(__FP8_TYPES_EXIST__)
-// FP8 E4M3 clipping function
-template <>
-__forceinline__ __device__ __fp8_e4m3 clip(__fp8_e4m3 val) {
-  // FP8 E4M3 has range [-448, 448], no infinities
-  // Built-in saturation in FP8 arithmetic
-  return val;
-}
-
-// FP8 E5M2 clipping function - prevent infinities by clamping to max finite value
-template <>
-__forceinline__ __device__ __fp8_e5m2 clip(__fp8_e5m2 val) {
-  // FP8 E5M2 has infinities - clamp to max finite value to prevent overflow
-  // Max finite value for E5M2 is 57344.0f (0x7B), min is -57344.0f (0xFB)
-  float fval = float(val);
-  fval = fmaxf(fval, -57344.0f);
-  fval = fminf(fval, 57344.0f);
-  return __fp8_e5m2(fval);
-}
-
-// FP8 E4M3 addition using __hadd for efficiency (single element)
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
-#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
-  // Optimized assembly for gfx942
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false);
-#elif !defined(__HIP_PLATFORM_AMD__)
-  // NVIDIA CUDA FP8 addition (CUDA 11.8+)
-  __fp8_e4m3 result = __fp8_e4m3(__hadd(__half(a), __half(b)));
-  return UseClip ? clip(result) : result;
-#else
-  // Fallback for non-gfx942 HIP platforms
-  __fp8_e4m3 result = __fp8_e4m3(float(a) + float(b));
-  return UseClip ? clip(result) : result;
-#endif
-}
-
-// FP8 E4M3 vectorized addition for 2 elements
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8x2_e4m3 add_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) {
-#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0)));
-  return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false);
-#elif !defined(__HIP_PLATFORM_AMD__)
-  // CUDA: Convert to half2, add using optimized __hadd2, convert back
-  __fp8x2_e4m3 result = __fp8x2_e4m3(__hadd2(__half2(a), __half2(b)));
-  return result;
-#else
-  // Fallback for non-gfx942 HIP: element-wise using single-element operations
-  union {
-    __fp8_e4m3 fp8[2];
-    __fp8x2_e4m3 fp8x2;
-  } ua, ub, result;
-  ua.fp8x2 = a;
-  ub.fp8x2 = b;
-  result.fp8[0] = add_elements<UseClip>(ua.fp8[0], ub.fp8[0]);
-  result.fp8[1] = add_elements<UseClip>(ua.fp8[1], ub.fp8[1]);
-  return result.fp8x2;
-#endif
-}
-
-// FP8 E4M3 vectorized addition for 4 elements (via 2x __fp8x2_e4m3)
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8x4_e4m3 add_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) {
-  // Process as two __fp8x2_e4m3 using add_elements for 2 elements
-  __fp8x2_e4m3* a_pair = reinterpret_cast<__fp8x2_e4m3*>(&a);
-  __fp8x2_e4m3* b_pair = reinterpret_cast<__fp8x2_e4m3*>(&b);
-
-  __fp8x2_e4m3 result[2];
-  result[0] = add_elements<UseClip>(a_pair[0], b_pair[0]);
-  result[1] = add_elements<UseClip>(a_pair[1], b_pair[1]);
-
-  return *reinterpret_cast<__fp8x4_e4m3*>(result);
-}
-
-// FP8 E5M2 addition using __hadd for efficiency (single element)
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
-#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__)
-  // Optimized assembly for gfx942 (bfloat8)
-  float2 v;
-  uint32_t ival = 0;
-  asm volatile("v_pk_add_f32 %0, %1, %2"
-               : "=v"(v)
-               : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0)));
-  return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false);
-#elif !defined(__HIP_PLATFORM_AMD__)
-  // NVIDIA CUDA FP8 addition
-  __fp8_e5m2 result = __fp8_e5m2(__hadd(__half(a), __half(b)));
-  return UseClip ? clip(result) : result;
-#else
-  // Fallback for non-gfx942 HIP platforms
-  __fp8_e5m2 result = __fp8_e5m2(float(a) + float(b));
-  return UseClip ? clip(result) : result;
-#endif
-}
-
-#if !defined(__HIP_PLATFORM_AMD__)
-// FP8 E5M2 vectorized addition for 2 elements (CUDA only)
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8x2_e5m2 add_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) {
-  // CUDA: Convert to half2, add using optimized __hadd2, convert back
-  __fp8x2_e5m2 result = __fp8x2_e5m2(__hadd2(__half2(a), __half2(b)));
-  return result;
-}
-
-// FP8 E5M2 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e5m2)
-template <bool UseClip = true>
-__forceinline__ __device__ __fp8x4_e5m2 add_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) {
-  // Process as two __fp8x2_e5m2 using add_elements for 2 elements
-  __fp8x2_e5m2* a_pair = reinterpret_cast<__fp8x2_e5m2*>(&a);
-  __fp8x2_e5m2* b_pair = reinterpret_cast<__fp8x2_e5m2*>(&b);
-
-  __fp8x2_e5m2 result[2];
-  result[0] = add_elements<UseClip>(a_pair[0], b_pair[0]);
-  result[1] = add_elements<UseClip>(a_pair[1], b_pair[1]);
-
-  return *reinterpret_cast<__fp8x4_e5m2*>(result);
-}
-#endif  // !defined(__HIP_PLATFORM_AMD__)
-
-// FP8 E4M3 min operation (single element)
-template <>
-__forceinline__ __device__ __fp8_e4m3 min_elements(__fp8_e4m3 a, __fp8_e4m3 b) {
-#if defined(__HIP_PLATFORM_AMD__)
-  return __fp8_e4m3(fminf(float(a), float(b)));
-#else
-  return __fp8_e4m3(__hmin(__half(a), __half(b)));
-#endif
-}
-
-// FP8 E4M3 vectorized min for 2 elements
-__forceinline__ __device__ __fp8x2_e4m3 min_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) {
-#if defined(__HIP_PLATFORM_AMD__)
-  // HIP implementation: use union and process element-wise
-  union {
-    __fp8_e4m3 fp8[2];
-    __fp8x2_e4m3 fp8x2;
-  } ua, ub, result;
-  ua.fp8x2 = a;
-  ub.fp8x2 = b;
-  result.fp8[0] = min_elements(ua.fp8[0], ub.fp8[0]);
-  result.fp8[1] = min_elements(ua.fp8[1], ub.fp8[1]);
-  return result.fp8x2;
-#else
-  return __fp8x2_e4m3(__hmin2(__half2(a), __half2(b)));
-#endif
-}
-
-// FP8 E4M3 vectorized min for 4 elements
-__forceinline__ __device__ __fp8x4_e4m3 min_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) {
-  // Process as two __fp8x2_e4m3 using min_elements for 2 elements
-  union {
-    __fp8x4_e4m3 vec4;
-    __fp8x2_e4m3 vec2[2];
-  } ua, ub, uresult;
-  ua.vec4 = a;
-  ub.vec4 = b;
-
-  uresult.vec2[0] = min_elements(ua.vec2[0], ub.vec2[0]);
-  uresult.vec2[1] = min_elements(ua.vec2[1], ub.vec2[1]);
-
-  return uresult.vec4;
-}
-
-// FP8 E5M2 min operation (single element)
-template <>
-__forceinline__ __device__ __fp8_e5m2 min_elements(__fp8_e5m2 a, __fp8_e5m2 b) {
-#if defined(__HIP_PLATFORM_AMD__)
-  return __fp8_e5m2(fminf(float(a), float(b)));
-#else
-  return __fp8_e5m2(__hmin(__half(a), __half(b)));
-#endif
-}
-
-#if !defined(__HIP_PLATFORM_AMD__)
-// FP8 E5M2 vectorized min for 2 elements (CUDA only)
-__forceinline__ __device__ __fp8x2_e5m2 min_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) {
-  return __fp8x2_e5m2(__hmin2(__half2(a), __half2(b)));
-}
-
-// FP8 E5M2 vectorized min for 4 elements (CUDA only)
-__forceinline__ __device__ __fp8x4_e5m2 min_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) {
-  // Process as two __fp8x2_e5m2 using min_elements for 2 elements
-  union {
-    __fp8x4_e5m2 vec4;
-    __fp8x2_e5m2 vec2[2];
-  } ua, ub, uresult;
-  ua.vec4 = a;
-  ub.vec4 = b;
-
-  uresult.vec2[0] = min_elements(ua.vec2[0], ub.vec2[0]);
-  uresult.vec2[1] = min_elements(ua.vec2[1], ub.vec2[1]);
-
-  return uresult.vec4;
-}
-#endif  // !defined(__HIP_PLATFORM_AMD__)
-#endif  // __FP8_TYPES_EXIST__
-
-template <typename T, ReduceOp OpType>
-__forceinline__ __device__ T cal_elements(T a, T b) {
-  if constexpr (OpType == SUM) {
-    return add_elements(a, b);
-  } else if constexpr (OpType == MIN) {
-    return min_elements(a, b);
-  }
-  // Should never reach here
-  return a;
-}
-
-template <typename T, ReduceOp OpType>
-__forceinline__ __device__ int4 cal_vectors_helper(int4 a, int4 b) {
-  int4 ret;
-  ret.w = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
-  ret.x = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  ret.z = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
-  return ret;
-}
-
-template <typename T, ReduceOp OpType>
-__forceinline__ __device__ uint2 cal_vectors_helper(uint2 a, uint2 b) {
-  uint2 ret;
-  ret.x = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  return ret;
-}
-
-template <typename T, ReduceOp OpType>
-__forceinline__ __device__ int cal_vectors_helper(int a, int b) {
-  return bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
-}
-
-#if defined(__HIP_PLATFORM_AMD__) && defined(__FP8_TYPES_EXIST__) && defined(__gfx942__)
-// Helper function to perform FP8 vector addition - dispatches based on scalar type
-// Uses AMD builtins from hip/amd_detail/amd_hip_fp8.h:
-//   - __builtin_amdgcn_cvt_pk_f32_fp8/bf8: Convert 2 FP8 values to 2 floats
-//   - __builtin_amdgcn_cvt_pk_fp8/bf8_f32: Convert 2 floats to 2 FP8 values
-// The 'word' parameter (false/true) selects low/high 16-bit word from uint32_t
-template <typename ScalarT>
-__forceinline__ __device__ int add_fp8x4_hip(int a, int b) {
-  uint32_t a32 = static_cast<uint32_t>(a);
-  uint32_t b32 = static_cast<uint32_t>(b);
-
-  float2 v_low, v_high;
-  uint32_t ival = 0;
-
-  if constexpr (std::is_same_v<ScalarT, __fp8_e4m3>) {
-    // E4M3 using fp8 conversion - process low word (false) and high word (true)
-    asm volatile("v_pk_add_f32 %0, %1, %2"
-                 : "=v"(v_low)
-                 : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, false)));
-    uint16_t result_low = __builtin_amdgcn_cvt_pk_fp8_f32(v_low.x, v_low.y, ival, false);
-
-    asm volatile("v_pk_add_f32 %0, %1, %2"
-                 : "=v"(v_high)
-                 : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, true)));
-    uint16_t result_high = __builtin_amdgcn_cvt_pk_fp8_f32(v_high.x, v_high.y, ival, false);
-
-    uint32_t result = (static_cast<uint32_t>(result_high) << 16) | result_low;
-    return static_cast<int>(result);
-  } else {  // __fp8_e5m2
-    // E5M2 using bf8 conversion - process low word (false) and high word (true)
-    asm volatile("v_pk_add_f32 %0, %1, %2"
-                 : "=v"(v_low)
-                 : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, false)));
-    uint16_t result_low = __builtin_amdgcn_cvt_pk_bf8_f32(v_low.x, v_low.y, ival, false);
-
-    asm volatile("v_pk_add_f32 %0, %1, %2"
-                 : "=v"(v_high)
-                 : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, true)));
-    uint16_t result_high = __builtin_amdgcn_cvt_pk_bf8_f32(v_high.x, v_high.y, ival, false);
-
-    uint32_t result = (static_cast<uint32_t>(result_high) << 16) | result_low;
-    return static_cast<int>(result);
-  }
-}
-#endif
-
-template <typename T, ReduceOp OpType, typename DataType>
-__forceinline__ __device__ DataType cal_vectors(DataType a, DataType b) {
-#if defined(__HIP_PLATFORM_AMD__) && defined(__FP8_TYPES_EXIST__) && defined(__gfx942__)
-  // For FP8 types on HIP gfx942, use specialized helper that dispatches based on scalar type
-  if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
-    if constexpr (OpType == SUM) {
-      if constexpr (std::is_same_v<DataType, int> || std::is_same_v<DataType, uint32_t>) {
-        // Handle int/uint32_t (4 FP8 elements)
-        return add_fp8x4_hip<T>(a, b);
-      } else if constexpr (std::is_same_v<DataType, int4>) {
-        // Handle int4 (16 FP8 elements) - process as 4 ints
-        int4 ret;
-        ret.w = add_fp8x4_hip<T>(a.w, b.w);
-        ret.x = add_fp8x4_hip<T>(a.x, b.x);
-        ret.y = add_fp8x4_hip<T>(a.y, b.y);
-        ret.z = add_fp8x4_hip<T>(a.z, b.z);
-        return ret;
-      } else if constexpr (std::is_same_v<DataType, uint2>) {
-        // Handle uint2 (8 FP8 elements) - process as 2 ints
-        uint2 ret;
-        ret.x = add_fp8x4_hip<T>(a.x, b.x);
-        ret.y = add_fp8x4_hip<T>(a.y, b.y);
-        return ret;
-      }
-    }
-  }
-#endif
-
-  // Define the vectorized computation type based on the element type
-  using CompType = typename std::conditional_t<
-      std::is_same_v<T, __half>, __half2,
-      std::conditional_t<std::is_same_v<T, __bfloat16>, __bfloat162,
-#if defined(__FP8_TYPES_EXIST__)
-                         std::conditional_t<std::is_same_v<T, __fp8_e4m3>, __fp8x4_e4m3,
-                                            std::conditional_t<std::is_same_v<T, __fp8_e5m2>, __fp8x4_e5m2,
-#endif
-                                                               T
-#if defined(__FP8_TYPES_EXIST__)
-                                                               >>>>;
-#else
-                         >>;
-#endif
-  return cal_vectors_helper<CompType, OpType>(a, b);
-}
-
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
 template <class T>
 MSCCLPP_DEVICE_INLINE constexpr std::size_t calcVectorSize() {
@@ -472,7 +42,12 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src
   // nvls can only handle 4 bytes alignment
   MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned");
   constexpr size_t nElem = calcVectorSize<T>();
-  using vectorType = mscclpp::VectorType<T, nElem>;
+  // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations
+  constexpr size_t vecSize = (std::is_same_v<T, int> || std::is_same_v<T, int32_t> || std::is_same_v<T, unsigned int> ||
+                              std::is_same_v<T, uint32_t>)
+                                 ? 1
+                                 : nElem;
+  using vectorType = mscclpp::VectorType<T, vecSize>;
   const size_t nVec = size / sizeof(vectorType);
   const size_t srcOffset4 = srcOffset / sizeof(vectorType);
   const size_t dstOffset4 = dstOffset / sizeof(vectorType);
@@ -500,53 +75,53 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src
 using AllreduceFunc =
     std::function<cudaError_t(const void*, void*, void*, void*, void*, mscclpp::DeviceHandle<mscclpp::SwitchChannel>*,
                               mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t, int, int, int,
-                              size_t, cudaStream_t, void*, uint32_t, int, int)>;
+                              size_t, cudaStream_t, void*, uint32_t, uint32_t, int, int)>;
 
-template <template <ReduceOp, typename> class Adapter>
-AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype) {
-  if (op == SUM) {
-    if (dtype == mscclpp::DataType::FLOAT16) {
-      return Adapter<SUM, half>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT32) {
-      return Adapter<SUM, float>::call;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::BFLOAT16) {
-      return Adapter<SUM, __bfloat16>::call;
-#endif
-#if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FP8_E4M3) {
-      return Adapter<SUM, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FP8_E5M2) {
-      return Adapter<SUM, __fp8_e5m2>::call;
-#endif
-    } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
-      return Adapter<SUM, int>::call;
-    } else {
-      return nullptr;
-    }
-  } else if (op == MIN) {
-    if (dtype == mscclpp::DataType::FLOAT16) {
-      return Adapter<MIN, half>::call;
-    } else if (dtype == mscclpp::DataType::FLOAT32) {
-      return Adapter<MIN, float>::call;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::BFLOAT16) {
-      return Adapter<MIN, __bfloat16>::call;
-#endif
-#if defined(__FP8_TYPES_EXIST__)
-    } else if (dtype == mscclpp::DataType::FP8_E4M3) {
-      return Adapter<MIN, __fp8_e4m3>::call;
-    } else if (dtype == mscclpp::DataType::FP8_E5M2) {
-      return Adapter<MIN, __fp8_e5m2>::call;
-#endif
-    } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
-      return Adapter<MIN, int>::call;
-    } else {
-      return nullptr;
-    }
+/// Dispatch helper for FP8 types with a configurable accumulation type.
+template <ReduceOp Op, typename FP8T, template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatchFp8Accum(mscclpp::DataType accumDtype, mscclpp::DataType dtype) {
+  if (accumDtype == mscclpp::DataType::FLOAT32) {
+    return Adapter<Op, FP8T, float>::call;
+  } else if (accumDtype == mscclpp::DataType::FLOAT16) {
+    return Adapter<Op, FP8T, half>::call;
+  } else if (accumDtype == dtype) {
+    return Adapter<Op, FP8T, FP8T>::call;
   }
   return nullptr;
 }
+
+template <ReduceOp Op, template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatchByDtype(mscclpp::DataType dtype, mscclpp::DataType accumDtype) {
+  if (dtype == mscclpp::DataType::FLOAT16) {
+    return Adapter<Op, half, half>::call;
+  } else if (dtype == mscclpp::DataType::FLOAT32) {
+    return Adapter<Op, float, float>::call;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  } else if (dtype == mscclpp::DataType::BFLOAT16) {
+    return Adapter<Op, __bfloat16, __bfloat16>::call;
+#endif
+#if defined(__FP8_TYPES_EXIST__)
+  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
+    return dispatchFp8Accum<Op, __fp8_e4m3, Adapter>(accumDtype, dtype);
+  } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
+    return dispatchFp8Accum<Op, __fp8_e5m2, Adapter>(accumDtype, dtype);
+#endif
+  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3B15) {
+    return dispatchFp8Accum<Op, __fp8_e4m3b15, Adapter>(accumDtype, dtype);
+  } else if (dtype == mscclpp::DataType::INT32 || dtype == mscclpp::DataType::UINT32) {
+    return Adapter<Op, int, int>::call;
+  } else if (dtype == mscclpp::DataType::UINT8) {
+    return Adapter<Op, uint8_t, uint8_t>::call;
+  }
+  return nullptr;
+}
+
+template <template <ReduceOp, typename, typename> class Adapter>
+AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype, mscclpp::DataType accumDtype) {
+  if (op == SUM) return dispatchByDtype<SUM, Adapter>(dtype, accumDtype);
+  if (op == MIN) return dispatchByDtype<MIN, Adapter>(dtype, accumDtype);
+  return nullptr;
+}
 }  // namespace collective
 }  // namespace mscclpp
 
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
index 97497eea..33a324c5 100644
--- a/src/ext/collectives/include/collective_utils.hpp
+++ b/src/ext/collectives/include/collective_utils.hpp
@@ -32,7 +32,6 @@ constexpr int NUM_SEMAPHORES = 64;
 constexpr int MAX_NRANKS_PER_NODE = 8;
 
 constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
-static bool mscclppDisableChannelCache = env()->disableChannelCache;
 
 std::vector<RegisteredMemory> setupRemoteMemories(std::shared_ptr<Communicator> comm, int rank,
                                                   RegisteredMemory localMemory);
@@ -117,12 +116,12 @@ class AlgorithmCtx {
   std::vector<MemoryChannel> memoryChannels;
   std::vector<SwitchChannel> switchChannels;
   std::vector<PortChannel> portChannels;
-  std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections;
   std::shared_ptr<DeviceHandle<MemoryChannel>> memoryChannelDeviceHandles;
   std::shared_ptr<DeviceHandle<SwitchChannel>> switchChannelDeviceHandles;
   std::shared_ptr<DeviceHandle<PortChannel>> portChannelDeviceHandles;
   std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores;
   std::vector<std::shared_ptr<Host2DeviceSemaphore>> hostSemaphores;
+  std::shared_ptr<void*> remoteMemoryHandles;
   std::unordered_map<std::string, std::shared_ptr<void>> extras;
 };
 
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
new file mode 100644
index 00000000..0b9592d7
--- /dev/null
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -0,0 +1,186 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "algorithm_selector.hpp"
+
+#include <mscclpp/env.hpp>
+#include <mscclpp/utils.hpp>
+
+#include "debug.h"
+
+namespace mscclpp {
+namespace nccl {
+
+static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, DataType dtype) {
+  bool nvlsSupported = config.nvlsSupported;
+
+  // NVLS does not support uint8_t (no hardware support for byte-level reduction)
+  // NVLS also does not support float8_e4m3b15 (software-defined type with no hardware NVLS reduction support)
+  if (dtype == DataType::UINT8 || dtype == DataType::FLOAT8_E4M3B15) {
+    return false;
+  }
+
+  const bool isFp8 = dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2;
+
+  if (!isFp8) {
+    return nvlsSupported;
+  }
+
+  // FP8 handling
+#if !defined(__HIP_PLATFORM_AMD__)
+  // NVLS does not support FP8 on devices with compute capability < 10
+  if (config.computeCapability.first < 10) {
+    return false;
+  }
+#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__))
+  return true;
+#else
+  return false;
+#endif
+#else
+  return nvlsSupported;
+#endif
+}
+
+bool matchExecutionPlan(std::shared_ptr<DslAlgorithm> algo, const CollectiveRequest& request) {
+  bool worldSizeMatch = algo->constraint().worldSize == request.worldSize;
+  bool ranksPerNodeMatch = algo->constraint().nRanksPerNode == request.nRanksPerNode;
+  bool collectiveMatch = algo->collective() == request.collective;
+  bool bufferModeMatch = algo->bufferMode() == CollectiveBufferMode::Any || request.bufferMode() == algo->bufferMode();
+  size_t effectiveSize =
+      (request.collective == "allgather") ? (request.messageSize * request.worldSize) : request.messageSize;
+  bool minSizeMatch = effectiveSize >= algo->messageRange().first;
+  bool maxSizeMatch = effectiveSize <= algo->messageRange().second;
+  bool result =
+      worldSizeMatch && ranksPerNodeMatch && collectiveMatch && bufferModeMatch && minSizeMatch && maxSizeMatch;
+  return result;
+}
+
+static std::shared_ptr<Algorithm> selectSingleNodeAllreduceBlackwell(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config) {
+  const size_t messageSize = request.messageSize;
+
+  const bool nvlsSupported = isNvlsSupportedForDataType(config, request.dtype);
+
+  // Small messages always use NVLS packet algorithm
+  if (messageSize <= (1 << 15)) {  // <= 32KB
+    return algoMap.at("default_allreduce_nvls_packet");
+  }
+
+  if (!config.symmetricMemory) {
+    if (messageSize <= (1 << 21)) {  // <= 2MB
+      return algoMap.at("default_allreduce_packet");
+    }
+    if (config.inCaptureMode) {
+      // CUDA graph mode: setup new connections each time (zero-copy for graph)
+      return algoMap.at("default_allreduce_rsag_zero_copy");
+    }
+    // Non-graph mode: use non-zero-copy algorithms
+    if (messageSize <= (1 << 23)) {  // <= 8MB
+      return algoMap.at("default_allreduce_rsag");
+    }
+    return algoMap.at("default_allreduce_rsag_pipeline");
+  }
+
+  // Symmetric memory path: can use cached memory handles
+  const bool useNvlsWithZeroCopy = nvlsSupported && config.isCuMemMapAllocated;
+  if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {  // <= 64KB or <= 1MB
+    return algoMap.at("default_allreduce_packet");
+  }
+  if (useNvlsWithZeroCopy) {
+    return algoMap.at("default_allreduce_nvls_zero_copy");
+  }
+
+  return algoMap.at("default_allreduce_rsag_zero_copy");
+}
+
+std::shared_ptr<Algorithm> selectSingleNodeAllreduce(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config) {
+  // Use Blackwell-specific selection for compute capability 10.x
+  if (config.computeCapability.first == 10) {
+    return selectSingleNodeAllreduceBlackwell(algoMap, request, config);
+  }
+
+  const size_t messageSize = request.messageSize;
+
+  // Determine NVLS availability based on data type and device capability
+  const bool nvlsSupported = isNvlsSupportedForDataType(config, request.dtype);
+
+  const bool useNvlsWithZeroCopy = nvlsSupported && config.symmetricMemory && config.isCuMemMapAllocated;
+
+  // Very small messages: use allpair packet algorithm
+  if (messageSize <= (1 << 14)) {  // <= 16KB
+    return algoMap.at("default_allreduce_allpair_packet");
+  }
+  // Small messages with NVLS support
+  if (messageSize <= (1 << 15) && nvlsSupported) {  // <= 32KB
+    return algoMap.at("default_allreduce_nvls_packet");
+  }
+  // Medium messages: use packet algorithm
+  if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {  // <= 64KB or <= 1MB
+    return algoMap.at("default_allreduce_packet");
+  }
+  // Large messages with NVLS zero-copy support
+  if (nvlsSupported && useNvlsWithZeroCopy) {
+    return algoMap.at("default_allreduce_nvls_zero_copy");
+  }
+  // Large messages with NVLS but without zero-copy
+  if (nvlsSupported) {
+    if (messageSize < (1 << 24)) {  // < 16MB
+      return algoMap.at("default_allreduce_nvls_warp_pipeline");
+    }
+    return algoMap.at("default_allreduce_nvls_block_pipeline");
+  }
+#if defined(__HIP_PLATFORM_AMD__)
+  // AMD platform: use fullmesh algorithm
+  return algoMap.at("default_allreduce_fullmesh");
+#else
+  // NVIDIA without NVLS: use RSAG pipeline if no NCCL fallback
+  if (!config.ncclDlopenSharedLib) {
+    return algoMap.at("default_allreduce_fullmesh");
+  }
+  return nullptr;
+#endif
+}
+
+std::shared_ptr<Algorithm> selectSingleNodeAllgather(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    [[maybe_unused]] const AlgorithmSelectorConfig& config) {
+  const size_t messageSize = request.messageSize;
+
+  // For messages up to 32MB, use fullmesh2 algorithm
+  if (messageSize <= 32 * (1 << 20)) {
+    return algoMap.at("default_allgather_fullmesh2");
+  }
+
+#if defined(__HIP_PLATFORM_AMD__)
+  // AMD platform always uses fullmesh2
+  return algoMap.at("default_allgather_fullmesh2");
+#else
+  // NVIDIA: use fullmesh for large messages if no NCCL fallback is available
+  if (!config.ncclDlopenSharedLib) {
+    return algoMap.at("default_allgather_fullmesh");
+  }
+  return nullptr;
+#endif
+}
+
+std::shared_ptr<Algorithm> selectMultiNodeAlgorithm(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap [[maybe_unused]],
+    const CollectiveRequest& request [[maybe_unused]], const AlgorithmSelectorConfig& config [[maybe_unused]]) {
+  // TODO: Implement multi-node algorithm selection
+  // Multi-node scenarios will need to consider:
+  // 1. Multi-node NVLS (if supported by hardware)
+  // 2. Multi-node IB (InfiniBand)
+  // 3. Hierarchical algorithms (intra-node + inter-node)
+  // 4. Network topology awareness
+
+  // For now, return nullptr to fallback to NCCL/RCCL
+  INFO(MSCCLPP_NCCL, "Multi-node collective not yet supported, fallback to nccl/rccl");
+  return nullptr;
+}
+
+}  // namespace nccl
+}  // namespace mscclpp
diff --git a/src/ext/nccl/algorithm_selector.hpp b/src/ext/nccl/algorithm_selector.hpp
new file mode 100644
index 00000000..c8705f8b
--- /dev/null
+++ b/src/ext/nccl/algorithm_selector.hpp
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
+#define MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
+
+#include <memory>
+#include <mscclpp/algorithm.hpp>
+#include <mscclpp/core.hpp>
+#include <unordered_map>
+
+namespace mscclpp {
+namespace nccl {
+
+/// Configuration for algorithm selection
+struct AlgorithmSelectorConfig {
+  bool symmetricMemory;
+  bool nvlsSupported;
+  bool isCuMemMapAllocated;
+  bool inCaptureMode;
+  std::pair<int, int> computeCapability;
+  bool ncclDlopenSharedLib;
+};
+
+/// Select an algorithm for single-node allreduce
+std::shared_ptr<Algorithm> selectSingleNodeAllreduce(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config);
+
+/// Select an algorithm for single-node allgather
+std::shared_ptr<Algorithm> selectSingleNodeAllgather(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config);
+
+/// Select an algorithm for multi-node collective operations
+/// Currently returns nullptr to fallback to NCCL/RCCL
+/// TODO: Implement multi-node NVLS and multi-node IB algorithms
+std::shared_ptr<Algorithm> selectMultiNodeAlgorithm(
+    const std::unordered_map<std::string, std::shared_ptr<Algorithm>>& algoMap, const CollectiveRequest& request,
+    const AlgorithmSelectorConfig& config);
+
+/// Check if an execution plan matches the request
+bool matchExecutionPlan(std::shared_ptr<DslAlgorithm> algo, const CollectiveRequest& request);
+
+}  // namespace nccl
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_EXT_NCCL_ALGORITHM_SELECTOR_HPP_
diff --git a/src/ext/nccl/audit-shim/audit_nccl.cc b/src/ext/nccl/audit-shim/audit_nccl.cc
index 5e3ab6f2..7fdeb67b 100644
--- a/src/ext/nccl/audit-shim/audit_nccl.cc
+++ b/src/ext/nccl/audit-shim/audit_nccl.cc
@@ -8,11 +8,11 @@
 
 extern "C" __attribute__((visibility("default"))) unsigned int la_version(unsigned int) { return LAV_CURRENT; }
 
-extern "C" __attribute__((visibility("default"))) char *la_objsearch(const char *name, uintptr_t *, unsigned int) {
-  const char *library = "libmscclpp_nccl.so";
+extern "C" __attribute__((visibility("default"))) char* la_objsearch(const char* name, uintptr_t*, unsigned int) {
+  const char* library = "libmscclpp_nccl.so";
   if (strcmp(name, "libnccl.so.2") && strcmp(name, "libnccl.so") && strcmp(name, "librccl.so") &&
       strcmp(name, "librccl.so.1")) {
-    return (char *)name;
+    return (char*)name;
   }
-  return (char *)library;
+  return (char*)library;
 }
\ No newline at end of file
diff --git a/src/ext/nccl/datatype_conversion.hpp b/src/ext/nccl/datatype_conversion.hpp
index 85bac056..dcfb645a 100644
--- a/src/ext/nccl/datatype_conversion.hpp
+++ b/src/ext/nccl/datatype_conversion.hpp
@@ -9,6 +9,8 @@
 #include <cstddef>
 #include <mscclpp/gpu_data_types.hpp>
 
+#include "logger.hpp"
+
 // Convert ncclDataType_t to mscclpp::DataType
 inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
   switch (dtype) {
@@ -16,6 +18,8 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
       return mscclpp::DataType::INT32;
     case ncclUint32:
       return mscclpp::DataType::UINT32;
+    case ncclUint8:
+      return mscclpp::DataType::UINT8;
     case ncclFloat16:
       return mscclpp::DataType::FLOAT16;
     case ncclFloat32:
@@ -24,9 +28,9 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
       return mscclpp::DataType::BFLOAT16;
 #ifdef __FP8_TYPES_EXIST__
     case ncclFloat8e4m3:
-      return mscclpp::DataType::FP8_E4M3;
+      return mscclpp::DataType::FLOAT8_E4M3;
     case ncclFloat8e5m2:
-      return mscclpp::DataType::FP8_E5M2;
+      return mscclpp::DataType::FLOAT8_E5M2;
 #endif
     default:
       throw mscclpp::Error("Unsupported ncclDataType_t: " + std::to_string(dtype), mscclpp::ErrorCode::InvalidUsage);
@@ -36,8 +40,10 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
 // Get the size in bytes of a data type
 inline size_t getDataTypeSize(mscclpp::DataType dtype) {
   switch (dtype) {
-    case mscclpp::DataType::FP8_E4M3:
-    case mscclpp::DataType::FP8_E5M2:
+    case mscclpp::DataType::UINT8:
+    case mscclpp::DataType::FLOAT8_E4M3:
+    case mscclpp::DataType::FLOAT8_E5M2:
+    case mscclpp::DataType::FLOAT8_E4M3B15:
       return 1;
     case mscclpp::DataType::FLOAT16:
     case mscclpp::DataType::BFLOAT16:
@@ -57,6 +63,8 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
       return ncclInt32;
     case mscclpp::DataType::UINT32:
       return ncclUint32;
+    case mscclpp::DataType::UINT8:
+      return ncclUint8;
     case mscclpp::DataType::FLOAT16:
       return ncclFloat16;
     case mscclpp::DataType::FLOAT32:
@@ -64,14 +72,18 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
     case mscclpp::DataType::BFLOAT16:
       return ncclBfloat16;
 #ifdef __FP8_TYPES_EXIST__
-    case mscclpp::DataType::FP8_E4M3:
+    case mscclpp::DataType::FLOAT8_E4M3:
       return ncclFloat8e4m3;
-    case mscclpp::DataType::FP8_E5M2:
+    case mscclpp::DataType::FLOAT8_E5M2:
       return ncclFloat8e5m2;
 #endif
+    case mscclpp::DataType::FLOAT8_E4M3B15:
+      // float8_e4m3b15 has no NCCL equivalent; NCCL cannot reduce this type correctly.
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "FLOAT8_E4M3B15 (float8_e4m3b15) has no NCCL equivalent and cannot be used with NCCL collectives");
     default:
-      assert(false && "Unsupported mscclpp::DataType");
-      return ncclNumTypes;
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "Unsupported mscclpp::DataType: " + std::to_string(static_cast<int>(dtype)));
   }
 }
 
diff --git a/src/ext/nccl/nccl.cu b/src/ext/nccl/nccl.cc
similarity index 79%
rename from src/ext/nccl/nccl.cu
rename to src/ext/nccl/nccl.cc
index 88b78d00..2d6c5f9d 100644
--- a/src/ext/nccl/nccl.cu
+++ b/src/ext/nccl/nccl.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <algorithm>
 #include <filesystem>
@@ -19,8 +19,10 @@
 
 #include <mscclpp/algorithm.hpp>
 
+#include "algorithm_selector.hpp"
 #include "datatype_conversion.hpp"
-#include "debug.h"
+
+static constexpr auto MSCCLPP_NCCL = mscclpp::LogSubsys::NCCL;
 
 #define NCCL_API extern "C" __attribute__((visibility("default")))
 
@@ -41,6 +43,7 @@ typedef enum mscclppNcclDlopenErr {
 typedef struct _mscclppNcclOps_t {
   ncclResult_t (*CommInitRank)(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
   ncclResult_t (*GetUniqueId)(ncclUniqueId* uniqueId);
+  ncclResult_t (*CommFinalize)(ncclComm_t comm);
   ncclResult_t (*CommDestroy)(ncclComm_t comm);
   ncclResult_t (*CommUserRank)(const ncclComm_t, int* rank);
   ncclResult_t (*AllReduce)(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op,
@@ -80,23 +83,24 @@ static inline int mscclppNcclDlopenInit() {
   const char* ncclLibPath = mscclpp::env()->ncclSharedLibPath.c_str();
   if (ncclLibPath != nullptr && ncclLibPath[0] != '\0') {
     if (std::filesystem::is_directory(ncclLibPath)) {
-      WARN("The value of the environment variable %s is a directory", ncclLibPath);
+      WARN(MSCCLPP_NCCL, "MSCCLPP_NCCL_LIB_PATH points to a directory: ", ncclLibPath);
       return dlopenError;
     }
 
-    mscclppNcclDlHandle = dlopen(ncclLibPath, RTLD_LAZY | RTLD_NODELETE);
+    mscclppNcclDlHandle = dlopen(ncclLibPath, RTLD_LAZY | RTLD_NODELETE | RTLD_DEEPBIND);
     if (!mscclppNcclDlHandle) {
-      WARN("Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: %s\n", dlerror());
+      WARN(MSCCLPP_NCCL, "Cannot open the shared library specified by MSCCLPP_NCCL_LIB_PATH: ", dlerror());
       return dlopenError;
     }
   } else {
-    WARN("The value of MSCCLPP_NCCL_LIB_PATH is empty!\n");
+    WARN(MSCCLPP_NCCL, "The value of MSCCLPP_NCCL_LIB_PATH is empty!");
     return dlopenError;
   }
 
   NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, CommInitRank,
              ncclResult_t(*)(ncclComm_t*, int, ncclUniqueId, int));
   NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, GetUniqueId, ncclResult_t(*)(ncclUniqueId*));
+  NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, CommFinalize, ncclResult_t(*)(ncclComm_t));
   NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, CommDestroy, ncclResult_t(*)(ncclComm_t));
   NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, CommUserRank, ncclResult_t(*)(ncclComm_t, int*));
   NCCL_DLSYM(mscclppNcclOps, mscclppNcclDlHandle, nccl, AllReduce,
@@ -170,6 +174,8 @@ struct ncclComm {
   std::shared_ptr<mscclpp::Executor> executor;
   mscclpp::AlgorithmCollection algorithmCollection;
   std::shared_ptr<char> scratchBuffer_;
+  std::shared_ptr<void> flagBuffer_;
+  size_t flagBufferSize_;
   const size_t scratchBufferSize_ = (1 << 27);  // 128MB
   int nRanksPerNode;
   int worldSize;
@@ -179,7 +185,7 @@ struct ncclComm {
 
 NCCL_API ncclResult_t ncclGetVersion(int* version) {
   if (version == nullptr) {
-    WARN("version is nullptr");
+    WARN(MSCCLPP_NCCL, "version is nullptr");
     return ncclInvalidArgument;
   }
   *version = MSCCLPP_VERSION;
@@ -188,7 +194,7 @@ NCCL_API ncclResult_t ncclGetVersion(int* version) {
 
 NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) {
   if (uniqueId == nullptr) {
-    WARN("uniqueId is nullptr");
+    WARN(MSCCLPP_NCCL, "uniqueId is nullptr");
     return ncclInvalidArgument;
   }
   if (mscclpp::UniqueIdBytes != NCCL_UNIQUE_ID_BYTES) return ncclInternalError;
@@ -212,21 +218,6 @@ static std::pair<int, int> getDeviceComputeCapability() {
   return std::make_pair(major, minor);
 }
 
-static bool matchExecutionPlan(std::shared_ptr<mscclpp::DslAlgorithm> algo, const mscclpp::CollectiveRequest& request) {
-  bool worldSizeMatch = algo->constraint().worldSize == request.worldSize;
-  bool ranksPerNodeMatch = algo->constraint().nRanksPerNode == request.nRanksPerNode;
-  bool collectiveMatch = algo->collective() == request.collective;
-  bool bufferModeMatch =
-      algo->bufferMode() == mscclpp::CollectiveBufferMode::Any || request.bufferMode() == algo->bufferMode();
-  size_t effectiveSize =
-      (request.collective == "allgather") ? (request.messageSize * request.worldSize) : request.messageSize;
-  bool minSizeMatch = effectiveSize >= algo->messageRange().first;
-  bool maxSizeMatch = effectiveSize <= algo->messageRange().second;
-  bool result =
-      worldSizeMatch && ranksPerNodeMatch && collectiveMatch && bufferModeMatch && minSizeMatch && maxSizeMatch;
-  return result;
-}
-
 static std::shared_ptr<mscclpp::Algorithm> algoSelector(
     const std::unordered_map<std::string, std::unordered_map<std::string, std::shared_ptr<mscclpp::Algorithm>>>&
         algoMapByCollective,
@@ -234,86 +225,63 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
   if (algoMapByCollective.find(request.collective) == algoMapByCollective.end()) {
     return nullptr;
   }
+
   for (const auto& pair : algoMapByCollective.at(request.collective)) {
     const auto& algo = pair.second;
     if (algo->type() == mscclpp::AlgorithmType::DSL) {
-      if (matchExecutionPlan(std::static_pointer_cast<mscclpp::DslAlgorithm>(algo), request)) {
+      if (mscclpp::nccl::matchExecutionPlan(std::static_pointer_cast<mscclpp::DslAlgorithm>(algo), request)) {
         return algo;
       }
     }
   }
-  if (request.nRanksPerNode != request.worldSize) {
-    // Fallback to nccl/rccl when multi-node
-    return nullptr;
-  }
-  static const bool mscclppDisableChannelCache = mscclpp::env()->disableChannelCache;
+
+  // Prepare algorithm selector configuration
   static const bool isNvlsSupported = mscclpp::isNvlsSupported();
   static const std::pair<int, int> deviceComputeCapability = getDeviceComputeCapability();
-  size_t messageSize = request.messageSize;
-  const std::string& collective = request.collective;
-  bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast<void*>(request.inputBuffer)) &&
-                             mscclpp::isCuMemMapAllocated(request.outputBuffer);
-  bool useNvlsWithZeroCopy = isNvlsSupported && !mscclppDisableChannelCache && isCuMemMapAllocated;
-  if (collective == "allgather") {
-    if (messageSize <= 32 * (1 << 20)) {
-      return algoMapByCollective.at(collective).at("default_allgather_fullmesh2");
-    } else {
-#if defined(__HIP_PLATFORM_AMD__)
-      return algoMapByCollective.at(collective).at("default_allgather_fullmesh2");
-#else
-      if (!mscclppNcclDlopenSharedLib) {
-        return algoMapByCollective.at(collective).at("default_allgather_fullmesh");
-      }
-#endif
-    }
+  static const bool ncclSymmetricMemory = mscclpp::env()->ncclSymmetricMemory;
+
+  const bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast<void*>(request.inputBuffer)) &&
+                                   mscclpp::isCuMemMapAllocated(request.outputBuffer);
+
+  cudaStreamCaptureStatus captureStatus = cudaStreamCaptureStatusNone;
+  CUDACHECK(cudaStreamIsCapturing(request.stream, &captureStatus));
+  const bool inCaptureMode = (captureStatus == cudaStreamCaptureStatusActive);
+
+  mscclpp::nccl::AlgorithmSelectorConfig config{.symmetricMemory = ncclSymmetricMemory,
+                                                .nvlsSupported = isNvlsSupported,
+                                                .isCuMemMapAllocated = isCuMemMapAllocated,
+                                                .inCaptureMode = inCaptureMode,
+                                                .computeCapability = deviceComputeCapability,
+                                                .ncclDlopenSharedLib = mscclppNcclDlopenSharedLib};
+
+  const auto& algoMap = algoMapByCollective.at(request.collective);
+
+  // Check if this is a multi-node scenario
+  if (request.nRanksPerNode != request.worldSize) {
+    return mscclpp::nccl::selectMultiNodeAlgorithm(algoMap, request, config);
   }
-  if (collective == "allreduce") {
-    bool useNvls = isNvlsSupported;
-    bool isFp8 = request.dtype == mscclpp::DataType::FP8_E4M3 || request.dtype == mscclpp::DataType::FP8_E5M2;
-#if !defined(__HIP_PLATFORM_AMD__)
-    if (isFp8 && deviceComputeCapability.first < 10) {
-      // NVLS does not support FP8 on devices with compute capability < 10
-      useNvls = false;
-    }
-#endif
-    if (messageSize <= (1 << 15) && useNvls) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls_packet");
-    }
-    if (messageSize <= (1 << 14)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_allpair_packet");
-    }
-    if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_packet");
-    }
-    if (useNvls && useNvlsWithZeroCopy) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls");
-    }
-    if (useNvls && messageSize < (1 << 24)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls_with_copy");
-    }
-    if (useNvls && messageSize >= (1 << 24)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_nvls_with_copy2");
-    }
-#if defined(__HIP_PLATFORM_AMD__)
-    return algoMapByCollective.at(collective).at("default_allreduce_fullmesh");
-#else
-    if (!mscclppNcclDlopenSharedLib) {
-      return algoMapByCollective.at(collective).at("default_allreduce_fullmesh");
-    }
-#endif
+
+  // Single-node scenarios
+  if (request.collective == "allgather") {
+    return mscclpp::nccl::selectSingleNodeAllgather(algoMap, request, config);
   }
-  INFO(MSCCLPP_NCCL, "Failed to get algo from customized kernel, fallback to nccl/rccl");
+
+  if (request.collective == "allreduce") {
+    return mscclpp::nccl::selectSingleNodeAllreduce(algoMap, request, config);
+  }
+
+  INFO(MSCCLPP_NCCL, "No suitable algorithm found for collective '", request.collective, "', fallback to nccl/rccl");
   return nullptr;
 }
 
 NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
-  INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank %d, world_size=%d", rank, nranks);
+  INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank ", rank, ", world_size=", nranks);
   if (comm == nullptr) {
-    WARN("comm is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr");
     return ncclInvalidArgument;
   }
   if (nranks < 0 || rank < 0 || rank >= nranks) {
-    WARN("nranks is %d, rank is %d", nranks, rank);
+    WARN(MSCCLPP_NCCL, "nranks is ", nranks, ", rank is ", rank);
     return ncclInvalidArgument;
   }
   std::shared_ptr<mscclpp::TcpBootstrap> bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
@@ -327,12 +295,17 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
   commPtr->scratchBuffer_ = mscclpp::GpuBuffer<char>(commPtr->scratchBufferSize_).memory();
   commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm, commPtr->scratchBuffer_);
 
+  auto [buffer, size] = mscclpp::getFlagBuffer();
+  commPtr->flagBuffer_ = buffer;
+  commPtr->flagBufferSize_ = size;
+
   commPtr->nRanksPerNode = mscclppComm->bootstrap()->getNranksPerNode();
   commPtr->worldSize = mscclppComm->bootstrap()->getNranks();
   auto algoBuilder = mscclpp::collective::AlgorithmCollectionBuilder::getInstance();
   algoBuilder->setFallbackAlgorithmSelector(algoSelector);
   commPtr->algorithmCollection = algoBuilder->buildDefaultAlgorithms(
-      reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()), commPtr->scratchBufferSize_, rank);
+      reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()), commPtr->scratchBufferSize_,
+      reinterpret_cast<uintptr_t>(commPtr->flagBuffer_.get()), commPtr->flagBufferSize_, rank);
   // Extend with user-defined algorithms
   commPtr->algorithmCollection.extend(algoBuilder->build());
 
@@ -346,7 +319,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
   const std::string ncclLibPath = mscclpp::env()->ncclSharedLibPath;
   if (!ncclLibPath.empty() && !mscclppNcclDlopenSharedLib) {
     if (!tryLoadNcclSharedLib()) {
-      WARN("Failed to load the shared library for nccl/rccl");
+      WARN(MSCCLPP_NCCL, "Failed to load the shared library for nccl/rccl");
       return ncclInternalError;
     }
   }
@@ -361,7 +334,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
 
     commPtr->mscclppNcclComm = new ncclComm_t();
     if (commPtr->mscclppNcclComm == nullptr) {
-      WARN("Failed to allocate memory for mscclppNcclComm");
+      WARN(MSCCLPP_NCCL, "Failed to allocate memory for mscclppNcclComm");
       return ncclInternalError;
     }
     mscclppNcclOps.CommInitRank(reinterpret_cast<ncclComm_t*>(commPtr->mscclppNcclComm), nranks, mscclppNcclUniqueId,
@@ -378,18 +351,29 @@ NCCL_API ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int*) {
     return ncclCommInitRank(comm, ndev, Id, 0);
   }
   // TODO: implement this function
-  WARN("ncclCommInitAll is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommInitAll is currently unavailable");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclCommFinalize(ncclComm_t comm) {
+  if (comm == nullptr) {
+    WARN(MSCCLPP_NCCL, "comm is nullptr");
+    return ncclInvalidArgument;
+  }
+  ncclComm_t* mscclppNcclCommPtr = reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm);
+  if (mscclppNcclCommPtr != nullptr && mscclppNcclOps.CommFinalize != nullptr) {
+    ncclResult_t result = mscclppNcclOps.CommFinalize(*mscclppNcclCommPtr);
+    if (result != ncclSuccess) {
+      return result;
+    }
+  }
   comm->comm->bootstrap()->barrier();
   return ncclSuccess;
 }
 
 NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   if (comm == nullptr) {
-    WARN("comm is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr");
     return ncclInvalidArgument;
   }
 #if defined(ENABLE_NPKIT)
@@ -401,11 +385,11 @@ NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
 #endif
 
   ncclComm_t* mscclppNcclCommPtr = reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm);
-  delete comm;
   if (mscclppNcclCommPtr != nullptr) {
-    mscclppNcclOps.CommDestroy(*reinterpret_cast<ncclComm_t*>(mscclppNcclCommPtr));
-    delete static_cast<ncclComm_t*>(mscclppNcclCommPtr);
+    mscclppNcclOps.CommDestroy(*mscclppNcclCommPtr);
+    delete mscclppNcclCommPtr;
   }
+  delete comm;
   return ncclSuccess;
 }
 
@@ -447,7 +431,7 @@ NCCL_API ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclCom
 }
 
 ncclResult_t ncclCommInitRankScalable(ncclComm_t*, int, int, int, ncclUniqueId*, ncclConfig_t*) {
-  WARN("ncclCommInitRankScalable is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommInitRankScalable is currently unavailable");
   return ncclInternalError;
 }
 
@@ -481,7 +465,7 @@ NCCL_API const char* ncclGetLastError(ncclComm_t) {
 
 NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t, ncclResult_t* asyncError) {
   if (asyncError == nullptr) {
-    WARN("asyncError is nullptr");
+    WARN(MSCCLPP_NCCL, "asyncError is nullptr");
     return ncclInvalidArgument;
   }
   *asyncError = ncclSuccess;
@@ -490,7 +474,7 @@ NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t, ncclResult_t* asyncError
 
 NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
   if (comm == nullptr || count == nullptr) {
-    WARN("comm is nullptr or count is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr or count is nullptr");
     return ncclInvalidArgument;
   }
   *count = comm->comm->bootstrap()->getNranks();
@@ -499,7 +483,7 @@ NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
 
 NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) {
   if (comm == nullptr || device == nullptr) {
-    WARN("comm is nullptr or device is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr or device is nullptr");
     return ncclInvalidArgument;
   }
   *device = comm->comm->bootstrap()->getRank();
@@ -508,7 +492,7 @@ NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) {
 
 NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
   if (comm == nullptr || rank == nullptr) {
-    WARN("comm is nullptr or rank is nullptr");
+    WARN(MSCCLPP_NCCL, "comm is nullptr or rank is nullptr");
     return ncclInvalidArgument;
   }
 
@@ -521,24 +505,24 @@ NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
 }
 
 NCCL_API ncclResult_t ncclCommWindowRegister(ncclComm_t, void*, size_t, ncclWindow_t*, int) {
-  WARN("ncclCommWindowRegister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommWindowRegister is currently unavailable");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclCommWindowDeregister(ncclComm_t, ncclWindow_t) {
-  WARN("ncclCommWindowDeregister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommWindowDeregister is currently unavailable");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t*, void*, ncclDataType_t, ncclScalarResidence_t, ncclComm_t) {
   // TODO: implement this function
-  WARN("ncclRedOpCreatePreMulSum is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclRedOpCreatePreMulSum is currently unavailable");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) {
   // TODO: implement this function
-  WARN("ncclRedOpDestroy is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclRedOpDestroy is currently unavailable");
   return ncclInternalError;
 }
 
@@ -549,7 +533,7 @@ NCCL_API ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t co
     return mscclppNcclOps.Reduce(sendbuff, recvbuff, count, datatype, op, root,
                                  *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
-  WARN("ncclReduce is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclReduce is currently unavailable");
   return ncclInternalError;
 }
 
@@ -569,14 +553,14 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
   }
   int rank = comm->comm->bootstrap()->getRank();
   if ((sendbuff == nullptr && root == rank) || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
-        "or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
+         "or comm is nullptr.");
     return ncclInvalidArgument;
   }
 
-  INFO(MSCCLPP_NCCL, "rank %d broadcast sendbuff %p recvbuff %p count %ld, dtype %d, comm: %p", rank, sendbuff,
-       recvbuff, count, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " broadcast sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", count,
+       ", dtype ", datatype, ", comm: ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("broadcast", fallbackList)) {
@@ -587,12 +571,15 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
   mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
   static std::unordered_map<std::string, std::vector<uint64_t>> hints{{"root", {static_cast<uint64_t>(root)}}};
   hints["root"][0] = static_cast<uint64_t>(root);
+
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
   mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                         .nRanksPerNode = comm->nRanksPerNode,
                                         .rank = rank,
                                         .inputBuffer = sendbuff,
                                         .outputBuffer = recvbuff,
                                         .messageSize = bytes,
+                                        .stream = stream,
                                         .collective = "broadcast",
                                         .dtype = dtype,
                                         .hints = hints};
@@ -600,7 +587,8 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
   if (algo != nullptr) {
     std::unordered_map<std::string, uintptr_t> extras{{"root", reinterpret_cast<uintptr_t>(&root)}};
     return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes, dtype,
-                                                   mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0, extras));
+                                                   mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0,
+                                                   symmetricMemory, extras));
   }
 
   if (mscclppNcclDlopenSharedLib == true) {
@@ -608,7 +596,7 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  WARN("No FallBack implementation for broadcast");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for broadcast");
   return ncclInvalidUsage;
 }
 
@@ -623,15 +611,15 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
   }
   // Checking if the parameters are valids
   if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, count is 0, "
-        "datatype is invalid, or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, count is 0, "
+         "datatype is invalid, or comm is nullptr.");
     return ncclInvalidArgument;
   }
   // Declarating variables
   int rank = comm->comm->bootstrap()->getRank();
-  INFO(MSCCLPP_NCCL, "rank %d allreduce sendbuff %p recvbuff %p count %ld, dtype %d comm is %p", rank, sendbuff,
-       recvbuff, count, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " allreduce sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", count,
+       ", dtype ", datatype, " comm is ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib && mscclppNcclInFallbackList("allreduce", fallbackList)) {
@@ -639,12 +627,14 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
   mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
   mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                         .nRanksPerNode = comm->nRanksPerNode,
                                         .rank = rank,
                                         .inputBuffer = sendbuff,
                                         .outputBuffer = recvbuff,
                                         .messageSize = bytes,
+                                        .stream = stream,
                                         .collective = "allreduce",
                                         .dtype = dtype,
                                         .hints = {}};
@@ -652,7 +642,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
   auto algo = comm->algorithmCollection.selectAlgorithm(request);
   if (algo != nullptr) {
     return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes, dtype,
-                                                   ncclRedOpToMscclpp(reductionOperation), stream, comm->executor));
+                                                   ncclRedOpToMscclpp(reductionOperation), stream, comm->executor, 0, 0,
+                                                   symmetricMemory));
   }
 
   if (mscclppNcclDlopenSharedLib == true) {
@@ -660,7 +651,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  WARN("No FallBack implementation for AllReduce");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for AllReduce");
   return ncclInvalidUsage;
 }
 
@@ -675,14 +666,14 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
   }
 
   if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
-        "or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
+         "or comm is nullptr.");
     return ncclInvalidArgument;
   }
 
-  INFO(MSCCLPP_NCCL, "ReduceScatter recvcount: %ld, datatype: %d, op: %d, messageSize: %ld", recvcount, datatype, op,
-       bytes * comm->comm->bootstrap()->getNranks());
+  INFO(MSCCLPP_NCCL, "ReduceScatter recvcount: ", recvcount, ", datatype: ", datatype, ", op: ", op,
+       ", messageSize: ", bytes * comm->comm->bootstrap()->getNranks());
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("reducescatter", fallbackList)) {
@@ -693,19 +684,22 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
   int rank = comm->comm->bootstrap()->getRank();
   int nRank = comm->comm->bootstrap()->getNranks();
   mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
   mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                         .nRanksPerNode = comm->nRanksPerNode,
                                         .rank = rank,
                                         .inputBuffer = sendbuff,
                                         .outputBuffer = recvbuff,
                                         .messageSize = bytes * nRank,
+                                        .stream = stream,
                                         .collective = "reducescatter",
                                         .dtype = dtype,
                                         .hints = {}};
   auto algo = comm->algorithmCollection.selectAlgorithm(request);
   if (algo != nullptr) {
     return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes * nRank, bytes, dtype,
-                                                   ncclRedOpToMscclpp(op), stream, comm->executor));
+                                                   ncclRedOpToMscclpp(op), stream, comm->executor, 0, 0,
+                                                   symmetricMemory));
   }
 
   if (mscclppNcclDlopenSharedLib == true) {
@@ -713,7 +707,7 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
                                         *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  WARN("No FallBack implementation for ReduceScatter");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for ReduceScatter");
   return ncclInternalError;
 }
 
@@ -727,16 +721,16 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
     return ncclSuccess;
   }
   if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
-    WARN(
-        "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
-        "or comm is nullptr.");
+    WARN(MSCCLPP_NCCL,
+         "One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
+         "or comm is nullptr.");
     return ncclInvalidArgument;
   }
 
   int rank = comm->comm->bootstrap()->getRank();
   int nRank = comm->comm->bootstrap()->getNranks();
-  INFO(MSCCLPP_NCCL, "rank %d allgather sendbuff %p recvbuff %p count %ld, dtype %d, comm %p", rank, sendbuff, recvbuff,
-       sendcount, datatype, comm);
+  INFO(MSCCLPP_NCCL, "rank ", rank, " allgather sendbuff ", sendbuff, " recvbuff ", recvbuff, " count ", sendcount,
+       ", dtype ", datatype, ", comm ", (void*)comm);
 
   const char* fallbackList = mscclpp::env()->forceNcclFallbackOperation.c_str();
   if (mscclppNcclDlopenSharedLib == true && mscclppNcclInFallbackList("allgather", fallbackList)) {
@@ -745,12 +739,14 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
   }
 
   mscclpp::DataType dtype = ncclDataTypeToMscclpp(datatype);
+  const bool symmetricMemory = mscclpp::env()->ncclSymmetricMemory;
   mscclpp::CollectiveRequest request = {.worldSize = comm->worldSize,
                                         .nRanksPerNode = comm->nRanksPerNode,
                                         .rank = rank,
                                         .inputBuffer = sendbuff,
                                         .outputBuffer = recvbuff,
                                         .messageSize = bytes,
+                                        .stream = stream,
                                         .collective = "allgather",
                                         .dtype = dtype,
                                         .hints = {}};
@@ -758,7 +754,8 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
   auto algo = comm->algorithmCollection.selectAlgorithm(request);
   if (algo != nullptr) {
     return static_cast<ncclResult_t>(algo->execute(comm->comm, sendbuff, recvbuff, bytes, bytes * nRank, dtype,
-                                                   mscclpp::ReduceOp::NOP, stream, comm->executor));
+                                                   mscclpp::ReduceOp::NOP, stream, comm->executor, 0, 0,
+                                                   symmetricMemory));
   }
 
   if (mscclppNcclDlopenSharedLib == true) {
@@ -766,7 +763,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  WARN("No FallBack implementation for AllGather");
+  WARN(MSCCLPP_NCCL, "No FallBack implementation for AllGather");
   return ncclInvalidUsage;
 }
 
@@ -776,7 +773,7 @@ NCCL_API ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_
     return mscclppNcclOps.Send(sendbuff, count, datatype, peer, *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm),
                                stream);
   }
-  WARN("ncclSend is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclSend is currently unavailable");
   return ncclInternalError;
 }
 
@@ -786,7 +783,7 @@ NCCL_API ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t data
     return mscclppNcclOps.Recv(recvbuff, count, datatype, peer, *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm),
                                stream);
   }
-  WARN("ncclRecv is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclRecv is currently unavailable");
   return ncclInternalError;
 }
 
@@ -800,7 +797,7 @@ NCCL_API ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t
     return ncclSuccess;
   }
   // TODO: implement this function
-  WARN("ncclAllToAll is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclAllToAll is currently unavailable");
   return ncclInternalError;
 }
 
@@ -815,7 +812,7 @@ NCCL_API ncclResult_t ncclAllToAllv(const void* sendbuff, [[maybe_unused]] const
                                       cudaMemcpyDeviceToDevice, stream));
     return ncclSuccess;
   }
-  WARN("ncclAllToAllv is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclAllToAllv is currently unavailable");
   return ncclInternalError;
 }
 
@@ -824,7 +821,7 @@ NCCL_API ncclResult_t ncclGroupStart() {
   if (mscclppNcclDlopenSharedLib == true) {
     return mscclppNcclOps.GroupStart();
   }
-  WARN("ncclGroupStart is currently unavailable, return success");
+  WARN(MSCCLPP_NCCL, "ncclGroupStart is currently unavailable, return success");
   return ncclSuccess;
 }
 
@@ -832,56 +829,56 @@ NCCL_API ncclResult_t ncclGroupEnd() {
   if (mscclppNcclDlopenSharedLib == true) {
     return mscclppNcclOps.GroupEnd();
   }
-  WARN("ncclGroupEnd is currently unavailable, return success");
+  WARN(MSCCLPP_NCCL, "ncclGroupEnd is currently unavailable, return success");
   return ncclSuccess;
 }
 
 NCCL_API ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t*) {
   // TODO: implement this function
-  WARN("ncclGroupSimulateEnd is not implemented");
+  WARN(MSCCLPP_NCCL, "ncclGroupSimulateEnd is not implemented");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclCommRegister(const ncclComm_t, void*, size_t, void**) {
   // TODO: Implementation
-  WARN("ncclCommRegister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommRegister is currently unavailable");
   return ncclInternalError;
 }
 
 NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
   // TODO: Implementation
-  WARN("ncclCommDeregister is currently unavailable");
+  WARN(MSCCLPP_NCCL, "ncclCommDeregister is currently unavailable");
   return ncclInternalError;
 }
 
 ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
   if (ptr == nullptr || size == 0) {
-    WARN("ptr is nullptr or size is 0");
+    WARN(MSCCLPP_NCCL, "ptr is nullptr or size is 0");
     return ncclInvalidArgument;
   }
   std::shared_ptr<char> sharedPtr;
   try {
     sharedPtr = mscclpp::GpuBuffer(size).memory();
     if (sharedPtr == nullptr) {
-      WARN("Failed to allocate memory via ncclMemAlloc");
+      WARN(MSCCLPP_NCCL, "Failed to allocate memory via ncclMemAlloc");
       return ncclSystemError;
     }
   } catch (const mscclpp::Error& e) {
     if (e.getErrorCode() == mscclpp::ErrorCode::InvalidUsage) {
-      WARN("Invalid usage: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Invalid usage: ", e.what());
       return ncclInvalidUsage;
     } else {
-      WARN("Internal error: %s", e.what());
+      WARN(MSCCLPP_NCCL, "Internal error: ", e.what());
       return ncclInternalError;
     }
   } catch (const mscclpp::CudaError& e) {
-    WARN("Cuda error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cuda error: ", e.what());
     return ncclUnhandledCudaError;
   } catch (const mscclpp::CuError& e) {
-    WARN("Cu error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Cu error: ", e.what());
     return ncclUnhandledCudaError;
   } catch (const mscclpp::BaseError& e) {
-    WARN("Base error: %s", e.what());
+    WARN(MSCCLPP_NCCL, "Base error: ", e.what());
     return ncclInternalError;
   }
   ptrMap[sharedPtr.get()] = sharedPtr;
@@ -899,6 +896,6 @@ ncclResult_t ncclMemFree(void* ptr) {
   }
 
   // Pointer not found
-  WARN("Pointer not found");
+  WARN(MSCCLPP_NCCL, "Pointer not found");
   return ncclInvalidUsage;
 }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8e41aac5..82b799dc 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,19 +1,22 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
-find_package(MPI)
+find_package(MPI REQUIRED)
 
 set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads)
 if(MSCCLPP_USE_IB)
     list(APPEND TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
 endif()
-set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main)
 set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
 set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include)
 
 if(MSCCLPP_USE_ROCM)
     file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu)
     set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
+    foreach(arch ${MSCCLPP_GPU_ARCHS})
+        add_compile_options(--offload-arch=${arch})
+    endforeach()
+    add_compile_definitions(__HIP_PLATFORM_AMD__)
 endif()
 
 function(add_test_executable name sources)
@@ -35,28 +38,25 @@ add_test_executable(executor_test executor_test.cc)
 configure_file(run_mpi_test.sh.in run_mpi_test.sh)
 
 include(CTest)
-include(FetchContent)
-FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip)
-option(INSTALL_GTEST OFF)
-FetchContent_MakeAvailable(googletest)
-include(GoogleTest)
+
+# Build test framework library
+add_library(test_framework STATIC framework.cc)
+target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_INC_COMMON})
+target_link_libraries(test_framework PUBLIC MPI::MPI_CXX)
 
 # Unit tests
 add_executable(unit_tests)
-target_link_libraries(unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST})
+target_link_libraries(unit_tests ${TEST_LIBS_COMMON} test_framework)
 target_include_directories(unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
 add_subdirectory(unit)
-gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST)
+add_test(NAME unit_tests COMMAND unit_tests)
 
 # Multi-process unit tests
 add_executable(mp_unit_tests)
-target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST} MPI::MPI_CXX)
+target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} test_framework MPI::MPI_CXX)
 target_include_directories(mp_unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
 add_subdirectory(mp_unit)
-gtest_discover_tests(mp_unit_tests DISCOVERY_MODE PRE_TEST)
+add_test(NAME mp_unit_tests COMMAND ${CMAKE_CURRENT_BINARY_DIR}/run_mpi_test.sh mp_unit_tests 2)
 
 # mscclpp-test
 add_subdirectory(mscclpp-test)
-
-# Performance tests
-add_subdirectory(perf)
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 00000000..a69b66ad
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,130 @@
+# MSCCL++ C++ Test Framework
+
+A lightweight, GTest-like test framework with MPI support for testing MSCCL++ C++ APIs. Defined in `framework.hpp` / `framework.cc`.
+
+## Adding a New Test (Step-by-Step)
+
+### Single-process test (unit/)
+
+1. **Create the test file** `test/unit/my_feature_tests.cc` (or `.cu` for CUDA):
+
+    ```cpp
+    #include "../framework.hpp"
+    #include <mscclpp/my_feature.hpp>
+
+    TEST(MyFeatureTest, BasicUsage) {
+      EXPECT_EQ(myFunction(), 42);
+    }
+    ```
+
+2. **Register it in CMake** — add the filename to `test/unit/CMakeLists.txt`:
+
+    ```cmake
+    target_sources(unit_tests PRIVATE
+        ...
+        my_feature_tests.cc   # <-- add here
+    )
+    ```
+
+3. **Build and run**:
+
+    ```bash
+    cmake --build build -j
+    ./build/test/unit_tests --filter=MyFeatureTest
+    ```
+
+### Multi-process test (mp_unit/)
+
+1. **Create the test file** `test/mp_unit/my_feature_tests.cc` (or `.cu`):
+
+    ```cpp
+    #include "mp_unit_tests.hpp"
+
+    TEST(MyFeatureTest, MultiRank) {
+      int rank = gEnv->rank;
+      EXPECT_GE(rank, 0);
+    }
+    ```
+
+    Use fixtures from `mp_unit_tests.hpp` (e.g., `CommunicatorTest`) if you need pre-established connections.
+
+2. **Register it in CMake** — add the filename to `test/mp_unit/CMakeLists.txt`:
+
+    ```cmake
+    target_sources(mp_unit_tests PRIVATE
+        ...
+        my_feature_tests.cc   # <-- add here
+    )
+    ```
+
+3. **Build and run**:
+
+    ```bash
+    cmake --build build -j
+    mpirun -np 2 ./build/test/mp_unit_tests --filter=MyFeatureTest
+    ```
+
+### Notes
+
+- No separate test registration step is needed — `TEST()` auto-registers via static initialization.
+- The `test_framework` static library is built from `framework.cc` in the top-level `test/CMakeLists.txt` and linked into both `unit_tests` and `mp_unit_tests`. You do not need to modify it.
+- Use `.cu` extension for files that contain CUDA kernel code; use `.cc` for host-only tests.
+- Each test binary needs a `main()` that calls `RUN_ALL_TESTS()`. See `unit/unit_tests_main.cc` (single-process) and `mp_unit/mp_unit_tests.cc` (multi-process with `Environment` setup).
+- Additional run options: `--filter=-Pattern` (exclude), `--exclude-perf-tests` (skip `PERF_TEST`s).
+
+## Macros
+
+| Macro | Behavior |
+|---|---|
+| `TEST(Suite, Name)` | Register a test. If `Suite` is a defined class, it's used as a fixture. |
+| `PERF_TEST(Suite, Name)` | Same as `TEST` but marked as perf (skippable via `--exclude-perf-tests`). |
+| `EXPECT_*` | Non-fatal assertions: `EXPECT_TRUE`, `EXPECT_FALSE`, `EXPECT_EQ`, `EXPECT_NE`, `EXPECT_LT`, `EXPECT_LE`, `EXPECT_GT`, `EXPECT_GE` |
+| `ASSERT_*` | Fatal assertions (abort test on failure): same variants as `EXPECT_*`, plus `ASSERT_NO_THROW` |
+| `FAIL()` | Fail immediately. Supports streaming: `FAIL() << "reason";` |
+| `SKIP_TEST()` | Skip the current test. Supports streaming: `SKIP_TEST() << "reason";` |
+| `CUDA_CHECK(call)` | Check a CUDA API return code, throw on error. |
+
+## Fixtures
+
+Define a class inheriting from `mscclpp::test::TestCase` with `SetUp()` / `TearDown()`, then use the class name as the suite name:
+
+```cpp
+class MyFixture : public mscclpp::test::TestCase {
+ public:
+  void SetUp() override { /* per-test setup */ }
+  void TearDown() override { /* per-test cleanup */ }
+ protected:
+  int sharedState_ = 0;
+};
+
+TEST(MyFixture, SomeTest) {
+  sharedState_ = 42;
+  EXPECT_EQ(sharedState_, 42);
+}
+```
+
+See `mp_unit/mp_unit_tests.hpp` (`BootstrapTest`, `CommunicatorTest`, etc.) for real fixture examples.
+
+## Global Environments
+
+Register an `Environment` subclass for one-time global setup/teardown (e.g., MPI bootstrap):
+
+```cpp
+class MyEnv : public mscclpp::test::Environment {
+ public:
+  void SetUp() override { /* global init */ }
+  void TearDown() override { /* global cleanup */ }
+};
+
+// In main(), before RUN_ALL_TESTS():
+mscclpp::test::TestRegistry::instance().addEnvironment(new MyEnv());
+```
+
+See `mp_unit/mp_unit_tests.cc` for the `MultiProcessTestEnv` example.
+
+## Utilities
+
+- `mscclpp::test::utils::isMainRank()` — true on MPI rank 0
+- `mscclpp::test::utils::getMPIRank()` / `getMPISize()`
+- `mscclpp::test::utils::Timer` — high-resolution timer with `start()`, `stop()`, `elapsedMilliseconds()`
+- `mscclpp::test::currentTestName()` — returns `"Suite.Name"` for the running test
\ No newline at end of file
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index ccf85abd..1f1d0e52 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -1,8 +1,8 @@
-set -e
+set -ex
 
-# get parameter from $1 and $2
 TEST_NAME=$1
 IB_ENVIRONMENT="${2:-true}"
+PLATFORM="${3:-cuda}"
 
 KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
 ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
@@ -35,20 +35,29 @@ set -e
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
 parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
 
+if [ "${PLATFORM}" == "rocm" ]; then
+  parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
+fi
+
 # force to pull the latest image
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
   "sudo docker pull ${CONTAINERIMAGE}"
+
+LAUNCH_OPTION="--gpus=all"
+if [ "${PLATFORM}" == "rocm" ]; then
+  LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
+fi
 if [ "${IB_ENVIRONMENT}" == "true" ]; then
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-    "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
+    "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
     -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
     --entrypoint /bin/bash ${CONTAINERIMAGE}"
 else
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-    "sudo docker run --rm -itd --net=host --ipc=host --gpus=all --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
+    "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
     -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
     --entrypoint /bin/bash ${CONTAINERIMAGE}"
 fi
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-  "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'"
+  "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"
 
diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
new file mode 100755
index 00000000..2468243e
--- /dev/null
+++ b/test/deploy/run-remote.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# Run a command on remote CI VMs via parallel-ssh.
+# By default, runs inside the mscclpp-test docker container.
+#
+# Usage:
+#   run-remote.sh [OPTIONS] < <command_script>
+#
+# Options:
+#   --no-docker   Run command directly on the host, not inside docker
+#   --no-log      Don't tail the log file in the background
+#   --hostfile    Override hostfile path (default: test/deploy/hostfile_ci)
+#   --host        Run command on a single host (uses parallel-ssh -H)
+#   --user        SSH user when using --host or custom hostfile
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HOSTFILE="${SCRIPT_DIR}/hostfile_ci"
+SSH_OPTION="StrictHostKeyChecking=no"
+KeyFilePath="${SSHKEYFILE_SECUREFILEPATH}"
+
+USE_DOCKER=true
+USE_LOG=true
+TARGET_HOST=""
+REMOTE_USER=""
+
+usage() {
+    echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] < <command_script>" >&2
+}
+
+require_value() {
+    local opt="$1"
+    local val="$2"
+    if [ -z "$val" ]; then
+        echo "Missing value for ${opt}" >&2
+        exit 1
+    fi
+}
+
+while [[ "$1" == --* ]]; do
+    case "$1" in
+        --no-docker) USE_DOCKER=false; shift ;;
+        --no-log)    USE_LOG=false; shift ;;
+        --hostfile)
+            require_value "--hostfile" "${2-}"
+            HOSTFILE="$2"
+            shift 2
+            ;;
+        --host)
+            require_value "--host" "${2-}"
+            TARGET_HOST="$2"
+            shift 2
+            ;;
+        --user)
+            require_value "--user" "${2-}"
+            REMOTE_USER="$2"
+            shift 2
+            ;;
+        *) echo "Unknown option: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ $# -ne 0 ] || [ -t 0 ]; then
+    usage
+    exit 1
+fi
+
+CMD=$(cat)
+if [ -z "$CMD" ]; then
+    usage
+    exit 1
+fi
+CMD_B64=$(printf '%s' "$CMD" | base64 | tr -d '\n')
+
+PSSH_TARGET_ARGS=()
+if [ -n "$TARGET_HOST" ]; then
+    PSSH_TARGET_ARGS=(-H "$TARGET_HOST")
+else
+    PSSH_TARGET_ARGS=(-h "$HOSTFILE")
+fi
+
+PSSH_USER_ARGS=()
+if [ -n "$REMOTE_USER" ]; then
+    PSSH_USER_ARGS=(-l "$REMOTE_USER")
+fi
+
+PSSH_COMMON=(
+    -t 0
+    "${PSSH_TARGET_ARGS[@]}"
+    "${PSSH_USER_ARGS[@]}"
+    -x "-i ${KeyFilePath}"
+    -O "$SSH_OPTION"
+)
+
+if $USE_DOCKER; then
+    INNER="set -euxo pipefail;"
+    INNER+=" cd /root/mscclpp;"
+    INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
+    INNER+=" CMD_B64='${CMD_B64}';"
+    INNER+=" TMP=\\\$(mktemp);"
+    INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d > \\\"\\\$TMP\\\";"
+    INNER+=" bash -euxo pipefail \\\"\\\$TMP\\\";"
+    INNER+=" rm -f \\\"\\\$TMP\\\""
+
+    parallel-ssh -i "${PSSH_COMMON[@]}" \
+        "sudo docker exec mscclpp-test bash -c \"${INNER}\""
+else
+    parallel-ssh -i "${PSSH_COMMON[@]}" \
+        "set -euxo pipefail; CMD_B64='${CMD_B64}'; TMP=\$(mktemp); printf '%s' \"\$CMD_B64\" | base64 -d > \"\$TMP\"; bash -euxo pipefail \"\$TMP\"; rm -f \"\$TMP\""
+fi
diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh
index 488fa81f..0c05a090 100644
--- a/test/deploy/run_tests.sh
+++ b/test/deploy/run_tests.sh
@@ -1,6 +1,5 @@
 set -e
 HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
-export PATH=/usr/local/mpi/bin:$PATH
 
 function run_mscclpp_test()
 {
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
index 4916d2eb..d4996cc2 100644
--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -1,5 +1,7 @@
 set -e
 
+PLATFORM="${1:-cuda}"
+
 mkdir -p /root/.ssh
 mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
 chown root:root /root/.ssh/authorized_keys
@@ -8,10 +10,12 @@ chown root:root /root/.ssh/config
 chmod 400 /root/mscclpp/sshkey
 chown root:root /root/mscclpp/sshkey
 
-nvidia-smi -pm 1
-for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-    nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-done
+if [ "${PLATFORM}" == "cuda" ]; then
+    nvidia-smi -pm 1
+    for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
+        nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
+    done
+fi
 
 make -C /root/mscclpp/tools/peer-access-test
 /root/mscclpp/tools/peer-access-test/peer_access_test
@@ -19,10 +23,19 @@ make -C /root/mscclpp/tools/peer-access-test clean
 
 if [[ "${CUDA_VERSION}" == *"11."* ]]; then
     pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
-else
+elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
     pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
 fi
 
+if [ "${PLATFORM}" == "rocm" ]; then
+    export CXX=/opt/rocm/bin/hipcc
+fi
+
+PIP_CMAKE_ARGS_FILE="/root/mscclpp/pip_cmake_args.txt"
+if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then
+    export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})"
+    echo "Using CMAKE_ARGS: ${CMAKE_ARGS}"
+fi
 cd /root/mscclpp && pip3 install .
 pip3 install setuptools_scm
 python3 -m setuptools_scm --force-write-version-files
diff --git a/test/executor-tests/algos/reduce.py b/test/executor-tests/algos/reduce.py
new file mode 100644
index 00000000..db630a43
--- /dev/null
+++ b/test/executor-tests/algos/reduce.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Test
+
+This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
+operations. It implements a 2-GPU allreduce using the Simple protocol
+with instruction fusion enabled.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce(name, num_threads_per_block, min_message_size, max_message_size):
+    collective = AllReduce(2, 2, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        2,
+        protocol="Simple",
+        instr_fusion=True,
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 4)
+        second_scratch_buffer = Buffer(1, 4)
+
+        # Each rank copies its input chunks to scratch to prepare for remote access
+        first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb=0)
+        second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb=0)
+
+        # Signal and wait to ensure scratch data is visible to the remote rank
+        first_ch.signal(tb=0)
+        second_ch.signal(tb=0)
+
+        first_ch.wait(tb=0)
+        second_ch.wait(tb=0)
+
+        # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
+        first_ch.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb=0)
+        first_ch.put(second_input_buffer[0:1], first_input_buffer[0:1], tb=0)
+
+        # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
+        first_ch.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=0)
+        first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb=0)
+        first_ch.put(second_input_buffer[1:2], first_input_buffer[1:2], tb=0)
+
+        # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
+        second_ch.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb=0)
+        second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb=0)
+        second_ch.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb=0)
+
+        # Final signal/wait to ensure all reduced data is consistent across both ranks
+        first_ch.signal(tb=0)
+        second_ch.signal(tb=0)
+
+        first_ch.wait(tb=0)
+        second_ch.wait(tb=0)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_nvls.py b/test/executor-tests/algos/reduce_nvls.py
new file mode 100644
index 00000000..e59b8247
--- /dev/null
+++ b/test/executor-tests/algos/reduce_nvls.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce NVLS Test
+
+This file tests the executor MULTI_LOAD_REDUCE_STORE operation using
+NVLS SwitchChannels. Each GPU reduces its chunk via the
+NVSwitch and broadcasts the result to all other GPUs.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_nvls(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        instances=1,
+        protocol="Simple",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the GPUs are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Reducing and Storing the data
+        for gpu in range(gpu_size):
+            buffer_offset = gpu
+            rank = Rank(gpu)
+            input_buffer = rank.get_input_buffer()
+            nvls_chan.at_rank(gpu).reduce(
+                buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+            )
+            nvls_chan.at_rank(gpu).broadcast(
+                src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+            )
+
+        # Synchronization to Ensure the GPUs finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_nvls(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_nvls_pipeline.py b/test/executor-tests/algos/reduce_nvls_pipeline.py
new file mode 100644
index 00000000..d7a4925e
--- /dev/null
+++ b/test/executor-tests/algos/reduce_nvls_pipeline.py
@@ -0,0 +1,94 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce NVLS Pipeline Test
+
+This file tests the executor MULTI_LOAD_REDUCE_STORE operation in a
+pipeline context using SwitchChannel. Each GPU reduces
+its chunk via the NVSwitch and broadcasts the result, processing data
+in a pipelined loop over fixed-size iterations.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+from mscclpp.language.loop import LoopIterationContext
+
+
+def reduce_nvls_pipeline(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        instances=1,
+        protocol="Simple",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the GPUs are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Pipeline Reducing and Storing the data
+        with LoopIterationContext(unit=2**20, num_chunks=1):
+            for gpu in range(gpu_size):
+                buffer_offset = gpu
+                rank = Rank(gpu)
+                input_buffer = rank.get_input_buffer()
+                nvls_chan.at_rank(gpu).reduce(
+                    buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+                )
+                nvls_chan.at_rank(gpu).broadcast(
+                    src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+                )
+
+        # Synchronization to Ensure the GPUs finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_nvls_pipeline(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_pack.py b/test/executor-tests/algos/reduce_pack.py
new file mode 100644
index 00000000..9aa48caf
--- /dev/null
+++ b/test/executor-tests/algos/reduce_pack.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Pack Test
+
+This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
+operations. It implements a 2-GPU allreduce with the LL (low-latency)
+packet protocol.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_pack(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 3)
+        second_scratch_buffer = Buffer(1, 3)
+
+        # Each rank sends its input chunk as packets to the other rank's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb=0)
+        second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb=0)
+
+        # Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
+        first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb=1, packet=True)
+        first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb=1)
+
+        # Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
+        second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb=1, packet=True)
+        second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb=1)
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb=1)
+
+        # Both ranks unpack the final reduced packets from scratch into their output buffers
+        first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb=2)
+        second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb=2)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_pack_tbg.py b/test/executor-tests/algos/reduce_pack_tbg.py
new file mode 100644
index 00000000..eaca4c4c
--- /dev/null
+++ b/test/executor-tests/algos/reduce_pack_tbg.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Pack Thread Block Group Test
+
+This file tests the REDUCE_COPY_SEND_PACKETS and REDUCE_SEND_PACKETS
+operations using thread block groups. It implements a 2-GPU allreduce
+with the LL (low-latency) packet protocol, where multiple thread
+blocks cooperate on each phase.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 3)
+        second_scratch_buffer = Buffer(1, 3)
+        tbg = []
+        for i in range(3):
+            tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
+
+        # Each rank sends its input chunk as packets to the other rank's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[1:2], first_input_buffer[1:2], tb_group=tbg[0])
+        second_ch.put_packets(first_scratch_buffer[0:1], second_input_buffer[0:1], tb_group=tbg[0])
+
+        # Rank 0 reduces received scratch with its input, then sends the result to rank 1's scratch
+        first_rank.reduce(first_input_buffer[0:1], [first_scratch_buffer[0:1]], tb_group=tbg[1], packet=True)
+        first_ch.put_packets(second_scratch_buffer[0:1], first_input_buffer[0:1], tb_group=tbg[1])
+
+        # Rank 1 reduces received scratch with its input, then sends the result back to rank 0's scratch
+        second_rank.reduce(second_input_buffer[1:2], [second_scratch_buffer[1:2]], tb_group=tbg[1], packet=True)
+        second_rank.copy_packets(second_scratch_buffer[2:3], second_input_buffer[1:2], tb_group=tbg[1])
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[2:3], tb_group=tbg[1])
+
+        # Both ranks unpack the final reduced packets from scratch into their output buffers
+        first_rank.unpack_packets(first_input_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[2])
+        second_rank.unpack_packets(second_input_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/reduce_tbg.py b/test/executor-tests/algos/reduce_tbg.py
new file mode 100644
index 00000000..103c6d20
--- /dev/null
+++ b/test/executor-tests/algos/reduce_tbg.py
@@ -0,0 +1,99 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Thread Block Group Test
+
+This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
+operations using thread block groups. It implements a 2-GPU allreduce
+with the Simple protocol and instruction fusion, where multiple thread
+blocks cooperate on each operation.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    collective = AllReduce(2, 2, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        2,
+        protocol="Simple",
+        instr_fusion=True,
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, memory channels, input buffers, and scratch buffers for 2-GPU AllReduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch_tb0 = MemoryChannel(1, 0)
+        first_ch_tb1 = MemoryChannel(1, 0)
+        second_ch_tb0 = MemoryChannel(0, 1)
+        second_ch_tb1 = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 4)
+        second_scratch_buffer = Buffer(1, 4)
+        tbg = ThreadBlockGroup(tb_list=[0, 1])
+
+        # Each rank copies its input chunks to scratch to prepare for remote access
+        first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb_group=tbg)
+        second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb_group=tbg)
+
+        # Signal and wait on both TBs to ensure scratch data is visible to the remote rank
+        first_ch_tb0.signal(tb=0)
+        first_ch_tb1.signal(tb=1)
+        second_ch_tb0.signal(tb=0)
+        second_ch_tb1.signal(tb=1)
+
+        first_ch_tb0.wait(tb=0)
+        first_ch_tb1.wait(tb=1)
+        second_ch_tb0.wait(tb=0)
+        second_ch_tb1.wait(tb=1)
+
+        # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
+        first_ch_tb0.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb_group=tbg)
+        first_ch_tb0.put(second_input_buffer[0:1], first_input_buffer[0:1], tb_group=tbg)
+
+        # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
+        first_ch_tb0.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg)
+        first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb_group=tbg)
+        first_ch_tb0.put(second_input_buffer[1:2], first_input_buffer[1:2], tb_group=tbg)
+
+        # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
+        second_ch_tb0.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb_group=tbg)
+        second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb_group=tbg)
+        second_ch_tb0.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb_group=tbg)
+
+        # Final signal/wait on both TBs to ensure all reduced data is consistent across both ranks
+        first_ch_tb0.signal(tb=0)
+        first_ch_tb1.signal(tb=1)
+        second_ch_tb0.signal(tb=0)
+        second_ch_tb1.signal(tb=1)
+
+        first_ch_tb0.wait(tb=0)
+        first_ch_tb1.wait(tb=1)
+        second_ch_tb0.wait(tb=0)
+        second_ch_tb1.wait(tb=1)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/transfer_pack.py b/test/executor-tests/algos/transfer_pack.py
new file mode 100644
index 00000000..e382f012
--- /dev/null
+++ b/test/executor-tests/algos/transfer_pack.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Transfer Pack Test
+
+This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
+PUT_PACKETS operations. It implements a 2-GPU allgather with the LL
+(low-latency) packet protocol.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def transfer_pack(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, output and scratch buffers for 2-GPU allgather
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_output_buffer = first_rank.get_output_buffer()
+        second_output_buffer = second_rank.get_output_buffer()
+        first_scratch_buffer = Buffer(0, 2)
+        second_scratch_buffer = Buffer(1, 2)
+
+        # Rank 0 sends its output chunk as packets to rank 1's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb=0)
+
+        # Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
+        second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb=0)
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=1)
+
+        # Both ranks unpack received packets from scratch into their output buffers
+        first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb=1)
+        second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb=2)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+transfer_pack(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/algos/transfer_pack_tbg.py b/test/executor-tests/algos/transfer_pack_tbg.py
new file mode 100644
index 00000000..5a2dc11b
--- /dev/null
+++ b/test/executor-tests/algos/transfer_pack_tbg.py
@@ -0,0 +1,71 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Transfer Pack Thread Block Group Test
+
+This file tests the UNPACK_PACKETS, COPY_PACKETS, READ_PUT_PACKETS and
+PUT_PACKETS operations using thread block groups. It implements a 2-GPU
+allgather with the LL (low-latency) packet protocol, where multiple
+thread blocks cooperate on each phase.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def transfer_pack_tbg(name, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    gpu_size = 2
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, output and scratch buffers for 2-GPU allgather
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_output_buffer = first_rank.get_output_buffer()
+        second_output_buffer = second_rank.get_output_buffer()
+        first_scratch_buffer = Buffer(0, 2)
+        second_scratch_buffer = Buffer(1, 2)
+        tbg = []
+        for i in range(3):
+            tbg.append(ThreadBlockGroup(tb_list=[2 * i, 2 * i + 1]))
+
+        # Rank 0 sends its output chunk as packets to rank 1's scratch buffer
+        first_ch.put_packets(second_scratch_buffer[0:1], first_output_buffer[0:1], tb_group=tbg[0])
+
+        # Rank 1 copies its output to scratch, then sends it as packets to rank 0's scratch buffer
+        second_rank.copy_packets(second_scratch_buffer[1:2], second_output_buffer[1:2], tb_group=tbg[0])
+        second_ch.read_put_packets(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb_group=tbg[1])
+
+        # Both ranks unpack received packets from scratch into their output buffers
+        first_rank.unpack_packets(first_output_buffer[1:2], first_scratch_buffer[1:2], tb_group=tbg[1])
+        second_rank.unpack_packets(second_output_buffer[0:1], second_scratch_buffer[0:1], tb_group=tbg[2])
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+transfer_pack_tbg(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/test/executor-tests/execution-plans/reduce.json b/test/executor-tests/execution-plans/reduce.json
new file mode 100644
index 00000000..49a1048a
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce.json
@@ -0,0 +1,389 @@
+{
+  "name": "reduce",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        },
+        {
+          "rank": 1,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_nvls.json b/test/executor-tests/execution-plans/reduce_nvls.json
new file mode 100644
index 00000000..ac1261d6
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_nvls.json
@@ -0,0 +1,246 @@
+{
+  "name": "allreduce_nvls",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "glres",
+              "src_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "switch",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "glres",
+              "src_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "switch_channel_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "switch",
+              "reduce_op": "sum"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_nvls_pipeline.json b/test/executor-tests/execution-plans/reduce_nvls_pipeline.json
new file mode 100644
index 00000000..c9fb0760
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_nvls_pipeline.json
@@ -0,0 +1,264 @@
+{
+  "name": "allreduce_nvls_pipeline",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pipeline",
+              "iter_context": {
+                "unit_size": 1048576,
+                "num_chunks": 1
+              },
+              "ops": [
+                {
+                  "name": "glres",
+                  "src_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 0,
+                      "size": 1
+                    }
+                  ],
+                  "dst_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 0,
+                      "size": 1
+                    }
+                  ],
+                  "channel_type": "switch",
+                  "reduce_op": "sum"
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 0,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "pipeline",
+              "iter_context": {
+                "unit_size": 1048576,
+                "num_chunks": 1
+              },
+              "ops": [
+                {
+                  "name": "glres",
+                  "src_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 1,
+                      "size": 1
+                    }
+                  ],
+                  "dst_buff": [
+                    {
+                      "switch_channel_id": 0,
+                      "index": 1,
+                      "size": 1
+                    }
+                  ],
+                  "channel_type": "switch",
+                  "reduce_op": "sum"
+                }
+              ]
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxsignal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rlxwait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            },
+            {
+              "channel_type": "switch",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        },
+        {
+          "channel_type": "switch",
+          "buffer_type": "i",
+          "rank_groups": [
+            {
+              "size": 2,
+              "ranks": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "remote_buffers": [],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_pack.json b/test/executor-tests/execution-plans/reduce_pack.json
new file mode 100644
index 00000000..b74d5772
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_pack.json
@@ -0,0 +1,297 @@
+{
+  "name": "reduce_pack",
+  "collective": "allreduce",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_pack_tbg.json b/test/executor-tests/execution-plans/reduce_pack_tbg.json
new file mode 100644
index 00000000..4380de6e
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_pack_tbg.json
@@ -0,0 +1,576 @@
+{
+  "name": "reduce_pack_tbg",
+  "collective": "allreduce",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "respkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 2,
+      "output_chunks": 2,
+      "scratch_chunks": 3,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "recspkt",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/reduce_tbg.json b/test/executor-tests/execution-plans/reduce_tbg.json
new file mode 100644
index 00000000..a4683236
--- /dev/null
+++ b/test/executor-tests/execution-plans/reduce_tbg.json
@@ -0,0 +1,773 @@
+{
+  "name": "reduce_tbg",
+  "collective": "allreduce",
+  "protocol": "Simple",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rres",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "get",
+              "src_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_ids": [
+                1
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "res",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 1,
+                  "size": 1
+                },
+                {
+                  "buffer_id": 1,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                1,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0,
+                1
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1,
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        },
+        {
+          "rank": 1,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 4,
+      "output_chunks": 4,
+      "scratch_chunks": 4,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "rre",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                },
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "reduce_op": "sum",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "copy",
+              "src_buff": [
+                {
+                  "type": "i",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "put",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 2,
+                  "size": 2
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "signal",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            },
+            {
+              "name": "wait",
+              "channel_ids": [
+                0
+              ],
+              "channel_type": "memory"
+            },
+            {
+              "name": "nop"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                1,
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0,
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "i",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": false,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/transfer_pack.json b/test/executor-tests/execution-plans/transfer_pack.json
new file mode 100644
index 00000000..270d6c13
--- /dev/null
+++ b/test/executor-tests/execution-plans/transfer_pack.json
@@ -0,0 +1,216 @@
+{
+  "name": "transfer_pack",
+  "collective": "allgather",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory"
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ]
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor-tests/execution-plans/transfer_pack_tbg.json b/test/executor-tests/execution-plans/transfer_pack_tbg.json
new file mode 100644
index 00000000..bec8459d
--- /dev/null
+++ b/test/executor-tests/execution-plans/transfer_pack_tbg.json
@@ -0,0 +1,406 @@
+{
+  "name": "transfer_pack_tbg",
+  "collective": "allgather",
+  "protocol": "LL",
+  "inplace": true,
+  "reuse_resources": false,
+  "gpus": [
+    {
+      "id": 0,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "ppkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            1
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 1,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    },
+    {
+      "id": 1,
+      "input_chunks": 1,
+      "output_chunks": 2,
+      "scratch_chunks": 2,
+      "threadblocks": [
+        {
+          "id": 0,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 1,
+          "ops": [
+            {
+              "name": "cpkt",
+              "src_buff": [
+                {
+                  "type": "o",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 2,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "ops": [
+            {
+              "name": "rppkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "buffer_id": 0,
+                  "index": 1,
+                  "size": 1
+                }
+              ],
+              "channel_type": "memory",
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [
+            {
+              "channel_type": "memory",
+              "channel_ids": [
+                0
+              ]
+            }
+          ],
+          "remote_buffer_refs": [
+            {
+              "access_channel_type": "memory",
+              "remote_buffer_ids": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 0,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        },
+        {
+          "id": 5,
+          "ops": [
+            {
+              "name": "upkt",
+              "src_buff": [
+                {
+                  "type": "s",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "dst_buff": [
+                {
+                  "type": "o",
+                  "index": 0,
+                  "size": 1
+                }
+              ],
+              "tbg_info": {
+                "tb_id": 1,
+                "tbg_size": 2
+              }
+            }
+          ],
+          "channels": [],
+          "remote_buffer_refs": []
+        }
+      ],
+      "channels": [
+        {
+          "channel_type": "memory",
+          "connected_to": [
+            0
+          ]
+        }
+      ],
+      "remote_buffers": [
+        {
+          "rank": 0,
+          "type": "s",
+          "access_channel_types": [
+            "memory"
+          ]
+        }
+      ],
+      "semaphores": []
+    }
+  ],
+  "num_threads_per_block": 1024,
+  "use_double_scratch_buffer": true,
+  "buffer_alignment": 16,
+  "min_message_size": 0,
+  "max_message_size": 18446744073709551615
+}
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 0e7869ab..2378e7ff 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -93,11 +93,8 @@ double benchTime(int rank, std::shared_ptr<mscclpp::Bootstrap> bootstrap, std::s
 
 int main(int argc, char* argv[]) {
   if (argc != 5 && argc != 6) {
-    std::cerr << "Usage: " << argv[0] << " <buffer size>"
-              << " <execution plan path>"
-              << " <number of iterations>"
-              << " <number of graph iterations>"
-              << " (optional) <packet type>" << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <buffer size> <execution plan path>"
+              << " <number of iterations> <number of graph iterations> (optional) <packet type>" << std::endl;
     return 1;
   }
 
@@ -142,7 +139,8 @@ int main(int argc, char* argv[]) {
     NpKit::Shutdown();
   }
 
-  std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << deltaSec * 1.e6 << " us" << std::endl;
+  double latencyUs = deltaSec * 1.e6;
+  std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << latencyUs << " us" << std::endl;
   MPI_Finalize();
   return 0;
 }
diff --git a/test/framework.cc b/test/framework.cc
new file mode 100644
index 00000000..941fdcba
--- /dev/null
+++ b/test/framework.cc
@@ -0,0 +1,369 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "framework.hpp"
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+namespace mscclpp {
+namespace test {
+
+// Global state
+static int gMpiRank = 0;
+static int gMpiSize = 1;
+static bool gMpiInitialized = false;
+static bool gCurrentTestPassed = true;
+static std::string gCurrentTestFailureMessage;
+static std::string gCurrentTestName;
+
+// Performance result collection
+struct PerfResult {
+  std::string label;
+  double value;
+  std::string unit;
+};
+struct PerfTestResults {
+  std::string testName;
+  std::vector<PerfResult> results;
+};
+static std::vector<PerfTestResults> gPerfResults;
+
+std::string currentTestName() { return gCurrentTestName; }
+
+void reportPerfResult(const std::string& label, double value, const std::string& unit) {
+  if (gMpiRank != 0) return;
+  if (gCurrentTestName.empty()) return;
+  // Find or create entry for the current test
+  if (gPerfResults.empty() || gPerfResults.back().testName != gCurrentTestName) {
+    gPerfResults.push_back({gCurrentTestName, {}});
+  }
+  gPerfResults.back().results.push_back({label, value, unit});
+}
+
+namespace utils {
+
+void initializeMPI(int argc, char* argv[]) {
+  if (gMpiInitialized) return;
+
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (!initialized) {
+    MPI_Init(&argc, &argv);
+  }
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &gMpiRank);
+  MPI_Comm_size(MPI_COMM_WORLD, &gMpiSize);
+  gMpiInitialized = true;
+}
+
+static void finalizeMPI() {
+  if (!gMpiInitialized) return;
+
+  MPI_Finalize();
+  gMpiInitialized = false;
+}
+
+bool isMainRank() { return gMpiRank == 0; }
+
+int getMPIRank() { return gMpiRank; }
+
+int getMPISize() { return gMpiSize; }
+
+void cleanupMPI() { finalizeMPI(); }
+
+void reportFailure(const char* file, int line, const std::string& message) {
+  gCurrentTestPassed = false;
+  std::ostringstream oss;
+  oss << file << ":" << line << ": " << message;
+  if (!gCurrentTestFailureMessage.empty()) {
+    gCurrentTestFailureMessage += "\n";
+  }
+  gCurrentTestFailureMessage += oss.str();
+  std::cerr << oss.str() << std::endl;
+}
+
+void reportSuccess() {
+  gCurrentTestPassed = true;
+  gCurrentTestFailureMessage.clear();
+}
+
+// Timer implementation
+Timer::Timer() : isRunning_(false) {}
+
+void Timer::start() {
+  startTime_ = std::chrono::high_resolution_clock::now();
+  isRunning_ = true;
+}
+
+void Timer::stop() {
+  endTime_ = std::chrono::high_resolution_clock::now();
+  isRunning_ = false;
+}
+
+double Timer::elapsedMicroseconds() const {
+  if (isRunning_) {
+    auto now = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration_cast<std::chrono::microseconds>(now - startTime_).count();
+  }
+  return std::chrono::duration_cast<std::chrono::microseconds>(endTime_ - startTime_).count();
+}
+
+double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
+
+double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
+
+void cudaCheck(cudaError_t err, const char* file, int line) {
+  if (err != cudaSuccess) {
+    std::string msg =
+        std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
+    throw std::runtime_error(msg);
+  }
+}
+
+}  // namespace utils
+
+// TestRegistry implementation
+TestRegistry& TestRegistry::instance() {
+  static TestRegistry registry;
+  return registry;
+}
+
+void TestRegistry::registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
+                                bool isPerfTest) {
+  tests_.push_back({suiteName, testName, std::move(factory), isPerfTest});
+}
+
+void TestRegistry::addEnvironment(Environment* env) { environments_.push_back(env); }
+
+// Returns true if the test should run given the filter string.
+// Filter syntax:
+//   ""          -> run all
+//   "Pattern"   -> run only tests whose full name contains Pattern
+//   "-Pattern"  -> run all tests EXCEPT those whose full name contains Pattern
+static bool matchesFilter(const std::string& fullName, const std::string& filter) {
+  if (filter.empty()) return true;
+  if (filter[0] == '-') {
+    // Negative filter: exclude tests matching any comma-separated pattern
+    std::string patterns = filter.substr(1);
+    size_t pos = 0;
+    while (pos < patterns.size()) {
+      size_t comma = patterns.find(',', pos);
+      std::string pattern = (comma == std::string::npos) ? patterns.substr(pos) : patterns.substr(pos, comma - pos);
+      if (!pattern.empty() && fullName.find(pattern) != std::string::npos) {
+        return false;
+      }
+      pos = (comma == std::string::npos) ? patterns.size() : comma + 1;
+    }
+    return true;
+  }
+  // Positive filter: include only matching tests
+  return fullName.find(filter) != std::string::npos;
+}
+
+int TestRegistry::runAllTests(int argc, char* argv[]) {
+  // Initialize MPI if not already initialized
+  if (!gMpiInitialized) {
+    utils::initializeMPI(argc, argv);
+  }
+
+  // Parse command line arguments
+  std::string filter;
+  bool excludePerfTests = false;
+  bool onlyPerfTests = false;
+
+  for (int i = 1; i < argc; ++i) {
+    std::string arg = argv[i];
+    if (arg.find("--filter=") == 0) {
+      filter = arg.substr(9);  // Length of "--filter="
+    } else if (arg == "--filter" && i + 1 < argc) {
+      filter = argv[i + 1];
+      ++i;
+    } else if (arg == "--exclude-perf-tests") {
+      excludePerfTests = true;
+    } else if (arg == "--only-perf-tests") {
+      onlyPerfTests = true;
+    }
+  }
+
+  // Set up global test environments
+  for (auto* env : environments_) {
+    try {
+      env->SetUp();
+    } catch (const std::exception& e) {
+      if (gMpiRank == 0) {
+        std::cerr << "Failed to set up test environment: " << e.what() << std::endl;
+      }
+      return 1;
+    }
+  }
+
+  int passed = 0;
+  int failed = 0;
+  int skipped = 0;
+
+  // Count tests to run
+  int totalToRun = 0;
+  int skippedByFilter = 0;
+  for (const auto& entry : tests_) {
+    std::string fullName = entry.suiteName + "." + entry.testName;
+    if (excludePerfTests && entry.isPerfTest) {
+      skippedByFilter++;
+      continue;
+    }
+    if (onlyPerfTests && !entry.isPerfTest) {
+      skippedByFilter++;
+      continue;
+    }
+    if (!matchesFilter(fullName, filter)) {
+      skippedByFilter++;
+      continue;
+    }
+    totalToRun++;
+  }
+
+  if (gMpiRank == 0) {
+    std::cout << "[==========] Running " << totalToRun << " tests";
+    if (skippedByFilter > 0) {
+      std::cout << " (" << skippedByFilter << " skipped by filter)";
+    }
+    std::cout << ".\n";
+  }
+
+  for (const auto& entry : tests_) {
+    std::string fullName = entry.suiteName + "." + entry.testName;
+
+    if (excludePerfTests && entry.isPerfTest) continue;
+    if (onlyPerfTests && !entry.isPerfTest) continue;
+    if (!matchesFilter(fullName, filter)) continue;
+
+    gCurrentTestPassed = true;
+    gCurrentTestFailureMessage.clear();
+    gCurrentTestName = fullName;
+
+    if (gMpiRank == 0) {
+      std::cout << "[ RUN      ] " << fullName << std::endl;
+    }
+
+    TestCase* testCase = nullptr;
+    bool testSkipped = false;
+    bool setUpSucceeded = false;
+    try {
+      testCase = entry.factory();
+      testCase->SetUp();
+      setUpSucceeded = true;
+      testCase->TestBody();
+    } catch (const SkipException& e) {
+      gCurrentTestPassed = true;
+      testSkipped = true;
+      if (gMpiRank == 0) {
+        std::cout << "[  SKIPPED ] " << fullName << ": " << e.what() << std::endl;
+      }
+    } catch (const std::exception& e) {
+      gCurrentTestPassed = false;
+      if (gCurrentTestFailureMessage.empty()) {
+        gCurrentTestFailureMessage = e.what();
+      }
+    } catch (...) {
+      gCurrentTestPassed = false;
+      if (gCurrentTestFailureMessage.empty()) {
+        gCurrentTestFailureMessage = "Unknown exception";
+      }
+    }
+
+    // Always call TearDown() if SetUp() succeeded, even if TestBody() threw
+    if (setUpSucceeded && testCase != nullptr) {
+      try {
+        testCase->TearDown();
+      } catch (const std::exception& e) {
+        // If test already failed, keep original failure message
+        if (gCurrentTestPassed) {
+          gCurrentTestPassed = false;
+          gCurrentTestFailureMessage = std::string("TearDown() failed: ") + e.what();
+        }
+      } catch (...) {
+        if (gCurrentTestPassed) {
+          gCurrentTestPassed = false;
+          gCurrentTestFailureMessage = "TearDown() failed with unknown exception";
+        }
+      }
+    }
+
+    delete testCase;
+    gCurrentTestName.clear();
+
+    if (testSkipped) {
+      skipped++;
+      continue;
+    }
+
+    // Synchronize test status across all MPI processes
+    int localPassed = gCurrentTestPassed ? 1 : 0;
+    int globalPassed = 1;
+    if (gMpiInitialized) {
+      MPI_Allreduce(&localPassed, &globalPassed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    } else {
+      globalPassed = localPassed;
+    }
+
+    if (gMpiRank == 0) {
+      if (globalPassed) {
+        std::cout << "[       OK ] " << fullName << std::endl;
+        passed++;
+      } else {
+        std::cout << "[  FAILED  ] " << fullName << std::endl;
+        if (!gCurrentTestFailureMessage.empty()) {
+          std::cout << "            Reason: " << gCurrentTestFailureMessage << std::endl;
+        }
+        failed++;
+      }
+    }
+  }
+
+  if (gMpiRank == 0) {
+    std::cout << "[==========] " << totalToRun << " tests ran.\n";
+    if (passed > 0) {
+      std::cout << "[  PASSED  ] " << passed << " tests.\n";
+    }
+    if (skipped > 0) {
+      std::cout << "[  SKIPPED ] " << skipped << " tests.\n";
+    }
+    if (failed > 0) {
+      std::cout << "[  FAILED  ] " << failed << " tests.\n";
+    }
+
+    // Print collected performance results
+    if (!gPerfResults.empty()) {
+      std::cout << "\n[   PERF   ] Performance results:\n";
+      for (const auto& testResult : gPerfResults) {
+        std::cout << "[   PERF   ] " << testResult.testName << "\n";
+        for (const auto& r : testResult.results) {
+          std::cout << "[   PERF   ]   " << std::setw(12) << r.label << ": " << std::setprecision(4) << r.value << " "
+                    << r.unit << "\n";
+        }
+      }
+      gPerfResults.clear();
+    }
+  }
+
+  // Tear down global test environments (in reverse order)
+  for (auto it = environments_.rbegin(); it != environments_.rend(); ++it) {
+    try {
+      (*it)->TearDown();
+      delete *it;
+    } catch (const std::exception& e) {
+      if (gMpiRank == 0) {
+        std::cerr << "Failed to tear down test environment: " << e.what() << std::endl;
+      }
+    }
+  }
+  environments_.clear();
+
+  return failed > 0 ? 1 : 0;
+}
+
+}  // namespace test
+}  // namespace mscclpp
diff --git a/test/framework.hpp b/test/framework.hpp
new file mode 100644
index 00000000..b2431ed9
--- /dev/null
+++ b/test/framework.hpp
@@ -0,0 +1,412 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_TEST_FRAMEWORK_HPP_
+#define MSCCLPP_TEST_FRAMEWORK_HPP_
+
+#include <mpi.h>
+
+#include <chrono>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <mscclpp/gpu.hpp>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace mscclpp {
+namespace test {
+
+// Test case base class
+class TestCase {
+ public:
+  virtual ~TestCase() = default;
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+  virtual void TestBody() = 0;
+};
+
+// Environment base class (for global test setup/teardown)
+class Environment {
+ public:
+  virtual ~Environment() = default;
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+};
+
+// Test registry and runner
+class TestRegistry {
+ public:
+  using TestFactory = std::function<TestCase*()>;
+
+  static TestRegistry& instance();
+
+  void registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
+                    bool isPerfTest = false);
+  void addEnvironment(Environment* env);
+  int runAllTests(int argc, char* argv[]);
+
+ private:
+  TestRegistry() = default;
+  struct TestEntry {
+    std::string suiteName;
+    std::string testName;
+    TestFactory factory;
+    bool isPerfTest;
+  };
+  std::vector<TestEntry> tests_;
+  std::vector<Environment*> environments_;
+};
+
+// Returns "Suite.Name" for the currently running test, or "" if none.
+std::string currentTestName();
+
+/// Collect a performance result for the current test. Results are printed together
+/// after all tests complete. Only rank 0 should call this (results are ignored on other ranks).
+/// @param label A label for this measurement (e.g., "128 MB" or "latency").
+/// @param value The numeric result.
+/// @param unit The unit string (e.g., "GB/s", "us/iter").
+void reportPerfResult(const std::string& label, double value, const std::string& unit);
+
+// Utility functions
+namespace utils {
+
+// MPI management
+void initializeMPI(int argc, char* argv[]);
+void cleanupMPI();
+bool isMainRank();
+int getMPIRank();
+int getMPISize();
+
+// Timing utilities
+class Timer {
+ public:
+  Timer();
+  void start();
+  void stop();
+  double elapsedMicroseconds() const;
+  double elapsedMilliseconds() const;
+  double elapsedSeconds() const;
+
+ private:
+  std::chrono::high_resolution_clock::time_point startTime_;
+  std::chrono::high_resolution_clock::time_point endTime_;
+  bool isRunning_;
+};
+
+// CUDA utilities
+void cudaCheck(cudaError_t err, const char* file, int line);
+#define CUDA_CHECK(call) mscclpp::test::utils::cudaCheck(call, __FILE__, __LINE__)
+
+// Test assertion helpers
+void reportFailure(const char* file, int line, const std::string& message);
+void reportSuccess();
+
+}  // namespace utils
+
+// Exception for test skips
+class SkipException : public std::runtime_error {
+ public:
+  explicit SkipException(const std::string& message) : std::runtime_error(message) {}
+};
+
+// Helper class for FAIL() macro — supports message streaming via operator<<
+class FailHelper {
+ public:
+  explicit FailHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  FailHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~FailHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed: " + msg);
+    } else {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed");
+    }
+    throw std::runtime_error("Test failed");
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+// Helper class for SKIP_TEST() macro — supports message streaming via operator<<
+// Usage: SKIP_TEST() << "Reason for skipping";
+class SkipHelper {
+ public:
+  explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  SkipHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~SkipHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      throw SkipException("Test skipped: " + msg);
+    } else {
+      throw SkipException("Test skipped");
+    }
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+// SFINAE helper: resolves to T if T is a complete type (user-defined fixture),
+// otherwise falls back to TestCase. This lets TEST() work with or without a fixture class.
+namespace detail {
+template <typename...>
+using void_t = void;
+
+template <typename T, typename = void_t<>>
+struct FixtureOf {
+  using type = TestCase;
+};
+template <typename T>
+struct FixtureOf<T, void_t<decltype(sizeof(T))>> {
+  using type = T;
+};
+}  // namespace detail
+
+}  // namespace test
+}  // namespace mscclpp
+
+// --- Test registration macros ---
+// TEST(Suite, Name): if Suite is a previously-defined class, the test inherits from it (fixture).
+// Otherwise, the test inherits from TestCase (no fixture needed).
+
+#define TEST(test_fixture, test_name)                                                                       \
+  class test_fixture;                                                                                       \
+  class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
+   public:                                                                                                  \
+    void TestBody() override;                                                                               \
+  };                                                                                                        \
+  static bool test_fixture##_##test_name##_registered = []() {                                              \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                                 \
+        #test_fixture, #test_name,                                                                          \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); });            \
+    return true;                                                                                            \
+  }();                                                                                                      \
+  void test_fixture##_##test_name##_Test::TestBody()
+
+#define PERF_TEST(test_fixture, test_name)                                                                  \
+  class test_fixture;                                                                                       \
+  class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
+   public:                                                                                                  \
+    void TestBody() override;                                                                               \
+  };                                                                                                        \
+  static bool test_fixture##_##test_name##_registered = []() {                                              \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                                 \
+        #test_fixture, #test_name,                                                                          \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, true);      \
+    return true;                                                                                            \
+  }();                                                                                                      \
+  void test_fixture##_##test_name##_Test::TestBody()
+
+// --- Test runner macro ---
+#define RUN_ALL_TESTS() ::mscclpp::test::TestRegistry::instance().runAllTests(argc, argv)
+
+// Assertion macros
+#define EXPECT_TRUE(condition)                                                                          \
+  do {                                                                                                  \
+    if (!(condition)) {                                                                                 \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
+    }                                                                                                   \
+  } while (0)
+
+#define EXPECT_FALSE(condition)                                                                          \
+  do {                                                                                                   \
+    if (condition) {                                                                                     \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
+    }                                                                                                    \
+  } while (0)
+
+#define EXPECT_EQ(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 == v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_NE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 != v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_LT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 < v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+    }                                                                                \
+  } while (0)
+
+#define EXPECT_LE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 <= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_GT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 > v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+    }                                                                                \
+  } while (0)
+
+#define EXPECT_GE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 >= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_TRUE(condition)                                                                          \
+  do {                                                                                                  \
+    if (!(condition)) {                                                                                 \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
+      throw std::runtime_error("Test assertion failed");                                                \
+    }                                                                                                   \
+  } while (0)
+
+#define ASSERT_FALSE(condition)                                                                          \
+  do {                                                                                                   \
+    if (condition) {                                                                                     \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
+      throw std::runtime_error("Test assertion failed");                                                 \
+    }                                                                                                    \
+  } while (0)
+
+#define ASSERT_EQ(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 == v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_NE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 != v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_LT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 < v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+      throw std::runtime_error("Test assertion failed");                             \
+    }                                                                                \
+  } while (0)
+
+#define ASSERT_LE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 <= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_GT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 > v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+      throw std::runtime_error("Test assertion failed");                             \
+    }                                                                                \
+  } while (0)
+
+#define ASSERT_GE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 >= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_NO_THROW(statement)                                                                         \
+  do {                                                                                                     \
+    try {                                                                                                  \
+      statement;                                                                                           \
+    } catch (const std::exception& e) {                                                                    \
+      std::ostringstream oss;                                                                              \
+      oss << "Expected: " #statement " not to throw\n  Actual: threw " << e.what();                        \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());                                \
+      throw std::runtime_error("Test assertion failed");                                                   \
+    } catch (...) {                                                                                        \
+      ::mscclpp::test::utils::reportFailure(                                                               \
+          __FILE__, __LINE__, "Expected: " #statement " not to throw\n  Actual: threw unknown exception"); \
+      throw std::runtime_error("Test assertion failed");                                                   \
+    }                                                                                                      \
+  } while (0)
+
+// --- Test control macros ---
+
+// Fail the current test immediately. Usage: FAIL() << "reason";
+#define FAIL() ::mscclpp::test::FailHelper(__FILE__, __LINE__)
+
+// Skip the current test. Usage: SKIP_TEST() << "reason";
+#define SKIP_TEST() ::mscclpp::test::SkipHelper(__FILE__, __LINE__)
+
+#endif  // MSCCLPP_TEST_FRAMEWORK_HPP_
diff --git a/test/mp_unit/CMakeLists.txt b/test/mp_unit/CMakeLists.txt
index b99bb09d..d4004e8e 100644
--- a/test/mp_unit/CMakeLists.txt
+++ b/test/mp_unit/CMakeLists.txt
@@ -8,6 +8,7 @@ target_sources(mp_unit_tests PRIVATE
     communicator_tests.cu
     port_channel_tests.cu
     memory_channel_tests.cu
+    semaphore_perf_tests.cu
     switch_channel_tests.cu
     executor_tests.cc
 )
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index 4bbab2f1..c28087a4 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
@@ -48,7 +48,7 @@ void BootstrapTest::bootstrapTestAll(std::shared_ptr<mscclpp::Bootstrap> bootstr
   bootstrapTestSendRecv(bootstrap);
 }
 
-TEST_F(BootstrapTest, WithId) {
+TEST(BootstrapTest, WithId) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   mscclpp::UniqueId id;
   if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
@@ -57,13 +57,13 @@ TEST_F(BootstrapTest, WithId) {
   bootstrapTestAll(bootstrap);
 }
 
-TEST_F(BootstrapTest, WithIpPortPair) {
+TEST(BootstrapTest, WithIpPortPair) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   bootstrap->initialize(gEnv->args["ip_port"]);
   bootstrapTestAll(bootstrap);
 }
 
-TEST_F(BootstrapTest, ResumeWithId) {
+TEST(BootstrapTest, ResumeWithId) {
   // This test may take a few minutes.
   bootstrapTestTimer.set(300);
 
@@ -76,19 +76,19 @@ TEST_F(BootstrapTest, ResumeWithId) {
   }
 }
 
-TEST_F(BootstrapTest, ResumeWithIpPortPair) {
+TEST(BootstrapTest, ResumeWithIpPortPair) {
   for (int i = 0; i < 5; ++i) {
     auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
     bootstrap->initialize(gEnv->args["ip_port"]);
   }
 }
 
-TEST_F(BootstrapTest, ExitBeforeConnect) {
+TEST(BootstrapTest, ExitBeforeConnect) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   bootstrap->createUniqueId();
 }
 
-TEST_F(BootstrapTest, TimeoutWithId) {
+TEST(BootstrapTest, TimeoutWithId) {
   mscclpp::Timer timer;
 
   // All ranks initialize a bootstrap with their own id (will hang)
@@ -99,7 +99,7 @@ TEST_F(BootstrapTest, TimeoutWithId) {
     // Set bootstrap timeout to 1 second
     bootstrap->initialize(id, 1);
   } catch (const mscclpp::Error& e) {
-    ASSERT_EQ(e.getErrorCode(), mscclpp::ErrorCode::Timeout);
+    ASSERT_TRUE(e.getErrorCode() == mscclpp::ErrorCode::Timeout);
   }
 
   // Timeout should be sligtly greater than 1 second
@@ -139,7 +139,7 @@ class MPIBootstrap : public mscclpp::Bootstrap {
   }
 };
 
-TEST_F(BootstrapTest, MPIBootstrap) {
+TEST(BootstrapTest, MPIBootstrap) {
   auto bootstrap = std::make_shared<MPIBootstrap>();
   bootstrapTestAll(bootstrap);
 }
diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu
index 9d83532a..066c5514 100644
--- a/test/mp_unit/communicator_tests.cu
+++ b/test/mp_unit/communicator_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
@@ -185,7 +185,7 @@ bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
   return true;
 }
 
-TEST_F(CommunicatorTest, BasicWrite) {
+TEST(CommunicatorTest, BasicWrite) {
   if (gEnv->rank >= numRanksToUse) return;
 
   deviceBufferInit();
@@ -215,7 +215,7 @@ __global__ void kernelWaitSemaphores(mscclpp::Host2DeviceSemaphore::DeviceHandle
   }
 }
 
-TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
+TEST(CommunicatorTest, WriteWithDeviceSemaphores) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::unordered_map<int, std::shared_ptr<mscclpp::Host2DeviceSemaphore>> semaphores;
@@ -254,7 +254,7 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
   communicator->bootstrap()->barrier();
 }
 
-TEST_F(CommunicatorTest, WriteWithHostSemaphores) {
+TEST(CommunicatorTest, WriteWithHostSemaphores) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::unordered_map<int, std::shared_ptr<mscclpp::Host2HostSemaphore>> semaphores;
diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc
index a903ed08..4f3f2545 100644
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -1,7 +1,8 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
+#include <unistd.h>
 
 #include <filesystem>
 #include <mscclpp/env.hpp>
@@ -22,7 +23,7 @@ std::string getExecutablePath() {
 
 void ExecutorTest::SetUp() {
   if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) {
-    GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2";
+    SKIP_TEST() << "This test requires world size to be 2 and ranks per node to be 2";
   }
   MultiProcessTest::SetUp();
 
@@ -49,7 +50,7 @@ void ExecutorTest::TearDown() {
   MultiProcessTest::TearDown();
 }
 
-TEST_F(ExecutorTest, TwoNodesAllreduce) {
+TEST(ExecutorTest, TwoNodesAllreduce) {
   std::string executablePath = getExecutablePath();
   std::filesystem::path path = executablePath;
   std::filesystem::path executionFilesPath =
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 8475ccf9..e5945563 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -1,10 +1,14 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
+#include <atomic>
+#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <thread>
 
+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"
 
@@ -18,9 +22,7 @@ void IbTestBase::SetUp() {
 }
 
 void IbPeerToPeerTest::SetUp() {
-#if !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
+  REQUIRE_IBVERBS;
 
   IbTestBase::SetUp();
 
@@ -42,7 +44,11 @@ void IbPeerToPeerTest::SetUp() {
   int ib_gid_index = std::stoi(gEnv->args["ib_gid_index"]);
 
   ibCtx = std::make_shared<mscclpp::IbCtx>(ibDevName);
-  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, 0, 64);
+  bool noAtomic = !ibCtx->supportsRdmaAtomics();
+  // When atomics are not supported, the MemoryConsistency test uses
+  // write-with-imm which requires recv WRs on the receiver side.
+  int maxRecvWr = noAtomic ? 64 : 0;
+  qp = ibCtx->createQp(-1, ib_gid_index, 1024, 1, 8192, maxRecvWr, 64, noAtomic);
 
   qpInfo[gEnv->rank] = qp->getInfo();
   bootstrap->allGather(qpInfo.data(), sizeof(mscclpp::IbQpInfo));
@@ -63,23 +69,24 @@ void IbPeerToPeerTest::registerBufferAndConnect(void* buf, size_t size) {
   bootstrap->barrier();
 }
 
-void IbPeerToPeerTest::stageSend(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled) {
+void IbPeerToPeerTest::stageSendWrite(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset,
+                                      bool signaled) {
   const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1];
-  qp->stageSend(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled);
+  qp->stageSendWrite(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled);
 }
 
-void IbPeerToPeerTest::stageAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled) {
+void IbPeerToPeerTest::stageSendAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled) {
   const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1];
-  qp->stageAtomicAdd(mr.get(), remoteMrInfo, wrId, dstOffset, addVal, signaled);
+  qp->stageSendAtomicAdd(mr.get(), remoteMrInfo, wrId, dstOffset, addVal, signaled);
 }
 
-void IbPeerToPeerTest::stageSendWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset,
-                                        bool signaled, unsigned int immData) {
+void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset,
+                                             bool signaled, unsigned int immData) {
   const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1];
-  qp->stageSendWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
+  qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
 }
 
-TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
+PERF_TEST(IbPeerToPeerTest, SimpleSendRecv) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -96,15 +103,15 @@ TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
   if (gEnv->rank == 1) {
     mscclpp::Timer timer;
     for (int iter = 0; iter < maxIter; ++iter) {
-      stageSend(sizeof(uint64_t) * nelem, 0, 0, 0, true);
+      stageSendWrite(sizeof(uint64_t) * nelem, 0, 0, 0, true);
       qp->postSend();
       bool waiting = true;
       int spin = 0;
       while (waiting) {
-        int wcNum = qp->pollCq();
+        int wcNum = qp->pollSendCq();
         ASSERT_GE(wcNum, 0);
         for (int i = 0; i < wcNum; ++i) {
-          int status = qp->getWcStatus(i);
+          int status = qp->getSendWcStatus(i);
           EXPECT_EQ(status, static_cast<int>(mscclpp::WsStatus::Success));
           waiting = false;
           break;
@@ -115,7 +122,7 @@ TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
       }
     }
     float us = (float)timer.elapsed();
-    std::cout << "IbPeerToPeerTest.SimpleSendRecv: " << us / maxIter << " us/iter" << std::endl;
+    ::mscclpp::test::reportPerfResult("latency", us / maxIter, "us/iter");
   }
   bootstrap->barrier();
 }
@@ -194,17 +201,39 @@ __global__ void kernelMemoryConsistency(uint64_t* data, volatile uint64_t* curIt
   }
 }
 
-TEST_F(IbPeerToPeerTest, MemoryConsistency) {
+TEST(IbPeerToPeerTest, MemoryConsistency) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
   }
 
+  // Use atomic path if supported by the IB device.
+  bool useAtomic = ibCtx->supportsRdmaAtomics();
+
   const uint64_t signalPeriod = 1024;
   const uint64_t maxIter = 10000;
   const uint64_t nelem = 65536 + 1;
   auto data = mscclpp::detail::gpuCallocUnique<uint64_t>(nelem);
 
+  // For no-atomic mode: allocate a separate signal buffer for write-with-imm destination.
+  // The sender writes-with-imm to this buffer; the receiver's CPU thread reads the imm_data
+  // from the recv CQ and writes the iteration value to data[0] via GDRCopy atomicStore.
+  std::shared_ptr<uint64_t> signalBuf;
+  std::unique_ptr<const mscclpp::IbMr> signalMr;
+  std::array<mscclpp::IbMrInfo, 2> signalMrInfo{};
+  if (!useAtomic) {
+    signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+    signalMr = ibCtx->registerMr(signalBuf.get(), sizeof(uint64_t));
+    signalMrInfo[gEnv->rank] = signalMr->getInfo();
+    bootstrap->allGather(signalMrInfo.data(), sizeof(mscclpp::IbMrInfo));
+
+    // Pre-post recv WRs for write-with-imm on both ranks
+    for (int i = 0; i < 64; ++i) {
+      qp->stageRecv(0);
+    }
+    qp->postRecv();
+  }
+
   registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);
 
   uint64_t res = 0;
@@ -223,6 +252,40 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
     ASSERT_EQ(*ptrCurIter, 0);
     ASSERT_EQ(*ptrResult, 0);
 
+    // For no-atomic mode: create a GDRCopy mapping for data[0] and start a CPU thread that
+    // polls recv CQ and forwards the signal via GDRCopy BAR1 write — the same mechanism
+    // used by IBConnection::recvThreadFunc for port channels.
+    std::atomic<bool> stopRecvThread(false);
+    std::thread recvThread;
+    std::unique_ptr<mscclpp::GdrMap> dataGdrMap;
+    if (!useAtomic) {
+      if (!mscclpp::gdrEnabled()) {
+        SKIP_TEST() << "No-atomic mode requires GDRCopy but it is not available.";
+      }
+      // Create GDRCopy BAR1 mapping for data[0] — same as how connection.cc maps inboundToken_
+      dataGdrMap =
+          std::make_unique<mscclpp::GdrMap>(std::shared_ptr<void>(data.get(), [](void*) {}),  // non-owning shared_ptr
+                                            cudaDevId);
+
+      recvThread = std::thread([&]() {
+        while (!stopRecvThread.load(std::memory_order_relaxed)) {
+          int wcNum = qp->pollRecvCq();
+          if (wcNum <= 0) continue;
+          for (int i = 0; i < wcNum; ++i) {
+            int status = qp->getRecvWcStatus(i);
+            if (status != static_cast<int>(mscclpp::WsStatus::Success)) continue;
+            uint64_t val = static_cast<uint64_t>(qp->getRecvWcImmData(i));
+            // Write the iteration value to data[0] via GDRCopy BAR1 atomicStore —
+            // same pattern as IBConnection::recvThreadFunc.
+            mscclpp::atomicStore(dataGdrMap->hostPtr(), val, mscclpp::memoryOrderRelaxed);
+            // Re-post recv
+            qp->stageRecv(0);
+            qp->postRecv();
+          }
+        }
+      });
+    }
+
     kernelMemoryConsistency<<<1, 1024>>>(data.get(), ptrCurIter, ptrResult, nelem, maxIter);
     MSCCLPP_CUDATHROW(cudaGetLastError());
 
@@ -244,6 +307,11 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
     }
 
     MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+    if (!useAtomic) {
+      stopRecvThread.store(true, std::memory_order_relaxed);
+      if (recvThread.joinable()) recvThread.join();
+    }
   } else if (gEnv->rank == 1) {
     // Sender
     std::vector<uint64_t> hostBuffer(nelem, 0);
@@ -261,26 +329,31 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
       bool signaled = (iter % signalPeriod == 0);
 
       // Send from the second element to the last
-      stageSend(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
+      stageSendWrite(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
       qp->postSend();
 
-#if 0
-      // For reference: send the first element using a normal send. This should occasionally see a wrong result.
-      stageSend(sizeof(uint64_t), 0, 0, 0, false);
-      qp->postSend();
-#else
-      // Send the first element using AtomicAdd. This should see the correct result.
-      stageAtomicAdd(0, 0, 1, false);
-      qp->postSend();
-#endif
+      if (useAtomic) {
+        // Send the first element using AtomicAdd. The non-posted PCIe atomic operation
+        // provides end-to-end ordering: data[1..N] are guaranteed visible when data[0] updates.
+        stageSendAtomicAdd(0, 0, 1, false);
+        qp->postSend();
+      } else {
+        // No-atomic mode: send a 0-byte WRITE_WITH_IMM carrying the iteration in imm_data.
+        // The receiver's CPU thread polls the recv CQ and writes the value to data[0]
+        // via GDRCopy atomicStore.
+        // QP ordering guarantees data[1..N] WRITE completes before this write-with-imm.
+        const mscclpp::IbMrInfo& remoteSignalMrInfo = signalMrInfo[(gEnv->rank == 1) ? 0 : 1];
+        qp->stageSendWriteWithImm(nullptr, remoteSignalMrInfo, 0, 0, 0, 0, false, static_cast<unsigned int>(iter));
+        qp->postSend();
+      }
 
       if (signaled) {
-        int wcNum = qp->pollCq();
+        int wcNum = qp->pollSendCq();
         while (wcNum == 0) {
-          wcNum = qp->pollCq();
+          wcNum = qp->pollSendCq();
         }
         ASSERT_EQ(wcNum, 1);
-        int status = qp->getWcStatus(0);
+        int status = qp->getSendWcStatus(0);
         ASSERT_EQ(status, static_cast<int>(mscclpp::WsStatus::Success));
       }
 
@@ -293,20 +366,33 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
     }
   }
 
-  if (res & 2) {
-    FAIL() << "The receiver is stuck at iteration " << iter << ".";
-  } else if (res != 0 && res != 1) {
-    FAIL() << "Unknown error is detected at iteration " << iter << ". res =" << res;
+  if (useAtomic) {
+    // With RDMA atomics, memory consistency must be guaranteed.
+    if (res & 2) {
+      FAIL() << "The receiver is stuck at iteration " << iter << ".";
+    }
+    EXPECT_EQ(res, 0);
+  } else {
+    if (res == 0) {
+      // No-atomic path works correctly here.
+    } else if (res & 2) {
+      SKIP_TEST() << "No-atomic signal forwarding: receiver stuck at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    } else {
+      SKIP_TEST() << "No-atomic signal forwarding: memory inconsistency detected at iteration " << iter
+                  << ". NIC DMA and CPU writes are not ordered on this platform.";
+    }
   }
-
-  EXPECT_EQ(res, 0);
 }
 
-TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
+PERF_TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
   }
+  if (!ibCtx->supportsRdmaAtomics()) {
+    SKIP_TEST() << "This test requires RDMA atomics support.";
+  }
 
   mscclpp::Timer timeout(3);
 
@@ -319,17 +405,17 @@ TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
   if (gEnv->rank == 1) {
     mscclpp::Timer timer;
     for (int iter = 0; iter < maxIter; ++iter) {
-      stageAtomicAdd(0, 0, 1, true);
+      stageSendAtomicAdd(0, 0, 1, true);
       qp->postSend();
       bool waiting = true;
       int spin = 0;
       while (waiting) {
-        int wcNum = qp->pollCq();
+        int wcNum = qp->pollSendCq();
         ASSERT_GE(wcNum, 0);
         for (int i = 0; i < wcNum; ++i) {
-          int status = qp->getWcStatus(i);
+          int status = qp->getSendWcStatus(i);
           if (status != static_cast<int>(mscclpp::WsStatus::Success)) {
-            FAIL() << "Work completion status error: " << qp->getWcStatusString(i);
+            FAIL() << "Work completion status error: " << qp->getSendWcStatusString(i);
           }
           waiting = false;
           break;
@@ -340,7 +426,7 @@ TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
       }
     }
     float us = (float)timer.elapsed();
-    std::cout << "IbPeerToPeerTest.SimpleAtomicAdd: " << us / maxIter << " us/iter" << std::endl;
+    ::mscclpp::test::reportPerfResult("latency", us / maxIter, "us/iter");
   }
   bootstrap->barrier();
 }
diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
index f6ef3aed..1ce9eb0b 100644
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <algorithm>
 
@@ -8,7 +8,7 @@
 void MemoryChannelOneToOneTest::SetUp() {
   // Need at least two ranks within a node
   if (gEnv->nRanksPerNode < 2) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   // Use only two ranks
   setNumRanksToUse(2);
@@ -88,27 +88,12 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   // The least nelem is 2 for packet ping pong
-  kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 1024 * 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
+  for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelWrapper(buff.get(), gEnv->rank, nElem, ret.get(), defaultNTries);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   int nTries = 1000000;
   communicator->bootstrap()->barrier();
@@ -118,7 +103,7 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)(nTries) << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)(nTries), "us/iter");
   }
 }
 
@@ -169,7 +154,7 @@ __global__ void kernelMemPutPingPong(int* buff, int rank, int nElem, int* ret) {
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
+TEST(MemoryChannelOneToOneTest, PutPingPong) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -187,28 +172,12 @@ TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 }
 
 __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
@@ -248,7 +217,7 @@ __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
+TEST(MemoryChannelOneToOneTest, GetPingPong) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -266,28 +235,12 @@ TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 }
 
 __global__ void kernelMemLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
@@ -371,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int*
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, LL8PacketPingPong) {
+PERF_TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
   auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
   packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
 }
 
-TEST_F(MemoryChannelOneToOneTest, LL16PacketPingPong) {
+PERF_TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
   auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
diff --git a/test/mp_unit/mp_unit_tests.cc b/test/mp_unit/mp_unit_tests.cc
index cafd9bbc..2f6dc1ca 100644
--- a/test/mp_unit/mp_unit_tests.cc
+++ b/test/mp_unit/mp_unit_tests.cc
@@ -98,14 +98,18 @@ static std::unordered_map<std::string, std::string> parseArgs(int argc, const ch
       continue;
     }
 
-    // Unrecognized positional token: ignore to keep parser permissive for gtest/MPI extras
+    // Unrecognized positional token: ignore
   }
 
   return options;
 }
 
 void MultiProcessTestEnv::SetUp() {
-  MPI_Init(NULL, NULL);
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (!initialized) {
+    MPI_Init(NULL, NULL);
+  }
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
   // get the local number of nodes with MPI
@@ -128,18 +132,17 @@ void MultiProcessTest::TearDown() {
 }
 
 int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
   gEnv = new MultiProcessTestEnv(argc, (const char**)argv);
-  ::testing::AddGlobalTestEnvironment(gEnv);
+  ::mscclpp::test::TestRegistry::instance().addEnvironment(gEnv);
   return RUN_ALL_TESTS();
 }
 
-TEST_F(MultiProcessTest, Prelim) {
+TEST(MultiProcessTest, Prelim) {
   // Test to make sure the MPI environment is set up correctly
   ASSERT_GE(gEnv->worldSize, 2);
 }
 
-TEST_F(MultiProcessTest, HostName) {
+TEST(MultiProcessTest, HostName) {
   const size_t maxNameLen = 1024;
   std::vector<char> buffer(gEnv->worldSize * maxNameLen, '\0');
   std::string hostName = mscclpp::getHostName(maxNameLen, '\0');
@@ -159,7 +162,7 @@ TEST_F(MultiProcessTest, HostName) {
   }
 }
 
-TEST_F(MultiProcessTest, HostHash) {
+TEST(MultiProcessTest, HostHash) {
   std::vector<uint64_t> buffer(gEnv->worldSize, 0);
   uint64_t hostHash = mscclpp::getHostHash();
   buffer[gEnv->rank] = hostHash;
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index bad80f0a..f4a26cf9 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -4,8 +4,6 @@
 #ifndef MSCCLPP_MP_UNIT_TESTS_HPP_
 #define MSCCLPP_MP_UNIT_TESTS_HPP_
 
-#include <gtest/gtest.h>
-
 #include <mscclpp/core.hpp>
 #include <mscclpp/executor.hpp>
 #include <mscclpp/memory_channel.hpp>
@@ -13,10 +11,18 @@
 #include <mscclpp/port_channel.hpp>
 #include <mscclpp/utils.hpp>
 
+#include "../framework.hpp"
 #include "ib.hpp"
 #include "utils_internal.hpp"
 
-class MultiProcessTestEnv : public ::testing::Environment {
+// Skip the current test if IBVerbs is not available in this build
+#if defined(USE_IBVERBS)
+#define REQUIRE_IBVERBS
+#else
+#define REQUIRE_IBVERBS SKIP_TEST() << "This test requires IBVerbs that the current build does not support."
+#endif
+
+class MultiProcessTestEnv : public ::mscclpp::test::Environment {
  public:
   MultiProcessTestEnv(int argc, const char** argv);
 
@@ -37,7 +43,7 @@ mscclpp::Transport ibIdToTransport(int id);
 int rankToLocalRank(int rank);
 int rankToNode(int rank);
 
-class MultiProcessTest : public ::testing::Test {
+class MultiProcessTest : public ::mscclpp::test::TestCase {
  protected:
   void TearDown() override;
 };
@@ -71,12 +77,12 @@ class IbPeerToPeerTest : public IbTestBase {
 
   void registerBufferAndConnect(void* buf, size_t size);
 
-  void stageSend(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled);
+  void stageSendWrite(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled);
 
-  void stageAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled);
+  void stageSendAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled);
 
-  void stageSendWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled,
-                        unsigned int immData);
+  void stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled,
+                             unsigned int immData);
 
   std::shared_ptr<mscclpp::TcpBootstrap> bootstrap;
   std::shared_ptr<mscclpp::IbCtx> ibCtx;
@@ -131,6 +137,8 @@ class CommunicatorTest : public CommunicatorTestBase {
 template <class T>
 using DeviceHandle = mscclpp::DeviceHandle<T>;
 
+using IbMode = mscclpp::EndpointConfig::Ib::Mode;
+
 class PortChannelOneToOneTest : public CommunicatorTestBase {
  protected:
   struct PingPongTestParams {
@@ -138,17 +146,20 @@ class PortChannelOneToOneTest : public CommunicatorTestBase {
     bool useIB;
     bool useEthernet;
     bool waitWithPoll;
+    IbMode ibMode;
   };
 
   void SetUp() override;
   void TearDown() override;
 
   void setupMeshConnections(std::vector<mscclpp::PortChannel>& portChannels, bool useIPC, bool useIb, bool useEthernet,
-                            void* sendBuff, size_t sendBuffBytes, void* recvBuff = nullptr, size_t recvBuffBytes = 0);
+                            void* sendBuff, size_t sendBuffBytes, void* recvBuff = nullptr, size_t recvBuffBytes = 0,
+                            IbMode ibMode = IbMode::Default);
   void testPingPong(PingPongTestParams params);
   void testPingPongPerf(PingPongTestParams params);
-  void testPacketPingPong(bool useIbOnly);
-  void testPacketPingPongPerf(bool useIbOnly);
+  void testPacketPingPong(bool useIbOnly, IbMode ibMode = IbMode::Default);
+  void testPacketPingPongPerf(bool useIbOnly, IbMode ibMode = IbMode::Default);
+  void testBandwidth(PingPongTestParams params);
 
   std::shared_ptr<mscclpp::ProxyService> proxyService;
 };
@@ -166,6 +177,12 @@ class MemoryChannelOneToOneTest : public CommunicatorTestBase {
   std::unordered_map<int, std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
 };
 
+class SemaphorePerfTest : public CommunicatorTestBase {
+ protected:
+  void SetUp() override;
+  void TearDown() override;
+};
+
 class SwitchChannelTest : public CommunicatorTestBase {
  protected:
   void SetUp() override;
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index cbd5cb6d..3b14ed31 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -1,12 +1,41 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <cstdint>
 #include <mscclpp/concurrency_device.hpp>
 
+#include "gdr.hpp"
 #include "mp_unit_tests.hpp"
 #include "utils_internal.hpp"
 
+// Skip the current test if the given IB mode will require GDRCopy on CUDA but it is unavailable.
+// On CUDA, HostNoAtomic requires GDRCopy for BAR1 signal forwarding. When IbMode::Host or
+// IbMode::Default is used and the IB device does not support RDMA atomics, the endpoint falls
+// back to no-atomic mode, which also requires GDRCopy.
+// On ROCm, no-atomic mode uses direct volatile writes and does not need GDRCopy.
+#if defined(MSCCLPP_USE_CUDA)
+inline void requireGdrForIbMode(IbMode mode, mscclpp::Transport ibTransport) {
+  if (mscclpp::gdrEnabled()) return;  // GDRCopy available — nothing to skip.
+  if (mode == IbMode::HostNoAtomic) {
+    SKIP_TEST() << "HostNoAtomic requires GDRCopy on CUDA: " << mscclpp::gdrStatusMessage();
+  }
+  // For Host/Default modes: check whether the IB device lacks RDMA atomics,
+  // which would cause an automatic fallback to no-atomic mode.
+  if (mode == IbMode::Host || mode == IbMode::Default) {
+    std::string devName = mscclpp::getIBDeviceName(ibTransport);
+    mscclpp::IbCtx ibCtx(devName);
+    if (!ibCtx.supportsRdmaAtomics()) {
+      SKIP_TEST() << "IB device " << devName
+                  << " lacks RDMA atomics; Host mode falls back to HostNoAtomic which requires GDRCopy: "
+                  << mscclpp::gdrStatusMessage();
+    }
+  }
+}
+#define REQUIRE_GDR_FOR_IB_MODE(mode) requireGdrForIbMode((mode), ibTransport)
+#else
+#define REQUIRE_GDR_FOR_IB_MODE(mode)  // No extra requirements on non-CUDA platforms.
+#endif
+
 void PortChannelOneToOneTest::SetUp() {
   // Use only two ranks
   setNumRanksToUse(2);
@@ -18,7 +47,7 @@ void PortChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); }
 
 void PortChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::PortChannel>& portChannels, bool useIPC,
                                                    bool useIb, bool useEthernet, void* sendBuff, size_t sendBuffBytes,
-                                                   void* recvBuff, size_t recvBuffBytes) {
+                                                   void* recvBuff, size_t recvBuffBytes, IbMode ibMode) {
   const int rank = communicator->bootstrap()->getRank();
   const int worldSize = communicator->bootstrap()->getNranks();
   const bool isInPlace = (recvBuff == nullptr);
@@ -47,6 +76,7 @@ void PortChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::PortChan
     } else if (useIb) {
       cfg.transport = ibTransport;
       cfg.ib.gidIndex = std::stoi(gEnv->args["ib_gid_index"]);
+      cfg.ib.mode = ibMode;
     } else if (useEthernet) {
       cfg.transport = mscclpp::Transport::Ethernet;
     }
@@ -162,7 +192,8 @@ void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) {
 
   std::vector<mscclpp::PortChannel> portChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
-  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int),
+                       nullptr, 0, params.ibMode);
 
   std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
   for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
@@ -176,26 +207,12 @@ void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) {
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   const int nTries = 1000;
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, params.waitWithPoll, nTries, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   proxyService->stopProxy();
 }
@@ -207,7 +224,8 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
 
   std::vector<mscclpp::PortChannel> portChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
-  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int),
+                       nullptr, 0, params.ibMode);
 
   std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
   for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
@@ -220,8 +238,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const std::string testName = ::mscclpp::test::currentTestName();
   const int nTries = 1000;
 
   // Warm-up
@@ -238,54 +255,63 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nTries, "us/iter");
   }
 
   proxyService->stopProxy();
 }
 
-TEST_F(PortChannelOneToOneTest, PingPong) {
-  testPingPong(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false});
+TEST(PortChannelOneToOneTest, PingPong) {
+  testPingPong(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIb) {
-#if defined(USE_IBVERBS)
-  testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  testPingPong(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongEthernet) {
-  testPingPong(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false});
+TEST(PortChannelOneToOneTest, PingPongEthernet) {
+  testPingPong(PingPongTestParams{
+      .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongWithPoll) {
-  testPingPong(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true});
+TEST(PortChannelOneToOneTest, PingPongWithPoll) {
+  testPingPong(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbWithPoll) {
-#if defined(USE_IBVERBS)
-  testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  testPingPong(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerf) {
-  testPingPongPerf(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false});
+PERF_TEST(PortChannelOneToOneTest, PingPongPerf) {
+  testPingPongPerf(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfIb) {
-#if defined(USE_IBVERBS)
-  testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  testPingPongPerf(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) {
-  testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false});
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
+  testPingPongPerf(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
+}
+
+PERF_TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
+  testPingPongPerf(PingPongTestParams{
+      .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
 __device__ mscclpp::DeviceSyncer gChannelOneToOneTestPortChansSyncer;
@@ -354,7 +380,7 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m
   }
 }
 
-void PortChannelOneToOneTest::testPacketPingPong(bool useIb) {
+void PortChannelOneToOneTest::testPacketPingPong(bool useIb, IbMode ibMode) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -367,7 +393,7 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb) {
   auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
 
   setupMeshConnections(portChannels, !useIb, useIb, false, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket),
-                       getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
+                       getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket), ibMode);
 
   ASSERT_EQ(portChannels.size(), 1);
 
@@ -387,41 +413,21 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb) {
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   const int nTries = 1000;
-
   // The least nelem is 2 for packet ping pong
-  kernelProxyLLPingPong<true>
-      <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 2, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true>
-      <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
-                                           1024 * 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
-                                           4 * 1024 * 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelProxyLLPingPong<true>
+        <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, nElem, nTries, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   communicator->bootstrap()->barrier();
 
   proxyService->stopProxy();
 }
 
-void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) {
+void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -434,7 +440,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) {
   auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
 
   setupMeshConnections(portChannels, !useIb, useIb, false, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket),
-                       getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
+                       getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket), ibMode);
 
   ASSERT_EQ(portChannels.size(), 1);
 
@@ -451,8 +457,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) {
 
   proxyService->startProxy();
 
-  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const std::string testName = ::mscclpp::test::currentTestName();
   const int nTries = 1000000;
 
   // Warm-up
@@ -471,28 +476,127 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) {
   communicator->bootstrap()->barrier();
 
   if (gEnv->rank == 0) {
-    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nTries, "us/iter");
   }
 
   proxyService->stopProxy();
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false); }
+TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongIb) {
-#if defined(USE_IBVERBS)
-  testPacketPingPong(true);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  testPacketPingPong(true, IbMode::Host);
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false); }
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIb) {
-#if defined(USE_IBVERBS)
-  testPacketPingPongPerf(true);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  testPacketPingPongPerf(true, IbMode::Host);
+}
+
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
+  testPacketPingPongPerf(true, IbMode::HostNoAtomic);
+}
+
+TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
+  testPingPong(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
+}
+
+TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
+  testPacketPingPong(true, IbMode::HostNoAtomic);
+}
+
+// Bandwidth test: bidirectional bulk transfer matching the tutorial pattern.
+// Both ranks do signal+wait+putWithSignal+wait per iteration.
+__global__ void kernelBandwidthBidir(int* buff, int nElem, int nIters, int rank) {
+  DeviceHandle<mscclpp::PortChannel>& portChan = gChannelOneToOneTestConstPortChans;
+  if (threadIdx.x != 0) return;
+  const uint64_t srcOffset = rank * nElem * sizeof(int);
+  const uint64_t dstOffset = srcOffset;
+  for (int i = 0; i < nIters; i++) {
+    portChan.signal();
+    portChan.wait();
+    portChan.putWithSignal(dstOffset, srcOffset, nElem * sizeof(int));
+    portChan.wait();
+  }
+}
+
+void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  const int maxElem = 32 * 1024 * 1024;  // 128 MB per direction
+  const int bufElem = maxElem * 2;       // 2x for bidirectional
+
+  std::vector<mscclpp::PortChannel> portChannels;
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(bufElem).memory();
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), bufElem * sizeof(int),
+                       nullptr, 0, params.ibMode);
+
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
+  for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
+
+  ASSERT_EQ(portChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::PortChannel>)));
+
+  proxyService->startProxy();
+
+  const std::string testName = ::mscclpp::test::currentTestName();
+  const int nIters = 1000;
+
+  for (int nElem : {256, 16 * 1024, 256 * 1024, 1024 * 1024, 4 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024}) {
+    // Warm-up
+    kernelBandwidthBidir<<<1, 1024>>>(buff.get(), nElem, 10, gEnv->rank);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    communicator->bootstrap()->barrier();
+
+    // Measure
+    mscclpp::Timer timer;
+    kernelBandwidthBidir<<<1, 1024>>>(buff.get(), nElem, nIters, gEnv->rank);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    double elapsedUs = timer.elapsed();
+    communicator->bootstrap()->barrier();
+
+    if (gEnv->rank == 0) {
+      double copyBytes = (double)nElem * sizeof(int);
+      double elapsedMsPerIter = elapsedUs / 1e3 / nIters;
+      double gbps = copyBytes / elapsedMsPerIter * 1e-6;
+      double sizeKB = copyBytes / 1024.0;
+      std::string label =
+          (sizeKB >= 1024.0) ? (std::to_string((int)(sizeKB / 1024.0)) + " MB") : (std::to_string((int)sizeKB) + " KB");
+      ::mscclpp::test::reportPerfResult(label, gbps, "GB/s");
+    }
+  }
+
+  proxyService->stopProxy();
+}
+
+PERF_TEST(PortChannelOneToOneTest, Bandwidth) {
+  testBandwidth(PingPongTestParams{
+      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
+}
+
+PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  testBandwidth(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
+}
+
+PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
+  testBandwidth(PingPongTestParams{
+      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
diff --git a/test/mp_unit/semaphore_perf_tests.cu b/test/mp_unit/semaphore_perf_tests.cu
new file mode 100644
index 00000000..a4c0e29f
--- /dev/null
+++ b/test/mp_unit/semaphore_perf_tests.cu
@@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/semaphore.hpp>
+
+#include "mp_unit_tests.hpp"
+
+void SemaphorePerfTest::SetUp() {
+  // Need at least two ranks within a node
+  if (gEnv->nRanksPerNode < 2) {
+    SKIP_TEST();
+  }
+  setNumRanksToUse(2);
+  CommunicatorTestBase::SetUp();
+}
+
+void SemaphorePerfTest::TearDown() { CommunicatorTestBase::TearDown(); }
+
+__constant__ mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle gSemaphorePerfTestHandle;
+
+__global__ void kernelSemaphorePingPong(int rank, int nIters) {
+  mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle& sem = gSemaphorePerfTestHandle;
+
+  // Warmup
+  for (int i = 0; i < 10; i++) {
+    if ((rank ^ (i & 1)) == 0) {
+      sem.signal();
+    } else {
+      sem.wait();
+    }
+  }
+
+  // Timed iterations — alternating signal/wait like the memory channel ping-pong
+  for (int i = 0; i < nIters; i++) {
+    if ((rank ^ (i & 1)) == 0) {
+      sem.signal();
+    } else {
+      sem.wait();
+    }
+  }
+}
+
+PERF_TEST(SemaphorePerfTest, SignalPingPong) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  connectMesh(/*useIpc=*/true, /*useIb=*/false, /*useEthernet=*/false);
+
+  int peerRank = (gEnv->rank == 0) ? 1 : 0;
+  auto d2dSemaphore = std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*communicator, connections[peerRank]);
+
+  auto devHandle = d2dSemaphore->deviceHandle();
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gSemaphorePerfTestHandle, &devHandle, sizeof(devHandle)));
+
+  const int nIters = 1000;
+  const std::string testName = ::mscclpp::test::currentTestName();
+
+  // Warmup run
+  kernelSemaphorePingPong<<<1, 1>>>(gEnv->rank, nIters);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  communicator->bootstrap()->barrier();
+
+  // Timed run
+  mscclpp::Timer timer;
+  kernelSemaphorePingPong<<<1, 1>>>(gEnv->rank, nIters);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  communicator->bootstrap()->barrier();
+
+  if (gEnv->rank == 0) {
+    ::mscclpp::test::reportPerfResult("latency", (float)timer.elapsed() / (float)nIters, "us/iter");
+  }
+}
diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu
index 44f4ebed..6d913c64 100644
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <algorithm>
 #include <mscclpp/switch_channel.hpp>
@@ -10,10 +10,10 @@
 void SwitchChannelTest::SetUp() {
   // Need at least two ranks within a node
   if (gEnv->nRanksPerNode < 2) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   if (!mscclpp::isNvlsSupported()) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   // Use only two ranks
   setNumRanksToUse(2);
@@ -23,6 +23,8 @@ void SwitchChannelTest::SetUp() {
 void SwitchChannelTest::TearDown() { CommunicatorTestBase::TearDown(); }
 
 __constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
 
 __global__ void kernelSwitchReduce() {
 #if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
@@ -31,7 +33,16 @@ __global__ void kernelSwitchReduce() {
 #endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
 }
 
-TEST_F(SwitchChannelTest, SimpleAllReduce) {
+__global__ void kernelSwitchReduceTwo() {
+#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+  auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan1.broadcast(0, val1);
+  auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan2.broadcast(0, val2);
+#endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+}
+
+TEST(SwitchChannelTest, SimpleAllReduce) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::vector<int> ranks;
@@ -66,5 +77,60 @@ TEST_F(SwitchChannelTest, SimpleAllReduce) {
   for (int i = 0; i < numRanksToUse; i++) {
     expected += i + 1.0f;
   }
-  ASSERT_EQ(result, expected) << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank;
+  if (result != expected) {
+    std::cerr << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank << std::endl;
+  }
+  ASSERT_EQ(result, expected);
+}
+
+TEST(SwitchChannelTest, TwoChannelsSameConnection) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  std::vector<int> ranks;
+  for (int i = 0; i < numRanksToUse; i++) {
+    ranks.push_back(i);
+  }
+
+  const size_t bufSize = 1024;
+  auto buffer1 = mscclpp::GpuBuffer<float>(bufSize / sizeof(float));
+  auto buffer2 = mscclpp::GpuBuffer<float>(bufSize / sizeof(float));
+  float data1 = (gEnv->rank + 1.0f) * 1.0f;
+  float data2 = (gEnv->rank + 1.0f) * 10.0f;
+  MSCCLPP_CUDATHROW(cudaMemcpy(buffer1.data(), &data1, sizeof(data1), cudaMemcpyHostToDevice));
+  MSCCLPP_CUDATHROW(cudaMemcpy(buffer2.data(), &data2, sizeof(data2), cudaMemcpyHostToDevice));
+
+  const size_t connSize = buffer1.bytes() + buffer2.bytes();
+  auto nvlsConnection = mscclpp::connectNvlsCollective(communicator, ranks, connSize);
+
+  auto switchChannel1 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer1.data()), bufSize);
+  auto switchChannel2 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer2.data()), bufSize);
+
+  auto deviceHandle1 = switchChannel1.deviceHandle();
+  auto deviceHandle2 = switchChannel2.deviceHandle();
+
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan1, &deviceHandle1, sizeof(deviceHandle1)));
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan2, &deviceHandle2, sizeof(deviceHandle2)));
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  communicator->bootstrap()->barrier();
+
+  if (gEnv->rank == 0) {
+    kernelSwitchReduceTwo<<<1, 1>>>();
+    MSCCLPP_CUDATHROW(cudaGetLastError());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  }
+  communicator->bootstrap()->barrier();
+
+  float result1, result2;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result1, buffer1.data(), sizeof(result1), cudaMemcpyDeviceToHost));
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result2, buffer2.data(), sizeof(result2), cudaMemcpyDeviceToHost));
+
+  float expected1 = 0.0f;
+  float expected2 = 0.0f;
+  for (int i = 0; i < numRanksToUse; i++) {
+    expected1 += (i + 1.0f) * 1.0f;
+    expected2 += (i + 1.0f) * 10.0f;
+  }
+  ASSERT_EQ(result1, expected1);
+  ASSERT_EQ(result2, expected2);
 }
diff --git a/test/mscclpp-test/CMakeLists.txt b/test/mscclpp-test/CMakeLists.txt
index 8b9c63fa..f7fdde58 100644
--- a/test/mscclpp-test/CMakeLists.txt
+++ b/test/mscclpp-test/CMakeLists.txt
@@ -1,7 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz)
+FetchContent_Declare(json
+    GIT_REPOSITORY https://github.com/nlohmann/json.git
+    GIT_TAG v3.12.0
+)
 FetchContent_MakeAvailable(json)
 
 # Include path for collective algorithm headers (alltoallv, etc.)
diff --git a/test/perf/CMakeLists.txt b/test/perf/CMakeLists.txt
deleted file mode 100644
index 6a16c034..00000000
--- a/test/perf/CMakeLists.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# Find required packages
-find_package(MPI REQUIRED)
-
-# Note: nlohmann_json::nlohmann_json target is already available from the main project
-
-# Set up common libraries and includes for tests
-set(PERF_TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads MPI::MPI_CXX)
-if(MSCCLPP_USE_IB)
-    list(APPEND PERF_TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
-endif()
-
-set(PERF_TEST_INC_COMMON 
-    PRIVATE ${PROJECT_SOURCE_DIR}/include 
-    SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
-
-# Function to add a test executable
-function(add_perf_test_executable name sources)
-    if(MSCCLPP_USE_ROCM)
-        set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
-    endif()
-    add_executable(${name} ${sources})
-    target_link_libraries(${name} ${PERF_TEST_LIBS_COMMON})
-    
-    # Link nlohmann_json - use the target from main project
-    target_link_libraries(${name} nlohmann_json::nlohmann_json)
-    
-    if(MSCCLPP_USE_IB)
-        target_compile_definitions(${name} PRIVATE USE_IBVERBS)
-    endif()
-    
-    target_include_directories(${name} ${PERF_TEST_INC_COMMON})
-    target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS)
-    
-    # Set C++ standard
-    target_compile_features(${name} PRIVATE cxx_std_17)
-
-    set_target_properties(${name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/perf")
-endfunction()
-
-# Add FIFO test  
-add_perf_test_executable(fifo_test "framework.cc;fifo_test.cu")
diff --git a/test/perf/fifo_test.cu b/test/perf/fifo_test.cu
deleted file mode 100644
index bb77a106..00000000
--- a/test/perf/fifo_test.cu
+++ /dev/null
@@ -1,298 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <getopt.h>
-
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mscclpp/fifo.hpp>
-#include <mscclpp/gpu_utils.hpp>
-#include <mscclpp/numa.hpp>
-#include <sstream>
-#include <stdexcept>
-
-#include "framework.hpp"
-
-using namespace mscclpp::test;
-
-// Constants for timeout and trigger calculation
-constexpr uint64_t TIMEOUT_SPINS = 1000000;
-constexpr int MIN_TRIGGERS = 1000;
-constexpr int MIN_WARMUP_TRIGGERS = 100;
-constexpr int TRIGGERS_PER_FIFO_SIZE = 10;
-constexpr int WARMUP_TRIGGERS_PER_FIFO_SIZE = 2;
-
-__constant__ mscclpp::FifoDeviceHandle gFifoDeviceHandle;
-
-__global__ void kernelFifoPush(size_t numTriggers) {
-  mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  mscclpp::ProxyTrigger trigger;
-  for (size_t i = 1; i <= numTriggers; ++i) {
-    trigger.fst = i;
-    trigger.snd = tid ^ i;
-    fifo.push(trigger);
-  }
-}
-
-__global__ void kernelFifoPushSync(size_t numTriggers) {
-  mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
-  mscclpp::ProxyTrigger trigger;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (size_t i = 1; i <= numTriggers; ++i) {
-    trigger.fst = i;
-    trigger.snd = tid ^ i;
-    fifo.sync(fifo.push(trigger));
-  }
-}
-
-static void setupCuda(int& cudaDevice, int& numaNode) {
-  utils::CUDA_CHECK(cudaGetDevice(&cudaDevice));
-  numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
-  mscclpp::numaBind(numaNode);
-}
-
-// Helper function to consume triggers from FIFO
-static bool consumeTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
-  int totalTriggers = numTriggers * parallel;
-  std::unordered_map<int, int> triggerCounts;
-  for (int i = 0; i < totalTriggers; ++i) {
-    mscclpp::ProxyTrigger trigger;
-    uint64_t spin = 0;
-    do {
-      trigger = hostFifo->poll();
-      if (spin++ > TIMEOUT_SPINS) {
-        return false;
-      }
-    } while (trigger.fst == 0 || trigger.snd == 0);
-
-    // Process trigger (see src/proxy.cc)
-    trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
-    trigger.snd = trigger.snd ^ trigger.fst;
-    assert(triggerCounts[trigger.snd] + 1 == trigger.fst);
-    triggerCounts[trigger.snd]++;
-    hostFifo->pop();
-  }
-  return true;
-}
-
-// Helper function to run a single kernel variant and return performance metrics
-std::tuple<double, double, int, int> runSingleKernelVariant(void (*kernel)(size_t),
-                                                            std::unique_ptr<mscclpp::Fifo>& hostFifo,
-                                                            cudaStream_t stream, int numParallel) {
-  // Calculate triggers based on FIFO size
-  const int numTriggers = std::max(MIN_TRIGGERS, static_cast<int>(hostFifo->size() * TRIGGERS_PER_FIFO_SIZE));
-  const int warmupTriggers =
-      std::max(MIN_WARMUP_TRIGGERS, static_cast<int>(hostFifo->size() * WARMUP_TRIGGERS_PER_FIFO_SIZE));
-
-  // Warmup
-  kernel<<<numParallel, 1, 0, stream>>>(warmupTriggers);
-  utils::CUDA_CHECK(cudaGetLastError());
-
-  // Process warmup triggers (note: total triggers = warmupTriggers * numParallel)
-  if (!consumeTriggers(hostFifo, warmupTriggers, numParallel)) {
-    return {0.0, 0.0, 0, 0};  // Return error values
-  }
-  utils::CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  // Benchmark
-  utils::Timer timer;
-  timer.start();
-
-  kernel<<<numParallel, 1, 0, stream>>>(numTriggers);
-  utils::CUDA_CHECK(cudaGetLastError());
-
-  // Process all triggers
-  if (!consumeTriggers(hostFifo, numTriggers, numParallel)) {
-    return {0.0, 0.0, 0, 0};
-  }
-  utils::CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  timer.stop();
-
-  const int totalTriggers = numTriggers * numParallel;
-  double throughput = totalTriggers / timer.elapsedSeconds();
-  double duration_us = timer.elapsedMicroseconds();
-
-  utils::CUDA_CHECK(cudaDeviceSynchronize());
-
-  return {throughput, duration_us, totalTriggers, warmupTriggers * numParallel};
-}
-
-void runFifoTestVariant(std::unique_ptr<mscclpp::Fifo>& hostFifo, cudaStream_t stream, int numParallel,
-                        nlohmann::ordered_json& combinedMetrics) {
-  auto [pushThroughput, pushDuration, numTriggers, warmupTriggers] =
-      runSingleKernelVariant(kernelFifoPush, hostFifo, stream, numParallel);
-
-  auto [syncThroughput, syncDuration, syncNumTriggers, syncWarmupTriggers] =
-      runSingleKernelVariant(kernelFifoPushSync, hostFifo, stream, numParallel);
-
-  auto formatThroughput = [](double thru) {
-    return double(int(thru * 10)) / 10.0;  // Round to 1 decimal place
-  };
-
-  std::string prefix = "p" + std::to_string(numParallel) + "_";
-  combinedMetrics[prefix + "push_throughput"] = formatThroughput(pushThroughput);
-  combinedMetrics[prefix + "push_sync_throughput"] = formatThroughput(syncThroughput);
-  combinedMetrics[prefix + "push_duration_us"] = pushDuration;
-  combinedMetrics[prefix + "push_sync_duration_us"] = syncDuration;
-  combinedMetrics[prefix + "num_triggers"] = numTriggers;
-  combinedMetrics[prefix + "warmup_triggers"] = warmupTriggers;
-}
-
-struct FifoTestConfig {
-  int fifoSize;
-  std::vector<int> parallelismLevels;
-
-  // Constructor with default parallelism levels
-  FifoTestConfig(int size, const std::vector<int>& parallel = {1, 2, 4, 8, 16})
-      : fifoSize(size), parallelismLevels(parallel) {}
-};
-
-void runFifoTest(const FifoTestConfig& config, [[maybe_unused]] int rank, [[maybe_unused]] int worldSize,
-                 [[maybe_unused]] int localRank) {
-  if (config.fifoSize <= 0) {
-    throw std::invalid_argument("FIFO size must be positive");
-  }
-  if (config.parallelismLevels.empty()) {
-    throw std::invalid_argument("At least one parallelism level must be specified");
-  }
-
-  int cudaDevice, numaNode;
-  setupCuda(cudaDevice, numaNode);
-
-  auto hostFifo = std::make_unique<mscclpp::Fifo>(config.fifoSize);
-
-  mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
-  utils::CUDA_CHECK(cudaMemcpyToSymbol(gFifoDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
-
-  cudaStream_t stream;
-  utils::CUDA_CHECK(cudaStreamCreate(&stream));
-
-  // Create test name with parallelism range
-  std::string testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_Parallel";
-
-  // Add parallelism range to test name (e.g., "P1-16" or "P1-4-16-64")
-  if (!config.parallelismLevels.empty()) {
-    testName += std::to_string(config.parallelismLevels.front());
-    if (config.parallelismLevels.size() > 1) {
-      testName += "-" + std::to_string(config.parallelismLevels.back());
-
-      // If parallelism levels have non-standard steps, include more detail
-      if (config.parallelismLevels.size() > 2 &&
-          (config.parallelismLevels[1] != 2 * config.parallelismLevels[0] || config.parallelismLevels.size() > 3)) {
-        testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_ParallelCustom";
-      }
-    }
-  }
-
-  // Print test configuration
-  if (utils::isMainRank()) {
-    std::stringstream ss;
-    ss << "Running FIFO test with size=" << config.fifoSize << ", parallelism_levels=[";
-    for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
-      if (i > 0) ss << ",";
-      ss << config.parallelismLevels[i];
-    }
-    ss << "]";
-    std::cout << ss.str() << std::endl;
-  }
-
-  nlohmann::ordered_json combinedMetrics;
-
-  for (int numParallel : config.parallelismLevels) {
-    runFifoTestVariant(hostFifo, stream, numParallel, combinedMetrics);
-  }
-
-  std::map<std::string, std::string> testParams;
-  testParams["fifo_size"] = std::to_string(static_cast<int>(hostFifo->size()));
-
-  // Add parallelism levels to test parameters
-  std::stringstream parallelismStream;
-  for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
-    if (i > 0) parallelismStream << ",";
-    parallelismStream << config.parallelismLevels[i];
-  }
-  testParams["parallelism_levels"] = parallelismStream.str();
-
-  utils::recordResult(testName, "fifo", combinedMetrics, testParams);
-
-  utils::CUDA_CHECK(cudaStreamDestroy(stream));
-}
-
-void runAllFifoTests([[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] int localRank) {
-  // clang-format off
-  std::vector<FifoTestConfig> configs = {
-      {1, {1}},
-      {128, {1, 8, 64, 128}},
-      {512, {1, 8, 64, 256, 512}},
-  };
-  // clang-format on
-
-  for (const auto& config : configs) {
-    runFifoTest(config, rank, worldSize, localRank);
-  }
-}
-
-static void printUsage(char* argv0) {
-  std::stringstream ss;
-  ss << "Usage: " << argv0 << " [OPTIONS]\n"
-     << "\n"
-     << "Options:\n"
-     << "  -o, --output-format FORMAT   Output format: human or json (default: human)\n"
-     << "  -f, --output-file FILE       JSON output file path (default: report.jsonl)\n"
-     << "  -v, --verbose                Increase verbosity\n"
-     << "  -h, --help                   Show this help message\n";
-  std::cout << ss.str();
-}
-
-int main(int argc, char* argv[]) {
-  std::string outputFormat = "human";
-  std::string outputFile = "report.jsonl";
-  bool verbose = false;
-
-  static struct option longOptions[] = {{"output-format", required_argument, 0, 'o'},
-                                        {"output-file", required_argument, 0, 'f'},
-                                        {"verbose", no_argument, 0, 'v'},
-                                        {"help", no_argument, 0, 'h'},
-                                        {0, 0, 0, 0}};
-
-  int c;
-  while ((c = getopt_long(argc, argv, "o:f:vh", longOptions, nullptr)) != -1) {
-    switch (c) {
-      case 'o':
-        outputFormat = optarg;
-        break;
-      case 'f':
-        outputFile = optarg;
-        break;
-      case 'v':
-        verbose = true;
-        break;
-      case 'h':
-        printUsage(argv[0]);
-        return 0;
-      default:
-        printUsage(argv[0]);
-        return 1;
-    }
-  }
-
-  std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>> tests = {
-      {"AllFifoTests", "FIFO performance tests with multiple configurations", runAllFifoTests}};
-
-  int result = utils::runMultipleTests(argc, argv, tests);
-
-  if (utils::isMainRank()) {
-    if (outputFormat == "json") {
-      utils::writeResultsToFile(outputFile);
-    } else {
-      utils::printResults(verbose);
-    }
-  }
-
-  utils::cleanupMPI();
-
-  return result;
-}
diff --git a/test/perf/framework.cc b/test/perf/framework.cc
deleted file mode 100644
index 85f7abd8..00000000
--- a/test/perf/framework.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "framework.hpp"
-
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-namespace mscclpp {
-namespace test {
-
-// Global state for results
-static std::vector<TestResult> g_results;
-static int g_mpi_rank = 0;
-static int g_mpi_size = 1;
-static bool g_mpi_initialized = false;
-
-namespace utils {
-
-// Internal MPI helper functions (not exposed in header)
-void initializeMPI(int argc, char* argv[]) {
-  if (g_mpi_initialized) return;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_mpi_rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &g_mpi_size);
-  g_mpi_initialized = true;
-}
-
-static void finalizeMPI() {
-  if (!g_mpi_initialized) return;
-
-  MPI_Finalize();
-  g_mpi_initialized = false;
-}
-
-static int getMPIRank() { return g_mpi_rank; }
-
-static int getMPISize() { return g_mpi_size; }
-
-static bool isMainProcess() { return g_mpi_rank == 0; }
-
-// Public utility functions for test output
-bool isMainRank() { return g_mpi_rank == 0; }
-
-void cleanupMPI() { finalizeMPI(); }
-
-std::string getCurrentTimestamp() {
-  auto now = std::chrono::system_clock::now();
-  auto time_t = std::chrono::system_clock::to_time_t(now);
-  std::stringstream ss;
-  ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%S");
-  return ss.str();
-}
-
-void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& test_params) {
-  TestResult result;
-  result.test_name = test_name;
-  result.test_category = test_category;
-  result.test_params = test_params;
-  result.metrics = metrics;
-  result.num_processes = g_mpi_size;
-  result.process_rank = g_mpi_rank;
-  result.timestamp = getCurrentTimestamp();
-
-  g_results.push_back(result);
-}
-
-void writeResultsToFile(const std::string& filename) {
-  std::ofstream file(filename);
-  if (!file) {
-    throw std::runtime_error("Cannot open output file: " + filename);
-  }
-
-  for (const auto& result : g_results) {
-    nlohmann::ordered_json j;
-    j["test_name"] = result.test_name;
-    j["test_category"] = result.test_category;
-    j["test_config"] = result.test_params;
-    j["metrics"] = result.metrics;
-    j["num_processes"] = result.num_processes;
-    j["process_rank"] = result.process_rank;
-    j["timestamp"] = result.timestamp;
-
-    file << j.dump() << std::endl;
-  }
-}
-
-void printResults(bool verbose) {
-  if (!isMainProcess()) return;
-
-  std::cout << "\n=== Test Results ===" << std::endl;
-
-  for (const auto& result : g_results) {
-    std::cout << "\nTest: " << result.test_name << " (" << result.test_category << ")" << std::endl;
-
-    if (verbose && !result.test_params.empty()) {
-      std::cout << "  Parameters:" << std::endl;
-      for (const auto& param : result.test_params) {
-        std::cout << "    " << param.first << ": " << param.second << std::endl;
-      }
-    }
-
-    std::cout << "  Metrics:" << std::endl;
-    for (auto it = result.metrics.begin(); it != result.metrics.end(); ++it) {
-      std::cout << "    " << it.key() << ": " << it.value() << std::endl;
-    }
-  }
-  std::cout << std::endl;
-}
-
-// Timer implementation
-Timer::Timer() : is_running_(false) {}
-
-void Timer::start() {
-  start_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = true;
-}
-
-void Timer::stop() {
-  end_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = false;
-}
-
-double Timer::elapsedMicroseconds() const {
-  if (is_running_) {
-    auto now = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration_cast<std::chrono::microseconds>(now - start_time_).count();
-  }
-  return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
-}
-
-double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
-
-double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
-
-void cudaCheck(cudaError_t err, const char* file, int line) {
-  if (err != cudaSuccess) {
-    std::string msg =
-        std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
-    throw std::runtime_error(msg);
-  }
-}
-
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests) {
-  int totalResult = 0;
-
-  // Initialize MPI once for all tests
-  initializeMPI(argc, argv);
-
-  try {
-    // Get MPI information
-    int rank = getMPIRank();
-    int size = getMPISize();
-    int local_rank = rank;  // For simplicity, assume local_rank = rank
-
-    for (const auto& test : tests) {
-      const std::string& testName = std::get<0>(test);
-      const std::string& testDescription = std::get<1>(test);
-      const std::function<void(int, int, int)>& testFunction = std::get<2>(test);
-
-      if (rank == 0) {
-        std::cout << "Running test: " << testName << std::endl;
-        if (!testDescription.empty()) {
-          std::cout << "  " << testDescription << std::endl;
-        }
-      }
-
-      // Don't clear results - accumulate them for all tests in the same file
-      // g_results.clear();  // Commented out to accumulate results
-
-      try {
-        // Run the individual test function with MPI information
-        testFunction(rank, size, local_rank);
-
-        // Synchronize before moving to next test
-        MPI_Barrier(MPI_COMM_WORLD);
-
-      } catch (const std::exception& e) {
-        if (rank == 0) {
-          std::cerr << "Error in test " << testName << ": " << e.what() << std::endl;
-        }
-        totalResult = 1;
-      }
-    }
-
-    // Don't cleanup MPI here - let the caller handle it
-    // finalizeMPI();
-
-  } catch (const std::exception& e) {
-    if (g_mpi_rank == 0) {
-      std::cerr << "Error: " << e.what() << std::endl;
-    }
-    finalizeMPI();
-    return 1;
-  }
-
-  return totalResult;
-}
-
-}  // namespace utils
-}  // namespace test
-}  // namespace mscclpp
diff --git a/test/perf/framework.hpp b/test/perf/framework.hpp
deleted file mode 100644
index e9b8c31f..00000000
--- a/test/perf/framework.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
-#define MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
-
-#include <mpi.h>
-
-#include <chrono>
-#include <fstream>
-#include <functional>
-#include <map>
-#include <mscclpp/gpu.hpp>
-#include <nlohmann/json.hpp>
-#include <string>
-#include <tuple>
-#include <vector>
-
-namespace mscclpp {
-namespace test {
-
-// Test result structure
-struct TestResult {
-  std::string test_name;
-  std::string test_category;
-  std::map<std::string, std::string> test_params;
-  nlohmann::ordered_json metrics;
-  int num_processes;
-  int process_rank;
-  std::string timestamp;
-};
-
-// Simple utility functions for testing
-namespace utils {
-
-// Test execution utilities
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests);
-
-// MPI management
-void initializeMPI(int argc, char* argv[]);
-void cleanupMPI();
-bool isMainRank();
-
-// Result recording
-void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& test_params = {});
-
-// Output utilities
-void writeResultsToFile(const std::string& filename);
-void printResults(bool verbose = false);
-void cleanupMPI();
-
-// Timing utilities
-class Timer {
- public:
-  Timer();
-  void start();
-  void stop();
-  double elapsedMicroseconds() const;
-  double elapsedMilliseconds() const;
-  double elapsedSeconds() const;
-
- private:
-  std::chrono::high_resolution_clock::time_point start_time_;
-  std::chrono::high_resolution_clock::time_point end_time_;
-  bool is_running_;
-};
-
-// CUDA utilities
-void cudaCheck(cudaError_t err, const char* file, int line);
-#define CUDA_CHECK(call) cudaCheck(call, __FILE__, __LINE__)
-
-}  // namespace utils
-
-}  // namespace test
-}  // namespace mscclpp
-
-#endif  // MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
diff --git a/test/torch/allreduce_temp_buff.py b/test/torch/allreduce_temp_buff.py
index 339c665f..d2b7fe52 100644
--- a/test/torch/allreduce_temp_buff.py
+++ b/test/torch/allreduce_temp_buff.py
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 # run with:
-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  MSCCLPP_DISABLE_CHANNEL_CACHE=true  torchrun --nproc_per_node=8 ./allreduce_temp_buff.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  MSCCLPP_NCCL_SYMMETRIC_MEMORY=false  torchrun --nproc_per_node=8 ./allreduce_temp_buff.py
 
 import os
 import torch
diff --git a/test/torch/memory_report.py b/test/torch/memory_report.py
index 55c9bd7e..0aa30dc3 100644
--- a/test/torch/memory_report.py
+++ b/test/torch/memory_report.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so MSCCLPP_DISABLE_CHANNEL_CACHE=true  torchrun --nnodes=1 --nproc_per_node=8 memory_report.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so MSCCLPP_NCCL_SYMMETRIC_MEMORY=false  torchrun --nnodes=1 --nproc_per_node=8 memory_report.py
 import os, sys
 import torch
 import torch.distributed as dist
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 312d31ef..a345effc 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,11 +1,14 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 target_sources(unit_tests PRIVATE
+    unit_tests_main.cc
     core_tests.cc
+    gdr_tests.cu
     gpu_utils_tests.cc
     errors_tests.cc
     fifo_tests.cu
+    fifo_perf_tests.cu
     numa_tests.cc
     socket_tests.cc
     utils_tests.cc
diff --git a/test/unit/compile_tests.cu b/test/unit/compile_tests.cu
index 9db91a4f..893bb940 100644
--- a/test/unit/compile_tests.cu
+++ b/test/unit/compile_tests.cu
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #undef NDEBUG
 #ifndef DEBUG_BUILD
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index 32e6a1b5..d2552ff3 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -1,12 +1,14 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/core.hpp>
 
-class LocalCommunicatorTest : public ::testing::Test {
+#include "../framework.hpp"
+
+// TODO: TransportFlags needs operator<< for EXPECT_EQ to work
+// Using ASSERT_TRUE with manual comparisons as workaround
+
+class LocalCommunicatorTest : public ::mscclpp::test::TestCase {
  protected:
   void SetUp() override {
     bootstrap = std::make_shared<mscclpp::TcpBootstrap>(0, 1);
@@ -18,15 +20,15 @@ class LocalCommunicatorTest : public ::testing::Test {
   std::shared_ptr<mscclpp::Communicator> comm;
 };
 
-TEST_F(LocalCommunicatorTest, RegisterMemory) {
+TEST(LocalCommunicatorTest, RegisterMemory) {
   int dummy[42];
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
   EXPECT_EQ(memory.data(), &dummy);
   EXPECT_EQ(memory.size(), sizeof(dummy));
-  EXPECT_EQ(memory.transports(), mscclpp::NoTransports);
+  ASSERT_TRUE(memory.transports() == mscclpp::NoTransports);
 }
 
-TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
+TEST(LocalCommunicatorTest, SendMemoryToSelf) {
   int dummy[42];
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
   comm->sendMemory(memory, 0);
@@ -34,5 +36,5 @@ TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
   auto sameMemory = memoryFuture.get();
   EXPECT_EQ(sameMemory.data(), memory.data());
   EXPECT_EQ(sameMemory.size(), memory.size());
-  EXPECT_EQ(sameMemory.transports(), memory.transports());
+  ASSERT_TRUE(sameMemory.transports() == memory.transports());
 }
diff --git a/test/unit/errors_tests.cc b/test/unit/errors_tests.cc
index f9faad19..3eeed387 100644
--- a/test/unit/errors_tests.cc
+++ b/test/unit/errors_tests.cc
@@ -1,30 +1,33 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/errors.hpp>
 
+#include "../framework.hpp"
+
+// TODO: ErrorCode needs operator<< for EXPECT_EQ to work
+// Using ASSERT_TRUE with manual comparisons as workaround
+
 TEST(ErrorsTest, SystemError) {
   mscclpp::Error error("test", mscclpp::ErrorCode::SystemError);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::SystemError);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::SystemError);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: SystemError)"));
 }
 
 TEST(ErrorsTest, InternalError) {
   mscclpp::Error error("test", mscclpp::ErrorCode::InternalError);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InternalError);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InternalError);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InternalError)"));
 }
 
 TEST(ErrorsTest, InvalidUsage) {
   mscclpp::Error error("test", mscclpp::ErrorCode::InvalidUsage);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InvalidUsage);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InvalidUsage);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InvalidUsage)"));
 }
 
 TEST(ErrorsTest, Timeout) {
   mscclpp::Error error("test", mscclpp::ErrorCode::Timeout);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::Timeout);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::Timeout);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: Timeout)"));
 }
diff --git a/test/unit/fifo_perf_tests.cu b/test/unit/fifo_perf_tests.cu
new file mode 100644
index 00000000..34b5d6bc
--- /dev/null
+++ b/test/unit/fifo_perf_tests.cu
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <cassert>
+#include <memory>
+#include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/numa.hpp>
+#include <unordered_map>
+
+#include "../framework.hpp"
+
+// Simple FIFO performance test to be run as part of unit_tests
+// This is a performance test that can be excluded from coverage runs
+// using the --exclude-perf-tests flag.
+
+constexpr uint64_t TIMEOUT_SPINS = 1000000;
+constexpr int MIN_TRIGGERS = 100;  // Reduced for faster unit test execution
+
+__constant__ mscclpp::FifoDeviceHandle gFifoPerfDeviceHandle;
+
+__global__ void kernelFifoPerfPush(size_t numTriggers) {
+  mscclpp::FifoDeviceHandle& fifo = gFifoPerfDeviceHandle;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  mscclpp::ProxyTrigger trigger;
+  for (size_t i = 1; i <= numTriggers; ++i) {
+    trigger.fst = i;
+    trigger.snd = tid ^ i;
+    fifo.push(trigger);
+  }
+}
+
+static bool consumePerfTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
+  int totalTriggers = numTriggers * parallel;
+  std::unordered_map<int, int> triggerCounts;
+  for (int i = 0; i < totalTriggers; ++i) {
+    mscclpp::ProxyTrigger trigger;
+    uint64_t spin = 0;
+    do {
+      trigger = hostFifo->poll();
+      if (spin++ > TIMEOUT_SPINS) {
+        return false;
+      }
+    } while (trigger.fst == 0 || trigger.snd == 0);
+
+    trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
+    trigger.snd = trigger.snd ^ trigger.fst;
+    if (triggerCounts[trigger.snd] + 1 != trigger.fst) {
+      return false;  // Validation failed
+    }
+    triggerCounts[trigger.snd]++;
+    hostFifo->pop();
+  }
+  return true;
+}
+
+PERF_TEST(FifoPerfTest, BasicPerformance) {
+  int cudaDevice, numaNode;
+  CUDA_CHECK(cudaGetDevice(&cudaDevice));
+  numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
+  mscclpp::numaBind(numaNode);
+
+  const int fifoSize = 128;
+  const int numTriggers = MIN_TRIGGERS;
+  const int numParallel = 1;
+
+  auto hostFifo = std::make_unique<mscclpp::Fifo>(fifoSize);
+  mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
+  CUDA_CHECK(cudaMemcpyToSymbol(gFifoPerfDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
+
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+
+  // Run kernel
+  kernelFifoPerfPush<<<numParallel, 1, 0, stream>>>(numTriggers);
+  CUDA_CHECK(cudaGetLastError());
+
+  // Process triggers
+  bool success = consumePerfTriggers(hostFifo, numTriggers, numParallel);
+  ASSERT_TRUE(success);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/test/unit/fifo_tests.cu b/test/unit/fifo_tests.cu
index b67a220d..8d30ca5e 100644
--- a/test/unit/fifo_tests.cu
+++ b/test/unit/fifo_tests.cu
@@ -1,13 +1,12 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/fifo.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 
+#include "../framework.hpp"
 #include "utils_internal.hpp"
 
 #define ITER 10000  // should be larger than the FIFO size for proper testing
diff --git a/test/unit/gdr_tests.cu b/test/unit/gdr_tests.cu
new file mode 100644
index 00000000..78bb2e1a
--- /dev/null
+++ b/test/unit/gdr_tests.cu
@@ -0,0 +1,251 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <mscclpp/atomic_device.hpp>
+#include <mscclpp/errors.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+#include "../framework.hpp"
+#include "gdr.hpp"
+
+// GdrStatus and gdrEnabled
+
+class GdrStatusTest : public ::mscclpp::test::TestCase {};
+
+TEST(GdrStatusTest, StatusIsValid) {
+  // gdrStatus() should return one of the defined enum values
+  auto status = mscclpp::gdrStatus();
+  ASSERT_TRUE(status == mscclpp::GdrStatus::Ok || status == mscclpp::GdrStatus::NotBuilt ||
+              status == mscclpp::GdrStatus::Disabled || status == mscclpp::GdrStatus::DriverMissing ||
+              status == mscclpp::GdrStatus::OpenFailed);
+}
+
+TEST(GdrStatusTest, EnabledConsistentWithStatus) {
+  // gdrEnabled() should be true iff gdrStatus() == Ok
+  EXPECT_EQ(mscclpp::gdrEnabled(), mscclpp::gdrStatus() == mscclpp::GdrStatus::Ok);
+}
+
+// GdrMap tests — only run when GDRCopy is available
+
+class GdrMapTest : public ::mscclpp::test::TestCase {
+ protected:
+  void SetUp() override {
+    if (!mscclpp::gdrEnabled()) {
+      SKIP_TEST() << "GDRCopy not enabled on this platform.";
+    }
+    MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
+    // Try creating a GDRCopy mapping to check if pin+map works on this platform.
+    try {
+      auto testMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+      mscclpp::GdrMap testMap(std::static_pointer_cast<void>(testMem), deviceId_);
+    } catch (const std::exception&) {
+      SKIP_TEST() << "GDRCopy mapping not supported on this platform.";
+    }
+  }
+
+  int deviceId_ = 0;
+};
+
+TEST(GdrMapTest, BasicMapping) {
+  // Allocate GPU memory via cudaMalloc (not VMM) and create a GDRCopy mapping
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+
+  ASSERT_TRUE(map.valid());
+  EXPECT_NE(map.hostPtr(), nullptr);
+}
+
+TEST(GdrMapTest, CopyToAndFrom) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write a value to GPU via GDRCopy
+  uint64_t writeVal = 0xDEADBEEFCAFE0123ULL;
+  map.copyTo(&writeVal, sizeof(uint64_t));
+
+  // Read it back via GDRCopy
+  uint64_t readVal = 0;
+  map.copyFrom(&readVal, sizeof(uint64_t));
+  EXPECT_EQ(readVal, writeVal);
+
+  // Also verify via cudaMemcpy
+  uint64_t cudaVal = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&cudaVal, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(cudaVal, writeVal);
+}
+
+TEST(GdrMapTest, CopyToVisibleFromGpu) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy, verify GPU sees it via cudaMemcpy
+  uint64_t val = 42;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 42);
+}
+
+TEST(GdrMapTest, MultipleWritesReadBack) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write multiple values sequentially and verify each
+  for (uint64_t i = 1; i <= 100; ++i) {
+    map.copyTo(&i, sizeof(uint64_t));
+    uint64_t readback = 0;
+    map.copyFrom(&readback, sizeof(uint64_t));
+    EXPECT_EQ(readback, i);
+    if (readback != i) break;
+  }
+}
+
+TEST(GdrMapTest, HostPtrIsWritable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write directly through the hostPtr (volatile store)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  *ptr = 12345;
+
+  // Read back via GDRCopy
+  uint64_t readback = 0;
+  map.copyFrom(&readback, sizeof(uint64_t));
+  EXPECT_EQ(readback, 12345);
+}
+
+TEST(GdrMapTest, HostPtrIsReadable) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+  ASSERT_TRUE(map.valid());
+
+  // Write via GDRCopy copyTo (same BAR1 path as the read)
+  uint64_t val = 99999;
+  map.copyTo(&val, sizeof(uint64_t));
+
+  // Read through the hostPtr (volatile load via BAR1)
+  volatile uint64_t* ptr = reinterpret_cast<volatile uint64_t*>(map.hostPtr());
+  EXPECT_EQ(*ptr, 99999);
+}
+
+TEST(GdrMapTest, DestroyDoesNotCrash) {
+  auto gpuMem = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  {
+    mscclpp::GdrMap map(std::static_pointer_cast<void>(gpuMem), deviceId_);
+    ASSERT_TRUE(map.valid());
+    uint64_t val = 1;
+    map.copyTo(&val, sizeof(uint64_t));
+  }
+  // After GdrMap is destroyed, gpuMem should still be valid
+  uint64_t result = 0;
+  MSCCLPP_CUDATHROW(cudaMemcpy(&result, gpuMem.get(), sizeof(uint64_t), cudaMemcpyDeviceToHost));
+  EXPECT_EQ(result, 1);
+}
+
+// GPU kernel: polls signalFromCpu until it reaches expectedIter, then writes expectedIter to ackToHost.
+// Repeats for maxIter iterations. The GPU uses system-scope acquire loads on signalFromCpu
+// and plain stores to ackToHost (which is host-pinned memory visible to CPU).
+__global__ void kernelGdrVisibilityPingPong(volatile uint64_t* signalFromCpu, volatile uint64_t* ackToHost,
+                                            uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // Poll until CPU writes the expected iteration value via GDRCopy BAR1
+    while (*signalFromCpu < iter) {
+    }
+    // Ack back to CPU via host-pinned memory
+    *ackToHost = iter;
+  }
+}
+
+TEST(GdrMapTest, CpuGpuVisibilityPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // signalBuf: GPU memory mapped via GDRCopy BAR1. CPU writes here, GPU polls.
+  auto signalBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap signalMap(std::static_pointer_cast<void>(signalBuf), deviceId_);
+  ASSERT_TRUE(signalMap.valid());
+
+  // ackBuf: host-pinned memory (gpuCallocHostShared). GPU writes here, CPU polls.
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — it will poll signalBuf and write ackBuf for each iteration
+  kernelGdrVisibilityPingPong<<<1, 1>>>(signalBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes iteration value to GPU via GDRCopy BAR1
+    uint64_t val = iter;
+    signalMap.copyTo(&val, sizeof(uint64_t));
+
+    // CPU polls host-pinned ack until GPU confirms it saw the value
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        FAIL() << "GPU did not ack iteration " << iter << " (ack=" << *ackPtr << ")";
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}
+
+// GPU kernel that polls a counter using system-scope acquire load.
+// When counter >= expectedIter, writes ack.
+__global__ void kernelCounterWait(uint64_t* counter, volatile uint64_t* ackToHost, uint64_t maxIter) {
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // System-scope acquire load — matches the atomicStore(relaxed) on the CPU side
+    uint64_t got;
+    do {
+      got = mscclpp::atomicLoad(counter, mscclpp::memoryOrderAcquire);
+    } while (got < iter);
+    // Ack back
+    *ackToHost = iter;
+  }
+}
+
+// Test the GDRCopy counter pattern used by HostNoAtomic mode:
+// - GPU memory allocated via gpuCallocShared (cudaMalloc)
+// - GdrMap for BAR1 mapping
+// - CPU writes via atomicStore(relaxed) through GDRCopy BAR1 mapping
+// - GPU reads via atomicLoad with memory_order_acquire
+TEST(GdrMapTest, AtomicStoreCounterPingPong) {
+  const uint64_t maxIter = 10000;
+
+  // Allocate GPU memory via gpuCallocShared
+  auto counterBuf = mscclpp::detail::gpuCallocShared<uint64_t>(1);
+  mscclpp::GdrMap counterMap(std::static_pointer_cast<void>(counterBuf), deviceId_);
+  ASSERT_TRUE(counterMap.valid());
+
+  // Ack buffer: host-pinned memory
+  auto ackBuf = mscclpp::detail::gpuCallocHostShared<uint64_t>(1);
+  volatile uint64_t* ackPtr = reinterpret_cast<volatile uint64_t*>(ackBuf.get());
+  *ackPtr = 0;
+
+  // Launch kernel — polls counterBuf with system-scope acquire load
+  kernelCounterWait<<<1, 1>>>(counterBuf.get(), ackBuf.get(), maxIter);
+  MSCCLPP_CUDATHROW(cudaGetLastError());
+
+  for (uint64_t iter = 1; iter <= maxIter; ++iter) {
+    // CPU writes counter via atomicStore (relaxed — GPU uses acquire on read)
+    mscclpp::atomicStore(counterMap.hostPtr(), iter, mscclpp::memoryOrderRelaxed);
+
+    // Wait for GPU ack
+    int spin = 0;
+    while (*ackPtr < iter) {
+      if (++spin > 100000000) {
+        MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+        FAIL() << "GPU did not ack iteration " << iter;
+      }
+    }
+  }
+
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  EXPECT_EQ(*ackPtr, maxIter);
+}
diff --git a/test/unit/gpu_utils_tests.cc b/test/unit/gpu_utils_tests.cc
index f4aba0d7..977314e9 100644
--- a/test/unit/gpu_utils_tests.cc
+++ b/test/unit/gpu_utils_tests.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/gpu_utils.hpp>
 
+#include "../framework.hpp"
+
 TEST(GpuUtilsTest, StreamPool) {
   auto streamPool = mscclpp::gpuStreamPool();
   cudaStream_t s;
diff --git a/test/unit/local_channel_tests.cu b/test/unit/local_channel_tests.cu
index 7414f6bb..699baa38 100644
--- a/test/unit/local_channel_tests.cu
+++ b/test/unit/local_channel_tests.cu
@@ -1,21 +1,21 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/port_channel.hpp>
 #include <mscclpp/port_channel_device.hpp>
 
+#include "../framework.hpp"
+
 #define MAGIC_CONST 777
 
 __constant__ mscclpp::PortChannelDeviceHandle gPortChannel;
 
-__global__ void kernelLocalPortChannelTest(void *dst, void *src, size_t bytes, int *ret) {
+__global__ void kernelLocalPortChannelTest(void* dst, void* src, size_t bytes, int* ret) {
   if (blockIdx.x == 0) {
     // sender
-    int *ptr = reinterpret_cast<int *>(src);
+    int* ptr = reinterpret_cast<int*>(src);
     for (size_t idx = threadIdx.x; idx < bytes / sizeof(int); idx += blockDim.x) {
       ptr[idx] = MAGIC_CONST;
     }
@@ -29,7 +29,7 @@ __global__ void kernelLocalPortChannelTest(void *dst, void *src, size_t bytes, i
       gPortChannel.wait();
     }
     __syncthreads();
-    int *ptr = reinterpret_cast<int *>(dst);
+    int* ptr = reinterpret_cast<int*>(dst);
     for (size_t idx = threadIdx.x; idx < bytes / sizeof(int); idx += blockDim.x) {
       if (ptr[idx] != MAGIC_CONST) {
         *ret = 1;  // Error: value mismatch
diff --git a/test/unit/numa_tests.cc b/test/unit/numa_tests.cc
index dfa63a74..46bf5e18 100644
--- a/test/unit/numa_tests.cc
+++ b/test/unit/numa_tests.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 
+#include "../framework.hpp"
+
 TEST(NumaTest, Basic) {
   int num;
   MSCCLPP_CUDATHROW(cudaGetDeviceCount(&num));
diff --git a/test/unit/socket_tests.cc b/test/unit/socket_tests.cc
index 1ab592ba..a5598938 100644
--- a/test/unit/socket_tests.cc
+++ b/test/unit/socket_tests.cc
@@ -1,11 +1,10 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/utils.hpp>
 #include <thread>
 
+#include "../framework.hpp"
 #include "socket.h"
 #include "utils_internal.hpp"
 
diff --git a/test/unit/unit_tests_main.cc b/test/unit/unit_tests_main.cc
new file mode 100644
index 00000000..397566e0
--- /dev/null
+++ b/test/unit/unit_tests_main.cc
@@ -0,0 +1,6 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "../framework.hpp"
+
+int main(int argc, char** argv) { return RUN_ALL_TESTS(); }
diff --git a/test/unit/utils_internal_tests.cc b/test/unit/utils_internal_tests.cc
index 5479a681..8526d9fe 100644
--- a/test/unit/utils_internal_tests.cc
+++ b/test/unit/utils_internal_tests.cc
@@ -1,10 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include <gtest/gtest.h>
-
 #include <thread>
 
+#include "../framework.hpp"
 #include "utils_internal.hpp"
 
 TEST(UtilsInternalTest, getHostHash) {
diff --git a/test/unit/utils_tests.cc b/test/unit/utils_tests.cc
index fa079b30..51562c21 100644
--- a/test/unit/utils_tests.cc
+++ b/test/unit/utils_tests.cc
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/errors.hpp>
 #include <mscclpp/utils.hpp>
 #include <thread>
 
+#include "../framework.hpp"
+
 TEST(UtilsTest, getHostName) {
   std::string hostname1 = mscclpp::getHostName(1024, '.');
   EXPECT_FALSE(hostname1.empty());
diff --git a/tools/npkit/npkit_trace_generator.py b/tools/npkit/npkit_trace_generator.py
index c5ed6191..294516e6 100644
--- a/tools/npkit/npkit_trace_generator.py
+++ b/tools/npkit/npkit_trace_generator.py
@@ -14,25 +14,25 @@ def parse_npkit_event_header(npkit_event_header_path):
         "NOP",
         "BARRIER",
         "PUT",
-        "PUT_PACKET",
-        "READ_PUT_PACKET",
+        "PUT_PACKETS",
+        "READ_PUT_PACKETS",
         "PUT_WITH_SIGNAL",
         "PUT_WITH_SIGNAL_AND_FLUSH",
         "GET",
         "COPY",
-        "COPY_PACKET",
-        "TRANSFORM_TO_PACKET",
+        "COPY_PACKETS",
+        "UNPACK_PACKETS",
         "SIGNAL",
         "WAIT",
         "FLUSH",
         "REDUCE",
-        "REDUCE_PACKET",
+        "REDUCE_PACKETS",
         "REDUCE_COPY_PACKETS",
         "REDUCE_SEND",
-        "REDUCE_SEND_PACKET",
+        "REDUCE_SEND_PACKETS",
         "REDUCE_COPY_SEND_PACKETS",
-        "READ_REDUCE_COPY",
-        "READ_REDUCE_COPY_SEND",
+        "READ_REDUCE",
+        "READ_REDUCE_SEND",
         "MULTI_LOAD_REDUCE_STORE",
         "RELAXED_SIGNAL",
         "RELAXED_WAIT",