Merge latest multinode branch

2026-05-11 08:50:21 +00:00 · 2026-04-21 18:19:40 +00:00
parent 6b44085148 eb1e0f33ee
commit 2f6fec9e20
226 changed files with 14140 additions and 4497 deletions
--- a/.azure-pipelines/codecov.yml
+++ b/.azure-pipelines/codecov.yml
@@ -0,0 +1,93 @@
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - apps/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+      - .devcontainer/**
+      - .github/**
+      - apps/**
+      - docker/**
+      - docs/**
+      - '**/*.md'
+
+jobs:
+- job: CodeCoverageA100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci
+      vmssName:         mscclpp-ci
+      gpuArch:          '80'
+
+- job: CodeCoverageH100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-h100
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
+
+- job: CodeCoverageMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      platform:         rocm
+      gpuArch:          gfx942
--- a/.azure-pipelines/integration-test-rocm.yml
+++ b/.azure-pipelines/integration-test-rocm.yml
@@ -1,114 +0,0 @@
-trigger:
-  branches:
-    include:
-    - main
-    - release/*
-  paths:
-    exclude:
-    - .devcontainer/**
-    - .github/**
-    - docker/**
-    - docs/**
-    - '**/*.md'
-
-pr:
-  branches:
-    include:
-    - main
-    - release/*
-  drafts: false
-  paths:
-    exclude:
-      - .devcontainer/**
-      - .github/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
-
-jobs:
- job: IntegrationTestRocm
-  displayName: Integration test ROCm
-  strategy:
-    matrix:
-      rocm6.2:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
-
-  pool:
-    name: mscclpp-rocm
-  container:
-    image: $[ variables['containerImage'] ]
-    options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1
-
-  steps:
-  - task: Bash@3
-    name: Build
-    displayName: Build
-    inputs:
-      targetType: 'inline'
-      script: |
-        mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
-        make -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: InstallRcclTest
-    displayName: Install rccl-test
-    inputs:
-      targetType: 'inline'
-      script: |
-        git clone https://github.com/ROCm/rccl-tests.git
-        cd rccl-tests
-        make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: InstallDep
-    displayName: Install dependencies
-    inputs:
-     targetType: 'inline'
-     script: |
-      set -e
-      git clone https://github.com/Azure/msccl-tools.git
-      cd msccl-tools
-      pip3 install .
-
-  - task: Bash@3
-    name: GenerateExectionFiles
-    displayName: Generate execution files
-    inputs:
-     targetType: 'inline'
-     script: |
-      set -e
-      git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/msccl-users
-      cd msccl-users
-      mkdir execution-files
-      python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json
-      python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json
-
-  - task: Bash@3
-    name: AllReduceTest
-    displayName: Run mscclpp allReduce test
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        export PATH=/usr/local/mpi/bin:$PATH
-        sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \
-          -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: AllReduceWithExecutionFileTest
-    displayName: Run mscclpp allReduce with execution file
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        export PATH=/usr/local/mpi/bin:$PATH
-        sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
-          -x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
-          -x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
-          -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
-          -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -41,11 +41,10 @@ jobs:
    image: $(containerImage)

  steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
    parameters:
      subscription:     mscclpp-ci
      vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '80'

 - job: IntegrationTestH100
@@ -61,10 +60,9 @@ jobs:
    image: $(containerImage)

  steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
    parameters:
      subscription:     mscclpp-ci-h100
      vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
      perfBaselineFile: test/deploy/perf_ndmv5.jsonl
      gpuArch:          '90'
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -37,33 +37,6 @@ jobs:
    image: $[ variables['containerImage'] ]

  steps:
-  - task: Bash@3
-    name: Build
-    displayName: Build
-    inputs:
-      targetType: 'inline'
-      script: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
-        make -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: DownloadSecureFile@1
-    name: SshKeyFile
-    displayName: Download key file
-    inputs:
-      secureFile: mscclpp-ssh.key
-
-  - task: Bash@3
-    name: InstallPackages
-    displayName: Install Packages
-    inputs:
-      targetType: 'inline'
-      script: |
-        sudo apt-get update -y
-        sudo apt-get install pssh -y
-        curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
  - task: Bash@3
    displayName: Add HostEntry
    inputs:
@@ -77,107 +50,46 @@ jobs:
          echo "Entry already exists, nothing to do."
        fi

-  - task: AzureCLI@2
-    name: StartVMSS
-    displayName: Start VMSS
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss start --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/deploy.yml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT

-  - task: Bash@3
-    name: DeployTestEnv
-    displayName: Deploy Test Env
-    inputs:
-      targetType: filePath
-      filePath: test/deploy/deploy.sh
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMscclppTest
+      displayName: Run multi-nodes mscclpp-test
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test

-  - task: Bash@3
-    name: RunMscclppTest
-    displayName: Run multi-nodes mscclpp-test
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodeUnitTest
+      displayName: Run multi-nodes unit tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mp-ut

-  - task: Bash@3
-    name: RunMultiNodeUnitTest
-    displayName: Run multi-nodes unit tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonTests
+      displayName: Run multi-nodes python tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh pytests

-  - task: Bash@3
-    name: RunMultiNodePythonTests
-    displayName: Run multi-nodes python tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonBenchmark
+      displayName: Run multi-nodes python benchmark
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark

-  - task: Bash@3
-    name: RunMultiNodePythonBenchmark
-    displayName: Run multi-nodes python benchmark
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
-        kill $CHILD_PID
-
-  - task: AzureCLI@2
-    name: StopVMSS
-    displayName: Deallocate VMSS
-    condition: always()
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss deallocate  --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/stop.yml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT
--- a/.azure-pipelines/nccl-api-test.yaml
+++ b/.azure-pipelines/nccl-api-test.yaml
@@ -40,11 +40,10 @@ jobs:
    image: $(containerImage)

  steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
    parameters:
      subscription:     mscclpp-ci
      vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
      nvccGencode:      "-gencode=arch=compute_80,code=sm_80"

 - job: NcclTestH100
@@ -61,9 +60,8 @@ jobs:
    image: $(containerImage)

  steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
    parameters:
      subscription:     mscclpp-ci-h100
      vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
      nvccGencode:      "-gencode=arch=compute_90,code=sm_90"
--- a/.azure-pipelines/rccl-api-test.yml
+++ b/.azure-pipelines/rccl-api-test.yml
@@ -0,0 +1,47 @@
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+      - .devcontainer/**
+      - .github/**
+      - docker/**
+      - docs/**
+      - '**/*.md'
+
+jobs:
+- job: RcclTestMI300X
+  displayName: Run MSCCLPP over RCCL Test (MI300X)
+  pool:
+    name: msccl-ci-mi300x
+
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/rccl-test.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      gpuArch:          gfx942
--- a/.azure-pipelines/templates/codecov.yml
+++ b/.azure-pipelines/templates/codecov.yml
@@ -0,0 +1,110 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildType:        Debug
+    cmakeArgs:        '-DMSCCLPP_ENABLE_COVERAGE=ON'
+    buildDisplayName: 'Build with coverage'
+    buildName:        BuildCoverage
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+- template: run-remote-task.yml
+  parameters:
+    name: TestsCoverageNonPerf
+    displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+      STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
+      export GCOV_PREFIX=/root/mscclpp
+      export GCOV_PREFIX_STRIP=$STRIP_COUNT
+
+      echo "Running unit_tests..."
+      ./build/bin/unit_tests
+      echo "unit_tests: PASSED"
+
+      echo "Running mp_unit_tests -np 2..."
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 2: PASSED"
+
+      echo "Running mp_unit_tests -np 4..."
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 4: PASSED"
+
+- template: run-remote-task.yml
+  parameters:
+    name: CaptureCoverage
+    displayName: Capture coverage data with lcov
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+
+      GCOV_TOOL_ARG=""
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1
+        GCOV_WRAPPER=$(mktemp)
+        printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
+        chmod +x "$GCOV_WRAPPER"
+        GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}"
+      fi
+
+      lcov --version
+      LCOV_CAPTURE_ARGS=""
+      if lcov --help 2>&1 | grep -q "inconsistent"; then
+        LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
+      fi
+
+      lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
+      if [ ! -s coverage.info ]; then
+        echo "ERROR: coverage.info was not generated."
+        exit 1
+      fi
+
+      lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
+      lcov --list coverage.info
+      ls -la coverage.info
+
+- task: Bash@3
+  name: FetchCoverage
+  displayName: Fetch coverage data from remote VM
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      HOST=$(head -1 ${HOSTFILE})
+      ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
+        'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
+      scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: UploadCodecov
+  displayName: Upload coverage to Codecov
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      curl -Os https://cli.codecov.io/latest/linux/codecov
+      chmod +x codecov
+      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/deploy.yml
+++ b/.azure-pipelines/templates/deploy.yml
@@ -0,0 +1,151 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+# Build parameters
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+  default: ''
+- name: buildType
+  type: string
+  default: 'Release'
+- name: buildTests
+  type: string
+  default: 'true'
+- name: cmakeArgs
+  type: string
+  default: ''
+- name: buildName
+  type: string
+  default: 'Build'
+- name: buildDisplayName
+  type: string
+  default: 'Build'
+# Deploy parameters
+- name: deployArgs
+  type: string
+  default: ''
+
+steps:
+# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
+- task: Bash@3
+  name: EnsureAzureCLI
+  displayName: Ensure Azure CLI Installed
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      if command -v az >/dev/null 2>&1; then
+        az version >/dev/null
+        exit 0
+      fi
+      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
+# 1. Build
+- task: Bash@3
+  name: ${{ parameters.buildName }}
+  displayName: ${{ parameters.buildDisplayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      BUILD_TESTS_ARG=""
+      if [ "${{ parameters.buildTests }}" = "true" ]; then
+        BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON"
+      fi
+
+      GPU_ARCH_ARG=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        eval CXX=/opt/rocm/bin/hipcc cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_ROCM=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      else
+        eval cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_CUDA=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      fi
+      make -j
+      cd ..
+      pwd > build/BUILD_PREFIX
+      echo "=== Build artifacts ==="
+      ls -la build/bin/ || echo "ERROR: build/bin/ missing after build"
+      du -sh build/bin/* 2>/dev/null || true
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 2. Write CMake args for pip install on remote VMs
+- task: Bash@3
+  name: WritePipCmakeArgs
+  displayName: Write pip CMake args
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      PIP_CMAKE_ARGS=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        PIP_CMAKE_ARGS="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ -n "${CMAKE_EXTRA_ARGS}" ]; then
+        PIP_CMAKE_ARGS="${PIP_CMAKE_ARGS} ${CMAKE_EXTRA_ARGS}"
+      fi
+      echo "${PIP_CMAKE_ARGS}" > pip_cmake_args.txt
+      echo "pip CMake args: $(cat pip_cmake_args.txt)"
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 3. Download SSH key + install packages + start VMSS
+- task: DownloadSecureFile@1
+  name: SshKeyFile
+  displayName: Download key file
+  inputs:
+    secureFile: mscclpp.pem
+
+- task: Bash@3
+  name: InstallPackages
+  displayName: Install Packages
+  inputs:
+    targetType: 'inline'
+    script: |
+      sudo apt-get update -y
+      sudo apt-get install pssh -y
+
+- task: AzureCLI@2
+  name: StartVMSS
+  displayName: Start VMSS
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
+
+# 4. Deploy test environment
+- task: Bash@3
+  name: DeployTestEnv
+  displayName: Deploy Test Env
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: ${{ parameters.deployArgs }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
--- a/.azure-pipelines/templates/integration-test.yaml
+++ b/.azure-pipelines/templates/integration-test.yaml
@@ -1,242 +0,0 @@
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: perfBaselineFile
-  type: string
-  default: 'test/deploy/perf_ndmv4.jsonl'
- name: gpuArch
-  type: string
-
-steps:
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: inline
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: AllGatherTest
-  displayName: Run mscclpp AllGather test
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        set -e;                                                   \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl;       \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: SendRecvTest
-  displayName: Run mscclpp SendRecv test
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: AllReduceTest
-  displayName: Run mscclpp AllReduce test
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl;                 \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: AllToAll
-  displayName: Run mscclpp AllToAll test
-  inputs:
-    targetType: 'inline'
-    script: |
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: CheckPerfNumber
-  displayName: Check collective primitives performance
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        cd /root/mscclpp;                                         \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: PythonAllReduceBenchmark
-  displayName: Python Allreduce Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        set -e;                                                    \
-        cd /root/mscclpp;                                          \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 -m pip install .;                                     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: FifoPerfBenchmark
-  displayName: FIFO Performance Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        ./build/bin/perf/fifo_test"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/integration-test.yml
+++ b/.azure-pipelines/templates/integration-test.yml
@@ -0,0 +1,76 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: perfBaselineFile
+  type: string
+  default: 'test/deploy/perf_ndmv4.jsonl'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test'
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllGatherTest
+    displayName: Run mscclpp AllGather test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: SendRecvTest
+    displayName: Run mscclpp SendRecv test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllReduceTest
+    displayName: Run mscclpp AllReduce test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllToAll
+    displayName: Run mscclpp AllToAll test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: CheckPerfNumber
+    displayName: Check collective primitives performance
+    remoteScript: |
+      python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
+
+- template: run-remote-task.yml
+  parameters:
+    name: PythonAllReduceBenchmark
+    displayName: Python Allreduce Benchmark
+    remoteScript: |
+      python3 -m pip install .
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ b/.azure-pipelines/templates/nccl-test.yaml
@@ -1,280 +0,0 @@
-# .azure-pipelines/templates/nccl-test.yaml
-# ----------------------------------------
-# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
-#
-# Parameters:
-#   subscription     – Azure subscription to use for VMSS start/stop
-#   sshKeySecureFile – the secureFile name for your SSH key
-
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: nvccGencode
-  type: string
-  default: "-gencode=arch=compute_80,code=sm_80"
-
-steps:
- checkout: self
- checkout: git://One/msccl-users
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
-
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: mscclpp/test/deploy/deploy.sh
-    arguments: nccltest-single-node
-    workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
-
- task: Bash@3
-  name: CopyMscclUsers
-  displayName: Copy msccl-users
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      DST_DIR="/tmp/mscclpp/msccl-users"
-      parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: GenerateExecutionFile
-#   displayName: Generate execution file
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp/msccl-users;  \
-#         mkdir -p execution-files;      \
-#         cd /root/mscclpp/msccl-users;  \
-#         bash algos/mscclpp_a100/generate_execution_plan.sh"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: InstallNcclTests
-  displayName: Install NCCL Tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd; git clone https://github.com/NVIDIA/nccl-tests.git;    \
-        cd nccl-tests;                                             \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllReduceTest
-#   displayName: Run NCCL AllReduce Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllGatherTest
-#   displayName: Run NCCL AllGather Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterTest
-#   displayName: Run NCCL Reduce Scatter Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: InstallNccl
-  displayName: Install NCCL
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd; git clone https://github.com/NVIDIA/nccl.git;          \
-        cd nccl;                                                   \
-        make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: RunNcclAllGatherFallbaclkToNcclTest
-  displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: RunNcclAllReduceFallbaclkToNcclTest
-  displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: RunNcclBroadcastFallbaclkToNcclTest
-  displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterFallbaclkToNcclTest
-#   displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";                                                                 \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;                                                                            \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/nccl-test.yml
+++ b/.azure-pipelines/templates/nccl-test.yml
@@ -0,0 +1,76 @@
+# .azure-pipelines/templates/nccl-test.yml
+# ----------------------------------------
+# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: nvccGencode
+  type: string
+  default: "-gencode=arch=compute_80,code=sm_80"
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    deployArgs:       'nccltest-single-node'
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallNcclTests
+    displayName: Install NCCL Tests
+    remoteScript: |
+      cd
+      git clone https://github.com/NVIDIA/nccl-tests.git
+      cd nccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallNccl
+    displayName: Install NCCL
+    remoteScript: |
+      LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
+      if [ -z "$LATEST_TAG" ]; then
+        echo "Failed to fetch latest NCCL tag"
+        exit 1
+      fi
+      cd
+      git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
+      cd nccl
+      make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclAllGatherFallbaclkToNcclTest
+    displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclAllReduceFallbaclkToNcclTest
+    displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclBroadcastFallbaclkToNcclTest
+    displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/rccl-test.yml
+++ b/.azure-pipelines/templates/rccl-test.yml
@@ -0,0 +1,63 @@
+# .azure-pipelines/templates/rccl-test.yml
+# ------------------------------------------------
+# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+#   vmssName         – VMSS name to start/stop
+#   gpuArch          – GPU architecture (e.g. gfx942)
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+  default: "gfx942"
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         rocm
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildTests:       false
+    deployArgs:       'single-node-test true rocm'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallRcclTests
+    displayName: Install RCCL Tests
+    remoteScript: |
+      cd
+      git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git
+      cd rocm-systems
+      git sparse-checkout init --cone
+      git sparse-checkout set projects/rccl-tests
+      git checkout
+      cd projects/rccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunRcclAllGatherTest
+    displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunRcclAllReduceTest
+    displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/run-remote-task.yml
+++ b/.azure-pipelines/templates/run-remote-task.yml
@@ -0,0 +1,27 @@
+parameters:
+- name: name
+  type: string
+  default: ''
+- name: displayName
+  type: string
+- name: runRemoteArgs
+  type: string
+  default: ''
+- name: remoteScript
+  type: string
+- name: workingDirectory
+  type: string
+  default: '$(System.DefaultWorkingDirectory)'
+
+steps:
+- task: Bash@3
+  ${{ if ne(parameters.name, '') }}:
+    name: ${{ parameters.name }}
+  displayName: ${{ parameters.displayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD'
+      ${{ parameters.remoteScript }}
+      REMOTE_CMD
+    workingDirectory: ${{ parameters.workingDirectory }}
--- a/.azure-pipelines/templates/stop.yml
+++ b/.azure-pipelines/templates/stop.yml
@@ -0,0 +1,20 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+
+steps:
+- task: AzureCLI@2
+  name: StopVMSS
+  displayName: Deallocate VMSS
+  condition: always()
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
--- a/.azure-pipelines/templates/ut-executor.yml
+++ b/.azure-pipelines/templates/ut-executor.yml
@@ -0,0 +1,42 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: ExecutorTest
+    displayName: Run executor tests
+    remoteScript: |
+      python3 -m pip install .
+      PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
+      TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yaml
@@ -1,89 +0,0 @@
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: gpuArch
-  type: string
-
-steps:
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: single-node-test false
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py::test_executor -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/ut-no-ib-env.yml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yml
@@ -0,0 +1,95 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_USE_IB=OFF'
+    deployArgs:       'single-node-test false'
+
+- template: run-remote-task.yml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: run-remote-task.yml
+  parameters:
+    name: StopContainer
+    displayName: Stop existing container
+    runRemoteArgs: '--no-docker --no-log'
+    remoteScript: |
+      sudo docker stop mscclpp-test || true
+      sudo docker rm mscclpp-test || true
+
+- task: Bash@3
+  displayName: Remove generated SSH key files
+  inputs:
+    targetType: 'inline'
+    script: |
+      rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+  
+- task: Bash@3
+  name: BuildWithIb
+  displayName: Rebuild with IB
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      cmake \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+        -DMSCCLPP_USE_CUDA=ON \
+        -DMSCCLPP_BUILD_TESTS=ON \
+        -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      make -j
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: DeployTestEnvWithIb
+  displayName: Deploy Test Env (with IB build)
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: single-node-test false
+    workingDirectory: $(System.DefaultWorkingDirectory)
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTestsWithIbBuildDisableIb
+    displayName: Run pytests (IB build, IB tests disabled)
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ b/.azure-pipelines/templates/ut-npkit.yaml
@@ -1,145 +0,0 @@
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: gpuArch
-  type: string
-
-
-steps:
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        set -e;                                                       \
-        cd /root/mscclpp;                                             \
-        mkdir -p build && cd build;                                   \
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
-        make -j"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: 'inline'
-    script: |
-      # set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output;     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json';      \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output;  \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;          \
-        grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;   \
-        grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/ut-npkit.yml
+++ b/.azure-pipelines/templates/ut-npkit.yml
@@ -0,0 +1,57 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
+    deployArgs:       'single-node-test'
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce"
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_UNPACK_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -1,135 +0,0 @@
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: gpuArch
-  type: string
-
-steps:
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
- task: Bash@3
-  name: UnitTests
-  displayName: Run mscclpp unit tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/ut.yml
+++ b/.azure-pipelines/templates/ut.yml
@@ -0,0 +1,49 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_fp8_accum.py -x
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -37,17 +37,16 @@ jobs:
      cuda11:
        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

  container:
    image: $(containerImage)

  steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
    parameters:
      subscription:     mscclpp-ci
      vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '80'

 - job: UnitTestWithNpKitA100
@@ -59,17 +58,16 @@ jobs:
      cuda11:
        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

  container:
    image: $(containerImage)

  steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
    parameters:
      subscription:     mscclpp-ci
      vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '80'

 - job: UnitTestH100
@@ -79,17 +77,16 @@ jobs:
  strategy:
    matrix:
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

  container:
    image: $(containerImage)

  steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
    parameters:
      subscription:     mscclpp-ci-h100
      vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '90'

 - job: UnitTestWithNpKitH100
@@ -99,21 +96,20 @@ jobs:
  strategy:
    matrix:
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

  container:
    image: $(containerImage)

  steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
    parameters:
      subscription:     mscclpp-ci-h100
      vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '90'

 - job: UnitTestNoIBEnv
-  timeoutInMinutes: 40
+  timeoutInMinutes: 60
  displayName: Test No IB Environment
  pool:
    name: msccl-ci-h100
@@ -121,15 +117,55 @@ jobs:
  strategy:
    matrix:
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

  container:
    image: $(containerImage)

  steps:
-  - template: templates/ut-no-ib-env.yaml
+  - template: templates/ut-no-ib-env.yml
    parameters:
      subscription:     mscclpp-ci-h100
      vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '90'
+
+- job: UnitTestMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      platform:         rocm
+      gpuArch:          gfx942
+
+- job: UnitTestExecutor
+  timeoutInMinutes: 60
+  displayName: Test DSL Executor
+  pool:
+    name: msccl-ci-h100
+
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut-executor.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -0,0 +1,24 @@
+codecov:
+  require_ci_to_pass: yes
+
+coverage:
+  status:
+    project:
+      default:
+        target: 68%
+        threshold: 1%
+    patch:
+      default:
+        target: 80%
+
+flag_management:
+  default_rules:
+    carryforward: true
+
+ignore:
+  - "test/"
+  - "examples/"
+  - "python/"
+  - "tools/"
+  - "docs/"
+  - "docker/"
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -25,7 +25,7 @@ For C/C++/CUDA source code:
 ```

 ## Formatting
-If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only staged files.
+If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only files that are tracked by git, so if you have added new files, make sure to `git add` them first.

 ## Building and Testing
 The following commands are commonly used for building and testing the project. See `docs/quickstart.md` for more detailed instructions.
@@ -40,10 +40,10 @@ cd ..

 For testing after successful build:
 ```bash
-# To run all tests
+# To run tests with two GPUs - two is enough for most tests
 mpirun -np 2 ./build/bin/mp_unit_tests
 # To run tests excluding IB-related ones (when IB is not available)
-mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
+mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
 ```

 For building a Python package:
@@ -51,6 +51,12 @@ For building a Python package:
 python3 -m pip install -e .
 ```

+For Python tests after building the package:
+```bash
+# Run tests with 8 GPUs - adjust the number as needed
+mpirun -np 8 python3 -m pytest ./python/test/test_mscclpp.py -vx
+```
+
 For building documentation (see dependencies in `docs/requirements.txt`):
 ```bash
 cd docs
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -40,7 +40,7 @@ jobs:
      fail-fast: false
      matrix:
        language: [ 'cpp', 'python' ]
-        version: [ 'cuda11.8', 'cuda12.8' ]
+        version: [ 'cuda11.8', 'cuda12.9' ]

    steps:
    - name: Checkout repository
@@ -51,7 +51,7 @@ jobs:
        df -h

    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v3
+      uses: github/codeql-action/init@v4
      with:
        languages: ${{ matrix.language }}

@@ -62,11 +62,11 @@ jobs:
    - name: Build
      run: |
        rm -rf build && mkdir build && cd build
-        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
-        make -j
+        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF ..
+        make -j4

    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v3
+      uses: github/codeql-action/analyze@v4
      with:
        category: "/language:${{matrix.language}}/version:${{matrix.version}}"

@@ -96,7 +96,7 @@ jobs:
        df -h

    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v3
+      uses: github/codeql-action/init@v4
      with:
        languages: ${{ matrix.language }}

@@ -107,10 +107,10 @@ jobs:
    - name: Build
      run: |
        rm -rf build && mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
-        make -j
+        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF ..
+        make -j4

    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v3
+      uses: github/codeql-action/analyze@v4
      with:
        category: "/language:${{matrix.language}}/version:${{matrix.version}}"
--- a/.github/workflows/doc-build.yaml
+++ b/.github/workflows/doc-build.yaml
--- a/.github/workflows/integration-test-backup.yml
+++ b/.github/workflows/integration-test-backup.yml
@@ -1,69 +0,0 @@
-name: IntegrationTest
-
-on: workflow_dispatch
-
-jobs:
-  IntegrationTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-
-      - name: Lock GPU clock frequency
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: Run mscclpp AllGather test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-
-      - name: Run mscclpp SendRecv test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-
-      - name: Run mscclpp AllReduce test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
-
-      - name: Run mscclpp AllToAll test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-
-      - name: Check collective primitives performance
-        run: |
-          set -e
-          python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
--- a/.github/workflows/mscclpp-lang.yml
+++ b/.github/workflows/mscclpp-lang.yml
@@ -15,7 +15,7 @@ jobs:
    strategy:
        fail-fast: false
        matrix:
-          version: [ 'cuda11.8', 'cuda12.8' ]
+          version: [ 'cuda11.8', 'cuda12.9' ]

    steps:
    - uses: actions/checkout@v4
--- a/.github/workflows/ut-backup.yml
+++ b/.github/workflows/ut-backup.yml
@@ -1,52 +0,0 @@
-name: UnitTest
-
-on: workflow_dispatch
-
-jobs:
-  UnitTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-        working-directory: ${{ github.workspace }}
-
-      - name: LockGPUClock
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: UnitTests
-        run: |
-          ./build/bin/unit_tests
-
-      - name: MpUnitTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
-
-      - name: PyTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,9 @@
 .vscode/
-.hypothesis/
 build/
-dist/
+build_coverage/
 __pycache__
 .*.swp
-.idea/
 *.so
+.pytest_cache/
+_codeql_detected_source_root
 docs/_static/versions.js
-_codeql_detected_source_root
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.

 cmake_minimum_required(VERSION 3.25)
 project(mscclpp LANGUAGES CXX)
@@ -47,7 +47,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

 # Options
 option(MSCCLPP_ENABLE_TRACE "Enable tracing" OFF)
-option(MSCCLPP_BUILD_TESTS "Build tests" ON)
+option(MSCCLPP_BUILD_TESTS "Build tests" OFF)
 option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
 option(MSCCLPP_BUILD_EXT_NCCL "Build NCCL interfaces" ON)
 option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
@@ -56,6 +56,8 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
 option(MSCCLPP_USE_IB "Use InfiniBand." ON)
 option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
+option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
+option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
 set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")

 if(MSCCLPP_BYPASS_GPU_CHECK)
@@ -98,6 +100,62 @@ else()
        message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.")
    endif()
 endif()
+
+# Code coverage setup
+if(MSCCLPP_ENABLE_COVERAGE)
+    if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+        message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading")
+    endif()
+    
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+        message(STATUS "Code coverage enabled")
+        
+        # Add coverage flags to C++ targets only (not CUDA)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:--coverage>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O0>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-g>)
+        add_link_options($<$<LINK_LANGUAGE:CXX>:--coverage>)
+        
+        # Find lcov
+        find_program(LCOV_PATH lcov)
+        
+        if(NOT LCOV_PATH)
+            message(WARNING "lcov not found. Install lcov to generate coverage reports.")
+        endif()
+        
+        if(LCOV_PATH)
+            # Add coverage target
+            add_custom_target(coverage
+                COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Running tests..."
+                COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
+                COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info"
+                
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Generating code coverage report"
+            )
+            
+            # Add coverage clean target
+            add_custom_target(coverage-clean
+                COMMAND ${CMAKE_COMMAND} -E remove coverage.info
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Cleaning coverage data"
+            )
+        endif()
+    else()
+        message(WARNING "Code coverage is only supported with GCC or Clang compilers")
+    endif()
+endif()
 if(MSCCLPP_GPU_ARCHS)
    string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
    string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
@@ -166,12 +224,35 @@ if(MSCCLPP_USE_IB)
    if(NOT IBVERBS_FOUND)
        message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.")
    endif()
+    find_package(MLX5)
+    if(MLX5_FOUND)
+        message(STATUS "MLX5 Direct Verbs found: ${MLX5_LIBRARIES}")
+    else()
+        message(STATUS "MLX5 Direct Verbs not found, mlx5dv optimizations disabled")
+    endif()
 endif()
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)

+option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON)
+if(MSCCLPP_USE_ROCM)
+    set(MSCCLPP_USE_GDRCOPY OFF)
+endif()
+if(MSCCLPP_USE_GDRCOPY)
+    find_package(GDRCopy)
+    if(NOT GDRCOPY_FOUND)
+        message(STATUS "GDRCopy not found, disabling GDRCopy support")
+        set(MSCCLPP_USE_GDRCOPY OFF)
+    else()
+        message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}")
+    endif()
+endif()
+
 include(FetchContent)
-FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
+FetchContent_Declare(json
+    GIT_REPOSITORY https://github.com/nlohmann/json.git
+    GIT_TAG v3.12.0
+)
 FetchContent_MakeAvailable(json)

 if("${INSTALL_PREFIX}" STREQUAL "")
--- a/README.md
+++ b/README.md
@@ -3,13 +3,16 @@
 [![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest)
 [![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
 [![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
-[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp)

 | Testing Pipelines        | Build Status      |
 |--------------------------|-------------------|
-| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
-| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
-| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) |
+| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Unit Tests (ROCm)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
+| NCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
+| RCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |

 A GPU-driven communication stack for scalable AI applications.

--- a/2
+++ b/2
@@ -1 +1 @@
-0.8.0
+0.9.0
--- a/cmake/FindGDRCopy.cmake
+++ b/cmake/FindGDRCopy.cmake
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE)
+#
+# The following variables are optionally searched for defaults
+#  GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
+#  GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
+#  GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found
+
+# The following are set after configuration is done:
+#  GDRCOPY_FOUND
+#  GDRCOPY_INCLUDE_DIRS
+#  GDRCOPY_LIBRARIES
+
+find_path(GDRCOPY_INCLUDE_DIRS
+  NAMES gdrapi.h
+  HINTS
+  ${GDRCOPY_INCLUDE_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(GDRCOPY_LIBRARIES
+  NAMES gdrapi
+  HINTS
+  ${GDRCOPY_LIB_DIR}
+  ${GDRCOPY_ROOT_DIR}
+  ${GDRCOPY_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+if(GDRCOPY_INCLUDE_DIRS)
+    include(CheckSymbolExists)
+    set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
+    set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
+    check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    unset(CMAKE_REQUIRED_INCLUDES)
+    if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
+        message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
+        set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
+    endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
+mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
--- a/cmake/FindMLX5.cmake
+++ b/cmake/FindMLX5.cmake
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Find the MLX5 Direct Verbs (mlx5dv) library
+#
+# The following variables are optionally searched for defaults
+#  MLX5_ROOT_DIR: Base directory where all MLX5 components are found
+#  MLX5_INCLUDE_DIR: Directory where MLX5 headers are found
+#  MLX5_LIB_DIR: Directory where MLX5 libraries are found
+
+# The following are set after configuration is done:
+#  MLX5_FOUND
+#  MLX5_INCLUDE_DIRS
+#  MLX5_LIBRARIES
+
+find_path(MLX5_INCLUDE_DIRS
+  NAMES infiniband/mlx5dv.h
+  HINTS
+  ${MLX5_INCLUDE_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/include
+  /usr/local/include
+  /usr/include)
+
+find_library(MLX5_LIBRARIES
+  NAMES mlx5
+  HINTS
+  ${MLX5_LIB_DIR}
+  ${MLX5_ROOT_DIR}
+  ${MLX5_ROOT_DIR}/lib
+  /usr/local/lib
+  /usr/lib
+  /usr/lib/x86_64-linux-gnu)
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
+mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        htop \
-        lcov \
        vim \
        && \
    apt-get autoremove -y && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* /tmp/*

+# Install lcov 2.2
+RUN LCOV_VERSION="2.2" && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        cpanminus \
+        gcc \
+        make \
+        perl \
+        && \
+    cpanm --notest \
+        Capture::Tiny \
+        DateTime \
+        JSON::XS \
+        Memory::Process \
+        TimeDate \
+        && \
+    cd /tmp && \
+    curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \
+    tar xzf lcov.tar.gz && \
+    cd lcov-${LCOV_VERSION} && \
+    make install && \
+    cd / && rm -rf /tmp/lcov* && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
 # Install CMake 3.26.4
 RUN OS_ARCH=$(uname -m) && \
    CMAKE_VERSION="3.26.4" && \
@@ -24,6 +49,33 @@ RUN OS_ARCH=$(uname -m) && \
    rm -rf ${CMAKE_HOME}.tar.gz && \
    ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/

+# Install GDRCopy userspace library for CUDA targets
+ARG TARGET="cuda13.0"
+RUN if echo "$TARGET" | grep -q "^cuda"; then \
+        GDRCOPY_VERSION="2.5.2" && \
+        apt-get update -y && \
+        apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \
+        cd /tmp && \
+        curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
+        tar xzf gdrcopy.tar.gz && \
+        cd gdrcopy-${GDRCOPY_VERSION}/packages && \
+        ./build-deb-packages.sh -k -t && \
+        dpkg -i libgdrapi_*.deb && \
+        cd / && rm -rf /tmp/gdrcopy* && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* /tmp/*; \
+    fi
+
+# Install ROCm-specific packages if building for ROCm
+RUN if echo "$TARGET" | grep -q "^rocm"; then \
+        apt-get update -y && \
+        apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* /tmp/*; \
+    fi
+
 # Create Python venv
 RUN python3 -m venv /root/venv && \
    echo 'source /root/venv/bin/activate' >> /root/.bashrc
@@ -32,10 +84,13 @@ ENV PATH="/root/venv/bin:${PATH}"
 # Install Python dependencies
 ADD . /tmp/mscclpp
 WORKDIR /tmp/mscclpp
-ARG TARGET="cuda13.0"
 RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
+    if echo "$TARGET" | grep -q "^rocm"; then \
+        export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
+    fi && \
    pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r python/requirements_${target_type}.txt
+    pip install --no-cache-dir -r python/requirements_${target_type}.txt && \
+    pip install --no-cache-dir coverage xlsxwriter

 # Cleanup
 RUN rm -rf /tmp/mscclpp
--- a/docker/base-x-rocm.dockerfile
+++ b/docker/base-x-rocm.dockerfile
@@ -1,19 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-ENV RCCL_VERSION=rocm-6.2.0
-ARG GPU_ARCH=gfx942
-ENV ARCH_TARGET=${GPU_ARCH}
-RUN cd /tmp && \
-    git clone --branch ${RCCL_VERSION} --depth 1  https://github.com/ROCm/rccl.git && \
-    cd rccl && \
-    ./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \
-    cd .. && \
-    rm -rf /tmp/rccl
-
-WORKDIR /
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -4,38 +4,39 @@ set -e

 declare -A baseImageTable
 baseImageTable=(
-    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
-    ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
-    ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
-    ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
+    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
    ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
    ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
-    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
+    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
    ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
-    ["rocm6.2"]="rocm/rocm-terminal:6.2.1"
+    ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
 )

 declare -A extraLdPathTable
 extraLdPathTable=(
-    ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
-    ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
-    ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
+    ["cuda11.8"]="/usr/local/cuda-11.8/compat"
+    ["cuda12.4"]="/usr/local/cuda-12.4/compat"
+    ["cuda12.8"]="/usr/local/cuda-12.8/compat"
+    ["cuda12.9"]="/usr/local/cuda-12.9/compat"
+    ["cuda13.0"]="/usr/local/cuda-13.0/compat"
    ["rocm6.2"]="/opt/rocm/lib"
 )

 declare -A ofedVersionTable
 ofedVersionTable=(
+    ["cuda11.8"]="23.07-0.5.1.2"
    ["cuda12.4"]="23.07-0.5.1.2"
    ["cuda12.8"]="24.10-1.1.4.0"
    ["cuda12.9"]="24.10-1.1.4.0"
    ["cuda13.0"]="24.10-3.2.5.0"
+    ["rocm6.2"]="24.10-1.1.4.0"
 )

 TARGET=${1}
 OS_ARCH=$(uname -m)

 print_usage() {
-    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
+    echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
 }

 if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
@@ -68,18 +69,11 @@ docker build -t ${TAG_TMP} \

 if [[ ${TARGET} == rocm* ]]; then
    echo "Building ROCm base image..."
-    docker build -t ${TAG_BASE} \
-        -f docker/base-x-rocm.dockerfile \
-        --build-arg BASE_IMAGE=${TAG_TMP} \
-        --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
-        --build-arg TARGET=${TARGET} \
-        --build-arg GPU_ARCH="gfx942" .
-    docker rmi ${TAG_TMP}
 else
    echo "Building CUDA base image..."
-    docker tag ${TAG_TMP} ${TAG_BASE}
-    docker rmi --no-prune ${TAG_TMP}
 fi
+docker tag ${TAG_TMP} ${TAG_BASE}
+docker rmi --no-prune ${TAG_TMP}

 docker build -t ${TAG_BASE_DEV} \
    -f docker/base-dev-x.dockerfile \
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -5,7 +5,7 @@
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
-SPHINXMULTIVERSION ?= sphinx-multiversion
+SPHINXMULTIVERSION ?= python3 build_multiversion.py
 SOURCEDIR     = .
 BUILDDIR      = _build

--- a/docs/_static/version-selector.js
+++ b/docs/_static/version-selector.js
@@ -26,27 +26,53 @@
     * @returns {string} The base path (e.g., '/mscclpp' or '')
     */
    function detectBasePath() {
-        const path = window.location.pathname;
-        // Match pattern: /base-path/vX.Y.Z/... or /base-path/main/...
-        // The base path is everything before the version or main directory
-        const match = path.match(/^(\/[^\/]+)?(?=\/(v\d+\.\d+\.\d+|main)\/)/);
-        if (match && match[1]) {
-            return match[1];
-        }
-        // Check if we're at a root that's actually a project site
-        // Look for common indicators like the repository name in the path
-        const projectMatch = path.match(/^(\/[^\/]+)(?=\/)/);
-        if (projectMatch) {
-            // Verify this isn't a version path at root
-            const potentialBase = projectMatch[1];
-            if (!potentialBase.match(/^\/v\d+\.\d+\.\d+$/) && potentialBase !== '/main') {
-                // Check if the remaining path contains version info
-                const remainingPath = path.substring(potentialBase.length);
-                if (remainingPath.match(/^\/(v\d+\.\d+\.\d+|main)\//)) {
-                    return potentialBase;
+        // Most reliable method: detect from this script's own URL
+        // The script is always at {base}/_static/version-selector.js or {base}/vX.Y.Z/_static/version-selector.js
+        const scripts = document.getElementsByTagName('script');
+        for (let i = 0; i < scripts.length; i++) {
+            const src = scripts[i].src;
+            if (src && (src.includes('/_static/version-selector.js') || src.endsWith('version-selector.js'))) {
+                try {
+                    const url = new URL(src);
+                    const scriptPath = url.pathname;
+                    // Extract base path: everything before /_static/version-selector.js
+                    // But also strip version directories like /v0.8.0/ or /main/
+                    const match = scriptPath.match(/^(.*?)\/_static\/version-selector\.js$/);
+                    if (match) {
+                        let basePath = match[1] || '';
+                        // Remove version suffix if present (e.g., /mscclpp/v0.8.0 -> /mscclpp)
+                        basePath = basePath.replace(/\/(v\d+\.\d+\.\d+|main)$/, '');
+                        return basePath;
+                    }
+                } catch (e) {
+                    // URL parsing failed, continue to fallback
+                    // Log a warning to aid debugging when the primary detection method fails.
+                    if (typeof console !== 'undefined' && typeof console.warn === 'function') {
+                        console.warn('version-selector: Failed to parse script URL for base path detection; falling back to location-based detection.', src, e);
+                    }
                }
            }
        }
+
+        // Fallback: try to detect from URL path
+        const path = window.location.pathname;
+        const segments = path.split('/').filter(s => s.length > 0);
+
+        if (segments.length >= 1) {
+            const firstSegment = segments[0];
+            // If first segment is not a version tag (vX.Y.Z), not 'main', and
+            // does not look like a file name (no '.' in the segment), then it's
+            // the GitHub Pages project base path (e.g., 'mscclpp').
+            // This handles both:
+            //   /mscclpp/v0.8.0/index.html -> base is /mscclpp
+            //   /mscclpp/index.html -> base is /mscclpp
+            // while avoiding treating root files like /index.html as a base path.
+            if (!firstSegment.match(/^v\d+\.\d+\.\d+$/) && firstSegment !== 'main' && !firstSegment.includes('.')) {
+                return '/' + firstSegment;
+            }
+        }
+
+        // No base path (root site or local development)
        return '';
    }
    
--- a/docs/build_multiversion.py
+++ b/docs/build_multiversion.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Wrapper around sphinx-multiversion that patches copy_tree to generate
+_version.py in each tag checkout. This is needed because setuptools_scm
+generates _version.py at build time, but sphinx-multiversion uses
+`git archive` which only contains committed files.
+
+Usage (called by Makefile):
+    python3 build_multiversion.py <sourcedir> <outputdir> [sphinx-opts...]
+"""
+
+import os
+import re
+import subprocess
+import sys
+
+import sphinx_multiversion.git as smv_git
+from sphinx_multiversion import main as smv_main
+
+# Save the original copy_tree
+_original_copy_tree = smv_git.copy_tree
+
+
+def _patched_copy_tree(gitroot, src, dst, reference, sourcepath="."):
+    """Call original copy_tree, then generate _version.py from the VERSION file."""
+    _original_copy_tree(gitroot, src, dst, reference, sourcepath)
+
+    # Extract version from the tag name (e.g., "v0.9.0" -> "0.9.0")
+    refname = getattr(reference, "refname", "") or ""
+    match = re.search(r"v(\d+\.\d+\.\d+)", refname)
+    if not match:
+        return
+
+    version = match.group(1)
+    version_py_dir = os.path.join(dst, "python", "mscclpp")
+    if os.path.isdir(version_py_dir):
+        version_py = os.path.join(version_py_dir, "_version.py")
+        if not os.path.exists(version_py):
+            with open(version_py, "w") as f:
+                f.write(f'__version__ = "{version}"\n')
+
+
+# Monkey-patch
+smv_git.copy_tree = _patched_copy_tree
+
+if __name__ == "__main__":
+    sys.exit(smv_main(sys.argv[1:]))
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -11,6 +11,18 @@
 import sys
 import importlib.util
 from pathlib import Path
+from unittest.mock import MagicMock
+
+
+class NamedMock(MagicMock):
+    def __getattr__(self, name):
+        attr = super().__getattr__(name)
+        if isinstance(attr, MagicMock):
+            # Assigns __name__ and __qualname__ to satisfy Sphinx autodoc inspection.
+            attr.__name__ = name
+            attr.__qualname__ = name
+        return attr
+

 # Add the python package to sys.path so Sphinx can find it
 project_root = Path(__file__).parent.parent
@@ -63,7 +75,7 @@ autodoc_default_options = {
    "show-inheritance": True,
 }
 # only mock the C-extension when using the source tree
-autodoc_mock_imports = ["mscclpp._version", "mscclpp._mscclpp", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"]
+autodoc_mock_imports = ["mscclpp._version", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"]
 autodoc_typehints = "description"
 napoleon_google_docstring = True
 napoleon_numpy_docstring = True
@@ -71,6 +83,10 @@ intersphinx_mapping = {
    "python": ("https://docs.python.org/3", None),
    "numpy": ("https://numpy.org/doc/stable/", None),
 }
+mock_mscclpp = NamedMock()
+# Set attributes to satisfy Sphinx autodoc inspection.
+mock_mscclpp.env.return_value.cache_dir = "_mscclpp"
+sys.modules["mscclpp._mscclpp"] = mock_mscclpp

 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
--- a/docs/dsl/quick_start.md
+++ b/docs/dsl/quick_start.md
@@ -12,6 +12,10 @@ After finishing the installation in the quick start section, you can add the fol
 python3 -m mscclpp --install
 ```

+This installs bundled default execution plans into `~/.cache/mscclpp/default` by default.
+If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed into `MSCCLPP_CACHE_DIR/default`.
+`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
+
 ## Your First Algorithm: AllGather

 Let's walk through a simple AllGather algorithm to understand the DSL basics. This example demonstrates the key concepts without diving into all the advanced features.
--- a/docs/dsl/results.md
+++ b/docs/dsl/results.md
@@ -56,9 +56,12 @@ python3 -m mscclpp --install

 After installation, the generated JSON execution plan can be found at:
 ```
-~/.cache/mscclpp_default/
+~/.cache/mscclpp/default/
 ```

+If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed under `MSCCLPP_CACHE_DIR/default/`.
+`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
+
 **Performance Results:**

 The figure below shows the performance characteristics for small message sizes in a two-node configuration:
--- a/docs/guide/mscclpp-torch-integration.md
+++ b/docs/guide/mscclpp-torch-integration.md
@@ -129,7 +129,7 @@ class CustomizedComm:
        self._algo_large = [
            algo for algo in algorithms
            if algo.collective == "allreduce"
-            and algo.name == "default_allreduce_nvls_with_copy"
+            and algo.name == "default_allreduce_nvls_warp_pipeline"
        ][0]

    def all_reduce(self, tensor: torch.Tensor, stream=None):
@@ -332,7 +332,8 @@ public:
                   size_t inputSize, size_t outputSize,
                   mscclpp::DataType dtype, mscclpp::ReduceOp op,
                   cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                   const std::unordered_map<std::string, uintptr_t>& extras) {
+                   const std::unordered_map<std::string, uintptr_t>& extras,
+                   [[maybe_unused]] mscclpp::DataType accumDtype) {
                return self->kernelFunc(ctx, input, output, inputSize, dtype, stream);
            },
            // Context initialization function
@@ -343,8 +344,8 @@ public:
            },
            // Context key generation function
            [self](const void* input, void* output,
-                   size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
-                return self->generateContextKey(input, output, inputSize, outputSize, dtype);
+                   size_t inputSize, size_t outputSize, mscclpp::DataType dtype, bool symmetricMemory) {
+                return self->generateContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
            }
        );
    }
@@ -468,3 +469,196 @@ stream_handle = torch.cuda.current_stream().cuda_stream

 All examples are in [`examples/torch-integration/`](../../examples/torch-integration/).

+---
+
+## Performance Tuning
+
+The default algorithms use a fixed heuristic to select algorithms based on message size. For production workloads, you can achieve significantly better performance by **auto-tuning** — benchmarking every candidate algorithm, block count, and thread count for each message size at startup, then using the fastest configuration at runtime.
+
+**Full example:** [customized_comm_with_tuning.py](../../examples/torch-integration/customized_comm_with_tuning.py)
+
+### How It Works
+
+1. **Candidate selection** — For each power-of-two message size from 1 KB to 128 MB, the tuner picks the applicable algorithms:
+   - All sizes (when NVLS is supported): `default_allreduce_nvls_zero_copy`
+   - Small messages (≤ 4 MB): `default_allreduce_nvls_packet`, `default_allreduce_packet`
+   - Large messages (≥ 512 KB): `default_allreduce_rsag_zero_copy`
+
+2. **Grid search** — Each candidate is run with every combination of block counts (`4, 8, 16, … 128`) and thread counts (`512, 768, 1024`). Results are captured in a CUDA graph and timed.
+
+3. **Cross-rank consensus** — Elapsed times are averaged across all ranks with an allreduce so every GPU selects the same configuration.
+
+4. **Runtime dispatch** — `get_tuned_config()` rounds the actual message size up to the next power of two and returns the winning `(algorithm, nblocks, nthreads)` triple.
+
+### Symmetric Memory Allocation
+
+Algorithms like `default_allreduce_nvls_zero_copy` require **symmetric memory** — memory where the buffer offset is the same for each rank, allocated via `mscclpp.RawGpuBuffer` (`cuMemAlloc`). Regular `torch.rand()` or `torch.empty()` allocations cannot be used with these algorithms because they do not guarantee the same offset across ranks. Instead, allocate a single large buffer and reuse it for all message sizes:
+
+```python
+# Allocate symmetric memory via RawGpuBuffer and wrap as a PyTorch tensor
+tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
+tune_tensor.normal_()
+```
+
+When executing an algorithm with symmetric memory, pass `symmetric_memory=True`:
+
+```python
+def _run_algo(self, algo, tensor, size, nblocks, nthreads):
+    return algo.execute(
+        comm=self.comm.communicator,
+        input_buffer=tensor.data_ptr(),
+        output_buffer=tensor.data_ptr(),
+        input_size=size,
+        output_size=size,
+        dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+        op=mscclpp.ReduceOp.SUM,
+        stream=torch.cuda.current_stream().cuda_stream,
+        nblocks=nblocks,
+        nthreads_per_block=nthreads,
+        symmetric_memory=True,
+    )
+```
+
+### Loading Candidate Algorithms
+
+The same `load_algorithms` helper from Approach 1 is reused. The tuner extracts multiple algorithm objects:
+
+```python
+algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
+
+self._algorithm_nvls_packet = [
+    algo for algo in algorithms
+    if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
+][0]
+
+self._algorithm_rsag_zero_copy = [
+    algo for algo in algorithms
+    if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy"
+][0]
+
+self._algorithm_packet = [
+    algo for algo in algorithms
+    if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
+][0]
+
+# NVLS zero-copy is only available on supported hardware
+if mscclpp.is_nvls_supported():
+    self._algorithm_nvls_zero_copy = [
+        algo for algo in algorithms
+        if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy"
+    ][0]
+```
+
+### The Tuning Loop
+
+The tuning loop iterates over message sizes, candidate algorithms, and kernel launch parameters. CUDA graphs are used for accurate timing. Note the use of `RawGpuBuffer` for symmetric memory:
+
+```python
+def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
+    sizes = [1 << i for i in range(10, 28)]
+    self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
+
+    # Use RawGpuBuffer for symmetric memory allocation
+    tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+    tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
+    tune_tensor.normal_()
+    candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
+    candidates_nthreads = [512, 768, 1024]
+
+    for size in sizes:
+        algos = []
+        if mscclpp.is_nvls_supported():
+            algos.append(self._algorithm_nvls_zero_copy)
+        if size <= 4 * 1024 * 1024:
+            algos.append(self._algorithm_nvls_packet)
+            algos.append(self._algorithm_packet)
+        if size >= 512 * 1024:
+            algos.append(self._algorithm_rsag_zero_copy)
+
+        best_time = float("inf")
+        best_config = None
+
+        for algo in algos:
+            for nb in candidates_nblocks:
+                for nt in candidates_nthreads:
+                    if self._run_algo(algo, tune_tensor, size, nb, nt) != 0:
+                        continue  # skip unsupported configs
+
+                    # Warmup, then time with CUDA graphs
+                    # ... (see full example for graph capture logic)
+
+                    # Average timing across ranks
+                    time_tensor = torch.full(
+                        (self.world_size,), elapsed, dtype=torch.float64, device="cuda"
+                    ).to(dtype=torch.float32)
+                    self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM)
+                    avg_time = time_tensor[self.rank].item() / self.world_size
+
+                    if avg_time < best_time:
+                        best_time = avg_time
+                        best_config = (algo, nb, nt)
+
+        if best_config:
+            self.best_configs[size] = best_config
+```
+
+### Dispatching with Tuned Configuration
+
+At runtime, round the message size to the next power of two and look up the best configuration. When the tensor is allocated from `RawGpuBuffer` (`cuMemAlloc`) and the buffer offset is the same for each rank, pass `symmetric_memory=True` to the `execute()` call (see the [Symmetric Memory Allocation](#symmetric-memory-allocation) section above):
+
+```python
+def get_tuned_config(self, size):
+    if size < 1024:
+        target_size = 1024
+    elif size > 256 * 1024 * 1024:
+        target_size = 256 * 1024 * 1024
+    else:
+        target_size = 1 << (size - 1).bit_length()
+    return self.best_configs.get(target_size)
+
+def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None):
+    config = self.get_tuned_config(tensor.nbytes)
+    algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0)
+    algo.execute(
+        comm=self.comm.communicator,
+        input_buffer=tensor.data_ptr(),
+        output_buffer=tensor.data_ptr(),
+        input_size=tensor.nbytes,
+        output_size=tensor.nbytes,
+        dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+        op=mscclpp.ReduceOp.SUM,
+        stream=stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream,
+        nblocks=nblocks,
+        nthreads_per_block=nthreads,
+    )
+```
+
+### Benchmarking with Symmetric Memory
+
+When benchmarking tuned configurations, use the same `RawGpuBuffer` allocation pattern. Create one large buffer and slice it for each message size:
+
+```python
+def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
+    # Allocate a single large RawGpuBuffer (symmetric memory) and reuse for all sizes
+    dtype = torch.float16
+    bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype))
+    bench_buf = torch.utils.dlpack.from_dlpack(bench_buf)
+    bench_buf.normal_()
+
+    for size in sizes:
+        n_elements = size // bench_buf.element_size()
+        tensor = bench_buf[:n_elements]
+
+        # Capture CUDA graph, warmup, and time...
+        with torch.cuda.graph(g, stream=capture_stream):
+            for _ in range(n_iter_per_graph):
+                self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
+```
+
+### Running the Tuning Example
+
+```bash
+MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
+  torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py
+```
--- a/docs/py_api.rst
+++ b/docs/py_api.rst
@@ -7,6 +7,4 @@ This reference organizes the MSCCL++ Python API.
   :toctree: py_api
   :recursive:

-   mscclpp.comm
-   mscclpp.utils
-   mscclpp.language
+   mscclpp
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -31,6 +31,9 @@
        ```
        If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
    * (Optional, for benchmarks) MPI
+    * (Optional, for NVIDIA platforms) [GDRCopy](https://github.com/NVIDIA/gdrcopy) >= 2.5.1
+        * GDRCopy is required for IB `HostNoAtomic` mode, which uses CPU-side signal forwarding to GPU memory via BAR1 mappings. This mode is used on platforms where RDMA atomics are not available (e.g., when using Data Direct Virtual Functions).
+        * Install GDRCopy from source or via packages. See the [GDRCopy installation guide](https://github.com/NVIDIA/gdrcopy#installation).
 * Others
    * For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites).
    * For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.
@@ -42,7 +45,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se

 ```bash
 # For NVIDIA platforms
-$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
+$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash
 # For AMD platforms
 $ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
 ```
@@ -171,7 +174,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
 For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.

 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```

@@ -189,14 +191,12 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec

 Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```

 Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
-mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
+mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```

 On AMD platforms, you need to add `RCCL_MSCCL_ENABLE=0` to avoid conflicts with the fallback features.
--- a/docs/tutorials/03-memory-channel.md
+++ b/docs/tutorials/03-memory-channel.md
@@ -78,7 +78,7 @@ mscclpp::GpuBuffer buffer(bufferBytes);
 mscclpp::RegisteredMemory localRegMem = comm.registerMemory(buffer.data(), buffer.bytes(), transport);
 ```

-Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (such as [NVL72](https://www.nvidia.com/en-us/data-center/gb300-nvl72) on NVIDIA Grace Blackwell platforms). We will introduce other transport types in later tutorials.
+Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (will be explained in a later section: {ref}`mc-cross-node`). We will introduce other transport types in later tutorials.

 **GpuBuffer** is NOT required for creating a `RegisteredMemory`; you can register any pre-allocated GPU memory region with `registerMemory()`. However, it is the user's responsibility to ensure that the memory region is suitable for their communication operations. Depending on the hardware platform, some communication methods may require specific memory allocation to ensure data consistency and correctness. `GpuBuffer` is a convenient way to allocate GPU memory that is compatible with the communication methods that MSCCL++ supports. It provides a simple interface for allocating GPU memory and automatically handles memory deallocation when it goes out of scope.

@@ -251,6 +251,37 @@ columns 2

 Since the flags take 50% of the packet size, the goodput of communication using packets is only 50% compared to transferring raw data. However, this doesn't matter because packets are designed for small data transfers. Packets transfer small data efficiently because the integrity of the user data is guaranteed by only waiting for the correct flags (done by `unpackPackets()`); explicit memory synchronization (signal and wait) is not needed.

+(mc-cross-node)=
+## Cross-node Execution
+
+For **inter-node** communication, using `PortChannel` (will be explained in the following tutorial) is usually a more accessible option that leverages more widely-used networking interfaces. However, `MemoryChannel` can still be used as long as the underlying hardware allows memory mapping between the two GPUs, such as [Multi-Node NVLink (MNNVL)](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/overview.html) on NVIDIA Grace Blackwell platforms.
+
+We can use the same example code to test inter-node `MemoryChannel`. Users can consult the [NVIDIA MNNVL verification guide](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/verifying.html) for verification steps and detailed environment requirements for MNNVL.
+
+Run the program on two nodes with command line arguments:
+
+```
+./bidir_memory_channel [<ip_port> <rank> <gpu_id>]
+```
+
+For example, assume we use `192.168.0.1:50000` as the bootstrap IP address and port, and both nodes use GPU 0 locally.
+
+On Node 0 (Rank 0):
+```bash
+$ ./bidir_memory_channel 192.168.0.1:50000 0 0
+```
+
+On Node 1 (Rank 1):
+```bash
+$ ./bidir_memory_channel 192.168.0.1:50000 1 0
+```
+
+You should see output indicating successful data transfer.
+
+```{tip}
+If your bootstrap IP address is not on the default network interface of your node, you can specify the network interface by passing `interface_name:ip:port` as the first argument (such as `eth1:192.168.0.1:50000`).
+```
+
 ## Summary and Next Steps

 In this tutorial, you have learned how to use `MemoryChannel` for efficient data transfer between GPUs. You have also learned how to create communication buffers using `RegisteredMemory` and `GpuBuffer`, and how to use packets for small data transfers. You can find more complex usage of `MemoryChannel` in the {ref}`mscclpp-test`.
--- a/examples/customized-collective-algorithm/customized_allgather.cu
+++ b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -101,15 +101,17 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
        "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
        [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
               mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
-               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+               [[maybe_unused]] mscclpp::DataType accumDtype) {
          return self->allgatherKernelFunc(ctx, input, output, inputSize, stream);
        },
        [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
               size_t outputSize,
               mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
+               bool symmetricMemory) {
          return self->generateAllgatherContextKey(input, output, inputSize, outputSize,
-                                                   static_cast<ncclDataType_t>(dtype));
+                                                   static_cast<ncclDataType_t>(dtype), symmetricMemory);
        });
    return allgatherAlgo;
  }
@@ -191,7 +193,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
  }

  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
-                                                       size_t outputSize, ncclDataType_t dtype) {
+                                                       size_t outputSize, ncclDataType_t dtype, bool) {
    return {(void*)input, output, inputSize, outputSize, 0};
  }
 };
--- a/examples/torch-integration/customized_allgather.cu
+++ b/examples/torch-integration/customized_allgather.cu
@@ -69,14 +69,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
        "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
        [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
               mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
-               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
+               int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
+               [[maybe_unused]] mscclpp::DataType accumDtype) {
          return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream);
        },
        [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
               size_t outputSize,
               mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
-        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
-          return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype);
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
+               bool symmetricMemory) {
+          return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
        });
    return allgatherAlgo;
  }
@@ -159,7 +161,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
  }

  mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
-                                                       size_t outputSize, mscclpp::DataType dtype) {
+                                                       size_t outputSize, mscclpp::DataType dtype, bool) {
    return {(void*)input, output, inputSize, outputSize, 0};
  }
 };
--- a/examples/torch-integration/customized_comm_with_default_algo.py
+++ b/examples/torch-integration/customized_comm_with_default_algo.py
@@ -15,7 +15,9 @@ import ipaddress
 def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
    collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
    return collection_builder.build_default_algorithms(
-        scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
+        scratch_buffer=scratch_buffer.data_ptr(),
+        scratch_buffer_size=scratch_buffer.nbytes,
+        rank=rank,
    )


@@ -59,7 +61,7 @@ class CustomizedComm:
        self._algorithm_nvls_nonzero_copy = [
            algo
            for algo in algorithms
-            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_with_copy"
+            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_warp_pipeline"
        ][0]

    def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
--- a/examples/torch-integration/customized_comm_with_tuning.py
+++ b/examples/torch-integration/customized_comm_with_tuning.py
@@ -0,0 +1,476 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
+
+import os
+import ipaddress
+
+import netifaces as ni
+import torch
+import mscclpp
+import mscclpp.ext
+import mscclpp.utils as mscclpp_utils
+
+# -- Helpers ------------------------------------------------------------------
+
+
+def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor:
+    """Allocate a tensor backed by RawGpuBuffer (symmetric memory)."""
+    # PyTorch's from_dlpack does not support certain float8 DLPack type codes.
+    # Work around by importing as uint8 and reinterpreting via .view().
+    _DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)
+    if dtype in _DLPACK_UNSUPPORTED:
+        dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8))
+        return torch.utils.dlpack.from_dlpack(dlpack).view(dtype)
+    dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype))
+    return torch.utils.dlpack.from_dlpack(dlpack)
+
+
+def _load_algorithms(scratch: torch.Tensor, rank: int):
+    return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms(
+        scratch_buffer=scratch.data_ptr(),
+        scratch_buffer_size=scratch.nbytes,
+        rank=rank,
+    )
+
+
+def _interfaces_for_ip(ip: str):
+    target = ipaddress.ip_address(ip)
+    for iface in ni.interfaces():
+        addrs = ni.ifaddresses(iface)
+        if ni.AF_INET in addrs:
+            for link in addrs[ni.AF_INET]:
+                if "addr" in link and ipaddress.ip_address(link["addr"]) == target:
+                    return iface
+    return None
+
+
+def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
+    if op == torch.distributed.ReduceOp.SUM:
+        return mscclpp.ReduceOp.SUM
+    if op == torch.distributed.ReduceOp.MIN:
+        return mscclpp.ReduceOp.MIN
+    raise ValueError(f"unsupported op: {op}")
+
+
+def _round_pow2(size: int) -> int:
+    """Round up to next power-of-2, clamped to [1024, 256 MB]."""
+    size = max(size, 1024)
+    size = min(size, 256 << 20)
+    return 1 << (size - 1).bit_length()
+
+
+# -- CustomizedComm -----------------------------------------------------------
+
+
+class CustomizedComm:
+    """Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
+
+    _TUNE_N_WARMUP = 5
+    _TUNE_N_GRAPH_LAUNCHES = 10
+    _TUNE_N_OPS_PER_GRAPH = 100
+    _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128]
+    _CANDIDATE_NTHREADS = [512, 768, 1024]
+    _NBLOCKS_LIMIT = {
+        "default_allreduce_nvls_packet": 16,
+        "default_allreduce_packet": 56,
+        "default_allreduce_allpair_packet": 56,
+        "default_allreduce_fullmesh": 64,
+        "default_allgather_fullmesh2": 32,
+    }
+
+    def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
+        self.comm = comm
+        self.rank = comm.my_rank
+        self.world_size = comm.nranks
+        self.symmetric_memory = symmetric_memory
+        self._nvls = mscclpp.is_nvls_supported()
+
+        self._scratch = _make_tensor(1 << 27, torch.float16)
+        self._barrier_tensor = _make_tensor(4096, torch.float32)
+
+        algos = _load_algorithms(self._scratch, self.rank)
+        self._algos = {(a.collective, a.name): a for a in algos}
+
+        # {collective: {rounded_size: (algo, nblocks, nthreads)}}
+        self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}}
+        self._tune_buf = None
+        self._time_buf = None
+
+    def _algo(self, collective: str, name: str):
+        return self._algos.get((collective, name))
+
+    def _default_ar_config(self):
+        """Fallback allreduce config for barrier / timing sync."""
+        pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
+        if self._nvls and pkt:
+            return (pkt, 0, 0)
+        return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
+
+    # -- low-level execute --
+
+    def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True):
+        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
+        ret = algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=tensor.data_ptr(),
+            output_buffer=tensor.data_ptr(),
+            input_size=tensor.nbytes,
+            output_size=tensor.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
+            op=op,
+            stream=s,
+            nblocks=nb,
+            nthreads_per_block=nt,
+            symmetric_memory=sym,
+            accum_dtype=accum_dtype,
+        )
+        if ret != 0:
+            print(f"Rank {self.rank}: {algo.name} failed ({ret})")
+        return ret
+
+    def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
+        if sym is None:
+            sym = self.symmetric_memory
+        s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
+        ret = algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=inp.data_ptr(),
+            output_buffer=out.data_ptr(),
+            input_size=inp.nbytes,
+            output_size=out.nbytes,
+            dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype),
+            op=mscclpp.ReduceOp.NOP,
+            stream=s,
+            nblocks=nb,
+            nthreads_per_block=nt,
+            symmetric_memory=sym,
+        )
+        if ret != 0:
+            print(f"Rank {self.rank}: AG {algo.name} failed ({ret})")
+        return ret
+
+    def _barrier_internal(self):
+        a, nb, nt = self._default_ar_config()
+        self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
+
+    # -- lazy tuning --
+
+    def _ensure_tune_bufs(self):
+        if self._tune_buf is None:
+            self._tune_buf = _make_tensor(1 << 27, torch.float16)
+            self._tune_buf.normal_()
+            self._time_buf = _make_tensor(4096, torch.float32)
+        return self._tune_buf
+
+    def _ar_candidates(self, size: int):
+        out = []
+        if size <= 4 << 20:
+            a = self._algo("allreduce", "default_allreduce_nvls_packet")
+            if self._nvls and a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_packet")
+            if a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_allpair_packet")
+            if a:
+                out.append(a)
+        if size >= 512 << 10:
+            a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
+            if self._nvls and self.symmetric_memory and a:
+                out.append(a)
+            a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
+            if a:
+                out.append(a)
+        if torch.version.hip is not None:
+            a = self._algo("allreduce", "default_allreduce_fullmesh")
+            if a:
+                out.append(a)
+        return out
+
+    def _ag_candidates(self):
+        a = self._algo("allgather", "default_allgather_fullmesh2")
+        return [a] if a else []
+
+    def _run_tune(self, collective, algo, buf, size, nb, nt):
+        """Single tune invocation for either collective."""
+        if collective == "allreduce":
+            return algo.execute(
+                comm=self.comm.communicator,
+                input_buffer=buf.data_ptr(),
+                output_buffer=buf.data_ptr(),
+                input_size=size,
+                output_size=size,
+                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
+                op=mscclpp.ReduceOp.SUM,
+                stream=torch.cuda.current_stream().cuda_stream,
+                nblocks=nb,
+                nthreads_per_block=nt,
+                symmetric_memory=True,
+            )
+        else:
+            total = size * self.world_size
+            out_ptr = buf.data_ptr()
+            return algo.execute(
+                comm=self.comm.communicator,
+                input_buffer=out_ptr + self.rank * size,
+                output_buffer=out_ptr,
+                input_size=size,
+                output_size=total,
+                dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
+                op=mscclpp.ReduceOp.NOP,
+                stream=torch.cuda.current_stream().cuda_stream,
+                nblocks=nb,
+                nthreads_per_block=nt,
+                symmetric_memory=False,
+            )
+
+    def _tune_size(self, collective: str, target_size: int):
+        """Auto-tune one (collective, target_size) pair and cache result."""
+        buf = self._ensure_tune_bufs()
+        cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates()
+
+        best_time, best_cfg = float("inf"), None
+        used = set()
+        run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
+
+        for algo in cands:
+            nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
+            for nb in self._CANDIDATE_NBLOCKS:
+                if nb > nb_limit:
+                    continue
+                for nt in self._CANDIDATE_NTHREADS:
+                    # Feasibility — sync result across ranks so all agree
+                    ret = run(algo, nb, nt)
+                    torch.cuda.synchronize()
+                    self._time_buf[0] = float(ret)
+                    self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True)
+                    if self._time_buf[0].item() != 0:
+                        continue
+                    used.add(algo)
+
+                    # Warmup
+                    for _ in range(self._TUNE_N_WARMUP):
+                        run(algo, nb, nt)
+
+                    # CUDA-graph timed benchmark
+                    cs = torch.cuda.Stream()
+                    cs.wait_stream(torch.cuda.current_stream())
+                    g = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(g, stream=cs):
+                        for _ in range(self._TUNE_N_OPS_PER_GRAPH):
+                            run(algo, nb, nt)
+
+                    start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+                    start.record(cs)
+                    with torch.cuda.stream(cs):
+                        for _ in range(self._TUNE_N_GRAPH_LAUNCHES):
+                            g.replay()
+                    end.record(cs)
+                    end.synchronize()
+                    elapsed = start.elapsed_time(end)
+
+                    # Cross-rank timing sync
+                    self._time_buf.fill_(elapsed)
+                    torch.cuda.current_stream().wait_stream(cs)
+                    self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True)
+                    avg = self._time_buf[self.rank].item() / self.world_size
+
+                    if avg < best_time:
+                        best_time, best_cfg = avg, (algo, nb, nt)
+
+        if best_cfg:
+            self._tune_cache[collective][target_size] = best_cfg
+            if self.rank == 0:
+                n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH
+                print(
+                    f"[tune] {collective} size={target_size}: {best_cfg[0].name} "
+                    f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us",
+                    flush=True,
+                )
+        else:
+            fb = (
+                self._default_ar_config()
+                if collective == "allreduce"
+                else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None)
+            )
+            self._tune_cache[collective][target_size] = fb
+
+        torch.cuda.synchronize()
+        self._barrier_internal()
+        for a in used:
+            a.reset()
+
+    # -- public API --
+
+    def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None):
+        sz = _round_pow2(tensor.nbytes)
+        if sz not in self._tune_cache["allreduce"]:
+            self._tune_size("allreduce", sz)
+        a, nb, nt = self._tune_cache["allreduce"][sz]
+        self._exec_ar(
+            tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory
+        )
+
+    def all_gather(self, output_tensor, input_tensor, stream=None):
+        sz = _round_pow2(input_tensor.nbytes)
+        if sz not in self._tune_cache["allgather"]:
+            self._tune_size("allgather", sz)
+        a, nb, nt = self._tune_cache["allgather"][sz]
+        self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory)
+
+    def barrier(self):
+        self._barrier_internal()
+
+    def destroy(self):
+        self._algos.clear()
+        self._tune_cache = {"allreduce": {}, "allgather": {}}
+        self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None
+
+
+# -- Benchmarks (standalone) --------------------------------------------------
+
+
+def _bench_sizes(low=5 * 1024, high=80 << 20):
+    sizes, c = [], low
+    while c <= high:
+        sizes.append(c)
+        c *= 2
+    return sizes
+
+
+def benchmark_allreduce(
+    comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100
+):
+    sizes = _bench_sizes()
+    if comm.rank == 0:
+        print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}")
+        print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
+
+    cs = torch.cuda.Stream()
+    buf = _make_tensor(1 << 27, dtype)
+    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
+
+    for size in sizes:
+        nelems = size // buf.element_size()
+        t = buf[: size // buf.element_size()]
+        comm.all_reduce(t, accum_dtype=accum_dtype)
+        torch.cuda.synchronize()
+
+        cs.wait_stream(torch.cuda.current_stream())
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, stream=cs):
+            for _ in range(n_iter):
+                comm.all_reduce(t, accum_dtype=accum_dtype)
+        with torch.cuda.stream(cs):
+            for _ in range(n_warmup):
+                g.replay()
+            comm.barrier()
+        cs.synchronize()
+
+        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        s.record(cs)
+        with torch.cuda.stream(cs):
+            for _ in range(n_graph_launches):
+                g.replay()
+        e.record(cs)
+        e.synchronize()
+
+        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
+        if comm.rank == 0:
+            print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}")
+
+
+def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100):
+    sizes = _bench_sizes()
+    if comm.rank == 0:
+        print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}")
+        print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
+
+    cs = torch.cuda.Stream()
+    buf = _make_tensor(1 << 27, dtype)
+    buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
+
+    for prs in sizes:
+        total = prs * comm.world_size
+        if total > buf.nbytes:
+            break
+        nt = total // buf.element_size()
+        npr = prs // buf.element_size()
+        out = buf[:nt]
+        inp = out[comm.rank * npr : (comm.rank + 1) * npr]
+
+        comm.all_gather(out, inp)
+        torch.cuda.synchronize()
+
+        cs.wait_stream(torch.cuda.current_stream())
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, stream=cs):
+            for _ in range(n_iter):
+                comm.all_gather(out, inp)
+        with torch.cuda.stream(cs):
+            for _ in range(n_warmup):
+                g.replay()
+            comm.barrier()
+        cs.synchronize()
+
+        s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        s.record(cs)
+        with torch.cuda.stream(cs):
+            for _ in range(n_graph_launches):
+                g.replay()
+        e.record(cs)
+        e.synchronize()
+
+        ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
+        if comm.rank == 0:
+            print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}")
+
+
+# -- Bootstrap & main ---------------------------------------------------------
+
+
+def init_dist() -> mscclpp.CommGroup:
+    addr = os.environ.get("MSCCLPP_MASTER_ADDR")
+    if addr:
+        rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"])
+        port = os.environ["MSCCLPP_MASTER_PORT"]
+        iface = _interfaces_for_ip(addr)
+        if not iface:
+            raise ValueError(f"No interface for {addr}")
+        return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
+    import torch.distributed as dist
+
+    dist.init_process_group(backend="gloo")
+    return mscclpp.CommGroup(torch_group=dist.group.WORLD)
+
+
+def main():
+    local = int(os.environ["LOCAL_RANK"])
+    torch.cuda.set_device(local)
+
+    dtype_str = os.environ.get("DTYPE", "float16")
+    dtype = getattr(torch, dtype_str, torch.float16)
+    accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
+    accum_str = os.environ.get("ACCUM_DTYPE")
+    accum_dtype = accum_map.get(accum_str) if accum_str else None
+
+    comm_group = init_dist()
+    cc = CustomizedComm(comm_group)
+
+    print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
+    benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
+    cc.barrier()
+    torch.cuda.synchronize()
+
+    benchmark_allgather(cc, dtype=dtype)
+    cc.barrier()
+    torch.cuda.synchronize()
+
+    cc.destroy()
+    print(f"rank {local} completed successfully.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/torch-integration/dsl_with_nccl_api.py
+++ b/examples/torch-integration/dsl_with_nccl_api.py
@@ -1,19 +1,20 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.

-# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/nccl/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py

 import os
 from typing import Any, Dict
 import torch, torch.distributed as dist
-import mscclpp
+import mscclpp.ext
 from mscclpp.language.collectives import AllReduce
 from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
 from mscclpp.language.program import CollectiveProgram
 from mscclpp.language.rank import Rank
+from mscclpp.language.utils import AlgoSpec


-def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
+def allreduce_nvls(spec: AlgoSpec) -> CollectiveProgram:
    gpu_size = spec.world_size
    with CollectiveProgram.from_spec(spec) as program:
        # Creating Channels
@@ -63,8 +64,8 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
    return program


-def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int):
-    spec = mscclpp.AlgoSpec(
+def setup_plan(algo_collection_builder: mscclpp.ext.AlgorithmCollectionBuilder, rank: int, world_size: int):
+    spec = AlgoSpec(
        name="allreduce_nvls",
        collective=AllReduce(8, 1, True),
        nranks_per_node=8,
@@ -94,10 +95,10 @@ def init_dist():
    rank = int(os.environ["RANK"])
    world = int(os.environ["WORLD_SIZE"])
    local = int(os.environ["LOCAL_RANK"])
-    algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder()
+    algorithm_collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
    setup_plan(algorithm_collection_builder, rank, world)
    algorithm_collection_builder.set_algorithm_selector(selector)
-    dist.init_process_group(backend="nccl", device_id=local)
+    dist.init_process_group(backend="nccl", device_id=torch.device("cuda", local))
    return rank, world, local


--- a/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu
+++ b/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu
@@ -9,7 +9,7 @@
 #include <sstream>

 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
  std::stringstream ss;
  (ss << ... << args);
  ss << std::endl;
@@ -23,7 +23,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
  }
 }

-__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
    for (int i = 0; i < iter; ++i) {
      devHandle->relaxedWait();
@@ -34,7 +34,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
  }
 }

-__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
    for (int i = 0; i < iter; ++i) {
      devHandle->relaxedSignal();
@@ -88,7 +88,7 @@ int main() {
  mscclpp::Semaphore sema0(/*localSemaphoreStub*/ semaStub0, /*remoteSemaphoreStub*/ semaStub1);
  mscclpp::BaseMemoryChannel memChan0(sema0);
  mscclpp::BaseMemoryChannelDeviceHandle memChanHandle0 = memChan0.deviceHandle();
-  void *devHandle0;
+  void* devHandle0;
  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle0, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle0, &memChanHandle0, sizeof(memChanHandle0), cudaMemcpyHostToDevice));

@@ -98,14 +98,14 @@ int main() {
  mscclpp::Semaphore sema1(/*localSemaphoreStub*/ semaStub1, /*remoteSemaphoreStub*/ semaStub0);
  mscclpp::BaseMemoryChannel memChan1(sema1);
  mscclpp::BaseMemoryChannelDeviceHandle memChanHandle1 = memChan1.deviceHandle();
-  void *devHandle1;
+  void* devHandle1;
  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle1, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle1, &memChanHandle1, sizeof(memChanHandle1), cudaMemcpyHostToDevice));

  log("GPU 0: Launching gpuKernel0 ...");

  MSCCLPP_CUDATHROW(cudaSetDevice(0));
-  gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle0), iter);
+  gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle0), iter);
  MSCCLPP_CUDATHROW(cudaGetLastError());

  log("GPU 1: Launching gpuKernel1 ...");
@@ -115,7 +115,7 @@ int main() {
  MSCCLPP_CUDATHROW(cudaEventCreate(&start));
  MSCCLPP_CUDATHROW(cudaEventCreate(&end));
  MSCCLPP_CUDATHROW(cudaEventRecord(start));
-  gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle1), iter);
+  gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle1), iter);
  MSCCLPP_CUDATHROW(cudaGetLastError());
  MSCCLPP_CUDATHROW(cudaEventRecord(end));
  MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
--- a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
+++ b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
@@ -14,7 +14,7 @@
 #define PORT_NUMBER "50505"

 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
  std::stringstream ss;
  (ss << ... << args);
  ss << std::endl;
@@ -50,7 +50,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
  }
 }

-__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
    for (int i = 0; i < iter; ++i) {
      devHandle->relaxedWait();
@@ -61,7 +61,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
  }
 }

-__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
    for (int i = 0; i < iter; ++i) {
      devHandle->relaxedSignal();
@@ -115,14 +115,14 @@ void worker(int gpuId) {

  mscclpp::BaseMemoryChannel memChan(sema);
  auto memChanHandle = memChan.deviceHandle();
-  void *devHandle;
+  void* devHandle;
  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));

  log("GPU ", gpuId, ": Launching a GPU kernel ...");

  if (gpuId == 0) {
-    gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
+    gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
    MSCCLPP_CUDATHROW(cudaGetLastError());
    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
  } else {
@@ -130,7 +130,7 @@ void worker(int gpuId) {
    MSCCLPP_CUDATHROW(cudaEventCreate(&start));
    MSCCLPP_CUDATHROW(cudaEventCreate(&end));
    MSCCLPP_CUDATHROW(cudaEventRecord(start));
-    gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
+    gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
    MSCCLPP_CUDATHROW(cudaGetLastError());
    MSCCLPP_CUDATHROW(cudaEventRecord(end));
    MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
--- a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
+++ b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
@@ -16,7 +16,7 @@
 #define PORT_NUMBER "50505"

 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
  std::stringstream ss;
  (ss << ... << args);
  ss << std::endl;
@@ -47,7 +47,7 @@ int wait_process(int pid) {

 __device__ mscclpp::DeviceSyncer devSyncer;

-__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid == 0) {
    devHandle->relaxedSignal();
@@ -65,7 +65,7 @@ __global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
  }
 }

-__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid == 0) {
    devHandle->relaxedSignal();
@@ -79,7 +79,7 @@ __global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
  devHandle->get(srcOffset, dstOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x);
 }

-__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank,
+__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank,
                                     uint32_t flag) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid == 0) {
@@ -95,9 +95,8 @@ __global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHand
  devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
 }

-void worker(int gpuId) {
+void worker(int myRank, int gpuId, const std::string& ipPort) {
  MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
-  const int myRank = gpuId;
  const int remoteRank = myRank == 0 ? 1 : 0;
  const int nRanks = 2;
  const int iter = 1000;
@@ -105,11 +104,11 @@ void worker(int gpuId) {
  const size_t bufferBytes = 256 * 1024 * 1024;
  const size_t pktBufferBytes = 256 * 1024 * 1024;

-  log("GPU ", gpuId, ": Preparing for tests ...");
+  log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");

  // Build a connection and a semaphore
  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
-  bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER);
+  bootstrap->initialize(ipPort);
  mscclpp::Communicator comm(bootstrap);
  auto conn = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank).get();
  auto sema = comm.buildSemaphore(conn, remoteRank).get();
@@ -133,8 +132,8 @@ void worker(int gpuId) {
  auto memChanHandle = memChan.deviceHandle();
  auto memPktChanHandle = memPktChan.deviceHandle();

-  void *devHandle;
-  void *devPktHandle;
+  void* devHandle;
+  void* devPktHandle;
  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
  MSCCLPP_CUDATHROW(cudaMalloc(&devPktHandle, sizeof(memPktChanHandle)));
  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
@@ -146,23 +145,23 @@ void worker(int gpuId) {
  std::function<void(size_t)> kernels[3];

  kernels[0] = [&](size_t copyBytes) {
-    bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
-                                            copyBytes, myRank);
+    bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
+                                            myRank);
  };

  kernels[1] = [&](size_t copyBytes) {
-    bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
-                                            copyBytes, myRank);
+    bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
+                                            myRank);
  };

  kernels[2] = [&](size_t copyBytes) {
    static uint32_t flag = 1;
-    bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devPktHandle),
+    bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devPktHandle),
                                                  copyBytes, myRank, flag++);
  };

  cudaEvent_t start, end;
-  if (gpuId == 0) {
+  if (myRank == 0) {
    MSCCLPP_CUDATHROW(cudaEventCreate(&start));
    MSCCLPP_CUDATHROW(cudaEventCreate(&end));
  }
@@ -189,13 +188,13 @@ void worker(int gpuId) {
      MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
      bootstrap->barrier();

-      if (gpuId == 0) {
+      if (myRank == 0) {
        MSCCLPP_CUDATHROW(cudaEventRecord(start, stream));
      }

      MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream));

-      if (gpuId == 0) {
+      if (myRank == 0) {
        MSCCLPP_CUDATHROW(cudaEventRecord(end, stream));
        MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
        float elapsedTime;
@@ -204,8 +203,8 @@ void worker(int gpuId) {
        MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
        elapsedTimePerIter = elapsedTime / iter;
        gbps = float(copyBytes) / elapsedTimePerIter * 1e-6f;
-        log("GPU ", gpuId, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
-            gbps, " GB/s");
+        log("Rank ", myRank, " (GPU ", gpuId, "): [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter,
+            " ms/iter, BW ", gbps, " GB/s");
      }
      MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
      MSCCLPP_CUDATHROW(cudaGraphExecDestroy(graphExec));
@@ -216,23 +215,47 @@ void worker(int gpuId) {
  bootstrap->barrier();
 }

-int main() {
-  int pid0 = spawn_process([]() { worker(0); });
-  int pid1 = spawn_process([]() { worker(1); });
-  if (pid0 < 0 || pid1 < 0) {
-    log("Failed to spawn processes.");
+int main(int argc, char** argv) {
+  if (argc == 1) {
+    int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
+    int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
+    if (pid0 < 0 || pid1 < 0) {
+      log("Failed to spawn processes.");
+      return -1;
+    }
+    int status0 = wait_process(pid0);
+    int status1 = wait_process(pid1);
+    if (status0 < 0 || status1 < 0) {
+      log("Failed to wait for processes.");
+      return -1;
+    }
+    if (status0 != 0 || status1 != 0) {
+      log("One of the processes failed.");
+      return -1;
+    }
+    log("Succeed!");
+    return 0;
+  } else if (argc == 4) {
+    std::string ipPort = argv[1];
+    int rank, gpuId;
+    try {
+      rank = std::stoi(argv[2]);
+      gpuId = std::stoi(argv[3]);
+    } catch (const std::exception&) {
+      log("Error: rank and gpu_id must be valid integers.");
+      return -1;
+    }
+    if (rank < 0 || rank > 2 || gpuId < 0) {
+      log("Error: rank must be between 0 and 1 and gpu_id must be non-negative.");
+      return -1;
+    }
+    worker(rank, gpuId, ipPort);
+    log("Rank ", rank, ": Succeed!");
+    return 0;
+  } else {
+    std::cerr << "Usage:\n"
+              << "  " << argv[0] << "                Run in intra-node mode\n"
+              << "  " << argv[0] << " <ip_port> <rank> <gpu_id>   Run in inter-node mode\n";
    return -1;
  }
-  int status0 = wait_process(pid0);
-  int status1 = wait_process(pid1);
-  if (status0 < 0 || status1 < 0) {
-    log("Failed to wait for processes.");
-    return -1;
-  }
-  if (status0 != 0 || status1 != 0) {
-    log("One of the processes failed.");
-    return -1;
-  }
-  log("Succeed!");
-  return 0;
 }
--- a/examples/tutorials/04-port-channel/bidir_port_channel.cu
+++ b/examples/tutorials/04-port-channel/bidir_port_channel.cu
@@ -16,7 +16,7 @@
 #define PORT_NUMBER "50505"

 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
  std::stringstream ss;
  (ss << ... << args);
  ss << std::endl;
@@ -45,7 +45,7 @@ int wait_process(int pid) {
  return -1;
 }

-__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid == 0) {
    devHandle->signal();
@@ -58,7 +58,7 @@ __global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size
  }
 }

-void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport transport) {
+void worker(int rank, int gpuId, const std::string& ipPort, mscclpp::Transport transport) {
  MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
  const int myRank = rank;
  const int remoteRank = myRank == 0 ? 1 : 0;
@@ -90,7 +90,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t

  auto portChanHandle = portChan.deviceHandle();

-  void *devHandle;
+  void* devHandle;
  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(portChanHandle)));
  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &portChanHandle, sizeof(portChanHandle), cudaMemcpyHostToDevice));

@@ -100,7 +100,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
  std::function<void(size_t)> kernels[1];

  kernels[0] = [&](size_t copyBytes) {
-    bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle *>(devHandle), copyBytes,
+    bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle*>(devHandle), copyBytes,
                                        myRank);
  };

@@ -166,7 +166,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
  bootstrap->barrier();
 }

-mscclpp::Transport parseTransport(const std::string &transportStr) {
+mscclpp::Transport parseTransport(const std::string& transportStr) {
  if (transportStr == "CudaIpc") return mscclpp::Transport::CudaIpc;
  if (transportStr == "IB0") return mscclpp::Transport::IB0;
  if (transportStr == "IB1") return mscclpp::Transport::IB1;
@@ -180,7 +180,7 @@ mscclpp::Transport parseTransport(const std::string &transportStr) {
  throw std::runtime_error("Unknown transport: " + transportStr);
 }

-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
  if (argc == 1) {
    int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
    int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
--- a/examples/tutorials/05-switch-channel/Makefile
+++ b/examples/tutorials/05-switch-channel/Makefile
@@ -0,0 +1,15 @@
+CUDA_HOME ?= /usr/local/cuda
+
+COMPILER := $(CUDA_HOME)/bin/nvcc
+ARCH_FLAG := -arch=native
+
+TARGET = bidir_switch_channel
+SRC = bidir_switch_channel.cu
+
+all: $(TARGET)
+
+$(TARGET): $(SRC)
+	$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp
+
+clean:
+	rm -f $(TARGET)
--- a/examples/tutorials/05-switch-channel/bidir_switch_channel.cu
+++ b/examples/tutorials/05-switch-channel/bidir_switch_channel.cu
@@ -0,0 +1,177 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <functional>
+#include <iostream>
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/switch_channel.hpp>
+#include <mscclpp/switch_channel_device.hpp>
+#include <sstream>
+
+#define PORT_NUMBER "50505"
+
+template <typename... Args>
+void log(Args &&...args) {
+  std::stringstream ss;
+  (ss << ... << args);
+  ss << std::endl;
+  std::cout << ss.str();
+}
+
+int spawn_process(std::function<void()> func) {
+  pid_t pid = fork();
+  if (pid < 0) return -1;
+  if (pid == 0) {
+    // Child process
+    func();
+    exit(0);
+  }
+  return pid;
+}
+
+int wait_process(int pid) {
+  int status;
+  if (waitpid(pid, &status, 0) < 0) {
+    return -1;
+  }
+  if (WIFEXITED(status)) {
+    return WEXITSTATUS(status);
+  }
+  return -1;
+}
+
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
+
+__device__ mscclpp::DeviceSyncer devSyncer;
+
+__global__ void kernelSwitchReduce(int rank, int numElements) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  // rank 0 performs on first half of data and rank 1 on second half
+  int min = rank * (numElements / 2);
+  int max = (rank + 1) * (numElements / 2);
+
+  for (int i = tid + min; i < max; i += stride) {
+    auto val = gConstSwitchChan.reduce<mscclpp::f32x1>(i);
+    gConstSwitchChan.broadcast(i, val);
+  }
+}
+
+void worker(int myRank, int gpuId, const std::string &ipPort) {
+  MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
+  const int nRanks = 2;
+  const int iter = 1000;
+  const size_t bufferBytes = 128 * 1024 * 1024;
+
+  log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");
+
+  // Build a connection and a semaphore
+  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
+  bootstrap->initialize(ipPort);
+  std::shared_ptr<mscclpp::Communicator> comm = std::make_shared<mscclpp::Communicator>(bootstrap);
+
+  std::vector<int> ranks;
+  ranks.reserve(nRanks);
+  for (int i = 0; i < nRanks; i++) ranks.push_back(i);
+
+  auto buffer = mscclpp::GpuBuffer<float>(bufferBytes);
+
+  auto nvlsConnection = mscclpp::connectNvlsCollective(comm, ranks, bufferBytes);
+
+  auto switchChannel = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer.data()), bufferBytes);
+
+  auto deviceHandle = switchChannel.deviceHandle();
+
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan, &deviceHandle, sizeof(deviceHandle)));
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  // Call the kernel in a loop for perf evaluation
+
+  for (size_t numElements : {1024, 1024 * 1024, 32 * 1024 * 1024}) {
+    cudaEvent_t start, end;
+    if (myRank == 0) {
+      MSCCLPP_CUDATHROW(cudaEventCreate(&start));
+      MSCCLPP_CUDATHROW(cudaEventCreate(&end));
+    }
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    bootstrap->barrier();
+
+    if (myRank == 0) {
+      MSCCLPP_CUDATHROW(cudaEventRecord(start, 0));
+    }
+
+    for (int i = 0; i < iter; ++i) {
+      kernelSwitchReduce<<<256, 1024>>>(myRank, numElements);
+    }
+
+    MSCCLPP_CUDATHROW(cudaGetLastError());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+    comm->bootstrap()->barrier();
+
+    if (myRank == 0) {
+      MSCCLPP_CUDATHROW(cudaEventRecord(end, 0));
+      MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
+      float elapsedTime;
+      float elapsedTimePerIter;
+      float gbps;
+      MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
+      elapsedTimePerIter = elapsedTime / iter;
+      float dataSize = numElements * 4;
+      gbps = dataSize / elapsedTimePerIter * 1e-6f;
+      log("Rank ", myRank, " (GPU ", gpuId, "): bytes ", dataSize, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
+          gbps, " GB/s");
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  if (argc == 1) {
+    int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
+    int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
+    if (pid0 < 0 || pid1 < 0) {
+      log("Failed to spawn processes.");
+      return -1;
+    }
+    int status0 = wait_process(pid0);
+    int status1 = wait_process(pid1);
+    if (status0 < 0 || status1 < 0) {
+      log("Failed to wait for processes.");
+      return -1;
+    }
+    if (status0 != 0 || status1 != 0) {
+      log("One of the processes failed.");
+      return -1;
+    }
+    log("Succeed!");
+    return 0;
+  } else if (argc == 4) {
+    std::string ipPort = argv[1];
+    int rank, gpuId;
+    try {
+      rank = std::stoi(argv[2]);
+      gpuId = std::stoi(argv[3]);
+    } catch (const std::exception &) {
+      log("Error: rank and gpu_id must be valid integers.");
+      return -1;
+    }
+    if (rank < 0 || rank > 2 || gpuId < 0) {
+      log("Error: rank must be between 0 and 1 and gpu_id must be non-negative.");
+      return -1;
+    }
+    worker(rank, gpuId, ipPort);
+    log("Rank ", rank, ": Succeed!");
+    return 0;
+  } else {
+    std::cerr << "Usage:\n"
+              << "  " << argv[0] << "                Run in intra-node mode\n"
+              << "  " << argv[0] << " <ip_port> <rank> <gpu_id>   Run in inter-node mode\n";
+    return -1;
+  }
+}
--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -84,6 +84,11 @@ class Algorithm {
  /// @return The Constraint struct specifying worldSize and nRanksPerNode requirements.
  virtual Constraint constraint() const = 0;

+  /// Set the valid message size range for this algorithm.
+  /// @param minMessageSize Minimum supported message size in bytes.
+  /// @param maxMessageSize Maximum supported message size in bytes.
+  virtual void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) = 0;
+
  /// Execute the algorithm.
  /// @param comm The communicator to use.
  /// @param input Pointer to the input buffer.
@@ -96,12 +101,16 @@ class Algorithm {
  /// @param executor The executor for DSL algorithms (may be nullptr for native).
  /// @param nBlocks Number of CUDA blocks (0 for auto-selection).
  /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
+  /// @param symmetricMemory Whether to use symmetric memory optimization.
  /// @param extras Additional parameters for algorithm-specific customization.
+  /// @param accumDtype Data type for accumulation during reduction. DataType::AUTO resolves to dtype.
  /// @return The result of the operation.
  virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                             size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                             std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                             const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
+                             bool symmetricMemory = false,
+                             const std::unordered_map<std::string, uintptr_t>& extras = {},
+                             DataType accumDtype = DataType::AUTO) = 0;

  /// Reset the algorithm state, clearing any cached contexts.
  virtual void reset() = 0;
@@ -179,10 +188,11 @@ class NativeAlgorithm : public Algorithm {
  /// @param nBlocks Number of CUDA blocks.
  /// @param nThreadsPerBlock Number of threads per block.
  /// @param extras Additional algorithm-specific parameters.
+  /// @param accumDtype Data type for accumulation (resolved from input dtype if sentinel).
  /// @return The result of the operation.
  using KernelFunc =
      std::function<CommResult(const std::shared_ptr<void>, const void*, void*, size_t, size_t, DataType, ReduceOp,
-                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&)>;
+                               cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&, DataType)>;

  /// Function type for creating algorithm contexts.
  /// @param comm The communicator.
@@ -201,9 +211,10 @@ class NativeAlgorithm : public Algorithm {
  /// @param inputSize Size of the input buffer.
  /// @param outputSize Size of the output buffer.
  /// @param dtype Data type of the elements.
+  /// @param symmetricMemory Whether symmetric memory is enabled.
  /// @return A key uniquely identifying this buffer configuration.
  using ContextKeyGenFunc = std::function<AlgorithmCtxKey(const void* input, void* output, size_t inputSize,
-                                                          size_t outputSize, DataType dtype)>;
+                                                          size_t outputSize, DataType dtype, bool symmetricMemory)>;

  /// Construct a NativeAlgorithm.
  /// @param name Human-readable name of the algorithm.
@@ -225,10 +236,12 @@ class NativeAlgorithm : public Algorithm {
  CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                     size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                     std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+                     bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
+                     DataType accumDtype = DataType::AUTO) override;
  const std::string& name() const override;
  const std::string& collective() const override;
  const std::pair<size_t, size_t>& messageRange() const override;
+  void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) override;
  const std::unordered_map<std::string, uint64_t>& tags() const override;
  const CollectiveBufferMode& bufferMode() const override;
  AlgorithmType type() const override { return AlgorithmType::Native; }
@@ -269,12 +282,14 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
  const std::string& name() const override;
  const std::string& collective() const override;
  const std::pair<size_t, size_t>& messageRange() const override;
+  void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) override;
  const std::unordered_map<std::string, uint64_t>& tags() const override;
  const CollectiveBufferMode& bufferMode() const override;
  CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
                     size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
                     std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
-                     const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
+                     bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
+                     DataType accumDtype = DataType::AUTO) override;
  AlgorithmType type() const override { return AlgorithmType::DSL; }
  Constraint constraint() const override;
  void reset() override;
@@ -299,6 +314,7 @@ struct CollectiveRequest {
  const void* inputBuffer;
  void* outputBuffer;
  size_t messageSize;
+  cudaStream_t stream;
  const std::string& collective;
  const DataType dtype;
  const std::unordered_map<std::string, std::vector<uint64_t>>& hints;
@@ -358,6 +374,10 @@ class AlgorithmCollection {
  AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
 };

+/// Get a default GPU flag buffer (allocated once and reused).
+/// @return A pair of (shared_ptr to the flag buffer, size in bytes).
+std::pair<std::shared_ptr<void>, size_t> getFlagBuffer();
+
 }  // namespace mscclpp

 #endif  // MSCCLPP_ALGORITHM_HPP_
--- a/include/mscclpp/assert_device.hpp
+++ b/include/mscclpp/assert_device.hpp
@@ -19,11 +19,11 @@
 #else  // defined(DEBUG_BUILD)

 #if defined(MSCCLPP_DEVICE_HIP)
-extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
-                                         const char *__function);
+extern "C" __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
+                                         const char* __function);
 #else   // !defined(MSCCLPP_DEVICE_HIP)
-extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
-                                                  const char *__function) __THROW;
+extern "C" __host__ __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
+                                                  const char* __function) __THROW;
 #endif  // !defined(MSCCLPP_DEVICE_HIP)

 /// Assert a condition on the device and print a message if the condition is false.
--- a/include/mscclpp/atomic_device.hpp
+++ b/include/mscclpp/atomic_device.hpp
@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
  return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
 }

-#elif defined(MSCCLPP_DEVICE_HIP)
+#else  // !defined(MSCCLPP_DEVICE_CUDA)

 constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
 constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
 constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
 constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;

-// HIP does not have thread scope enums like CUDA
 constexpr auto scopeSystem = 0;
 constexpr auto scopeDevice = 0;

@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
  return __atomic_fetch_add(ptr, val, memoryOrder);
 }

-#endif  // defined(MSCCLPP_DEVICE_HIP)
+#endif  // !defined(MSCCLPP_DEVICE_CUDA)

 }  // namespace mscclpp

--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -381,11 +381,19 @@ struct EndpointConfig {
  /// These settings are only used when the transport is an InfiniBand type (IB0-IB7); they are ignored for other
  /// transports.
  struct Ib {
+    /// IB mode for signaling, used to select between different implementations.
+    enum class Mode {
+      Default,      // Use the MSCCLPP_IBV_MODE environment variable (or "host" if unset).
+      Host,         // Use the host stack with RDMA atomics.
+      HostNoAtomic  // Use the host stack with write-with-immediate signaling (no RDMA atomics).
+    };
+
    static constexpr int DefaultPort = -1;
-    static constexpr int DefaultGidIndex = 0;
+    static constexpr int DefaultGidIndex = -1;
    static constexpr int DefaultMaxCqSize = 1024;
    static constexpr int DefaultMaxCqPollNum = 1;
    static constexpr int DefaultMaxSendWr = 8192;
+    static constexpr int DefaultMaxRecvWr = 16;
    static constexpr int DefaultMaxWrPerSend = 64;

    /// Device index. Currently ignored; use transport type (IB0-IB7) to select device.
@@ -394,32 +402,41 @@ struct EndpointConfig {
    int port;
    /// GID index.
    int gidIndex;
-    /// Maximum size of the completion queue.
+    /// Maximum size of the send completion queue.
    int maxCqSize;
-    /// Maximum number of completion queue polls per operation.
+    /// Maximum number of send completion queue polls per operation.
    int maxCqPollNum;
    /// Maximum number of outstanding send work requests.
    int maxSendWr;
+    /// Maximum number of outstanding receive work requests (used in HostNoAtomic mode for write-with-immediate).
+    int maxRecvWr;
    /// Maximum number of work requests per send operation.
    int maxWrPerSend;
+    /// IB mode for signaling. When set to Default, uses the MSCCLPP_IBV_MODE environment variable.
+    Mode mode;

    /// Constructor.
    /// @param deviceIndex Device index.
    /// @param port Port number.
-    /// @param gidIndex GID index.
-    /// @param maxCqSize Maximum completion queue size.
-    /// @param maxCqPollNum Maximum completion queue poll count.
+    /// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
+    /// @param maxCqSize Maximum send completion queue size.
+    /// @param maxCqPollNum Maximum send completion queue poll count.
    /// @param maxSendWr Maximum outstanding send work requests.
+    /// @param maxRecvWr Maximum outstanding receive work requests (for HostNoAtomic mode).
    /// @param maxWrPerSend Maximum work requests per send operation.
+    /// @param mode IB mode for signaling (Default uses MSCCLPP_IBV_MODE env variable).
    Ib(int deviceIndex = -1, int port = DefaultPort, int gidIndex = DefaultGidIndex, int maxCqSize = DefaultMaxCqSize,
-       int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxWrPerSend = DefaultMaxWrPerSend)
+       int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxRecvWr = DefaultMaxRecvWr,
+       int maxWrPerSend = DefaultMaxWrPerSend, Mode mode = Mode::Default)
        : deviceIndex(deviceIndex),
          port(port),
          gidIndex(gidIndex),
          maxCqSize(maxCqSize),
          maxCqPollNum(maxCqPollNum),
          maxSendWr(maxSendWr),
-          maxWrPerSend(maxWrPerSend) {}
+          maxRecvWr(maxRecvWr),
+          maxWrPerSend(maxWrPerSend),
+          mode(mode) {}
  };

  /// Communication transport type (e.g., CudaIpc, IB0-IB7, Ethernet).
@@ -658,6 +675,7 @@ class Connection {
  friend class SemaphoreStub;
  friend class Semaphore;
  friend class ProxyService;
+  friend class BaseConnection;
 };

 /// SemaphoreStub object only used for constructing Semaphore, not for direct use by the user.
--- a/include/mscclpp/env.hpp
+++ b/include/mscclpp/env.hpp
@@ -54,6 +54,12 @@ class Env {
  /// default libibverbs library found in the system.
  const std::string ibvSo;

+  /// Env name: `MSCCLPP_IBV_MODE`. Selects the IB stack implementation for PortChannel.
+  /// Allowed values:
+  /// - "host": use the host stack with RDMA atomics (default).
+  /// - "host-no-atomic": use the host stack with write-with-immediate signaling (no RDMA atomics).
+  const std::string ibvMode;
+
  /// Env name: `MSCCLPP_HOSTID`. A string that uniquely identifies the host. If unset, it will use the hostname.
  /// This is used to determine whether the host is the same across different processes.
  const std::string hostid;
@@ -70,9 +76,9 @@ class Env {
  /// Env name: `MSCCLPP_COMM_ID`. To be deprecated; don't use this.
  const std::string commId;

-  /// Env name: `MSCCLPP_EXECUTION_PLAN_DIR`. The directory to find execution plans from. This should be set to
-  /// use execution plans for the NCCL API. Unset by default.
-  const std::string executionPlanDir;
+  /// Env name: `MSCCLPP_CACHE_DIR`. The directory to use for caching execution plans and other temporary files.
+  /// If unset, it defaults to `~/.cache/mscclpp`.
+  const std::string cacheDir;

  /// Env name: `MSCCLPP_NPKIT_DUMP_DIR`. The directory to dump NPKIT traces to. If this is set, NPKIT will be
  /// enabled and will dump traces to this directory. Unset by default.
@@ -92,17 +98,27 @@ class Env {
  /// debugging purposes. Currently supports `all`, `broadcast`, `allreduce`, `reducescatter`, and `allgather`.
  const std::string forceNcclFallbackOperation;

-  /// Env name: `MSCCLPP_DISABLE_CHANNEL_CACHE`. If set to true, it will disable the channel cache for NCCL APIs.
-  /// Currently, this should be set to true if the application may call NCCL APIs on the same local buffer with
-  /// different remote buffers, e.g., in the case of a dynamic communicator. If CUDA/HIP graphs are used, disabling
-  /// the channel cache won't affect the performance, but otherwise it may lead to performance degradation.
+  /// Env name: `MSCCLPP_NCCL_SYMMETRIC_MEMORY`. If set to true, it indicates that the application uses symmetric memory
+  /// allocation across all ranks, making it safe to cache memory handles for all NCCL algorithms. If set to false, the
+  /// system will either use non-zero-copy algorithms (when CUDA/HIP graphs are not enabled) or set up new connections
+  /// every time (when CUDA/HIP graphs are enabled). This should be set to false if the application may call NCCL APIs
+  /// on the same local buffer with different remote buffers, e.g., in the case of a dynamic communicator.
  /// Default is false.
-  const bool disableChannelCache;
+  const bool ncclSymmetricMemory;

  /// Env name: `MSCCLPP_FORCE_DISABLE_NVLS`. If set to true, it will disable the NVLS support in MSCCL++.
  /// Default is false.
  const bool forceDisableNvls;

+  /// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++.
+  /// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded.
+  /// Default is false.
+  const bool forceDisableGdr;
+
+  /// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
+  /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
+  const int ibGidIndex;
+
 private:
  Env();

--- a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
+++ b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp
@@ -47,7 +47,8 @@ class AlgorithmCollectionBuilder {
  /// @return The built AlgorithmCollection containing all registered algorithms.
  AlgorithmCollection build();

-  AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, int rank);
+  AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer,
+                                             size_t flagBufferSize, int rank);

 private:
  AlgorithmCollectionBuilder() = default;
@@ -55,7 +56,8 @@ class AlgorithmCollectionBuilder {
  AlgoSelectFunc algoSelector_ = nullptr;
  AlgoSelectFunc fallbackAlgoSelector_ = nullptr;

-  AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize);
+  AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize,
+                                                   uintptr_t flagBuffer, size_t flagBufferSize);
  AlgorithmCollection buildDefaultDslAlgorithms(int rank);

  static std::shared_ptr<AlgorithmCollectionBuilder> gAlgorithmCollectionBuilder_;
--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -15,6 +15,7 @@ using cudaGraphExec_t = hipGraphExec_t;
 using cudaDeviceProp = hipDeviceProp_t;
 using cudaStream_t = hipStream_t;
 using cudaStreamCaptureMode = hipStreamCaptureMode;
+using cudaStreamCaptureStatus = hipStreamCaptureStatus;
 using cudaMemcpyKind = hipMemcpyKind;
 using cudaIpcMemHandle_t = hipIpcMemHandle_t;

@@ -35,6 +36,9 @@ constexpr auto cudaErrorNotSupported = hipErrorNotSupported;
 constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
 constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
 constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed;
+constexpr auto cudaStreamCaptureStatusNone = hipStreamCaptureStatusNone;
+constexpr auto cudaStreamCaptureStatusActive = hipStreamCaptureStatusActive;
+constexpr auto cudaStreamCaptureStatusInvalidated = hipStreamCaptureStatusInvalidated;
 constexpr auto cudaHostAllocMapped = hipHostMallocMapped;
 constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined;
 constexpr auto cudaMemcpyDefault = hipMemcpyDefault;
@@ -98,6 +102,7 @@ constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVIC
 #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
 #define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__)
 #define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__)
+#define cudaStreamIsCapturing(...) hipStreamIsCapturing(__VA_ARGS__)
 #define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__)
 #define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__)
 #define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__)
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
--- a/include/mscclpp/proxy.hpp
+++ b/include/mscclpp/proxy.hpp
@@ -29,7 +29,9 @@ class Proxy {
 public:
  /// Constructor.
  /// @param handler Handler for each FIFO trigger.
-  /// @param threadInit Optional function run in proxy thread before FIFO consumption.
+  /// @param threadInit Optional function run once in the proxy thread before FIFO consumption.
+  ///        The function should initialize thread runtime context before any CUDA API call in that thread
+  ///        (for example, set CUDA device and optionally bind NUMA affinity).
  /// @param fifoSize FIFO size (default: DEFAULT_FIFO_SIZE).
  Proxy(ProxyHandler handler, std::function<void()> threadInit, int fifoSize = DEFAULT_FIFO_SIZE);

--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -16,6 +16,7 @@ namespace mscclpp {
 class Host2DeviceSemaphore {
 private:
  Semaphore semaphore_;
+  std::shared_ptr<uint64_t> inboundToken_;
  detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
  std::unique_ptr<uint64_t> outboundToken_;

@@ -29,6 +30,15 @@ class Host2DeviceSemaphore {
  /// @param connection The connection associated with this semaphore.
  Host2DeviceSemaphore(Communicator& communicator, const Connection& connection);

+  /// Destructor.
+  ~Host2DeviceSemaphore();
+
+  /// Move constructor.
+  Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default;
+
+  /// Move assignment operator.
+  Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default;
+
  /// Returns the connection.
  /// @return The connection associated with this semaphore.
  Connection& connection();
@@ -82,7 +92,6 @@ class MemoryDevice2DeviceSemaphore {
 private:
  Semaphore semaphore_;
  detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
-  detail::UniqueGpuPtr<uint64_t> outboundToken_;

 public:
  /// Constructor.
--- a/include/mscclpp/semaphore_device.hpp
+++ b/include/mscclpp/semaphore_device.hpp
@@ -82,19 +82,20 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {

  /// Signal remote device, ensures prior memory ops complete.
  MSCCLPP_DEVICE_INLINE void signal() {
-    auto outbound = incOutbound();
-#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ == 800)
-    // Using memoryOrderSeqCst is faster for A100.
-    atomicStore(remoteInboundToken, outbound, memoryOrderSeqCst);
-#else
-    atomicStore(remoteInboundToken, outbound, memoryOrderRelease);
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("red.release.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
+#elif defined(MSCCLPP_DEVICE_HIP)
+    (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelease);
 #endif
  }

  /// Relaxed signal; no memory completion guarantee. Use it only for synchronizing execution, not data.
  MSCCLPP_DEVICE_INLINE void relaxedSignal() {
-    auto outbound = incOutbound();
-    atomicStore(remoteInboundToken, outbound, memoryOrderRelaxed);
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("red.relaxed.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
+#elif defined(MSCCLPP_DEVICE_HIP)
+    (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelaxed);
+#endif
  }

  /// Thread-safe read of expected inbound value.
@@ -121,27 +122,12 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
    return atomicLoad<uint64_t, scopeSystem>(inboundToken, memoryOrderRelaxed);
  }

-  /// Thread-safe read of outbound value.
-  /// @return The outbound value.
-  MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() {
-    return atomicLoad<uint64_t, scopeDevice>(outboundToken, memoryOrderRelaxed);
-  }
-
-  /// Thread-safe increment of outbound value.
-  /// @return The incremented outbound value.
-  MSCCLPP_DEVICE_INLINE uint64_t incOutbound() {
-    return atomicFetchAdd<uint64_t, scopeDevice>(outboundToken, 1, memoryOrderRelaxed) + 1;
-  }
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)

  /// A local memory space where the remote device will write its semaphore value and the local device will read it.
  uint64_t* inboundToken;

-  /// A local memory space where the local device stores the semaphore value to be written to the remote device.
-  uint64_t* outboundToken;
-
-  /// A remote memory space where the local device writes its outboundToken on. This is inboundToken of the
-  /// remote device.
+  /// A remote memory space where the local device atomically increments. This is inboundToken of the remote device.
  uint64_t* remoteInboundToken;

  /// A local memory space where the local device stores the expected value of the inboundToken to wait for.
--- a/include/mscclpp/switch_channel_device.hpp
+++ b/include/mscclpp/switch_channel_device.hpp
@@ -80,26 +80,26 @@ struct SwitchChannelDeviceHandle {
          : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
          : "l"(ptr)
          : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
      asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
      asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];"
          : "=r"(val.words[0]), "=r"(val.words[1])
          : "l"(ptr)
          : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x16>) {
      asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];"
          : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
          : "l"(ptr)
          : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x4>) {
      asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x8>) {
      asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];"
          : "=r"(val.words[0]), "=r"(val.words[1])
          : "l"(ptr)
          : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x16>) {
      asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];"
          : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
          : "l"(ptr)
@@ -148,23 +148,23 @@ struct SwitchChannelDeviceHandle {
      asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                   "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                   : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
      asm volatile("multimem.st.relaxed.sys.global.e4m3x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
      asm volatile("multimem.st.relaxed.sys.global.v2.e4m3x4  [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]),
                   "r"(val.words[1])
                   : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e4m3x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e4m3x16>) {
      asm volatile("multimem.st.relaxed.sys.global.v4.e4m3x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                   "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                   : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x4>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x4>) {
      asm volatile("multimem.st.relaxed.sys.global.e5m2x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x8>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x8>) {
      asm volatile("multimem.st.relaxed.sys.global.v2.e5m2x4  [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]),
                   "r"(val.words[1])
                   : "memory");
-    } else if constexpr (std::is_same_v<VectorType, fp8_e5m2x16>) {
+    } else if constexpr (std::is_same_v<VectorType, f8_e5m2x16>) {
      asm volatile("multimem.st.relaxed.sys.global.v4.e5m2x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
                   "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
                   : "memory");
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,6 +4,10 @@
 add_subdirectory(csrc)
 add_subdirectory(test)

+target_compile_definitions(mscclpp_py PRIVATE
+  $<$<BOOL:${MSCCLPP_DISABLE_NB_LEAK_WARNINGS}>:MSCCLPP_DISABLE_NB_LEAK_WARNINGS>
+)
+
 add_custom_target(pytest_lib_copy ALL
    COMMAND ${CMAKE_COMMAND} -E copy_if_different
        ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_mscclpp.*.so
@@ -12,4 +16,4 @@ add_custom_target(pytest_lib_copy ALL
        ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_ext.*.so
        ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
    DEPENDS mscclpp_py mscclpp_py_test
-)
+)
--- a/python/csrc/CMakeLists.txt
+++ b/python/csrc/CMakeLists.txt
@@ -24,4 +24,7 @@ set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
 set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
 target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES})
 target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
+if(MSCCLPP_USE_ROCM)
+    target_compile_definitions(mscclpp_py PRIVATE MSCCLPP_USE_ROCM)
+endif()
 install(TARGETS mscclpp_py LIBRARY DESTINATION .)
--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -16,14 +16,16 @@ namespace nb = nanobind;
 using namespace mscclpp;

 void register_algorithm(nb::module_& m) {
-  nb::enum_<CollectiveBufferMode>(m, "CollectiveBufferMode")
+  nb::enum_<CollectiveBufferMode>(m, "CppCollectiveBufferMode")
      .value("ANY", CollectiveBufferMode::Any)
      .value("IN_PLACE", CollectiveBufferMode::InPlace)
      .value("OUT_OF_PLACE", CollectiveBufferMode::OutOfPlace);

-  nb::enum_<AlgorithmType>(m, "AlgorithmType").value("NATIVE", AlgorithmType::Native).value("DSL", AlgorithmType::DSL);
+  nb::enum_<AlgorithmType>(m, "CppAlgorithmType")
+      .value("NATIVE", AlgorithmType::Native)
+      .value("DSL", AlgorithmType::DSL);

-  nb::enum_<CommResult>(m, "CommResult")
+  nb::enum_<CommResult>(m, "CppCommResult")
      .value("COMM_SUCCESS", CommResult::CommSuccess)
      .value("COMM_UNHANDLED_CUDA_ERROR", CommResult::CommUnhandledCudaError)
      .value("COMM_SYSTEM_ERROR", CommResult::CommSystemError)
@@ -34,13 +36,13 @@ void register_algorithm(nb::module_& m) {
      .value("COMM_IN_PROGRESS", CommResult::CommInProgress)
      .value("COMM_NUM_RESULTS", CommResult::CommNumResults);

-  nb::enum_<ReduceOp>(m, "ReduceOp")
+  nb::enum_<ReduceOp>(m, "CppReduceOp")
      .value("SUM", ReduceOp::SUM)
      .value("MIN", ReduceOp::MIN)
      .value("NOP", ReduceOp::NOP);

  auto algorithmClass =
-      nb::class_<Algorithm>(m, "Algorithm")
+      nb::class_<Algorithm>(m, "CppAlgorithm")
          .def_static(
              "from_native_capsule",
              [](nb::capsule cap) {
@@ -58,6 +60,12 @@ void register_algorithm(nb::module_& m) {
          .def_prop_ro("name", &Algorithm::name)
          .def_prop_ro("collective", &Algorithm::collective)
          .def_prop_ro("message_range", &Algorithm::messageRange)
+          .def(
+              "set_message_size_range",
+              [](Algorithm& self, size_t minMessageSize, size_t maxMessageSize) {
+                self.setMessageSizeRange(minMessageSize, maxMessageSize);
+              },
+              nb::arg("min_message_size"), nb::arg("max_message_size"))
          .def_prop_ro("tags", &Algorithm::tags)
          .def_prop_ro("buffer_mode", &Algorithm::bufferMode)
          .def_prop_ro("constraint", &Algorithm::constraint)
@@ -67,16 +75,19 @@ void register_algorithm(nb::module_& m) {
              "execute",
              [](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
                 size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
-                 std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock,
-                 std::unordered_map<std::string, uintptr_t> extras) {
+                 std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
+                 std::unordered_map<std::string, uintptr_t> extras, int32_t accumDtype) {
                return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
                                    inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
-                                    nBlocks, nThreadsPerBlock, extras);
+                                    nBlocks, nThreadsPerBlock, symmetricMemory, extras,
+                                    static_cast<DataType>(accumDtype));
              },
              nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
              nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
-              nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0,
-              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>());
+              nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
+              nb::arg("extras") = std::unordered_map<std::string, uintptr_t>(),
+              nb::arg("accum_dtype") = static_cast<int32_t>(DataType::AUTO))
+          .def("reset", &Algorithm::reset);

  nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
      .def(nb::init<>())
@@ -84,21 +95,21 @@ void register_algorithm(nb::module_& m) {
      .def_rw("world_size", &Algorithm::Constraint::worldSize)
      .def_rw("n_ranks_per_node", &Algorithm::Constraint::nRanksPerNode);

-  nb::class_<AlgorithmBuilder>(m, "AlgorithmBuilder").def("build", &AlgorithmBuilder::build);
+  nb::class_<AlgorithmBuilder>(m, "CppAlgorithmBuilder").def("build", &AlgorithmBuilder::build);

-  nb::class_<DslAlgorithm, Algorithm>(m, "DslAlgorithm")
+  nb::class_<DslAlgorithm, Algorithm>(m, "CppDslAlgorithm")
      .def(nb::init<std::string, ExecutionPlan, std::unordered_map<std::string, uint64_t>, Algorithm::Constraint>(),
           nb::arg("id"), nb::arg("plan"), nb::arg("tags") = std::unordered_map<std::string, uint64_t>(),
           nb::arg("constraint") = Algorithm::Constraint())
      .def("build", &DslAlgorithm::build);

-  nb::class_<AlgorithmCollection>(m, "AlgorithmCollection")
+  nb::class_<AlgorithmCollection>(m, "CppAlgorithmCollection")
      .def("register_algorithm", &AlgorithmCollection::registerAlgorithm, nb::arg("collective"), nb::arg("algo_name"),
           nb::arg("algorithm"))
      .def("get_algorithms_by_collective", &AlgorithmCollection::getAlgorithmsByCollective, nb::arg("collective"))
      .def("to_list", &AlgorithmCollection::getAllAlgorithms);

-  nb::class_<CollectiveRequest>(m, "CollectiveRequest")
+  nb::class_<CollectiveRequest>(m, "CppCollectiveRequest")
      .def_ro("world_size", &CollectiveRequest::worldSize)
      .def_ro("n_ranks_per_node", &CollectiveRequest::nRanksPerNode)
      .def_ro("rank", &CollectiveRequest::rank)
@@ -107,8 +118,22 @@ void register_algorithm(nb::module_& m) {
      .def_prop_ro("output_buffer",
                   [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.outputBuffer); })
      .def_ro("message_size", &CollectiveRequest::messageSize)
+      .def_prop_ro("stream", [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.stream); })
      .def_prop_ro("collective", [](const CollectiveRequest& self) { return self.collective; })
      .def_ro("dtype", &CollectiveRequest::dtype)
      .def_prop_ro("hints", [](const CollectiveRequest& self) { return self.hints; })
      .def("buffer_mode", &CollectiveRequest::bufferMode);
+
+  m.def(
+      "cpp_get_flag_buffer",
+      []() {
+        auto [buffer, size] = getFlagBuffer();
+        uintptr_t ptr = reinterpret_cast<uintptr_t>(buffer.get());
+        // Transfer shared_ptr ownership into a capsule so Python's GC manages the lifetime.
+        auto prevent = std::make_unique<std::shared_ptr<void>>(std::move(buffer));
+        nb::capsule owner(prevent.get(), [](void* p) noexcept { delete static_cast<std::shared_ptr<void>*>(p); });
+        prevent.release();  // capsule now owns the pointer
+        return nb::make_tuple(ptr, size, owner);
+      },
+      "Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size, owner).");
 }
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -32,21 +32,25 @@ extern void register_algorithm_collection_builder(nb::module_& m);

 template <typename T>
 void def_shared_future(nb::handle& m, const std::string& typestr) {
-  std::string pyclass_name = std::string("shared_future_") + typestr;
+  std::string pyclass_name = std::string("CppSharedFuture_") + typestr;
  nb::class_<std::shared_future<T>>(m, pyclass_name.c_str()).def("get", &std::shared_future<T>::get);
 }

 void register_core(nb::module_& m) {
  m.def("version", &version);

-  nb::enum_<DataType>(m, "DataType")
+  nb::enum_<DataType>(m, "CppDataType")
      .value("int32", DataType::INT32)
      .value("uint32", DataType::UINT32)
      .value("float16", DataType::FLOAT16)
      .value("float32", DataType::FLOAT32)
-      .value("bfloat16", DataType::BFLOAT16);
+      .value("bfloat16", DataType::BFLOAT16)
+      .value("float8_e4m3", DataType::FLOAT8_E4M3)
+      .value("float8_e5m2", DataType::FLOAT8_E5M2)
+      .value("uint8", DataType::UINT8)
+      .value("float8_e4m3b15", DataType::FLOAT8_E4M3B15);

-  nb::class_<Bootstrap>(m, "Bootstrap")
+  nb::class_<Bootstrap>(m, "CppBootstrap")
      .def("get_rank", &Bootstrap::getRank)
      .def("get_n_ranks", &Bootstrap::getNranks)
      .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode)
@@ -71,7 +75,7 @@ void register_core(nb::module_& m) {
      .def("recv", static_cast<void (Bootstrap::*)(std::vector<char>&, int, int)>(&Bootstrap::recv), nb::arg("data"),
           nb::arg("peer"), nb::arg("tag"));

-  nb::class_<UniqueId>(m, "UniqueId")
+  nb::class_<UniqueId>(m, "CppUniqueId")
      .def(nb::init<>())
      .def("__setstate__",
           [](UniqueId& self, nb::bytes b) {
@@ -81,7 +85,7 @@ void register_core(nb::module_& m) {
      .def("__getstate__",
           [](const UniqueId& self) { return nb::bytes(reinterpret_cast<const char*>(self.data()), UniqueIdBytes); });

-  nb::class_<TcpBootstrap, Bootstrap>(m, "TcpBootstrap")
+  nb::class_<TcpBootstrap, Bootstrap>(m, "CppTcpBootstrap")
      .def(nb::init<int, int>(), "Do not use this constructor. Use create instead.")
      .def_static(
          "create", [](int rank, int nRanks) { return std::make_shared<TcpBootstrap>(rank, nRanks); }, nb::arg("rank"),
@@ -93,7 +97,7 @@ void register_core(nb::module_& m) {
      .def("initialize", static_cast<void (TcpBootstrap::*)(const std::string&, int64_t)>(&TcpBootstrap::initialize),
           nb::call_guard<nb::gil_scoped_release>(), nb::arg("if_ip_port_trio"), nb::arg("timeout_sec") = 30);

-  nb::enum_<Transport>(m, "Transport")
+  nb::enum_<Transport>(m, "CppTransport")
      .value("Unknown", Transport::Unknown)
      .value("CudaIpc", Transport::CudaIpc)
      .value("IB0", Transport::IB0)
@@ -106,7 +110,7 @@ void register_core(nb::module_& m) {
      .value("IB7", Transport::IB7)
      .value("NumTransports", Transport::NumTransports);

-  nb::class_<TransportFlags>(m, "TransportFlags")
+  nb::class_<TransportFlags>(m, "CppTransportFlags")
      .def(nb::init<>())
      .def(nb::init_implicit<Transport>(), nb::arg("transport"))
      .def("has", &TransportFlags::has, nb::arg("transport"))
@@ -130,12 +134,12 @@ void register_core(nb::module_& m) {
      .def(nb::self == nb::self)
      .def(nb::self != nb::self);

-  nb::enum_<DeviceType>(m, "DeviceType")
+  nb::enum_<DeviceType>(m, "CppDeviceType")
      .value("Unknown", DeviceType::Unknown)
      .value("CPU", DeviceType::CPU)
      .value("GPU", DeviceType::GPU);

-  nb::class_<Device>(m, "Device")
+  nb::class_<Device>(m, "CppDevice")
      .def(nb::init<>())
      .def(nb::init_implicit<DeviceType>(), nb::arg("type"))
      .def(nb::init<DeviceType, int>(), nb::arg("type"), nb::arg("id") = -1)
@@ -147,24 +151,33 @@ void register_core(nb::module_& m) {
        return ss.str();
      });

-  nb::class_<EndpointConfig::Ib>(m, "EndpointConfigIb")
+  nb::enum_<EndpointConfig::Ib::Mode>(m, "CppIbMode")
+      .value("Default", EndpointConfig::Ib::Mode::Default)
+      .value("Host", EndpointConfig::Ib::Mode::Host)
+      .value("HostNoAtomic", EndpointConfig::Ib::Mode::HostNoAtomic);
+
+  nb::class_<EndpointConfig::Ib>(m, "CppEndpointConfigIb")
      .def(nb::init<>())
-      .def(nb::init<int, int, int, int, int, int, int>(), nb::arg("device_index") = -1,
+      .def(nb::init<int, int, int, int, int, int, int, int, EndpointConfig::Ib::Mode>(), nb::arg("device_index") = -1,
           nb::arg("port") = EndpointConfig::Ib::DefaultPort,
           nb::arg("gid_index") = EndpointConfig::Ib::DefaultGidIndex,
           nb::arg("max_cq_size") = EndpointConfig::Ib::DefaultMaxCqSize,
           nb::arg("max_cq_poll_num") = EndpointConfig::Ib::DefaultMaxCqPollNum,
           nb::arg("max_send_wr") = EndpointConfig::Ib::DefaultMaxSendWr,
-           nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend)
+           nb::arg("max_recv_wr") = EndpointConfig::Ib::DefaultMaxRecvWr,
+           nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend,
+           nb::arg("mode") = EndpointConfig::Ib::Mode::Default)
      .def_rw("device_index", &EndpointConfig::Ib::deviceIndex)
      .def_rw("port", &EndpointConfig::Ib::port)
      .def_rw("gid_index", &EndpointConfig::Ib::gidIndex)
      .def_rw("max_cq_size", &EndpointConfig::Ib::maxCqSize)
      .def_rw("max_cq_poll_num", &EndpointConfig::Ib::maxCqPollNum)
      .def_rw("max_send_wr", &EndpointConfig::Ib::maxSendWr)
-      .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend);
+      .def_rw("max_recv_wr", &EndpointConfig::Ib::maxRecvWr)
+      .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend)
+      .def_rw("mode", &EndpointConfig::Ib::mode);

-  nb::class_<RegisteredMemory>(m, "RegisteredMemory")
+  nb::class_<RegisteredMemory>(m, "CppRegisteredMemory")
      .def(nb::init<>())
      .def("data", [](RegisteredMemory& self) { return reinterpret_cast<uintptr_t>(self.data()); })
      .def("size", &RegisteredMemory::size)
@@ -172,7 +185,7 @@ void register_core(nb::module_& m) {
      .def("serialize", &RegisteredMemory::serialize)
      .def_static("deserialize", &RegisteredMemory::deserialize, nb::arg("data"));

-  nb::class_<Endpoint>(m, "Endpoint")
+  nb::class_<Endpoint>(m, "CppEndpoint")
      .def("config", &Endpoint::config)
      .def("transport", &Endpoint::transport)
      .def("device", &Endpoint::device)
@@ -180,7 +193,7 @@ void register_core(nb::module_& m) {
      .def("serialize", &Endpoint::serialize)
      .def_static("deserialize", &Endpoint::deserialize, nb::arg("data"));

-  nb::class_<Connection>(m, "Connection")
+  nb::class_<Connection>(m, "CppConnection")
      .def("write", &Connection::write, nb::arg("dst"), nb::arg("dstOffset"), nb::arg("src"), nb::arg("srcOffset"),
           nb::arg("size"))
      .def(
@@ -197,7 +210,7 @@ void register_core(nb::module_& m) {
      .def("local_device", &Connection::localDevice)
      .def("get_max_write_queue_size", &Connection::getMaxWriteQueueSize);

-  nb::class_<EndpointConfig>(m, "EndpointConfig")
+  nb::class_<EndpointConfig>(m, "CppEndpointConfig")
      .def(nb::init<>())
      .def(nb::init_implicit<Transport>(), nb::arg("transport"))
      .def(nb::init<Transport, Device, int, EndpointConfig::Ib>(), nb::arg("transport"), nb::arg("device"),
@@ -223,12 +236,18 @@ void register_core(nb::module_& m) {
      .def_prop_rw(
          "ib_max_send_wr", [](EndpointConfig& self) { return self.ib.maxSendWr; },
          [](EndpointConfig& self, int v) { self.ib.maxSendWr = v; })
+      .def_prop_rw(
+          "ib_max_recv_wr", [](EndpointConfig& self) { return self.ib.maxRecvWr; },
+          [](EndpointConfig& self, int v) { self.ib.maxRecvWr = v; })
      .def_prop_rw(
          "ib_max_wr_per_send", [](EndpointConfig& self) { return self.ib.maxWrPerSend; },
          [](EndpointConfig& self, int v) { self.ib.maxWrPerSend = v; })
+      .def_prop_rw(
+          "ib_mode", [](EndpointConfig& self) { return self.ib.mode; },
+          [](EndpointConfig& self, EndpointConfig::Ib::Mode v) { self.ib.mode = v; })
      .def_rw("max_write_queue_size", &EndpointConfig::maxWriteQueueSize);

-  nb::class_<Context>(m, "Context")
+  nb::class_<Context>(m, "CppContext")
      .def_static("create", &Context::create)
      .def(
          "register_memory",
@@ -239,13 +258,13 @@ void register_core(nb::module_& m) {
      .def("create_endpoint", &Context::createEndpoint, nb::arg("config"))
      .def("connect", &Context::connect, nb::arg("local_endpoint"), nb::arg("remote_endpoint"));

-  nb::class_<SemaphoreStub>(m, "SemaphoreStub")
+  nb::class_<SemaphoreStub>(m, "CppSemaphoreStub")
      .def(nb::init<const Connection&>(), nb::arg("connection"))
      .def("memory", &SemaphoreStub::memory)
      .def("serialize", &SemaphoreStub::serialize)
      .def_static("deserialize", &SemaphoreStub::deserialize, nb::arg("data"));

-  nb::class_<Semaphore>(m, "Semaphore")
+  nb::class_<Semaphore>(m, "CppSemaphore")
      .def(nb::init<>())
      .def(nb::init<const SemaphoreStub&, const SemaphoreStub&>(), nb::arg("local_stub"), nb::arg("remote_stub"))
      .def("connection", &Semaphore::connection)
@@ -256,7 +275,7 @@ void register_core(nb::module_& m) {
  def_shared_future<Connection>(m, "Connection");
  def_shared_future<Semaphore>(m, "Semaphore");

-  nb::class_<Communicator>(m, "Communicator")
+  nb::class_<Communicator>(m, "CppCommunicator")
      .def(nb::init<std::shared_ptr<Bootstrap>, std::shared_ptr<Context>>(), nb::arg("bootstrap"),
           nb::arg("context") = nullptr)
      .def("bootstrap", &Communicator::bootstrap)
@@ -289,6 +308,9 @@ void register_core(nb::module_& m) {
 }

 NB_MODULE(_mscclpp, m) {
+#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS
+  nb::set_leak_warnings(false);
+#endif
  register_env(m);
  register_error(m);
  register_port_channel(m);
@@ -306,4 +328,4 @@ NB_MODULE(_mscclpp, m) {

  // ext
  register_algorithm_collection_builder(m);
-}
+}
--- a/python/csrc/env_py.cpp
+++ b/python/csrc/env_py.cpp
@@ -11,7 +11,7 @@ namespace nb = nanobind;
 using namespace mscclpp;

 void register_env(nb::module_& m) {
-  nb::class_<Env>(m, "Env")
+  nb::class_<Env>(m, "CppEnv")
      .def_ro("debug", &Env::debug)
      .def_ro("debug_subsys", &Env::debugSubsys)
      .def_ro("debug_file", &Env::debugFile)
@@ -20,9 +20,11 @@ void register_env(nb::module_& m) {
      .def_ro("socket_family", &Env::socketFamily)
      .def_ro("socket_ifname", &Env::socketIfname)
      .def_ro("comm_id", &Env::commId)
-      .def_ro("execution_plan_dir", &Env::executionPlanDir)
+      .def_ro("ibv_mode", &Env::ibvMode)
+      .def_ro("cache_dir", &Env::cacheDir)
      .def_ro("npkit_dump_dir", &Env::npkitDumpDir)
-      .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
+      .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream)
+      .def_ro("ib_gid_index", &Env::ibGidIndex);

  m.def("env", &env);
 }
--- a/python/csrc/error_py.cpp
+++ b/python/csrc/error_py.cpp
@@ -11,18 +11,18 @@ using namespace mscclpp;

 #define REGISTER_EXCEPTION_TRANSLATOR(name_)                                                                         \
  nb::register_exception_translator(                                                                                 \
-      [](const std::exception_ptr &p, void *payload) {                                                               \
+      [](const std::exception_ptr& p, void* payload) {                                                               \
        try {                                                                                                        \
          std::rethrow_exception(p);                                                                                 \
-        } catch (const name_ &e) {                                                                                   \
-          PyErr_SetObject(reinterpret_cast<PyObject *>(payload),                                                     \
+        } catch (const name_& e) {                                                                                   \
+          PyErr_SetObject(reinterpret_cast<PyObject*>(payload),                                                      \
                          PyTuple_Pack(2, PyLong_FromLong(long(e.getErrorCode())), PyUnicode_FromString(e.what()))); \
        }                                                                                                            \
      },                                                                                                             \
      m.attr(#name_).ptr());

-void register_error(nb::module_ &m) {
-  nb::enum_<ErrorCode>(m, "ErrorCode")
+void register_error(nb::module_& m) {
+  nb::enum_<ErrorCode>(m, "CppErrorCode")
      .value("SystemError", ErrorCode::SystemError)
      .value("InternalError", ErrorCode::InternalError)
      .value("RemoteError", ErrorCode::RemoteError)
--- a/python/csrc/executor_py.cpp
+++ b/python/csrc/executor_py.cpp
@@ -15,16 +15,16 @@ namespace nb = nanobind;
 using namespace mscclpp;

 void register_executor(nb::module_& m) {
-  nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
+  nb::enum_<PacketType>(m, "CppPacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);

-  nb::class_<ExecutionPlan>(m, "ExecutionPlan")
+  nb::class_<ExecutionPlan>(m, "CppExecutionPlan")
      .def(nb::init<const std::string&, int>(), nb::arg("planPath"), nb::arg("rank"))
      .def_prop_ro("name", [](const ExecutionPlan& self) -> std::string { return self.name(); })
      .def_prop_ro("collective", [](const ExecutionPlan& self) -> std::string { return self.collective(); })
      .def_prop_ro("min_message_size", [](const ExecutionPlan& self) -> size_t { return self.minMessageSize(); })
      .def_prop_ro("max_message_size", [](const ExecutionPlan& self) -> size_t { return self.maxMessageSize(); });

-  nb::class_<Executor>(m, "Executor")
+  nb::class_<Executor>(m, "CppExecutor")
      .def(nb::init<std::shared_ptr<Communicator>>(), nb::arg("comm"))
      .def(
          "execute",
--- a/python/csrc/ext/algorithm_collection_builder_py.cpp
+++ b/python/csrc/ext/algorithm_collection_builder_py.cpp
@@ -4,6 +4,7 @@
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/function.h>
 #include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
 #include <nanobind/stl/unordered_map.h>
 #include <nanobind/stl/vector.h>

@@ -15,7 +16,7 @@ using namespace mscclpp;
 using namespace mscclpp::collective;

 void register_algorithm_collection_builder(nb::module_& m) {
-  nb::class_<AlgorithmCollectionBuilder>(m, "AlgorithmCollectionBuilder")
+  nb::class_<AlgorithmCollectionBuilder>(m, "CppAlgorithmCollectionBuilder")
      .def_static("get_instance", &AlgorithmCollectionBuilder::getInstance)
      .def("add_algorithm_builder", &AlgorithmCollectionBuilder::addAlgorithmBuilder, nb::arg("builder"))
      .def(
@@ -29,6 +30,6 @@ void register_algorithm_collection_builder(nb::module_& m) {
           nb::arg("selector"))
      .def("build", &AlgorithmCollectionBuilder::build)
      .def("build_default_algorithms", &AlgorithmCollectionBuilder::buildDefaultAlgorithms, nb::arg("scratch_buffer"),
-           nb::arg("scratch_buffer_size"), nb::arg("rank"))
+           nb::arg("scratch_buffer_size"), nb::arg("flag_buffer"), nb::arg("flag_buffer_size"), nb::arg("rank"))
      .def_static("reset", &AlgorithmCollectionBuilder::reset);
 }
--- a/python/csrc/fifo_py.cpp
+++ b/python/csrc/fifo_py.cpp
@@ -9,7 +9,7 @@ namespace nb = nanobind;
 using namespace mscclpp;

 void register_fifo(nb::module_& m) {
-  nb::class_<ProxyTrigger>(m, "ProxyTrigger")
+  nb::class_<ProxyTrigger>(m, "CppProxyTrigger")
      .def_prop_rw(
          "fst", [](const ProxyTrigger& self) { return self.fst; },
          [](ProxyTrigger& self, uint64_t v) { self.fst = v; })
@@ -17,7 +17,7 @@ void register_fifo(nb::module_& m) {
          "snd", [](const ProxyTrigger& self) { return self.snd; },
          [](ProxyTrigger& self, uint64_t v) { self.snd = v; });

-  nb::class_<FifoDeviceHandle>(m, "FifoDeviceHandle")
+  nb::class_<FifoDeviceHandle>(m, "CppFifoDeviceHandle")
      .def_rw("triggers", &FifoDeviceHandle::triggers)
      .def_rw("tail", &FifoDeviceHandle::tail)
      .def_rw("head", &FifoDeviceHandle::head)
@@ -26,7 +26,7 @@ void register_fifo(nb::module_& m) {
        return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
      });

-  nb::class_<Fifo>(m, "Fifo")
+  nb::class_<Fifo>(m, "CppFifo")
      .def(nb::init<int>(), nb::arg("size") = DEFAULT_FIFO_SIZE)
      .def("poll", &Fifo::poll)
      .def("pop", &Fifo::pop)
--- a/python/csrc/gpu_utils_py.cpp
+++ b/python/csrc/gpu_utils_py.cpp
@@ -34,6 +34,19 @@ static DLDataType getDlType(std::string type) {
    return DLDataType{kDLBfloat, 16, 1};
  } else if (type == "torch.float16") {
    return DLDataType{kDLFloat, 16, 1};
+  } else if (type == "torch.float8_e4m3fn") {
+    return DLDataType{kDLFloat8_e4m3fn, 8, 1};
+  } else if (type == "torch.float8_e4m3fnuz") {
+    return DLDataType{kDLFloat8_e4m3fnuz, 8, 1};
+  } else if (type == "torch.float8_e5m2") {
+    return DLDataType{kDLFloat8_e5m2, 8, 1};
+  } else if (type == "torch.float8_e5m2fnuz") {
+    return DLDataType{kDLFloat8_e5m2fnuz, 8, 1};
+  } else if (type == "torch.uint8") {
+    return DLDataType{kDLUInt, 8, 1};
+  } else if (type == "fp8_e4m3b15") {
+    // No standard DLPack code for fp8_e4m3b15; store as raw uint8 bytes.
+    return DLDataType{kDLUInt, 8, 1};
  } else {
    throw Error("Unsupported type: " + type, ErrorCode::InvalidUsage);
  }
@@ -101,7 +114,7 @@ static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::v
 void register_gpu_utils(nb::module_& m) {
  m.def("is_nvls_supported", &isNvlsSupported);

-  nb::class_<GpuBuffer<char>>(m, "RawGpuBuffer")
+  nb::class_<GpuBuffer<char>>(m, "CppRawGpuBuffer")
      .def(nb::init<size_t>(), nb::arg("nelems"))
      .def("nelems", &GpuBuffer<char>::nelems)
      .def("bytes", &GpuBuffer<char>::bytes)
--- a/python/csrc/memory_channel_py.cpp
+++ b/python/csrc/memory_channel_py.cpp
@@ -11,20 +11,20 @@ namespace nb = nanobind;
 using namespace mscclpp;

 void register_memory_channel(nb::module_& m) {
-  nb::class_<BaseMemoryChannel>(m, "BaseMemoryChannel")
+  nb::class_<BaseMemoryChannel>(m, "CppBaseMemoryChannel")
      .def(nb::init<>())
      .def(nb::init<std::shared_ptr<MemoryDevice2DeviceSemaphore>>(), nb::arg("semaphore"))
      .def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
      .def("device_handle", &BaseMemoryChannel::deviceHandle);

-  nb::class_<BaseMemoryChannel::DeviceHandle>(m, "BaseMemoryChannelDeviceHandle")
+  nb::class_<BaseMemoryChannel::DeviceHandle>(m, "CppBaseMemoryChannelDeviceHandle")
      .def(nb::init<>())
      .def_rw("semaphore_", &BaseMemoryChannel::DeviceHandle::semaphore_)
      .def_prop_ro("raw", [](const BaseMemoryChannel::DeviceHandle& self) -> nb::bytes {
        return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
      });

-  nb::class_<MemoryChannel>(m, "MemoryChannel")
+  nb::class_<MemoryChannel>(m, "CppMemoryChannel")
      .def(nb::init<>())
      .def(
          "__init__",
@@ -42,7 +42,7 @@ void register_memory_channel(nb::module_& m) {
          nb::arg("semaphore"), nb::arg("dst"), nb::arg("src"), nb::arg("packet_buffer") = 0)
      .def("device_handle", &MemoryChannel::deviceHandle);

-  nb::class_<MemoryChannel::DeviceHandle>(m, "MemoryChannelDeviceHandle")
+  nb::class_<MemoryChannel::DeviceHandle>(m, "CppMemoryChannelDeviceHandle")
      .def(nb::init<>())
      .def_rw("semaphore_", &MemoryChannel::DeviceHandle::semaphore_)
      .def_rw("dst_", &MemoryChannel::DeviceHandle::dst_)
--- a/python/csrc/npkit_py.cpp
+++ b/python/csrc/npkit_py.cpp
@@ -8,8 +8,8 @@

 namespace nb = nanobind;

-void register_npkit(nb::module_ &m) {
-  nb::module_ sub_m = m.def_submodule("npkit", "NPKit functions");
+void register_npkit(nb::module_& m) {
+  nb::module_ sub_m = m.def_submodule("cpp_npkit", "NPKit functions");
  sub_m.def("init", &NpKit::Init);
  sub_m.def("dump", &NpKit::Dump);
  sub_m.def("shutdown", &NpKit::Shutdown);
--- a/python/csrc/numa_py.cpp
+++ b/python/csrc/numa_py.cpp
@@ -6,8 +6,8 @@ int getDeviceNumaNode(int cudaDev);
 void numaBind(int node);
 };  // namespace mscclpp

-void register_numa(nb::module_ &m) {
-  nb::module_ sub_m = m.def_submodule("numa", "numa functions");
+void register_numa(nb::module_& m) {
+  nb::module_ sub_m = m.def_submodule("cpp_numa", "numa functions");
  sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode);
  sub_m.def("numa_bind", &mscclpp::numaBind);
 }
--- a/python/csrc/port_channel_py.cpp
+++ b/python/csrc/port_channel_py.cpp
@@ -11,11 +11,11 @@ namespace nb = nanobind;
 using namespace mscclpp;

 void register_port_channel(nb::module_& m) {
-  nb::class_<BaseProxyService>(m, "BaseProxyService")
+  nb::class_<BaseProxyService>(m, "CppBaseProxyService")
      .def("start_proxy", &BaseProxyService::startProxy, nb::arg("blocking") = false)
      .def("stop_proxy", &BaseProxyService::stopProxy);

-  nb::class_<ProxyService, BaseProxyService>(m, "ProxyService")
+  nb::class_<ProxyService, BaseProxyService>(m, "CppProxyService")
      .def(nb::init<int>(), nb::arg("fifo_size") = DEFAULT_FIFO_SIZE)
      .def("start_proxy", &ProxyService::startProxy, nb::arg("blocking") = false)
      .def("stop_proxy", &ProxyService::stopProxy)
@@ -31,13 +31,13 @@ void register_port_channel(nb::module_& m) {
      .def("base_port_channel", &ProxyService::basePortChannel, nb::arg("id"))
      .def("port_channel", &ProxyService::portChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src"));

-  nb::class_<BasePortChannel>(m, "BasePortChannel")
+  nb::class_<BasePortChannel>(m, "CppBasePortChannel")
      .def(nb::init<>())
      .def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>>(),
           nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy"))
      .def("device_handle", &BasePortChannel::deviceHandle);

-  nb::class_<BasePortChannel::DeviceHandle>(m, "BasePortChannelDeviceHandle")
+  nb::class_<BasePortChannel::DeviceHandle>(m, "CppBasePortChannelDeviceHandle")
      .def(nb::init<>())
      .def_rw("semaphore_id_", &BasePortChannel::DeviceHandle::semaphoreId_)
      .def_rw("semaphore_", &BasePortChannel::DeviceHandle::semaphore_)
@@ -46,13 +46,13 @@ void register_port_channel(nb::module_& m) {
        return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
      });

-  nb::class_<PortChannel>(m, "PortChannel")
+  nb::class_<PortChannel>(m, "CppPortChannel")
      .def(nb::init<>())
      .def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>, MemoryId, MemoryId>(),
           nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy"), nb::arg("dst"), nb::arg("src"))
      .def("device_handle", &PortChannel::deviceHandle);

-  nb::class_<PortChannel::DeviceHandle>(m, "PortChannelDeviceHandle")
+  nb::class_<PortChannel::DeviceHandle>(m, "CppPortChannelDeviceHandle")
      .def(nb::init<>())
      .def_rw("semaphore_id_", &PortChannel::DeviceHandle::semaphoreId_)
      .def_rw("semaphore_", &PortChannel::DeviceHandle::semaphore_)
--- a/python/csrc/semaphore_py.cpp
+++ b/python/csrc/semaphore_py.cpp
@@ -10,7 +10,7 @@ namespace nb = nanobind;
 using namespace mscclpp;

 void register_semaphore(nb::module_& m) {
-  nb::class_<Host2DeviceSemaphore> host2DeviceSemaphore(m, "Host2DeviceSemaphore");
+  nb::class_<Host2DeviceSemaphore> host2DeviceSemaphore(m, "CppHost2DeviceSemaphore");
  host2DeviceSemaphore.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
      .def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
      .def("connection", &Host2DeviceSemaphore::connection)
@@ -25,7 +25,7 @@ void register_semaphore(nb::module_& m) {
        return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
      });

-  nb::class_<Host2HostSemaphore>(m, "Host2HostSemaphore")
+  nb::class_<Host2HostSemaphore>(m, "CppHost2HostSemaphore")
      .def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
      .def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
      .def("connection", &Host2HostSemaphore::connection)
@@ -34,7 +34,7 @@ void register_semaphore(nb::module_& m) {
      .def("wait", &Host2HostSemaphore::wait, nb::call_guard<nb::gil_scoped_release>(),
           nb::arg("max_spin_count") = 10000000);

-  nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "MemoryDevice2DeviceSemaphore");
+  nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "CppMemoryDevice2DeviceSemaphore");
  memoryDevice2DeviceSemaphore.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
      .def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
      .def("connection", &MemoryDevice2DeviceSemaphore::connection)
@@ -43,7 +43,6 @@ void register_semaphore(nb::module_& m) {
  nb::class_<MemoryDevice2DeviceSemaphore::DeviceHandle>(memoryDevice2DeviceSemaphore, "DeviceHandle")
      .def(nb::init<>())
      .def_rw("inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundToken)
-      .def_rw("outbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundToken)
      .def_rw("remote_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundToken)
      .def_rw("expected_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundToken)
      .def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
--- a/python/csrc/switch_channel_py.cpp
+++ b/python/csrc/switch_channel_py.cpp
@@ -15,11 +15,11 @@ namespace nb = nanobind;
 using namespace mscclpp;

 void register_nvls(nb::module_& m) {
-  nb::class_<SwitchChannel>(m, "SwitchChannel")
+  nb::class_<SwitchChannel>(m, "CppSwitchChannel")
      .def("get_device_ptr", [](SwitchChannel* self) { return (uintptr_t)self->getDevicePtr(); })
      .def("device_handle", &SwitchChannel::deviceHandle);

-  nb::class_<SwitchChannel::DeviceHandle>(m, "DeviceHandle")
+  nb::class_<SwitchChannel::DeviceHandle>(m, "CppSwitchChannelDeviceHandle")
      .def(nb::init<>())
      .def_rw("device_ptr", &SwitchChannel::DeviceHandle::devicePtr)
      .def_rw("mc_ptr", &SwitchChannel::DeviceHandle::mcPtr)
@@ -28,7 +28,7 @@ void register_nvls(nb::module_& m) {
        return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
      });

-  nb::class_<NvlsConnection>(m, "NvlsConnection")
+  nb::class_<NvlsConnection>(m, "CppNvlsConnection")
      .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("device_ptr"), nb::arg("size"));

  m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("all_ranks"),
--- a/python/mscclpp/init.py
+++ b/python/mscclpp/init.py
@@ -23,35 +23,37 @@ version = {
 from ._core import *

 from ._mscclpp import (
-    Device,
-    DeviceType,
-    Communicator,
-    Connection,
+    CppDevice as Device,
+    CppDeviceType as DeviceType,
+    CppCommunicator as Communicator,
+    CppConnection as Connection,
    connect_nvls_collective,
-    EndpointConfig,
-    Fifo,
-    Semaphore,
-    Host2DeviceSemaphore,
-    Host2HostSemaphore,
-    numa,
-    ProxyService,
-    RegisteredMemory,
-    PortChannel,
-    MemoryChannel,
-    MemoryDevice2DeviceSemaphore,
-    TcpBootstrap,
-    Transport,
-    TransportFlags,
-    DataType,
-    ErrorCode,
-    Executor,
-    ExecutionPlan,
-    PacketType,
-    RawGpuBuffer,
-    ReduceOp,
+    CppEndpointConfig as EndpointConfig,
+    CppEndpointConfigIb as EndpointConfigIb,
+    CppIbMode as IbMode,
+    CppFifo as Fifo,
+    CppSemaphore as Semaphore,
+    CppHost2DeviceSemaphore as Host2DeviceSemaphore,
+    CppHost2HostSemaphore as Host2HostSemaphore,
+    cpp_numa as numa,
+    CppProxyService as ProxyService,
+    CppRegisteredMemory as RegisteredMemory,
+    CppPortChannel as PortChannel,
+    CppMemoryChannel as MemoryChannel,
+    CppMemoryDevice2DeviceSemaphore as MemoryDevice2DeviceSemaphore,
+    CppTcpBootstrap as TcpBootstrap,
+    CppTransport as Transport,
+    CppTransportFlags as TransportFlags,
+    CppDataType as DataType,
+    CppErrorCode as ErrorCode,
+    CppExecutor as Executor,
+    CppExecutionPlan as ExecutionPlan,
+    CppPacketType as PacketType,
+    CppRawGpuBuffer as RawGpuBuffer,
+    CppReduceOp as ReduceOp,
    env,
    is_nvls_supported,
-    npkit,
+    cpp_npkit as npkit,
 )

 __all__ = [
@@ -61,6 +63,8 @@ __all__ = [
    "Connection",
    "connect_nvls_collective",
    "EndpointConfig",
+    "EndpointConfigIb",
+    "IbMode",
    "ErrorCode",
    "Fifo",
    "Semaphore",
--- a/python/mscclpp/main.py
+++ b/python/mscclpp/main.py
@@ -6,7 +6,7 @@ import shutil
 import argparse
 from pathlib import Path

-from mscclpp.language import default_algos as def_algo
+from mscclpp import default_algos as def_algo
 from mscclpp.language.collectives import *
 from mscclpp.language.utils import AlgoSpec

@@ -57,7 +57,7 @@ default_algo_configs = [


 def create_default_plans():
-    plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp_default")
+    plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default")
    plan_path = Path(plan_dir)
    if plan_path.exists():
        shutil.rmtree(plan_path)
--- a/python/mscclpp/_core/init.py
+++ b/python/mscclpp/_core/init.py
@@ -5,9 +5,3 @@ from .algorithm import *
 from .comm import *
 from .compiler import *
 from .buffer import *
-
-__all__ = []
-__all__ += algorithm.__all__
-__all__ += comm.__all__
-__all__ += compiler.__all__
-__all__ += buffer.__all__
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -4,18 +4,22 @@
 from __future__ import annotations
 from typing import Optional, Tuple, Dict
 from functools import cached_property
+import cupy as cp


 from mscclpp._mscclpp import (
-    Algorithm as _Algorithm,
-    DslAlgorithm as _DslAlgorithm,
-    AlgorithmType as _AlgorithmType,
-    Communicator,
-    CollectiveBufferMode,
-    DataType,
-    Executor,
-    ExecutionPlan,
-    ReduceOp,
+    CppAlgorithm,
+    CppDslAlgorithm,
+    CppAlgorithmType,
+    CppCommunicator,
+    CppCollectiveBufferMode,
+    CppDataType,
+    CppExecutor,
+    CppExecutionPlan,
+    CppReduceOp,
+    CppAlgorithmBuilder,
+    CppAlgorithmCollection,
+    cpp_get_flag_buffer,
 )

 __all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
@@ -45,7 +49,7 @@ class Algorithm:
        """

        def __init__(self, world_size: int = 0, n_ranks_per_node: int = 0):
-            self._constraint = _Algorithm.Constraint(world_size, n_ranks_per_node)
+            self._constraint = CppAlgorithm.Constraint(world_size, n_ranks_per_node)

        @property
        def world_size(self) -> int:
@@ -58,23 +62,23 @@ class Algorithm:
    def __init__(
        self,
        id: Optional[str] = None,
-        execution_plan: Optional[ExecutionPlan] = None,
-        native_handle: Optional[_Algorithm] = None,
+        execution_plan: Optional[CppExecutionPlan] = None,
+        native_handle: Optional[CppAlgorithm] = None,
        tags: Optional[Dict[str, int]] = None,
        constraint: Optional[Constraint] = None,
    ):
        if execution_plan is not None:
-            self._algorithm = _DslAlgorithm(
+            self._algorithm = CppDslAlgorithm(
                id,
                execution_plan,
                tags=tags if tags is not None else {},
-                constraint=constraint._constraint if constraint is not None else _Algorithm.Constraint(),
+                constraint=constraint._constraint if constraint is not None else CppAlgorithm.Constraint(),
            )
        elif native_handle is not None:
            self._algorithm = native_handle

    @classmethod
-    def create_from_native_handle(cls, handle: _Algorithm):
+    def create_from_native_handle(cls, handle: CppAlgorithm):
        """Create an Algorithm instance from a native C++ algorithm handle.

        Args:
@@ -97,7 +101,7 @@ class Algorithm:
        Returns:
            A new Algorithm instance wrapping the algorithm from the capsule.
        """
-        handle = _Algorithm.from_native_capsule(obj)
+        handle = CppAlgorithm.from_native_capsule(obj)
        return cls(native_handle=handle)

    @cached_property
@@ -110,18 +114,31 @@ class Algorithm:
        """The collective operation this algorithm implements (e.g., "allreduce", "allgather")."""
        return self._algorithm.collective

-    @cached_property
+    @property
    def message_size_range(self) -> Tuple[int, int]:
        """The valid message size range (min_size, max_size) in bytes."""
        return (self._algorithm.message_range[0], self._algorithm.message_range[1])

+    def set_message_size_range(self, min_message_size: int, max_message_size: int):
+        """Set the valid message size range in bytes.
+
+        Args:
+            min_message_size: Minimum supported message size in bytes.
+            max_message_size: Maximum supported message size in bytes.
+
+        Only supported for native algorithms. Raises TypeError for DSL algorithms.
+        """
+        if self.is_dsl_algorithm():
+            raise TypeError("set_message_size_range is only supported for native algorithms")
+        self._algorithm.set_message_size_range(min_message_size, max_message_size)
+
    @cached_property
    def tags(self) -> Dict[str, int]:
        """Dictionary of tag names to tag values for algorithm selection hints."""
        return self._algorithm.tags

    @cached_property
-    def buffer_mode(self) -> CollectiveBufferMode:
+    def buffer_mode(self) -> CppCollectiveBufferMode:
        """The buffer mode supported by this algorithm (IN_PLACE, OUT_OF_PLACE, or ANY)."""
        return self._algorithm.buffer_mode

@@ -131,7 +148,7 @@ class Algorithm:
        Returns:
            True if this algorithm is defined using DSL/execution plan, False otherwise.
        """
-        if self._algorithm.type == _AlgorithmType.DSL:
+        if self._algorithm.type == CppAlgorithmType.DSL:
            return True
        return False

@@ -141,24 +158,26 @@ class Algorithm:
        Returns:
            True if this algorithm is implemented natively, False otherwise.
        """
-        if self._algorithm.type == _AlgorithmType.NATIVE:
+        if self._algorithm.type == CppAlgorithmType.NATIVE:
            return True
        return False

    def execute(
        self,
-        comm: Communicator,
+        comm: CppCommunicator,
        input_buffer: int,
        output_buffer: int,
        input_size: int,
        output_size: int,
-        dtype: DataType,
-        op: ReduceOp = ReduceOp.NOP,
+        dtype: CppDataType,
+        op: CppReduceOp = CppReduceOp.NOP,
        stream: int = 0,
-        executor: Optional[Executor] = None,
+        executor: Optional[CppExecutor] = None,
        nblocks=0,
        nthreads_per_block=0,
+        symmetric_memory: bool = False,
        extras: Optional[Dict[str, int]] = None,
+        accum_dtype: Optional[CppDataType] = None,
    ) -> int:
        """Execute the collective algorithm.

@@ -174,11 +193,16 @@ class Algorithm:
            executor: The executor for DSL algorithms (required for DSL, optional for native).
            nblocks: Number of CUDA blocks (0 for auto-selection).
            nthreads_per_block: Number of threads per block (0 for auto-selection).
+            symmetric_memory: Whether to use symmetric memory optimization (default: False).
            extras: Additional algorithm-specific parameters.
+            accum_dtype: Data type for accumulation during reduction. If None, defaults to
+                         the same as dtype. Use DataType.float32 for high-precision FP8 accumulation.

        Returns:
            The result code (0 for success).
        """
+        merged_extras = dict(extras) if extras is not None else {}
+        accum_dtype = accum_dtype if accum_dtype is not None else dtype
        return self._algorithm.execute(
            comm,
            int(input_buffer),
@@ -191,12 +215,18 @@ class Algorithm:
            executor,
            nblocks,
            nthreads_per_block,
-            extras if extras is not None else {},
+            symmetric_memory,
+            merged_extras,
+            int(accum_dtype),
        )

+    def reset(self):
+        """Reset the internal state of the algorithm, if applicable."""
+        self._algorithm.reset()
+

 class AlgorithmBuilder:
-    def __init__(self, algorithm_builder: _AlgorithmBuilder):
+    def __init__(self, algorithm_builder: CppAlgorithmBuilder):
        self._algorithm_builder = algorithm_builder

    def build(self) -> Algorithm:
@@ -204,7 +234,7 @@ class AlgorithmBuilder:


 class AlgorithmCollection:
-    def __init__(self, native_collection: _AlgorithmCollection):
+    def __init__(self, native_collection: CppAlgorithmCollection):
        self._native_collection = native_collection
        self._algorithms = [Algorithm.create_from_native_handle(algo) for algo in self._native_collection.to_list()]

@@ -228,3 +258,24 @@ class AlgorithmCollection:
        """Register an algorithm for a collective operation."""
        self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm)
        self._algorithms.append(algorithm)
+
+
+_flag_buffer_cache = None
+
+
+def get_flag_buffer() -> cp.ndarray:
+    """Get the default flag buffer for algorithm selection.
+
+    This buffer is used internally by default algorithms to store selection flags.
+    It is allocated as a shared GPU buffer and can be accessed from Python.
+    The result is cached so all callers share the same buffer.
+
+    Returns:
+        A CuPy array representing the flag buffer on the GPU.
+    """
+    global _flag_buffer_cache
+    if _flag_buffer_cache is None:
+        buffer_ptr, buffer_size, owner = cpp_get_flag_buffer()
+        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, owner), 0)
+        _flag_buffer_cache = cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
+    return _flag_buffer_cache
--- a/python/mscclpp/_core/buffer.py
+++ b/python/mscclpp/_core/buffer.py
@@ -6,7 +6,7 @@ from typing import Union, Tuple

 import cupy as cp
 import numpy as np
-from mscclpp._mscclpp import RawGpuBuffer
+from mscclpp._mscclpp import CppRawGpuBuffer

 __all__ = ["GpuBuffer"]

@@ -25,6 +25,6 @@ class GpuBuffer(cp.ndarray):
        if any(s <= 0 for s in shape):
            raise ValueError("Shape must be positive.")
        # Create the buffer
-        buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
+        buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
        return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
--- a/python/mscclpp/_core/comm.py
+++ b/python/mscclpp/_core/comm.py
@@ -6,21 +6,21 @@ from typing import Type

 import cupy as cp
 from mscclpp._mscclpp import (
-    Communicator,
-    Connection,
+    CppCommunicator,
+    CppConnection,
    connect_nvls_collective,
-    EndpointConfig,
-    Semaphore,
-    ProxyService,
-    RegisteredMemory,
-    PortChannel,
-    MemoryChannel,
-    TcpBootstrap,
-    Transport,
-    TransportFlags,
+    CppEndpointConfig,
+    CppSemaphore,
+    CppProxyService,
+    CppRegisteredMemory,
+    CppPortChannel,
+    CppMemoryChannel,
+    CppTcpBootstrap,
+    CppTransport,
+    CppTransportFlags,
 )
-import mpi4py
 import numpy as np
+import pickle

 from mscclpp.utils import is_torch_tensor

@@ -29,27 +29,47 @@ __all__ = ["CommGroup"]

 class CommGroup:
    def __init__(
-        self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None
+        self,
+        mpi_comm: "mpi4py.MPI.Comm" = None,
+        torch_group: "dist.ProcessGroup" = None,
+        interfaceIpPortTrio: str = "",
+        rank: int = None,
+        size: int = None,
    ):
-        if interfaceIpPortTrio == "":
-            self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
+        if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None):
            uniq_id = None
-            if mpi_comm.rank == 0:
-                # similar to NCCL's unique id
+            rank, size = (
+                (mpi_comm.Get_rank(), mpi_comm.Get_size())
+                if mpi_comm is not None
+                else (torch_group.rank(), torch_group.size())
+            )
+            self.bootstrap = CppTcpBootstrap.create(rank, size)
+            if rank == 0:
                uniq_id = self.bootstrap.create_unique_id()
-            uniq_id_global = mpi_comm.bcast(uniq_id, 0)
+            if mpi_comm is not None:
+                import mpi4py
+
+                uniq_id_global = mpi_comm.bcast(uniq_id, 0)
+            else:
+                import torch
+                import torch.distributed as dist
+
+                if rank == 0:
+                    uniq_id_global = uniq_id
+                    pickled_data = pickle.dumps(uniq_id)
+                    data_tensor = torch.frombuffer(bytearray(pickled_data), dtype=torch.uint8).clone()
+                else:
+                    data_tensor = torch.zeros(256, dtype=torch.uint8)
+                dist.broadcast(data_tensor, src=0, group=torch_group)
+                uniq_id_global = pickle.loads(data_tensor.numpy().tobytes())
            self.bootstrap.initialize(uniq_id_global)
-        elif mpi_comm:
-            # use this instead
-            self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
-            self.bootstrap.initialize(interfaceIpPortTrio)
        elif not interfaceIpPortTrio == "":
            assert rank >= 0 and size >= 1
-            self.bootstrap = TcpBootstrap.create(rank, size)
+            self.bootstrap = CppTcpBootstrap.create(rank, size)
            self.bootstrap.initialize(interfaceIpPortTrio)
        else:
            raise RuntimeError("Either the interface or mpi_group need to be specified")
-        self.communicator = Communicator(self.bootstrap)
+        self.communicator = CppCommunicator(self.bootstrap)
        self.my_rank = self.bootstrap.get_rank()
        self.nranks = self.bootstrap.get_n_ranks()
        self.nranks_per_node = self.bootstrap.get_n_ranks_per_node()
@@ -63,43 +83,43 @@ class CommGroup:
    def recv(self, tensor: np.ndarray, peer: int, tag: int):
        self.bootstrap.recv(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag)

-    def my_ib_device(self, local_rank: int) -> Transport:
+    def my_ib_device(self, local_rank: int) -> CppTransport:
        if local_rank == 0:
-            return Transport.IB0
+            return CppTransport.IB0
        if local_rank == 1:
-            return Transport.IB1
+            return CppTransport.IB1
        if local_rank == 2:
-            return Transport.IB2
+            return CppTransport.IB2
        if local_rank == 3:
-            return Transport.IB3
+            return CppTransport.IB3
        if local_rank == 4:
-            return Transport.IB4
+            return CppTransport.IB4
        if local_rank == 5:
-            return Transport.IB5
+            return CppTransport.IB5
        if local_rank == 6:
-            return Transport.IB6
+            return CppTransport.IB6
        if local_rank == 7:
-            return Transport.IB7
+            return CppTransport.IB7
        else:
            assert False  # only 8 IBs are supported

    def make_connection(
        self,
        all_ranks: list[int],
-        endpoints: EndpointConfig | Transport | dict[int, EndpointConfig] | dict[int, Transport],
+        endpoints: CppEndpointConfig | CppTransport | dict[int, CppEndpointConfig] | dict[int, CppTransport],
        use_switch: bool = False,
-    ) -> dict[int, Connection]:
-        if type(endpoints) is Transport:
-            endpoints = EndpointConfig(endpoints)
+    ) -> dict[int, CppConnection]:
+        if type(endpoints) is CppTransport:
+            endpoints = CppEndpointConfig(endpoints)
        elif type(endpoints) is dict:
-            endpoints = {k: EndpointConfig(v) if type(v) is Transport else v for k, v in endpoints.items()}
+            endpoints = {k: CppEndpointConfig(v) if type(v) is CppTransport else v for k, v in endpoints.items()}
        connections = {}
        for rank in all_ranks:
            if type(endpoints) is dict:
                endpoint = endpoints[rank]
            else:
                endpoint = endpoints
-            if endpoint.transport == Transport.CudaIpc and use_switch:
+            if endpoint.transport == CppTransport.CudaIpc and use_switch:
                return connect_nvls_collective(self.communicator, all_ranks, 2**30)
            else:
                connections[rank] = self.communicator.connect(endpoint, rank)
@@ -107,8 +127,8 @@ class CommGroup:
        return connections

    def register_tensor_with_connections(
-        self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, Connection]
-    ) -> dict[int, RegisteredMemory]:
+        self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, CppConnection]
+    ) -> dict[int, CppRegisteredMemory]:
        local_reg_memory = self.register_local_memory(tensor, connections)
        all_registered_memories = {}
        all_registered_memories[self.my_rank] = local_reg_memory
@@ -121,8 +141,8 @@ class CommGroup:
        return all_registered_memories

    def _register_memory_with_connections(
-        self, memory: RegisteredMemory, connections: dict[int, Connection]
-    ) -> dict[int, RegisteredMemory]:
+        self, memory: CppRegisteredMemory, connections: dict[int, CppConnection]
+    ) -> dict[int, CppRegisteredMemory]:
        all_registered_memories = {}
        all_registered_memories[self.my_rank] = memory
        future_memories = {}
@@ -133,18 +153,20 @@ class CommGroup:
            all_registered_memories[rank] = future_memories[rank].get()
        return all_registered_memories

-    def make_semaphores(self, connections: dict[int, Connection]) -> dict[int, Semaphore]:
+    def make_semaphores(self, connections: dict[int, CppConnection]) -> dict[int, CppSemaphore]:
        future_semaphores = {}
        for rank in connections:
            future_semaphores[rank] = self.communicator.build_semaphore(connections[rank], rank)
        return {rank: future.get() for rank, future in future_semaphores.items()}

-    def make_memory_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, MemoryChannel]:
+    def make_memory_channels(
+        self, tensor: cp.ndarray, connections: dict[int, CppConnection]
+    ) -> dict[int, CppMemoryChannel]:
        semaphores = self.make_semaphores(connections)
        registered_memories = self.register_tensor_with_connections(tensor, connections)
        channels = {}
        for rank in connections:
-            channels[rank] = MemoryChannel(
+            channels[rank] = CppMemoryChannel(
                semaphores[rank], registered_memories[rank], registered_memories[self.my_rank]
            )
        return channels
@@ -152,9 +174,9 @@ class CommGroup:
    def make_memory_channels_with_scratch(
        self,
        tensor: cp.ndarray,
-        registeredScratchBuffer: RegisteredMemory,
-        connections: dict[int, Connection],
-    ) -> dict[int, MemoryChannel]:
+        registeredScratchBuffer: CppRegisteredMemory,
+        connections: dict[int, CppConnection],
+    ) -> dict[int, CppMemoryChannel]:
        semaphores = self.make_semaphores(connections)
        registered_memories = self._register_memory_with_connections(registeredScratchBuffer, connections)
        channels = {}
@@ -162,17 +184,17 @@ class CommGroup:
        tensor_size = (
            tensor.numel() * tensor.element_size() if is_torch_tensor(tensor) else tensor.size * tensor.itemsize
        )
-        local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, TransportFlags())
+        local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, CppTransportFlags())
        scratch_data_ptr = registeredScratchBuffer.data()
        for rank in connections:
-            channels[rank] = MemoryChannel(
+            channels[rank] = CppMemoryChannel(
                semaphores[rank], registered_memories[rank], local_registered_memory, scratch_data_ptr
            )
        return channels

    def make_port_channels(
-        self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
-    ) -> dict[int, PortChannel]:
+        self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection]
+    ) -> dict[int, CppPortChannel]:
        semaphores = self.make_semaphores(connections)
        registered_memories = self.register_tensor_with_connections(tensor, connections)
        memory_ids = {}
@@ -188,12 +210,12 @@ class CommGroup:

    def make_port_channels_with_scratch(
        self,
-        proxy_service: ProxyService,
+        proxy_service: CppProxyService,
        tensor: cp.ndarray,
-        registeredScratchBuffer: RegisteredMemory,
-        connections: dict[int, Connection],
-    ) -> dict[int, PortChannel]:
-        transport_flags = TransportFlags()
+        registeredScratchBuffer: CppRegisteredMemory,
+        connections: dict[int, CppConnection],
+    ) -> dict[int, CppPortChannel]:
+        transport_flags = CppTransportFlags()
        for rank in connections:
            transport_flags |= connections[rank].transport()
        data_ptr = (
@@ -223,8 +245,8 @@ class CommGroup:
        return channels

    def register_semaphore_with_proxy(
-        self, proxy_service: ProxyService, connections: dict[int, Connection]
-    ) -> dict[int, PortChannel]:
+        self, proxy_service: CppProxyService, connections: dict[int, CppConnection]
+    ) -> dict[int, CppPortChannel]:
        semaphores = self.make_semaphores(connections)
        semaphore_ids = {}
        for rank in semaphores:
@@ -235,7 +257,7 @@ class CommGroup:
        return channels

    def register_memory_with_proxy(
-        self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
+        self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection]
    ) -> dict[int, int]:
        registered_memories = self.register_tensor_with_connections(tensor, connections)
        memory_ids = {}
@@ -243,8 +265,8 @@ class CommGroup:
            memory_ids[rank] = proxy_service.add_memory(registered_memories[rank])
        return memory_ids

-    def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> RegisteredMemory:
-        transport_flags = TransportFlags()
+    def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, CppConnection]) -> CppRegisteredMemory:
+        transport_flags = CppTransportFlags()
        for rank in connections:
            transport_flags |= connections[rank].transport()
        data_ptr = (
--- a/python/mscclpp/_core/compiler.py
+++ b/python/mscclpp/_core/compiler.py
@@ -26,9 +26,7 @@ from mscclpp.language.program import CollectiveProgram
 from mscclpp.language.utils import AlgoSpec
 from mscclpp.utils import get_device_arch

-from mscclpp._mscclpp import (
-    ExecutionPlan,
-)
+from mscclpp._mscclpp import CppExecutionPlan, env

 logging.basicConfig(level=logging.INFO)

@@ -51,7 +49,7 @@ class DslCompiler:
    into execution plans that can be run on GPUs. The compiled plans are cached
    to disk for reuse.

-    The cache location can be configured via the `MSCCLPP_EXECUTION_PLAN_DIR`
+    The cache location can be configured via the `MSCCLPP_CACHE_DIR`
    environment variable (defaults to `~/.cache/mscclpp`).

    Example:
@@ -138,7 +136,7 @@ class DslCompiler:
            )
        ).hexdigest()

-        plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp")
+        plan_dir = Path(env().cache_dir)
        os.makedirs(plan_dir, exist_ok=True)
        filename = f"{plan_id}.json"
        plan_path = os.path.join(plan_dir, filename)
@@ -157,7 +155,7 @@ class DslCompiler:
                    os.remove(tmp_path)
            except Exception:
                Path(plan_path).unlink(missing_ok=True)
-        execution_plan = ExecutionPlan(plan_path, rank)
+        execution_plan = CppExecutionPlan(plan_path, rank)
        return Algorithm(
            id=plan_id,
            execution_plan=execution_plan,
@@ -179,8 +177,8 @@ class NativeCodeCompiler:
    based on the runtime environment. Compiled modules are cached to avoid
    recompilation.

-    The cache location can be configured via the `MSCCLPP_NATIVE_CACHE_DIR`
-    environment variable (defaults to `~/.cache/mscclpp/native`).
+    The cache location can be configured via the `MSCCLPP_CACHE_DIR`
+    environment variable (defaults to `~/.cache/mscclpp`).

    Attributes:
        _is_hip: True if running on AMD/ROCm, False for NVIDIA/CUDA.
@@ -226,8 +224,7 @@ class NativeCodeCompiler:
            "-L" + os.path.join(self._lib_home, "lib"),
            "-lmscclpp",
        ]
-        cache_root = os.environ.get("MSCCLPP_NATIVE_CACHE_DIR", Path.home() / ".cache/mscclpp/native")
-        self._cache_dir = Path(cache_root)
+        self._cache_dir = Path(env().cache_dir) / "native"
        self._cache_dir.mkdir(parents=True, exist_ok=True)

    def _get_compiler(self) -> str:
@@ -283,7 +280,7 @@ class NativeCodeCompiler:
        Note:
            - The source file should include pybind11 bindings to expose functions.
            - MSCCLPP headers are automatically included in the compilation.
-            - The module is cached in `MSCCLPP_NATIVE_CACHE_DIR` (default: ~/.cache/mscclpp/native).
+            - The module is cached in `MSCCLPP_CACHE_DIR` (default: ~/.cache/mscclpp).
            - File locking is used to prevent race conditions during parallel compilation.

        Example:
--- a/python/mscclpp/ext/algorithm_collection_builder.py
+++ b/python/mscclpp/ext/algorithm_collection_builder.py
@@ -3,12 +3,10 @@

 from __future__ import annotations
 from typing import Union
-from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection
+from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_flag_buffer
 import atexit

-from mscclpp._mscclpp import (
-    AlgorithmCollectionBuilder as _AlgorithmCollectionBuilder,
-)
+from mscclpp._mscclpp import CppAlgorithmCollectionBuilder

 __all__ = ["AlgorithmCollectionBuilder"]

@@ -24,13 +22,14 @@ class AlgorithmCollectionBuilder:
    @classmethod
    def reset(cls):
        if cls._instance is not None:
-            _AlgorithmCollectionBuilder.reset()
+            CppAlgorithmCollectionBuilder.reset()
            cls._instance = None

    def __init__(self):
        if not hasattr(self, "_initialized"):
-            self._builder = _AlgorithmCollectionBuilder.get_instance()
+            self._builder = CppAlgorithmCollectionBuilder.get_instance()
            self._initialized = True
+            self._flag_buffer = None

    def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]):
        if isinstance(algorithm_builder, AlgorithmBuilder):
@@ -52,8 +51,17 @@ class AlgorithmCollectionBuilder:
        collection = self._builder.build()
        return AlgorithmCollection(collection)

-    def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection:
-        native_collection = self._builder.build_default_algorithms(int(scratch_buffer), scratch_buffer_size, rank)
+    def build_default_algorithms(
+        self,
+        scratch_buffer: int,
+        scratch_buffer_size: int,
+        rank: int,
+    ) -> AlgorithmCollection:
+        if self._flag_buffer is None:
+            self._flag_buffer = get_flag_buffer()
+        native_collection = self._builder.build_default_algorithms(
+            int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank
+        )
        return AlgorithmCollection(native_collection)


--- a/python/mscclpp/ext/alltoallv_single.py
+++ b/python/mscclpp/ext/alltoallv_single.py
@@ -24,11 +24,11 @@ def _a2av_dbg(msg: str):
    if _DEBUG_A2AV:
        print(msg, file=sys.stderr, flush=True)
 from mscclpp._mscclpp import (
-    Communicator,
-    TcpBootstrap,
-    DataType,
-    ReduceOp,
-    CommResult,
+    CppCommunicator as Communicator,
+    CppTcpBootstrap as TcpBootstrap,
+    CppDataType as DataType,
+    CppReduceOp as ReduceOp,
+    CppCommResult as CommResult,
 )
 from mscclpp.ext.algorithm_collection_builder import AlgorithmCollectionBuilder

@@ -375,6 +375,7 @@ class MscclppAlltoAllV:
            None,  # executor (not needed for native algos)
            0,     # nblocks (auto)
            0,     # nthreads_per_block (auto)
+            False, # symmetric_memory
            self._extras,
        )

--- a/python/mscclpp/language/channel.py
+++ b/python/mscclpp/language/channel.py
@@ -140,7 +140,7 @@ class MemoryChannel:

        for tb_id in tb_list:
            tb_chunk_id = get_program().setup_remote_chunk(self.src_rank, tb_id, remote_chunk, self.channel_type)
-            tb_channel_ids = get_program().setup_channel(tb, self)
+            tb_channel_ids = get_program().setup_channel(tb_id, self)
            op = GetOperation(
                src_buff=[RemoteChunk(src_chunk.buffer, src_chunk.index, src_chunk.size, tb_chunk_id)],
                dst_buff=[LocalChunk(dst_chunk.buffer, dst_chunk.index, dst_chunk.size)],
--- a/python/mscclpp/language/internal/operations.py
+++ b/python/mscclpp/language/internal/operations.py
@@ -534,6 +534,7 @@ class PutOperation(BaseOperation):
        self.dst_buff = dst_buff
        self.channel_ids = channel_ids
        self.channel_type = channel_type
+        self.from_packet = from_packet
        self.to_packet = to_packet
        self.with_signal = with_signal
        self.with_signal_and_flush = with_signal_and_flush
@@ -579,6 +580,25 @@ class PutOperation(BaseOperation):
                with_signal=self.with_signal,
                with_signal_and_flush=self.with_signal_and_flush,
            )
+        elif (
+            isinstance(other, PutOperation)
+            and self.name == Instruction.read_put_packet
+            and self.name == other.name
+            and self.src_buff == other.src_buff
+            and self.channel_type == other.channel_type
+            and self.tbg_info == other.tbg_info
+        ):
+            fused_operation = PutOperation(
+                src_buff=self.src_buff,
+                dst_buff=self.dst_buff + other.dst_buff,
+                channel_ids=self.channel_ids + other.channel_ids,
+                channel_type=self.channel_type,
+                tbg_info=self.tbg_info,
+                from_packet=self.from_packet,
+                to_packet=self.to_packet,
+                with_signal=self.with_signal,
+                with_signal_and_flush=self.with_signal_and_flush,
+            )

        return fused_operation

@@ -725,7 +745,7 @@ class ReduceOperation(BaseOperation):
                remote_dst_buff=self.remote_dst_buff + other.dst_buff,
                channel_ids=self.channel_ids,
                put_channel_ids=self.put_channel_ids + other.channel_ids,
-                channel_type=self.channel_type,
+                channel_type=other.channel_type,
                reduce_operation=self.reduce_operation,
                tbg_info=self.tbg_info,
                packet=self.packet,
--- a/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py
+++ b/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py
@@ -0,0 +1,78 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def allgather_example(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Scratch Buffers
+        scratch_buffer = []
+        for gpu in range(gpu_size):
+            scratch_buffer.append(Buffer(gpu, 2 * gpu_size))
+
+        # Copying it to scratch buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            scratch_offset = gpu_size
+            input_buffer = rank.get_input_buffer()
+            rank.copy_packets(
+                scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1], input_buffer[0:1], tb=0
+            )
+
+        # Putting packets in the remote scratch buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            output_buffer = rank.get_output_buffer()
+            for peer in range(1, gpu_size):
+                dst_rank = (gpu + peer) % gpu_size
+                ch = MemoryChannel(dst_rank, gpu)
+                tb = 0
+                ch.read_put_packets(
+                    scratch_buffer[dst_rank][gpu : gpu + 1],
+                    scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1],
+                    tb,
+                )
+
+        # Copying packets from local scratch buffer to local buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            output_buffer = rank.get_output_buffer()
+            for peer in range(1, gpu_size):
+                dst_rank = (gpu + peer) % gpu_size
+                rank.unpack_packets(
+                    output_buffer[dst_rank : dst_rank + 1],
+                    scratch_buffer[gpu][dst_rank : dst_rank + 1],
+                    tb=0,
+                )
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+allgather_example(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .8.0
 .9.0